1965 files changed, 118479 insertions, 35366 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9d8e85..71a7426 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,7 @@ set(LLVM_EXAMPLES_BINARY_DIR ${LLVM_BINARY_DIR}/examples)
 set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" )
 
 set(LLVM_ALL_TARGETS
+  AArch64
   ARM
   CppBackend
   Hexagon
@@ -375,6 +376,13 @@ set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
 include_directories( ${LLVM_BINARY_DIR}/include ${LLVM_MAIN_INCLUDE_DIR})
 
+if( ${CMAKE_SYSTEM_NAME} MATCHES FreeBSD )
+  # On FreeBSD, /usr/local/* is not used by default. In order to build LLVM
+  # with libxml2, iconv.h, etc., we must add /usr/local paths.
+  include_directories("/usr/local/include")
+  link_directories("/usr/local/lib")
+endif( ${CMAKE_SYSTEM_NAME} MATCHES FreeBSD )
+
 if( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include llvm/Support/Solaris.h")
 endif( ${CMAKE_SYSTEM_NAME} MATCHES SunOS )
diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index c02e71c..10bf071 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -90,6 +90,10 @@ E: sabre@nondot.org
 W: http://nondot.org/~sabre/
 D: Everything not covered by someone else
 
+N: Tim Northover
+E: Tim.Northover@arm.com
+D: AArch64 backend
+
 N: Jakob Olesen
 D: Register allocators and TableGen
 
@@ -109,6 +113,10 @@ N: Duncan Sands
 E: baldrick@free.fr
 D: DragonEgg
 
+N: Michael Spencer
+E: bigcheesegs@gmail.com
+D: Windows parts of Support, Object, ar, nm, objdump, ranlib, size
+
 N: Tom Stellard
 E: thomas.stellard@amd.com
 E: mesa-dev@lists.freedesktop.org
diff --git a/Makefile b/Makefile
index 70fad6d..7a1b190 100644
--- a/Makefile
+++ b/Makefile
@@ -248,13 +248,26 @@ build-for-llvm-top:
 SVN = svn
 SVN-UPDATE-OPTIONS =
 AWK = awk
-SUB-SVN-DIRS = $(AWK) '/I|\?      / {print $$2}'   \
-		| LC_ALL=C xargs $(SVN) info 2>/dev/null \
-		| $(AWK) '/^Path:\ / {print $$2}'
+
+# Multiline variable defining a recursive function for finding svn repos rooted at
+# a given path. svnup() requires one argument: the root to search from.
+define SUB_SVN_DIRS
+svnup() {
+  dirs=`svn status --no-ignore $$1 | awk '/I|\?      / {print $$2}' | LC_ALL=C xargs svn info 2>/dev/null | awk '/^Path:\ / {print $$2}'`;
+  if [ "$$dirs" = "" ]; then
+    return;
+  fi;
+  for f in $$dirs; do
+	  echo $$f;
+    svnup $$f;
+  done
+}
+endef
+export SUB_SVN_DIRS
 
 update:
 	$(SVN) $(SVN-UPDATE-OPTIONS) update $(LLVM_SRC_ROOT)
-	@ $(SVN) status --no-ignore $(LLVM_SRC_ROOT) | $(SUB-SVN-DIRS) | xargs $(SVN) $(SVN-UPDATE-OPTIONS) update
+	@eval $$SUB_SVN_DIRS; $(SVN) status --no-ignore $(LLVM_SRC_ROOT) | svnup $(LLVM_SRC_ROOT) | xargs $(SVN) $(SVN-UPDATE-OPTIONS) update
 
 happiness: update all check-all
 
diff --git a/Makefile.config.in b/Makefile.config.in
index 3c4f7b7..d3b57e9 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in
@@ -134,6 +134,9 @@ BUILD_CXX=@BUILD_CXX@
 # Triple for configuring build tools when cross-compiling
 BUILD_TRIPLE=@build@
 
+# Target triple (cpu-vendor-os) which LLVM is compiled for
+HOST_TRIPLE=@host@
+
 # Target triple (cpu-vendor-os) for which we should generate code
 TARGET_TRIPLE=@target@
 
@@ -153,8 +156,17 @@ CXX = @CXX@
 # Path to the CC binary, which use used by testcases for native builds.
 CC := @CC@
 
+# C/C++ preprocessor flags.
+CPPFLAGS += @CPPFLAGS@
+
+# C compiler flags.
+CFLAGS += @CFLAGS@
+
+# C++ compiler flags.
+CXXFLAGS += @CXXFLAGS@
+
 # Linker flags.
-LDFLAGS+=@LDFLAGS@
+LDFLAGS += @LDFLAGS@
 
 # Path to the library archiver program.
 AR_PATH = @AR@
@@ -176,6 +188,7 @@ RANLIB     := @RANLIB@
 RM         := @RM@
 SED        := @SED@
 TAR        := @TAR@
+PYTHON     := @PYTHON@
 
 # Paths to miscellaneous programs we hope are present but might not be
 BZIP2      := @BZIP2@
@@ -287,7 +300,7 @@ ENABLE_DOCS = @ENABLE_DOCS@
 ENABLE_DOXYGEN = @ENABLE_DOXYGEN@
 
 # Do we want to enable threads?
-ENABLE_THREADS := @ENABLE_THREADS@
+ENABLE_THREADS := @LLVM_ENABLE_THREADS@
 
 # Do we want to build with position independent code?
 ENABLE_PIC := @ENABLE_PIC@
@@ -358,6 +371,12 @@ NO_MISSING_FIELD_INITIALIZERS = @NO_MISSING_FIELD_INITIALIZERS@
 NO_VARIADIC_MACROS = @NO_VARIADIC_MACROS@
 # -Wcovered-switch-default
 COVERED_SWITCH_DEFAULT = @COVERED_SWITCH_DEFAULT@
+# -Wno-uninitialized
+NO_UNINITIALIZED = @NO_UNINITIALIZED@
+# -Wno-maybe-uninitialized
+NO_MAYBE_UNINITIALIZED = @NO_MAYBE_UNINITIALIZED@
+# -Wno-nested-anon-types
+NO_NESTED_ANON_TYPES = @NO_NESTED_ANON_TYPES@
 
 # Was polly found in tools/polly?
 LLVM_HAS_POLLY = @LLVM_HAS_POLLY@
diff --git a/Makefile.rules b/Makefile.rules
index c64275f..c737965 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -97,7 +97,7 @@ endif
 $(LLVMBuildMakeFrag): $(PROJ_SRC_ROOT)/Makefile.rules \
 		      $(PROJ_OBJ_ROOT)/Makefile.config
 	$(Echo) Constructing LLVMBuild project information.
-	$(Verb) $(LLVMBuildTool) \
+	$(Verb)$(PYTHON) $(LLVMBuildTool) \
 	  --native-target "$(TARGET_NATIVE_ARCH)" \
 	  --enable-targets "$(TARGETS_TO_BUILD)" \
 	  --enable-optional-components "$(OPTIONAL_COMPONENTS)" \
@@ -583,16 +583,24 @@ ifeq ($(HOST_OS),Darwin)
 
   LoadableModuleOptions := -Wl,-flat_namespace -Wl,-undefined,suppress
   SharedLinkOptions := -dynamiclib
-  ifneq ($(ARCH),ARM)
-    SharedLinkOptions += -mmacosx-version-min=$(DARWIN_VERSION)
+  ifdef DEPLOYMENT_TARGET
+    SharedLinkOptions += $(DEPLOYMENT_TARGET)
+  else
+    ifneq ($(ARCH),ARM)
+      SharedLinkOptions += -mmacosx-version-min=$(DARWIN_VERSION)
+    endif
   endif
 else
   SharedLinkOptions=-shared
 endif
 
 ifeq ($(TARGET_OS),Darwin)
-  ifneq ($(ARCH),ARM)
-    TargetCommonOpts += -mmacosx-version-min=$(DARWIN_VERSION)
+  ifdef DEPLOYMENT_TARGET
+    TargetCommonOpts += $(DEPLOYMENT_TARGET)
+  else
+    ifneq ($(ARCH),ARM)
+      TargetCommonOpts += -mmacosx-version-min=$(DARWIN_VERSION)
+    endif
   endif
 endif
 
@@ -648,7 +656,7 @@ else
   ifneq ($(DARWIN_MAJVERS),4)
     LD.Flags += $(RPATH) -Wl,@executable_path/../lib
   endif
-  ifeq ($(RC_BUILDIT),YES)
+  ifeq ($(RC_XBS),YES)
     TempFile := $(shell mkdir -p ${OBJROOT}/dSYMs ; mktemp ${OBJROOT}/dSYMs/llvm-lto.XXXXXX)
     LD.Flags += -Wl,-object_path_lto -Wl,$(TempFile)
   endif
@@ -665,10 +673,12 @@ LD.Flags += $(EXTRA_LD_OPTIONS)
 endif
 
 ifndef NO_PEDANTIC
-CompileCommonOpts += -pedantic -Wno-long-long
+CompileCommonOpts += -pedantic -Wno-long-long $(NO_NESTED_ANON_TYPES)
 endif
 CompileCommonOpts += -Wall -W -Wno-unused-parameter -Wwrite-strings \
-                     $(EXTRA_OPTIONS) $(COVERED_SWITCH_DEFAULT)
+                     $(EXTRA_OPTIONS) $(COVERED_SWITCH_DEFAULT) \
+                     $(NO_UNINITIALIZED) $(NO_MAYBE_UNINITIALIZED) \
+                     $(NO_MISSING_FIELD_INITIALIZERS)
 # Enable cast-qual for C++; the workaround is to use const_cast.
 CXX.Flags += -Wcast-qual
 
@@ -824,7 +834,7 @@ ObjectsBC := $(BaseNameSources:%=$(ObjDir)/%.bc)
 #----------------------------------------------------------
 
 ifeq (-mingw32,$(findstring -mingw32,$(BUILD_TRIPLE)))
-  ECHOPATH := $(Verb)python -u -c "import sys;print ' '.join(sys.argv[1:])"
+  ECHOPATH := $(Verb)$(PYTHON) -u -c "import sys;print ' '.join(sys.argv[1:])"
 else
   ECHOPATH := $(Verb)$(ECHO)
 endif
diff --git a/autoconf/config.sub b/autoconf/config.sub
index 9942491..a8d8528 100755
--- a/autoconf/config.sub
+++ b/autoconf/config.sub
@@ -251,7 +251,8 @@ case $basic_machine in
 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
 	| am33_2.0 \
 	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
-        | be32 | be64 \
+   | aarch64 \
+   | be32 | be64 \
 	| bfin \
 	| c4x | clipper \
 	| d10v | d30v | dlx | dsp16xx \
@@ -359,6 +360,7 @@ case $basic_machine in
 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
 	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
+   | aarch64-* \
 	| avr-* | avr32-* \
 	| be32-* | be64-* \
 	| bfin-* | bs2000-* \
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 102147c..b8f1766 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -59,12 +59,43 @@ if test ${srcdir} != "." ; then
   fi
 fi
 
+dnl Default to empty (i.e. assigning the null string to) CFLAGS and CXXFLAGS,
+dnl instead of the autoconf default (for example, '-g -O2' for CC=gcc).
+${CFLAGS=}
+${CXXFLAGS=}
+
 dnl We need to check for the compiler up here to avoid anything else
 dnl starting with a different one.
 AC_PROG_CC(clang llvm-gcc gcc)
 AC_PROG_CXX(clang++ llvm-g++ g++)
 AC_PROG_CPP
 
+dnl If CXX is Clang, check that it can find and parse C++ standard library
+dnl headers.
+if test "$CXX" = "clang++" ; then
+  AC_MSG_CHECKING([whether clang works])
+  AC_LANG_PUSH([C++])
+  dnl Note that space between 'include' and '(' is required.  There's a broken
+  dnl regex in aclocal that otherwise will think that we call m4's include
+  dnl builtin.
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <limits>
+#if __has_include (<cxxabi.h>)
+#include <cxxabi.h>
+#endif
+#if __has_include (<unwind.h>)
+#include <unwind.h>
+#endif
+]])],
+[
+  AC_MSG_RESULT([yes])
+],
+[
+  AC_MSG_RESULT([no])
+  AC_MSG_ERROR([Selected compiler could not find or parse C++ standard library headers.  Rerun with CC=c-compiler CXX=c++-compiler ./configure ...])
+])
+  AC_LANG_POP([C++])
+fi
+
 dnl Configure all of the projects present in our source tree. While we could
 dnl just AC_CONFIG_SUBDIRS on the set of directories in projects that have a
 dnl configure script, that usage of the AC_CONFIG_SUBDIRS macro is deprecated.
@@ -363,6 +394,7 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
+  aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
   mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
@@ -396,6 +428,7 @@ case $host in
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
   arm*-*)                 host_arch="ARM" ;;
+  aarch64*-*)             host_arch="AArch64" ;;
   mips-* | mips64-*)      host_arch="Mips" ;;
   mipsel-* | mips64el-*)  host_arch="Mips" ;;
   xcore-*)                host_arch="XCore" ;;
@@ -614,6 +647,7 @@ else
     PowerPC)     AC_SUBST(TARGET_HAS_JIT,1) ;;
     x86_64)      AC_SUBST(TARGET_HAS_JIT,1) ;;
     ARM)         AC_SUBST(TARGET_HAS_JIT,1) ;;
+    AArch64)     AC_SUBST(TARGET_HAS_JIT,0) ;;
     Mips)        AC_SUBST(TARGET_HAS_JIT,1) ;;
     XCore)       AC_SUBST(TARGET_HAS_JIT,0) ;;
     MSP430)      AC_SUBST(TARGET_HAS_JIT,0) ;;
@@ -745,20 +779,21 @@ dnl Allow specific targets to be specified for building (or not)
 TARGETS_TO_BUILD=""
 AC_ARG_ENABLE([targets],AS_HELP_STRING([--enable-targets],
     [Build specific host targets: all or target1,target2,... Valid targets are:
-     host, x86, x86_64, sparc, powerpc, arm, mips, hexagon,
+     host, x86, x86_64, sparc, powerpc, arm, aarch64, mips, hexagon,
      xcore, msp430, nvptx, and cpp (default=all)]),,
     enableval=all)
 if test "$enableval" = host-only ; then
   enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips XCore MSP430 CppBackend MBlaze NVPTX Hexagon" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC AArch64 ARM Mips XCore MSP430 CppBackend MBlaze NVPTX Hexagon" ;;
   *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
       case "$a_target" in
         x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
         x86_64)   TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
+        aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
@@ -1211,10 +1246,15 @@ fi
 dnl Verify that GCC is version 3.0 or higher
 if test "$GCC" = "yes"
 then
-  AC_COMPILE_IFELSE([[#if !defined(__GNUC__) || __GNUC__ < 3
-#error Unsupported GCC version
-#endif
-]], [], [AC_MSG_ERROR([gcc 3.x required, but you have a lower version])])
+  AC_COMPILE_IFELSE(
+[
+  AC_LANG_SOURCE([[
+    #if !defined(__GNUC__) || __GNUC__ < 3
+    #error Unsupported GCC version
+    #endif
+  ]])
+],
+[], [AC_MSG_ERROR([gcc 3.x required, but you have a lower version])])
 fi
 
 dnl Check for GNU Make.  We use its extensions, so don't build without it
@@ -1230,8 +1270,55 @@ dnl Check optional compiler flags.
 AC_MSG_CHECKING([optional compiler flags])
 CXX_FLAG_CHECK(NO_VARIADIC_MACROS, [-Wno-variadic-macros])
 CXX_FLAG_CHECK(NO_MISSING_FIELD_INITIALIZERS, [-Wno-missing-field-initializers])
+CXX_FLAG_CHECK(NO_NESTED_ANON_TYPES, [-Wno-nested-anon-types])
 CXX_FLAG_CHECK(COVERED_SWITCH_DEFAULT, [-Wcovered-switch-default])
-AC_MSG_RESULT([$NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $COVERED_SWITCH_DEFAULT])
+
+dnl GCC's potential uninitialized use analysis is weak and presents lots of
+dnl false positives, so disable it.
+NO_UNINITIALIZED=
+NO_MAYBE_UNINITIALIZED=
+if test "$GXX" = "yes"
+then
+  CXX_FLAG_CHECK(NO_MAYBE_UNINITIALIZED, [-Wno-maybe-uninitialized])
+  dnl gcc 4.7 introduced -Wmaybe-uninitialized to distinguish cases which are
+  dnl known to be uninitialized from cases which might be uninitialized.  We
+  dnl still want to catch the first kind of errors.
+  if test -z "$NO_MAYBE_UNINITIALIZED"
+  then
+    CXX_FLAG_CHECK(NO_UNINITIALIZED, [-Wno-uninitialized])
+  fi
+fi
+AC_MSG_RESULT([$NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $NO_NESTED_ANON_TYPES $COVERED_SWITCH_DEFAULT $NO_UNINITIALIZED $NO_MAYBE_UNINITIALIZED])
+
+AC_ARG_WITH([python],
+            [AS_HELP_STRING([--with-python], [path to python])],
+            [PYTHON="$withval"])
+
+if test -n "$PYTHON" && test -x "$PYTHON" ; then
+  AC_MSG_CHECKING([for python])
+  AC_MSG_RESULT([user defined: $with_python])
+else
+  if test -n "$PYTHON" ; then
+    AC_MSG_WARN([specified python ($PYTHON) is not usable, searching path])
+  fi
+
+  AC_PATH_PROG([PYTHON], [python python2 python26],
+               [AC_MSG_RESULT([not found])
+                AC_MSG_ERROR([could not find python 2.5 or higher])])
+fi
+
+AC_MSG_CHECKING([for python >= 2.5])
+ac_python_version=`$PYTHON -c 'import sys; print sys.version.split()[[0]]'`
+ac_python_version_major=`echo $ac_python_version | cut -d'.' -f1`
+ac_python_version_minor=`echo $ac_python_version | cut -d'.' -f2`
+ac_python_version_patch=`echo $ac_python_version | cut -d'.' -f3`
+if   test "$ac_python_version_major" -eq "2" \
+   && test "$ac_python_version_minor" -ge "5" ; then
+  AC_MSG_RESULT([$PYTHON ($ac_python_version)])
+else
+  AC_MSG_RESULT([not found])
+  AC_MSG_FAILURE([found python $ac_python_version ($PYTHON); required >= 2.5])
+fi
 
 dnl===-----------------------------------------------------------------------===
 dnl===
@@ -1426,18 +1513,23 @@ AC_CHECK_HEADERS([CrashReporterClient.h])
 dnl Try to find Darwin specific crash reporting global.
 AC_MSG_CHECKING([__crashreporter_info__])
 AC_LINK_IFELSE(
-  AC_LANG_SOURCE(
-    [[extern const char *__crashreporter_info__;
-      int main() {
-        __crashreporter_info__ = "test";
-        return 0;
-      }
-    ]]),
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(HAVE_CRASHREPORTER_INFO, 1, Can use __crashreporter_info__),
-  AC_MSG_RESULT(no)
-  AC_DEFINE(HAVE_CRASHREPORTER_INFO, 0,
-            Define if __crashreporter_info__ exists.))
+[
+  AC_LANG_SOURCE([[
+    extern const char *__crashreporter_info__;
+    int main() {
+      __crashreporter_info__ = "test";
+      return 0;
+    }
+  ]])
+],
+[
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([HAVE_CRASHREPORTER_INFO], [1], [can use __crashreporter_info__])
+],
+[
+  AC_MSG_RESULT([no])
+  AC_DEFINE([HAVE_CRASHREPORTER_INFO], [0], [can use __crashreporter_info__])
+])
 
 dnl===-----------------------------------------------------------------------===
 dnl===
@@ -1463,6 +1555,7 @@ dnl===-----------------------------------------------------------------------===
 
 AC_CHECK_FUNCS([backtrace ceilf floorf roundf rintf nearbyintf getcwd ])
 AC_CHECK_FUNCS([powf fmodf strtof round ])
+AC_CHECK_FUNCS([log log2 log10 exp exp2])
 AC_CHECK_FUNCS([getpagesize getrusage getrlimit setrlimit gettimeofday ])
 AC_CHECK_FUNCS([isatty mkdtemp mkstemp ])
 AC_CHECK_FUNCS([mktemp posix_spawn pread realpath sbrk setrlimit strdup ])
@@ -1500,10 +1593,15 @@ fi
 dnl Check Win32 API EnumerateLoadedModules.
 if test "$llvm_cv_os_type" = "MingW" ; then
   AC_MSG_CHECKING([whether EnumerateLoadedModules() accepts new decl])
-  AC_COMPILE_IFELSE([[#include <windows.h>
-#include <imagehlp.h>
-extern void foo(PENUMLOADED_MODULES_CALLBACK);
-extern void foo(BOOL(CALLBACK*)(PCSTR,ULONG_PTR,ULONG,PVOID));]],
+  AC_COMPILE_IFELSE(
+[
+  AC_LANG_SOURCE([[
+    #include <windows.h>
+    #include <imagehlp.h>
+    extern void foo(PENUMLOADED_MODULES_CALLBACK);
+    extern void foo(BOOL(CALLBACK*)(PCSTR,ULONG_PTR,ULONG,PVOID));
+  ]])
+],
 [
   AC_MSG_RESULT([yes])
   llvm_cv_win32_elmcb_pcstr="PCSTR"
@@ -1544,22 +1642,28 @@ dnl Since we'll be using these atomic builtins in C++ files we should test
 dnl the C++ compiler.
 AC_LANG_PUSH([C++])
 AC_LINK_IFELSE(
-  AC_LANG_SOURCE(
-    [[int main() {
-        volatile unsigned long val = 1;
-        __sync_synchronize();
-        __sync_val_compare_and_swap(&val, 1, 0);
-        __sync_add_and_fetch(&val, 1);
-        __sync_sub_and_fetch(&val, 1);
-        return 0;
-      }
-    ]]),
-  AC_LANG_POP([C++])
-  AC_MSG_RESULT(yes)
-  AC_DEFINE(LLVM_HAS_ATOMICS, 1, Has gcc/MSVC atomic intrinsics),
-  AC_MSG_RESULT(no)
-  AC_DEFINE(LLVM_HAS_ATOMICS, 0, Has gcc/MSVC atomic intrinsics)
-  AC_MSG_WARN([LLVM will be built thread-unsafe because atomic builtins are missing]))
+[
+  AC_LANG_SOURCE([[
+    int main() {
+      volatile unsigned long val = 1;
+      __sync_synchronize();
+      __sync_val_compare_and_swap(&val, 1, 0);
+      __sync_add_and_fetch(&val, 1);
+      __sync_sub_and_fetch(&val, 1);
+      return 0;
+    }
+  ]])
+],
+[
+  AC_MSG_RESULT([yes])
+  AC_DEFINE([LLVM_HAS_ATOMICS], [1], [Has gcc/MSVC atomic intrinsics])
+],
+[
+  AC_MSG_RESULT([no])
+  AC_DEFINE([LLVM_HAS_ATOMICS], [0], [Has gcc/MSVC atomic intrinsics])
+  AC_MSG_WARN([LLVM will be built thread-unsafe because atomic builtins are missing])
+])
+AC_LANG_POP([C++])
 
 dnl===-----------------------------------------------------------------------===
 dnl===
diff --git a/autoconf/m4/cxx_flag_check.m4 b/autoconf/m4/cxx_flag_check.m4
index 62454b7..4b09744 100644
--- a/autoconf/m4/cxx_flag_check.m4
+++ b/autoconf/m4/cxx_flag_check.m4
@@ -1,2 +1,2 @@
 AC_DEFUN([CXX_FLAG_CHECK],
-  [AC_SUBST($1, `$CXX -Werror $2 -fsyntax-only -xc /dev/null 2>/dev/null && echo $2`)])
+  [AC_SUBST($1, `$CXX -Werror patsubst($2, [^-Wno-], [-W]) -fsyntax-only -xc /dev/null 2>/dev/null && echo $2`)])
diff --git a/autoconf/m4/func_isinf.m4 b/autoconf/m4/func_isinf.m4
index 22ba81d..40dc48b 100644
--- a/autoconf/m4/func_isinf.m4
+++ b/autoconf/m4/func_isinf.m4
@@ -1,34 +1,40 @@
-#
-# This function determins if the isinf function isavailable on this
-# platform.
-#
+dnl
+dnl This function determins if the isinf function isavailable on this
+dnl platform.
+dnl
+
 AC_DEFUN([AC_FUNC_ISINF],[
+
 AC_SINGLE_CXX_CHECK([ac_cv_func_isinf_in_math_h],   
                     [isinf], [<math.h>],
                     [float f; isinf(f);])
 if test "$ac_cv_func_isinf_in_math_h" = "yes" ; then 
-  AC_DEFINE([HAVE_ISINF_IN_MATH_H],1,[Set to 1 if the isinf function is found in <math.h>])
+  AC_DEFINE([HAVE_ISINF_IN_MATH_H], [1],
+            [Set to 1 if the isinf function is found in <math.h>])
 fi
 
 AC_SINGLE_CXX_CHECK([ac_cv_func_isinf_in_cmath],    
                     [isinf], [<cmath>],
                     [float f; isinf(f);])
 if test "$ac_cv_func_isinf_in_cmath" = "yes" ; then
-  AC_DEFINE([HAVE_ISINF_IN_CMATH],1,[Set to 1 if the isinf function is found in <cmath>])
+  AC_DEFINE([HAVE_ISINF_IN_CMATH], [1],
+            [Set to 1 if the isinf function is found in <cmath>])
 fi
 
 AC_SINGLE_CXX_CHECK([ac_cv_func_std_isinf_in_cmath],
                     [std::isinf], [<cmath>],
                     [float f; std::isinf(f);])
 if test "$ac_cv_func_std_isinf_in_cmath" = "yes" ; then 
-  AC_DEFINE([HAVE_STD_ISINF_IN_CMATH],1,[Set to 1 if the std::isinf function is found in <cmath>])
+  AC_DEFINE([HAVE_STD_ISINF_IN_CMATH], [1],
+            [Set to 1 if the std::isinf function is found in <cmath>])
 fi
 
 AC_SINGLE_CXX_CHECK([ac_cv_func_finite_in_ieeefp_h],
                     [finite], [<ieeefp.h>],
                     [float f; finite(f);])
 if test "$ac_cv_func_finite_in_ieeefp_h" = "yes" ; then
-  AC_DEFINE([HAVE_FINITE_IN_IEEEFP_H],1,[Set to 1 if the finite function is found in <ieeefp.h>])
+  AC_DEFINE([HAVE_FINITE_IN_IEEEFP_H], [1],
+            [Set to 1 if the finite function is found in <ieeefp.h>])
 fi
 
 ])
diff --git a/autoconf/m4/huge_val.m4 b/autoconf/m4/huge_val.m4
index 6c9a22e..d224d7c 100644
--- a/autoconf/m4/huge_val.m4
+++ b/autoconf/m4/huge_val.m4
@@ -7,12 +7,10 @@ AC_DEFUN([AC_HUGE_VAL_CHECK],[
     AC_LANG_PUSH([C++])
     ac_save_CXXFLAGS=$CXXFLAGS
     CXXFLAGS="$CXXFLAGS -pedantic"
-    AC_RUN_IFELSE(
-      AC_LANG_PROGRAM(
-        [#include <math.h>],
-        [double x = HUGE_VAL; return x != x; ]),
-      [ac_cv_huge_val_sanity=yes],[ac_cv_huge_val_sanity=no],
-      [ac_cv_huge_val_sanity=yes])
+    AC_RUN_IFELSE([AC_LANG_PROGRAM([[#include <math.h>]],
+                                   [[double x = HUGE_VAL; return x != x;]])],
+                  [ac_cv_huge_val_sanity=yes],[ac_cv_huge_val_sanity=no],
+                  [ac_cv_huge_val_sanity=yes])
     CXXFLAGS=$ac_save_CXXFLAGS
     AC_LANG_POP([C++])
     ])
diff --git a/autoconf/m4/single_cxx_check.m4 b/autoconf/m4/single_cxx_check.m4
index 21efa4b..cb47326 100644
--- a/autoconf/m4/single_cxx_check.m4
+++ b/autoconf/m4/single_cxx_check.m4
@@ -1,10 +1,16 @@
+dnl
 dnl AC_SINGLE_CXX_CHECK(CACHEVAR, FUNCTION, HEADER, PROGRAM)
-dnl                     $1,       $2,       $3,     $4,     
-dnl 
+dnl                     $1,       $2,       $3,     $4,
+
 AC_DEFUN([AC_SINGLE_CXX_CHECK],
- [AC_CACHE_CHECK([for $2 in $3], [$1],
-  [AC_LANG_PUSH([C++])
-   AC_COMPILE_IFELSE(AC_LANG_PROGRAM([#include $3],[$4]),[$1=yes],[$1=no])
-  AC_LANG_POP([C++])])
- ])
+[
+  AC_CACHE_CHECK([for $2 in $3], [$1],
+  [
+    AC_LANG_PUSH([C++])
+    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]][$3], [$4])],
+                      [$1][[=yes]],
+                      [$1][[=no]])
+    AC_LANG_POP([C++])
+  ])
+])
 
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index c625b87..7496622 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -54,6 +54,7 @@ check_include_file(ndir.h HAVE_NDIR_H)
 if( NOT PURE_WINDOWS )
   check_include_file(pthread.h HAVE_PTHREAD_H)
 endif()
+check_include_file(sanitizer/msan_interface.h HAVE_SANITIZER_MSAN_INTERFACE_H)
 check_include_file(setjmp.h HAVE_SETJMP_H)
 check_include_file(signal.h HAVE_SIGNAL_H)
 check_include_file(stdint.h HAVE_STDINT_H)
@@ -118,6 +119,12 @@ check_symbol_exists(isnan math.h HAVE_ISNAN_IN_MATH_H)
 check_symbol_exists(ceilf math.h HAVE_CEILF)
 check_symbol_exists(floorf math.h HAVE_FLOORF)
 check_symbol_exists(fmodf math.h HAVE_FMODF)
+check_symbol_exists(log math.h HAVE_LOG)
+check_symbol_exists(log2 math.h HAVE_LOG2)
+check_symbol_exists(log10 math.h HAVE_LOG10)
+check_symbol_exists(exp math.h HAVE_EXP)
+check_symbol_exists(exp2 math.h HAVE_EXP2)
+check_symbol_exists(exp10 math.h HAVE_EXP10)
 if( HAVE_SETJMP_H )
   check_symbol_exists(longjmp setjmp.h HAVE_LONGJMP)
   check_symbol_exists(setjmp setjmp.h HAVE_SETJMP)
@@ -304,6 +311,24 @@ include(CheckCXXCompilerFlag)
 
 check_cxx_compiler_flag("-Wno-variadic-macros" SUPPORTS_NO_VARIADIC_MACROS_FLAG)
 
+set(USE_NO_MAYBE_UNINITIALIZED 0)
+set(USE_NO_UNINITIALIZED 0)
+
+# Disable gcc's potentially uninitialized use analysis as it presents lots of
+# false positives.
+if (CMAKE_COMPILER_IS_GNUCXX)
+  check_cxx_compiler_flag("-Wmaybe-uninitialized" HAS_MAYBE_UNINITIALIZED)
+  if (HAS_MAYBE_UNINITIALIZED)
+    set(USE_NO_MAYBE_UNINITIALIZED 1)
+  else()
+    # Only recent versions of gcc make the distinction between -Wuninitialized
+    # and -Wmaybe-uninitialized. If -Wmaybe-uninitialized isn't supported, just
+    # turn off all uninitialized use warnings.
+    check_cxx_compiler_flag("-Wuninitialized" HAS_UNINITIALIZED)
+    set(USE_NO_UNINITIALIZED ${HAS_UNINITIALIZED})
+  endif()
+endif()
+
 include(GetHostTriple)
 get_host_triple(LLVM_HOST_TRIPLE)
 
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 516e023..f0b31ce 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -155,20 +155,9 @@ macro(add_llvm_external_project name)
   endif()
 endmacro(add_llvm_external_project)
 
-# Returns directory where unittest should reside.
-function(get_unittest_directory dir)
-  if (CMAKE_BUILD_TYPE)
-   set(result ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
-  else()
-   set(result ${CMAKE_CURRENT_BINARY_DIR})
-  endif()
-  set(${dir} ${result} PARENT_SCOPE)
-endfunction()
-
 # Generic support for adding a unittest.
 function(add_unittest test_suite test_name)
-  get_unittest_directory(OUTPUT_DIR)
-  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OUTPUT_DIR})
+  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   if( NOT LLVM_BUILD_TESTS )
     set(EXCLUDE_FROM_ALL ON)
   endif()
@@ -245,8 +234,8 @@ function(configure_lit_site_cfg input output)
 
   set(LLVM_SOURCE_DIR ${LLVM_MAIN_SRC_DIR})
   set(LLVM_BINARY_DIR ${LLVM_BINARY_DIR})
-  set(LLVM_TOOLS_DIR "${LLVM_TOOLS_BINARY_DIR}/%(build_config)s")
-  set(LLVM_LIBS_DIR "${LLVM_BINARY_DIR}/lib/%(build_config)s")
+  set(LLVM_TOOLS_DIR "${LLVM_TOOLS_BINARY_DIR}/%(build_mode)s")
+  set(LLVM_LIBS_DIR "${LLVM_BINARY_DIR}/lib/%(build_mode)s")
   set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE})
   set(ENABLE_SHARED ${LLVM_SHARED_LIBS_ENABLED})
   set(SHLIBPATH_VAR ${SHLIBPATH_VAR})
@@ -257,8 +246,8 @@ function(configure_lit_site_cfg input output)
     set(ENABLE_ASSERTIONS "0")
   endif()
 
-  set(HOST_OS ${CMAKE_HOST_SYSTEM_NAME})
-  set(HOST_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
+  set(HOST_OS ${CMAKE_SYSTEM_NAME})
+  set(HOST_ARCH ${CMAKE_SYSTEM_PROCESSOR})
 
   configure_file(${input} ${output} @ONLY)
 endfunction()
@@ -272,8 +261,7 @@ function(add_lit_target target comment)
   set(LIT_COMMAND
     ${PYTHON_EXECUTABLE}
     ${LLVM_MAIN_SRC_DIR}/utils/lit/lit.py
-    --param build_config=${CMAKE_CFG_INTDIR}
-    --param build_mode=${RUNTIME_BUILD_MODE}
+    --param build_mode=${CMAKE_CFG_INTDIR}
     ${LIT_ARGS}
     )
   foreach(param ${ARG_PARAMS})
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 711fe96..9121080 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -11,20 +11,6 @@ elseif( "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
   set(LLVM_COMPILER_IS_GCC_COMPATIBLE ON)
 endif()
 
-# Run-time build mode; It is used for unittests.
-if(MSVC_IDE)
-  # Expect "$(Configuration)", "$(OutDir)", etc.
-  # It is expanded by msbuild or similar.
-  set(RUNTIME_BUILD_MODE "${CMAKE_CFG_INTDIR}")
-elseif(NOT CMAKE_BUILD_TYPE STREQUAL "")
-  # Expect "Release" "Debug", etc.
-  # Or unittests could not run.
-  set(RUNTIME_BUILD_MODE ${CMAKE_BUILD_TYPE})
-else()
-  # It might be "."
-  set(RUNTIME_BUILD_MODE "${CMAKE_CFG_INTDIR}")
-endif()
-
 if( LLVM_ENABLE_ASSERTIONS )
   # MSVC doesn't like _DEBUG on release builds. See PR 4379.
   if( NOT MSVC )
@@ -169,6 +155,7 @@ if( MSVC )
     -wd4551 # Suppress 'function call missing argument list'
     -wd4624 # Suppress ''derived class' : destructor could not be generated because a base class destructor is inaccessible'
     -wd4715 # Suppress ''function' : not all control paths return a value'
+    -wd4722 # Suppress ''function' : destructor never returns, potential memory leak'
     -wd4800 # Suppress ''type' : forcing value to bool 'true' or 'false' (performance warning)'
 
     # Promoted warnings.
@@ -191,8 +178,25 @@ if( MSVC )
 elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
   if (LLVM_ENABLE_WARNINGS)
     add_llvm_definitions( -Wall -W -Wno-unused-parameter -Wwrite-strings )
+
+    # Turn off missing field initializer warnings for gcc to avoid noise from
+    # false positives with empty {}. Turn them on otherwise (they're off by
+    # default for clang).
+    check_cxx_compiler_flag("-Wmissing-field-initializers" CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
+    if (CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
+      if (CMAKE_COMPILER_IS_GNUCXX)
+        add_llvm_definitions( -Wno-missing-field-initializers )
+      else()
+        add_llvm_definitions( -Wmissing-field-initializers )
+      endif()
+    endif()
+
     if (LLVM_ENABLE_PEDANTIC)
       add_llvm_definitions( -pedantic -Wno-long-long )
+      check_cxx_compiler_flag("-Werror -Wnested-anon-types" CXX_SUPPORTS_NO_NESTED_ANON_TYPES_FLAG)
+      if( CXX_SUPPORTS_NO_NESTED_ANON_TYPES_FLAG )
+        set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nested-anon-types" )
+      endif()
     endif (LLVM_ENABLE_PEDANTIC)
     check_cxx_compiler_flag("-Werror -Wcovered-switch-default" CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG)
     if( CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG )
@@ -202,6 +206,12 @@ elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
     if( C_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG )
       set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wcovered-switch-default" )
     endif()
+    if (USE_NO_UNINITIALIZED)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-uninitialized")
+    endif()
+    if (USE_NO_MAYBE_UNINITIALIZED)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-maybe-uninitialized")
+    endif()
   endif (LLVM_ENABLE_WARNINGS)
   if (LLVM_ENABLE_WERROR)
     add_llvm_definitions( -Werror )
diff --git a/configure b/configure
index 4392e97..bb612d5 100755
--- a/configure
+++ b/configure
@@ -766,7 +766,11 @@ CONVENIENCE_LTDL_FALSE
 LIBADD_DL
 NO_VARIADIC_MACROS
 NO_MISSING_FIELD_INITIALIZERS
+NO_NESTED_ANON_TYPES
 COVERED_SWITCH_DEFAULT
+NO_MAYBE_UNINITIALIZED
+NO_UNINITIALIZED
+PYTHON
 USE_UDIS86
 USE_OPROFILE
 USE_INTEL_JITEVENTS
@@ -1435,8 +1439,8 @@ Optional Features:
                           YES)
   --enable-targets        Build specific host targets: all or
                           target1,target2,... Valid targets are: host, x86,
-                          x86_64, sparc, powerpc, arm, mips, hexagon, xcore,
-                          msp430, nvptx, and cpp (default=all)
+                          x86_64, sparc, powerpc, arm, aarch64, mips, hexagon,
+                          xcore, msp430, nvptx, and cpp (default=all)
   --enable-experimental-targets
                           Build experimental host targets: disable or
                           target1,target2,... (default=disable)
@@ -1467,6 +1471,7 @@ Optional Packages:
   --with-bug-report-url   Specify the URL where bug reports should be
                           submitted (default=http://llvm.org/bugs/)
   --with-internal-prefix  Installation directory for internal files
+  --with-python           path to python
   --with-udis86=<path>    Use udis86 external x86 disassembler library
   --with-oprofile=<prefix>
                           Tell OProfile >= 0.9.4 how to symbolize JIT output
@@ -1976,6 +1981,9 @@ echo "$as_me: error: Already configured in ${srcdir}" >&2;}
   fi
 fi
 
+${CFLAGS=}
+${CXXFLAGS=}
+
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
@@ -3471,6 +3479,98 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
+if test "$CXX" = "clang++" ; then
+  { echo "$as_me:$LINENO: checking whether clang works" >&5
+echo $ECHO_N "checking whether clang works... $ECHO_C" >&6; }
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <limits>
+#if __has_include (<cxxabi.h>)
+#include <cxxabi.h>
+#endif
+#if __has_include (<unwind.h>)
+#include <unwind.h>
+#endif
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest.$ac_objext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+
+  { echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6; }
+
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+  { { echo "$as_me:$LINENO: error: Selected compiler could not find or parse C++ standard library headers.  Rerun with CC=c-compiler CXX=c++-compiler ./configure ..." >&5
+echo "$as_me: error: Selected compiler could not find or parse C++ standard library headers.  Rerun with CC=c-compiler CXX=c++-compiler ./configure ..." >&2;}
+   { (exit 1); exit 1; }; }
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+fi
+
 
 
 if test -d ${srcdir}/projects/llvm-gcc ; then
@@ -3912,6 +4012,7 @@ else
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
+  aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
   mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
@@ -3945,6 +4046,7 @@ case $host in
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
   arm*-*)                 host_arch="ARM" ;;
+  aarch64*-*)             host_arch="AArch64" ;;
   mips-* | mips64-*)      host_arch="Mips" ;;
   mipsel-* | mips64el-*)  host_arch="Mips" ;;
   xcore-*)                host_arch="XCore" ;;
@@ -5277,6 +5379,8 @@ else
  ;;
     ARM)         TARGET_HAS_JIT=1
  ;;
+    AArch64)     TARGET_HAS_JIT=0
+ ;;
     Mips)        TARGET_HAS_JIT=1
  ;;
     XCore)       TARGET_HAS_JIT=0
@@ -5498,13 +5602,14 @@ if test "$enableval" = host-only ; then
   enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips XCore MSP430 CppBackend MBlaze NVPTX Hexagon" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC AArch64 ARM Mips XCore MSP430 CppBackend MBlaze NVPTX Hexagon" ;;
   *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
       case "$a_target" in
         x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
         x86_64)   TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
+        aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
@@ -10393,7 +10498,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10396 "configure"
+#line 10501 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -12084,9 +12189,17 @@ fi
 if test "$GCC" = "yes"
 then
   cat >conftest.$ac_ext <<_ACEOF
-#if !defined(__GNUC__) || __GNUC__ < 3
-#error Unsupported GCC version
-#endif
+
+  /* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+    #if !defined(__GNUC__) || __GNUC__ < 3
+    #error Unsupported GCC version
+    #endif
+
 
 _ACEOF
 rm -f conftest.$ac_objext
@@ -12148,14 +12261,114 @@ echo "${ECHO_T}ok" >&6; }
 
 { echo "$as_me:$LINENO: checking optional compiler flags" >&5
 echo $ECHO_N "checking optional compiler flags... $ECHO_C" >&6; }
-NO_VARIADIC_MACROS=`$CXX -Werror -Wno-variadic-macros -fsyntax-only -xc /dev/null 2>/dev/null && echo -Wno-variadic-macros`
+NO_VARIADIC_MACROS=`$CXX -Werror -Wvariadic-macros -fsyntax-only -xc /dev/null 2>/dev/null && echo -Wno-variadic-macros`
+
+NO_MISSING_FIELD_INITIALIZERS=`$CXX -Werror -Wmissing-field-initializers -fsyntax-only -xc /dev/null 2>/dev/null && echo -Wno-missing-field-initializers`
 
-NO_MISSING_FIELD_INITIALIZERS=`$CXX -Werror -Wno-missing-field-initializers -fsyntax-only -xc /dev/null 2>/dev/null && echo -Wno-missing-field-initializers`
+NO_NESTED_ANON_TYPES=`$CXX -Werror -Wnested-anon-types -fsyntax-only -xc /dev/null 2>/dev/null && echo -Wno-nested-anon-types`
 
 COVERED_SWITCH_DEFAULT=`$CXX -Werror -Wcovered-switch-default -fsyntax-only -xc /dev/null 2>/dev/null && echo -Wcovered-switch-default`
 
-{ echo "$as_me:$LINENO: result: $NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $COVERED_SWITCH_DEFAULT" >&5
-echo "${ECHO_T}$NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $COVERED_SWITCH_DEFAULT" >&6; }
+
+NO_UNINITIALIZED=
+NO_MAYBE_UNINITIALIZED=
+if test "$GXX" = "yes"
+then
+  NO_MAYBE_UNINITIALIZED=`$CXX -Werror -Wmaybe-uninitialized -fsyntax-only -xc /dev/null 2>/dev/null && echo -Wno-maybe-uninitialized`
+
+        if test -z "$NO_MAYBE_UNINITIALIZED"
+  then
+    NO_UNINITIALIZED=`$CXX -Werror -Wuninitialized -fsyntax-only -xc /dev/null 2>/dev/null && echo -Wno-uninitialized`
+
+  fi
+fi
+{ echo "$as_me:$LINENO: result: $NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $NO_NESTED_ANON_TYPES $COVERED_SWITCH_DEFAULT $NO_UNINITIALIZED $NO_MAYBE_UNINITIALIZED" >&5
+echo "${ECHO_T}$NO_VARIADIC_MACROS $NO_MISSING_FIELD_INITIALIZERS $NO_NESTED_ANON_TYPES $COVERED_SWITCH_DEFAULT $NO_UNINITIALIZED $NO_MAYBE_UNINITIALIZED" >&6; }
+
+
+# Check whether --with-python was given.
+if test "${with_python+set}" = set; then
+  withval=$with_python; PYTHON="$withval"
+fi
+
+
+if test -n "$PYTHON" && test -x "$PYTHON" ; then
+  { echo "$as_me:$LINENO: checking for python" >&5
+echo $ECHO_N "checking for python... $ECHO_C" >&6; }
+  { echo "$as_me:$LINENO: result: user defined: $with_python" >&5
+echo "${ECHO_T}user defined: $with_python" >&6; }
+else
+  if test -n "$PYTHON" ; then
+    { echo "$as_me:$LINENO: WARNING: specified python ($PYTHON) is not usable, searching path" >&5
+echo "$as_me: WARNING: specified python ($PYTHON) is not usable, searching path" >&2;}
+  fi
+
+  # Extract the first word of "python python2 python26", so it can be a program name with args.
+set dummy python python2 python26; ac_word=$2
+{ echo "$as_me:$LINENO: checking for $ac_word" >&5
+echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
+if test "${ac_cv_path_PYTHON+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  case $PYTHON in
+  [\\/]* | ?:[\\/]*)
+  ac_cv_path_PYTHON="$PYTHON" # Let the user override the test with a path.
+  ;;
+  *)
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  for ac_exec_ext in '' $ac_executable_extensions; do
+  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_executable_p "$as_dir/$ac_word$ac_exec_ext"; }; then
+    ac_cv_path_PYTHON="$as_dir/$ac_word$ac_exec_ext"
+    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+done
+IFS=$as_save_IFS
+
+  test -z "$ac_cv_path_PYTHON" && ac_cv_path_PYTHON="{ echo "$as_me:$LINENO: result: not found" >&5
+echo "${ECHO_T}not found" >&6; }
+                { { echo "$as_me:$LINENO: error: could not find python 2.5 or higher" >&5
+echo "$as_me: error: could not find python 2.5 or higher" >&2;}
+   { (exit 1); exit 1; }; }"
+  ;;
+esac
+fi
+PYTHON=$ac_cv_path_PYTHON
+if test -n "$PYTHON"; then
+  { echo "$as_me:$LINENO: result: $PYTHON" >&5
+echo "${ECHO_T}$PYTHON" >&6; }
+else
+  { echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6; }
+fi
+
+
+fi
+
+{ echo "$as_me:$LINENO: checking for python >= 2.5" >&5
+echo $ECHO_N "checking for python >= 2.5... $ECHO_C" >&6; }
+ac_python_version=`$PYTHON -c 'import sys; print sys.version.split()[0]'`
+ac_python_version_major=`echo $ac_python_version | cut -d'.' -f1`
+ac_python_version_minor=`echo $ac_python_version | cut -d'.' -f2`
+ac_python_version_patch=`echo $ac_python_version | cut -d'.' -f3`
+if   test "$ac_python_version_major" -eq "2" \
+   && test "$ac_python_version_minor" -ge "5" ; then
+  { echo "$as_me:$LINENO: result: $PYTHON ($ac_python_version)" >&5
+echo "${ECHO_T}$PYTHON ($ac_python_version)" >&6; }
+else
+  { echo "$as_me:$LINENO: result: not found" >&5
+echo "${ECHO_T}not found" >&6; }
+  { { echo "$as_me:$LINENO: error: found python $ac_python_version ($PYTHON); required >= 2.5
+See \`config.log' for more details." >&5
+echo "$as_me: error: found python $ac_python_version ($PYTHON); required >= 2.5
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }
+fi
 
 
 
@@ -16259,16 +16472,19 @@ done
 { echo "$as_me:$LINENO: checking __crashreporter_info__" >&5
 echo $ECHO_N "checking __crashreporter_info__... $ECHO_C" >&6; }
 cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
+
+  /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
 cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
-extern const char *__crashreporter_info__;
-      int main() {
-        __crashreporter_info__ = "test";
-        return 0;
-      }
+
+    extern const char *__crashreporter_info__;
+    int main() {
+      __crashreporter_info__ = "test";
+      return 0;
+    }
+
 
 _ACEOF
 rm -f conftest.$ac_objext conftest$ac_exeext
@@ -16305,6 +16521,7 @@ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
   ac_status=$?
   echo "$as_me:$LINENO: \$? = $ac_status" >&5
   (exit $ac_status); }; }; then
+
   { echo "$as_me:$LINENO: result: yes" >&5
 echo "${ECHO_T}yes" >&6; }
 
@@ -16312,17 +16529,20 @@ cat >>confdefs.h <<\_ACEOF
 #define HAVE_CRASHREPORTER_INFO 1
 _ACEOF
 
+
 else
   echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	{ echo "$as_me:$LINENO: result: no" >&5
+
+  { echo "$as_me:$LINENO: result: no" >&5
 echo "${ECHO_T}no" >&6; }
 
 cat >>confdefs.h <<\_ACEOF
 #define HAVE_CRASHREPORTER_INFO 0
 _ACEOF
 
+
 fi
 
 rm -f core conftest.err conftest.$ac_objext \
@@ -16330,7 +16550,6 @@ rm -f core conftest.err conftest.$ac_objext \
 
 
 
-
   { echo "$as_me:$LINENO: checking for HUGE_VAL sanity" >&5
 echo $ECHO_N "checking for HUGE_VAL sanity... $ECHO_C" >&6; }
 if test "${ac_cv_huge_val_sanity+set}" = set; then
@@ -17133,6 +17352,120 @@ done
 
 
 
+for ac_func in log log2 log10 exp exp2
+do
+as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
+{ echo "$as_me:$LINENO: checking for $ac_func" >&5
+echo $ECHO_N "checking for $ac_func... $ECHO_C" >&6; }
+if { as_var=$as_ac_var; eval "test \"\${$as_var+set}\" = set"; }; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+/* Define $ac_func to an innocuous variant, in case <limits.h> declares $ac_func.
+   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
+#define $ac_func innocuous_$ac_func
+
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char $ac_func (); below.
+    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+    <limits.h> exists even on freestanding compilers.  */
+
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+
+#undef $ac_func
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char $ac_func ();
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined __stub_$ac_func || defined __stub___$ac_func
+choke me
+#endif
+
+int
+main ()
+{
+return $ac_func ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  eval "$as_ac_var=yes"
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	eval "$as_ac_var=no"
+fi
+
+rm -f core conftest.err conftest.$ac_objext \
+      conftest$ac_exeext conftest.$ac_ext
+fi
+ac_res=`eval echo '${'$as_ac_var'}'`
+	       { echo "$as_me:$LINENO: result: $ac_res" >&5
+echo "${ECHO_T}$ac_res" >&6; }
+if test `eval echo '${'$as_ac_var'}'` = yes; then
+  cat >>confdefs.h <<_ACEOF
+#define `echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+done
+
+
+
+
+
+
 for ac_func in getpagesize getrusage getrlimit setrlimit gettimeofday
 do
 as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
@@ -17905,18 +18238,20 @@ _ACEOF
  fi
 
 
-{ echo "$as_me:$LINENO: checking for srand48/lrand48/drand48 in <stdlib.h>" >&5
+
+  { echo "$as_me:$LINENO: checking for srand48/lrand48/drand48 in <stdlib.h>" >&5
 echo $ECHO_N "checking for srand48/lrand48/drand48 in <stdlib.h>... $ECHO_C" >&6; }
 if test "${ac_cv_func_rand48+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  ac_ext=cpp
+
+    ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-   cat >conftest.$ac_ext <<_ACEOF
+    cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
@@ -17974,12 +18309,13 @@ sed 's/^/| /' conftest.$ac_ext >&5
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_ext=c
+    ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 fi
 { echo "$as_me:$LINENO: result: $ac_cv_func_rand48" >&5
 echo "${ECHO_T}$ac_cv_func_rand48" >&6; }
@@ -19449,10 +19785,19 @@ if test "$llvm_cv_os_type" = "MingW" ; then
   { echo "$as_me:$LINENO: checking whether EnumerateLoadedModules() accepts new decl" >&5
 echo $ECHO_N "checking whether EnumerateLoadedModules() accepts new decl... $ECHO_C" >&6; }
   cat >conftest.$ac_ext <<_ACEOF
-#include <windows.h>
-#include <imagehlp.h>
-extern void foo(PENUMLOADED_MODULES_CALLBACK);
-extern void foo(BOOL(CALLBACK*)(PCSTR,ULONG_PTR,ULONG,PVOID));
+
+  /* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+    #include <windows.h>
+    #include <imagehlp.h>
+    extern void foo(PENUMLOADED_MODULES_CALLBACK);
+    extern void foo(BOOL(CALLBACK*)(PCSTR,ULONG_PTR,ULONG,PVOID));
+
+
 _ACEOF
 rm -f conftest.$ac_objext
 if { (ac_try="$ac_compile"
@@ -19513,18 +19858,20 @@ _ACEOF
 fi
 
 
-{ echo "$as_me:$LINENO: checking for isnan in <math.h>" >&5
+
+  { echo "$as_me:$LINENO: checking for isnan in <math.h>" >&5
 echo $ECHO_N "checking for isnan in <math.h>... $ECHO_C" >&6; }
 if test "${ac_cv_func_isnan_in_math_h+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  ac_ext=cpp
+
+    ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-   cat >conftest.$ac_ext <<_ACEOF
+    cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
@@ -19582,12 +19929,13 @@ sed 's/^/| /' conftest.$ac_ext >&5
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_ext=c
+    ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 fi
 { echo "$as_me:$LINENO: result: $ac_cv_func_isnan_in_math_h" >&5
 echo "${ECHO_T}$ac_cv_func_isnan_in_math_h" >&6; }
@@ -19601,18 +19949,20 @@ _ACEOF
 
 fi
 
-{ echo "$as_me:$LINENO: checking for isnan in <cmath>" >&5
+
+  { echo "$as_me:$LINENO: checking for isnan in <cmath>" >&5
 echo $ECHO_N "checking for isnan in <cmath>... $ECHO_C" >&6; }
 if test "${ac_cv_func_isnan_in_cmath+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  ac_ext=cpp
+
+    ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-   cat >conftest.$ac_ext <<_ACEOF
+    cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
@@ -19670,12 +20020,13 @@ sed 's/^/| /' conftest.$ac_ext >&5
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_ext=c
+    ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 fi
 { echo "$as_me:$LINENO: result: $ac_cv_func_isnan_in_cmath" >&5
 echo "${ECHO_T}$ac_cv_func_isnan_in_cmath" >&6; }
@@ -19688,18 +20039,20 @@ _ACEOF
 
 fi
 
-{ echo "$as_me:$LINENO: checking for std::isnan in <cmath>" >&5
+
+  { echo "$as_me:$LINENO: checking for std::isnan in <cmath>" >&5
 echo $ECHO_N "checking for std::isnan in <cmath>... $ECHO_C" >&6; }
 if test "${ac_cv_func_std_isnan_in_cmath+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  ac_ext=cpp
+
+    ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-   cat >conftest.$ac_ext <<_ACEOF
+    cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
@@ -19757,12 +20110,13 @@ sed 's/^/| /' conftest.$ac_ext >&5
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_ext=c
+    ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 fi
 { echo "$as_me:$LINENO: result: $ac_cv_func_std_isnan_in_cmath" >&5
 echo "${ECHO_T}$ac_cv_func_std_isnan_in_cmath" >&6; }
@@ -19776,18 +20130,21 @@ _ACEOF
 fi
 
 
-{ echo "$as_me:$LINENO: checking for isinf in <math.h>" >&5
+
+
+  { echo "$as_me:$LINENO: checking for isinf in <math.h>" >&5
 echo $ECHO_N "checking for isinf in <math.h>... $ECHO_C" >&6; }
 if test "${ac_cv_func_isinf_in_math_h+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  ac_ext=cpp
+
+    ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-   cat >conftest.$ac_ext <<_ACEOF
+    cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
@@ -19845,12 +20202,13 @@ sed 's/^/| /' conftest.$ac_ext >&5
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_ext=c
+    ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 fi
 { echo "$as_me:$LINENO: result: $ac_cv_func_isinf_in_math_h" >&5
 echo "${ECHO_T}$ac_cv_func_isinf_in_math_h" >&6; }
@@ -19863,18 +20221,20 @@ _ACEOF
 
 fi
 
-{ echo "$as_me:$LINENO: checking for isinf in <cmath>" >&5
+
+  { echo "$as_me:$LINENO: checking for isinf in <cmath>" >&5
 echo $ECHO_N "checking for isinf in <cmath>... $ECHO_C" >&6; }
 if test "${ac_cv_func_isinf_in_cmath+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  ac_ext=cpp
+
+    ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-   cat >conftest.$ac_ext <<_ACEOF
+    cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
@@ -19932,12 +20292,13 @@ sed 's/^/| /' conftest.$ac_ext >&5
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_ext=c
+    ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 fi
 { echo "$as_me:$LINENO: result: $ac_cv_func_isinf_in_cmath" >&5
 echo "${ECHO_T}$ac_cv_func_isinf_in_cmath" >&6; }
@@ -19950,18 +20311,20 @@ _ACEOF
 
 fi
 
-{ echo "$as_me:$LINENO: checking for std::isinf in <cmath>" >&5
+
+  { echo "$as_me:$LINENO: checking for std::isinf in <cmath>" >&5
 echo $ECHO_N "checking for std::isinf in <cmath>... $ECHO_C" >&6; }
 if test "${ac_cv_func_std_isinf_in_cmath+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  ac_ext=cpp
+
+    ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-   cat >conftest.$ac_ext <<_ACEOF
+    cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
@@ -20019,12 +20382,13 @@ sed 's/^/| /' conftest.$ac_ext >&5
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_ext=c
+    ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 fi
 { echo "$as_me:$LINENO: result: $ac_cv_func_std_isinf_in_cmath" >&5
 echo "${ECHO_T}$ac_cv_func_std_isinf_in_cmath" >&6; }
@@ -20037,18 +20401,20 @@ _ACEOF
 
 fi
 
-{ echo "$as_me:$LINENO: checking for finite in <ieeefp.h>" >&5
+
+  { echo "$as_me:$LINENO: checking for finite in <ieeefp.h>" >&5
 echo $ECHO_N "checking for finite in <ieeefp.h>... $ECHO_C" >&6; }
 if test "${ac_cv_func_finite_in_ieeefp_h+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
-  ac_ext=cpp
+
+    ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
-   cat >conftest.$ac_ext <<_ACEOF
+    cat >conftest.$ac_ext <<_ACEOF
 /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
@@ -20106,12 +20472,13 @@ sed 's/^/| /' conftest.$ac_ext >&5
 fi
 
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-  ac_ext=c
+    ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
+
 fi
 { echo "$as_me:$LINENO: result: $ac_cv_func_finite_in_ieeefp_h" >&5
 echo "${ECHO_T}$ac_cv_func_finite_in_ieeefp_h" >&6; }
@@ -20728,19 +21095,22 @@ ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ex
 ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
 
 cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
+
+  /* confdefs.h.  */
 _ACEOF
 cat confdefs.h >>conftest.$ac_ext
 cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
-int main() {
-        volatile unsigned long val = 1;
-        __sync_synchronize();
-        __sync_val_compare_and_swap(&val, 1, 0);
-        __sync_add_and_fetch(&val, 1);
-        __sync_sub_and_fetch(&val, 1);
-        return 0;
-      }
+
+    int main() {
+      volatile unsigned long val = 1;
+      __sync_synchronize();
+      __sync_val_compare_and_swap(&val, 1, 0);
+      __sync_add_and_fetch(&val, 1);
+      __sync_sub_and_fetch(&val, 1);
+      return 0;
+    }
+
 
 _ACEOF
 rm -f conftest.$ac_objext conftest$ac_exeext
@@ -20757,7 +21127,7 @@ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
   cat conftest.err >&5
   echo "$as_me:$LINENO: \$? = $ac_status" >&5
   (exit $ac_status); } &&
-	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+	 { ac_try='test -z "$ac_cxx_werror_flag" || test ! -s conftest.err'
   { (case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
@@ -20777,11 +21147,6 @@ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
   ac_status=$?
   echo "$as_me:$LINENO: \$? = $ac_status" >&5
   (exit $ac_status); }; }; then
-  ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
   { echo "$as_me:$LINENO: result: yes" >&5
 echo "${ECHO_T}yes" >&6; }
@@ -20790,11 +21155,13 @@ cat >>confdefs.h <<\_ACEOF
 #define LLVM_HAS_ATOMICS 1
 _ACEOF
 
+
 else
   echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-	{ echo "$as_me:$LINENO: result: no" >&5
+
+  { echo "$as_me:$LINENO: result: no" >&5
 echo "${ECHO_T}no" >&6; }
 
 cat >>confdefs.h <<\_ACEOF
@@ -20803,10 +21170,17 @@ _ACEOF
 
   { echo "$as_me:$LINENO: WARNING: LLVM will be built thread-unsafe because atomic builtins are missing" >&5
 echo "$as_me: WARNING: LLVM will be built thread-unsafe because atomic builtins are missing" >&2;}
+
 fi
 
 rm -f core conftest.err conftest.$ac_objext \
       conftest$ac_exeext conftest.$ac_ext
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
 
 
 if test "$llvm_cv_os_type" = "Linux" -a "$llvm_cv_target_arch" = "x86_64" ; then
@@ -22211,7 +22585,11 @@ CONVENIENCE_LTDL_FALSE!$CONVENIENCE_LTDL_FALSE$ac_delim
 LIBADD_DL!$LIBADD_DL$ac_delim
 NO_VARIADIC_MACROS!$NO_VARIADIC_MACROS$ac_delim
 NO_MISSING_FIELD_INITIALIZERS!$NO_MISSING_FIELD_INITIALIZERS$ac_delim
+NO_NESTED_ANON_TYPES!$NO_NESTED_ANON_TYPES$ac_delim
 COVERED_SWITCH_DEFAULT!$COVERED_SWITCH_DEFAULT$ac_delim
+NO_MAYBE_UNINITIALIZED!$NO_MAYBE_UNINITIALIZED$ac_delim
+NO_UNINITIALIZED!$NO_UNINITIALIZED$ac_delim
+PYTHON!$PYTHON$ac_delim
 USE_UDIS86!$USE_UDIS86$ac_delim
 USE_OPROFILE!$USE_OPROFILE$ac_delim
 USE_INTEL_JITEVENTS!$USE_INTEL_JITEVENTS$ac_delim
@@ -22237,10 +22615,6 @@ BINDINGS_TO_BUILD!$BINDINGS_TO_BUILD$ac_delim
 ALL_BINDINGS!$ALL_BINDINGS$ac_delim
 OCAML_LIBDIR!$OCAML_LIBDIR$ac_delim
 ENABLE_VISIBILITY_INLINES_HIDDEN!$ENABLE_VISIBILITY_INLINES_HIDDEN$ac_delim
-RPATH!$RPATH$ac_delim
-RDYNAMIC!$RDYNAMIC$ac_delim
-program_prefix!$program_prefix$ac_delim
-LIBOBJS!$LIBOBJS$ac_delim
 _ACEOF
 
   if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
@@ -22282,10 +22656,14 @@ _ACEOF
 ac_delim='%!_!# '
 for ac_last_try in false false false false false :; do
   cat >conf$$subs.sed <<_ACEOF
+RPATH!$RPATH$ac_delim
+RDYNAMIC!$RDYNAMIC$ac_delim
+program_prefix!$program_prefix$ac_delim
+LIBOBJS!$LIBOBJS$ac_delim
 LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 1; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 5; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
index 54b4a4a..712d57d 100644
--- a/docs/AliasAnalysis.rst
+++ b/docs/AliasAnalysis.rst
@@ -1,5 +1,3 @@
-.. _alias_analysis:
-
 ==================================
 LLVM Alias Analysis Infrastructure
 ==================================
diff --git a/docs/Atomics.rst b/docs/Atomics.rst
index 1bca53e..705d73f 100644
--- a/docs/Atomics.rst
+++ b/docs/Atomics.rst
@@ -1,5 +1,3 @@
-.. _atomics:
-
 ==============================================
 LLVM Atomic Instructions and Concurrency Guide
 ==============================================
diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index 333e79b..c83b6c1 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst
@@ -1,5 +1,3 @@
-.. _bitcode_format:
-
 .. role:: raw-html(raw)
    :format: html
 
diff --git a/docs/BranchWeightMetadata.rst b/docs/BranchWeightMetadata.rst
index 2667ce3..71ecd34 100644
--- a/docs/BranchWeightMetadata.rst
+++ b/docs/BranchWeightMetadata.rst
@@ -1,5 +1,3 @@
-.. _branch_weight:
-
 ===========================
 LLVM Branch Weight Metadata
 ===========================
diff --git a/docs/Bugpoint.rst b/docs/Bugpoint.rst
index 047129f..1a5fc8c 100644
--- a/docs/Bugpoint.rst
+++ b/docs/Bugpoint.rst
@@ -1,5 +1,3 @@
-.. _bugpoint:
-
 ====================================
 LLVM bugpoint tool: design and usage
 ====================================
diff --git a/docs/CMake.rst b/docs/CMake.rst
index f895788..6eab04b 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -1,5 +1,3 @@
-.. _building-with-cmake:
-
 ========================
 Building LLVM with CMake
 ========================
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index ce23667..b5d4180 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -1,5 +1,3 @@
-.. _code_generator:
-
 ==========================================
 The LLVM Target-Independent Code Generator
 ==========================================
@@ -17,6 +15,8 @@ The LLVM Target-Independent Code Generator
     .partial { background-color: #F88017 }
     .yes { background-color: #0F0; }
     .yes:before { content: "Y" }
+    .na { background-color: #6666FF; }
+    .na:before { content: "N/A" }
   </style>
 
 .. contents::
@@ -285,12 +285,10 @@ The ``TargetInstrInfo`` class
 -----------------------------
 
 The ``TargetInstrInfo`` class is used to describe the machine instructions
-supported by the target. It is essentially an array of ``TargetInstrDescriptor``
-objects, each of which describes one instruction the target
-supports. Descriptors define things like the mnemonic for the opcode, the number
-of operands, the list of implicit register uses and defs, whether the
-instruction has certain target-independent properties (accesses memory, is
-commutable, etc), and holds any target-specific flags.
+supported by the target.  Descriptions define things like the mnemonic for
+the opcode, the number of operands, the list of implicit register uses and defs,
+whether the instruction has certain target-independent properties (accesses
+memory, is commutable, etc), and holds any target-specific flags.
 
 The ``TargetFrameInfo`` class
 -----------------------------
@@ -1748,12 +1746,14 @@ the key:
 :raw-html:`<table border="1" cellspacing="0">`
 :raw-html:`<tr>`
 :raw-html:`<th>Unknown</th>`
+:raw-html:`<th>Not Applicable</th>`
 :raw-html:`<th>No support</th>`
 :raw-html:`<th>Partial Support</th>`
 :raw-html:`<th>Complete Support</th>`
 :raw-html:`</tr>`
 :raw-html:`<tr>`
 :raw-html:`<td class="unknown"></td>`
+:raw-html:`<td class="na"></td>`
 :raw-html:`<td class="no"></td>`
 :raw-html:`<td class="partial"></td>`
 :raw-html:`<td class="yes"></td>`
@@ -1773,7 +1773,7 @@ Here is the table:
 :raw-html:`<th>MBlaze</th>`
 :raw-html:`<th>MSP430</th>`
 :raw-html:`<th>Mips</th>`
-:raw-html:`<th>PTX</th>`
+:raw-html:`<th>NVPTX</th>`
 :raw-html:`<th>PowerPC</th>`
 :raw-html:`<th>Sparc</th>`
 :raw-html:`<th>X86</th>`
@@ -1787,7 +1787,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- MBlaze -->`
 :raw-html:`<td class="unknown"></td> <!-- MSP430 -->`
 :raw-html:`<td class="yes"></td> <!-- Mips -->`
-:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="yes"></td> <!-- NVPTX -->`
 :raw-html:`<td class="yes"></td> <!-- PowerPC -->`
 :raw-html:`<td class="yes"></td> <!-- Sparc -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
@@ -1801,7 +1801,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- MBlaze -->`
 :raw-html:`<td class="no"></td> <!-- MSP430 -->`
 :raw-html:`<td class="no"></td> <!-- Mips -->`
-:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="no"></td> <!-- NVPTX -->`
 :raw-html:`<td class="no"></td> <!-- PowerPC -->`
 :raw-html:`<td class="no"></td> <!-- Sparc -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
@@ -1815,7 +1815,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- MBlaze -->`
 :raw-html:`<td class="no"></td> <!-- MSP430 -->`
 :raw-html:`<td class="no"></td> <!-- Mips -->`
-:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="na"></td> <!-- NVPTX -->`
 :raw-html:`<td class="no"></td> <!-- PowerPC -->`
 :raw-html:`<td class="no"></td> <!-- Sparc -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
@@ -1829,7 +1829,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- MBlaze -->`
 :raw-html:`<td class="unknown"></td> <!-- MSP430 -->`
 :raw-html:`<td class="no"></td> <!-- Mips -->`
-:raw-html:`<td class="unknown"></td> <!-- PTX -->`
+:raw-html:`<td class="yes"></td> <!-- NVPTX -->`
 :raw-html:`<td class="yes"></td> <!-- PowerPC -->`
 :raw-html:`<td class="unknown"></td> <!-- Sparc -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
@@ -1843,7 +1843,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- MBlaze -->`
 :raw-html:`<td class="unknown"></td> <!-- MSP430 -->`
 :raw-html:`<td class="yes"></td> <!-- Mips -->`
-:raw-html:`<td class="unknown"></td> <!-- PTX -->`
+:raw-html:`<td class="na"></td> <!-- NVPTX -->`
 :raw-html:`<td class="yes"></td> <!-- PowerPC -->`
 :raw-html:`<td class="unknown"></td> <!-- Sparc -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
@@ -1857,7 +1857,7 @@ Here is the table:
 :raw-html:`<td class="yes"></td> <!-- MBlaze -->`
 :raw-html:`<td class="no"></td> <!-- MSP430 -->`
 :raw-html:`<td class="no"></td> <!-- Mips -->`
-:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="na"></td> <!-- NVPTX -->`
 :raw-html:`<td class="no"></td> <!-- PowerPC -->`
 :raw-html:`<td class="no"></td> <!-- Sparc -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
@@ -1871,7 +1871,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- MBlaze -->`
 :raw-html:`<td class="unknown"></td> <!-- MSP430 -->`
 :raw-html:`<td class="no"></td> <!-- Mips -->`
-:raw-html:`<td class="unknown"></td> <!-- PTX -->`
+:raw-html:`<td class="no"></td> <!-- NVPTX -->`
 :raw-html:`<td class="yes"></td> <!-- PowerPC -->`
 :raw-html:`<td class="unknown"></td> <!-- Sparc -->`
 :raw-html:`<td class="yes"></td> <!-- X86 -->`
@@ -1885,7 +1885,7 @@ Here is the table:
 :raw-html:`<td class="no"></td> <!-- MBlaze -->`
 :raw-html:`<td class="no"></td> <!-- MSP430 -->`
 :raw-html:`<td class="no"></td> <!-- Mips -->`
-:raw-html:`<td class="no"></td> <!-- PTX -->`
+:raw-html:`<td class="no"></td> <!-- NVPTX -->`
 :raw-html:`<td class="no"></td> <!-- PowerPC -->`
 :raw-html:`<td class="no"></td> <!-- Sparc -->`
 :raw-html:`<td class="partial"><a href="#feat_segstacks_x86">*</a></td> <!-- X86 -->`
@@ -2367,17 +2367,17 @@ Dynamic Allocation
 
   TODO - More to come.
 
-The PTX backend
----------------
+The NVPTX backend
+-----------------
 
-The PTX code generator lives in the lib/Target/PTX directory. It is currently a
-work-in-progress, but already supports most of the code generation functionality
-needed to generate correct PTX kernels for CUDA devices.
+The NVPTX code generator under lib/Target/NVPTX is an open-source version of
+the NVIDIA NVPTX code generator for LLVM.  It is contributed by NVIDIA and is
+a port of the code generator used in the CUDA compiler (nvcc).  It targets the
+PTX 3.0/3.1 ISA and can target any compute capability greater than or equal to
+2.0 (Fermi).
 
-The code generator can target PTX 2.0+, and shader model 1.0+.  The PTX ISA
-Reference Manual is used as the primary source of ISA information, though an
-effort is made to make the output of the code generator match the output of the
-NVidia nvcc compiler, whenever possible.
+This target is of production quality and should be completely compatible with
+the official NVIDIA toolchain.
 
 Code Generator Options:
 
@@ -2387,39 +2387,28 @@ Code Generator Options:
 :raw-html:`<th>Description</th>`
 :raw-html:`</tr>`
 :raw-html:`<tr>`
-:raw-html:`<td>``double``</td>`
-:raw-html:`<td align="left">If enabled, the map_f64_to_f32 directive is disabled in the PTX output, allowing native double-precision arithmetic</td>`
+:raw-html:`<td>sm_20</td>`
+:raw-html:`<td align="left">Set shader model/compute capability to 2.0</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>sm_21</td>`
+:raw-html:`<td align="left">Set shader model/compute capability to 2.1</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>sm_30</td>`
+:raw-html:`<td align="left">Set shader model/compute capability to 3.0</td>`
+:raw-html:`</tr>`
+:raw-html:`<tr>`
+:raw-html:`<td>sm_35</td>`
+:raw-html:`<td align="left">Set shader model/compute capability to 3.5</td>`
 :raw-html:`</tr>`
 :raw-html:`<tr>`
-:raw-html:`<td>``no-fma``</td>`
-:raw-html:`<td align="left">Disable generation of Fused-Multiply Add instructions, which may be beneficial for some devices</td>`
+:raw-html:`<td>ptx30</td>`
+:raw-html:`<td align="left">Target PTX 3.0</td>`
 :raw-html:`</tr>`
 :raw-html:`<tr>`
-:raw-html:`<td>``smxy / computexy``</td>`
-:raw-html:`<td align="left">Set shader model/compute capability to x.y, e.g. sm20 or compute13</td>`
+:raw-html:`<td>ptx31</td>`
+:raw-html:`<td align="left">Target PTX 3.1</td>`
 :raw-html:`</tr>`
 :raw-html:`</table>`
 
-Working:
-
-* Arithmetic instruction selection (including combo FMA)
-
-* Bitwise instruction selection
-
-* Control-flow instruction selection
-
-* Function calls (only on SM 2.0+ and no return arguments)
-
-* Addresses spaces (0 = global, 1 = constant, 2 = local, 4 = shared)
-
-* Thread synchronization (bar.sync)
-
-* Special register reads ([N]TID, [N]CTAID, PMx, CLOCK, etc.)
-
-In Progress:
-
-* Robust call instruction selection
-
-* Stack frame allocation
-
-* Device-specific instruction scheduling optimizations
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index 8003c12..4d66ad7 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -1,5 +1,3 @@
-.. _coding_standards:
-
 =====================
 LLVM Coding Standards
 =====================
@@ -1090,6 +1088,34 @@ flushes the output stream.  In other words, these are equivalent:
 Most of the time, you probably have no reason to flush the output stream, so
 it's better to use a literal ``'\n'``.
 
+Don't use ``inline`` when defining a function in a class definition
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A member function defined in a class definition is implicitly inline, so don't
+put the ``inline`` keyword in this case.
+
+Don't:
+
+.. code-block:: c++
+
+  class Foo {
+  public:
+    inline void bar() {
+      // ...
+    }
+  };
+
+Do:
+
+.. code-block:: c++
+
+  class Foo {
+  public:
+    void bar() {
+      // ...
+    }
+  };
+
 Microscopic Details
 -------------------
 
@@ -1304,7 +1330,7 @@ namespace just because it was declared there.
 See Also
 ========
 
-A lot of these comments and recommendations have been culled for other sources.
+A lot of these comments and recommendations have been culled from other sources.
 Two particularly important books for our work are:
 
 #. `Effective C++
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 256970b..fce63ba 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -43,7 +43,8 @@ OPTIONS
 
  By default, FileCheck canonicalizes input horizontal whitespace (spaces and
  tabs) which causes it to ignore these differences (a space will match a tab).
- The :option:`--strict-whitespace` argument disables this behavior.
+ The :option:`--strict-whitespace` argument disables this behavior. End-of-line
+ sequences are canonicalized to UNIX-style '\n' in all modes.
 
 .. option:: -version
 
diff --git a/docs/CommandGuide/index.rst b/docs/CommandGuide/index.rst
index 73a4835..9bfa964 100644
--- a/docs/CommandGuide/index.rst
+++ b/docs/CommandGuide/index.rst
@@ -1,5 +1,3 @@
-.. _commands:
-
 LLVM Command Guide
 ------------------
 
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index 1dcaff1..40c7646 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst
@@ -151,10 +151,6 @@ ADDITIONAL OPTIONS
 
  List the discovered test suites as part of the standard output.
 
-.. option:: --no-tcl-as-sh
-
- Run Tcl scripts internally (instead of converting to shell scripts).
-
 .. option:: --repeat=N
 
  Run each test ``N`` times.  Currently this is primarily useful for timing
@@ -298,14 +294,13 @@ executed, two important global variables are predefined:
  tests in the suite.
 
  **suffixes** For **lit** test formats which scan directories for tests, this
- variable is a list of suffixes to identify test files.  Used by: *ShTest*,
- *TclTest*.
+ variable is a list of suffixes to identify test files.  Used by: *ShTest*.
 
  **substitutions** For **lit** test formats which substitute variables into a test
- script, the list of substitutions to perform.  Used by: *ShTest*, *TclTest*.
+ script, the list of substitutions to perform.  Used by: *ShTest*.
 
  **unsupported** Mark an unsupported directory, all tests within it will be
- reported as unsupported.  Used by: *ShTest*, *TclTest*.
+ reported as unsupported.  Used by: *ShTest*.
 
  **parent** The parent configuration, this is the config object for the directory
  containing the test suite, or None.
diff --git a/docs/CommandLine.rst b/docs/CommandLine.rst
index 302f5a4..073958b 100644
--- a/docs/CommandLine.rst
+++ b/docs/CommandLine.rst
@@ -1,5 +1,3 @@
-.. _commandline:
-
 ==============================
 CommandLine 2.0 Library Manual
 ==============================
@@ -68,9 +66,7 @@ CommandLine library to have the following features:
 
 This document will hopefully let you jump in and start using CommandLine in your
 utility quickly and painlessly.  Additionally it should be a simple reference
-manual to figure out how stuff works.  If it is failing in some area (or you
-want an extension to the library), nag the author, `Chris
-Lattner <mailto:sabre@nondot.org>`_.
+manual to figure out how stuff works.
 
 Quick Start Guide
 =================
diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index f214919..bc0b996 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst
@@ -1,5 +1,3 @@
-.. _compiler_writer_info:
-
 ========================================================
 Architecture & Platform Information for Compiler Writers
 ========================================================
@@ -24,6 +22,11 @@ ARM
 
 * `ABI <http://www.arm.com/products/DevTools/ABI.html>`_
 
+AArch64
+-------
+
+* `ARMv8 Instruction Set Overview <http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.genc010197a/index.html>`_
+
 Itanium (ia64)
 --------------
 
@@ -40,19 +43,15 @@ PowerPC
 IBM - Official manuals and docs
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-* `PowerPC Architecture Book <http://www-106.ibm.com/developerworks/eserver/articles/archguide.html>`_
-
-  * Book I: `PowerPC User Instruction Set Architecture <http://www-106.ibm.com/developerworks/eserver/pdfs/archpub1.pdf>`_
-
-  * Book II: `PowerPC Virtual Environment Architecture <http://www-106.ibm.com/developerworks/eserver/pdfs/archpub2.pdf>`_
+* `Power Instruction Set Architecture, Versions 2.03 through 2.06 (authentication required, free sign-up) <https://www.power.org/technology-introduction/standards-specifications>`_
 
-  * Book III: `PowerPC Operating Environment Architecture <http://www-106.ibm.com/developerworks/eserver/pdfs/archpub3.pdf>`_
+* `PowerPC Compiler Writer's Guide <http://www.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF7785256996007558C6>`_
 
-* `PowerPC Compiler Writer's Guide <http://www-3.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF7785256996007558C6>`_
+* `Intro to PowerPC Architecture <http://www.ibm.com/developerworks/linux/library/l-powarch/>`_
 
-* `PowerPC Processor Manuals <http://www-3.ibm.com/chips/techlib/techlib.nsf/products/PowerPC>`_
+* `PowerPC Processor Manuals (embedded) <http://www.ibm.com/chips/techlib/techlib.nsf/products/PowerPC>`_
 
-* `Intro to PowerPC Architecture <http://www-106.ibm.com/developerworks/linux/library/l-powarch/>`_
+* `Various IBM specifications and white papers <https://www.power.org/documentation/?document_company=105&document_category=all&publish_year=all&grid_order=DESC&grid_sort=title>`_
 
 * `IBM AIX/5L for POWER Assembly Reference <http://publibn.boulder.ibm.com/doc_link/en_US/a_doc_lib/aixassem/alangref/alangreftfrm.htm>`_
 
@@ -101,6 +100,8 @@ Linux
 -----
 
 * `PowerPC 64-bit ELF ABI Supplement <http://www.linuxbase.org/spec/ELF/ppc64/>`_
+* `Procedure Call Standard for the AArch64 Architecture <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055a/IHI0055A_aapcs64.pdf>`_
+* `ELF for the ARM 64-bit Architecture (AArch64) <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf>`_
 
 OS X
 ----
diff --git a/docs/DebuggingJITedCode.rst b/docs/DebuggingJITedCode.rst
index eeb2f77..d6101d5 100644
--- a/docs/DebuggingJITedCode.rst
+++ b/docs/DebuggingJITedCode.rst
@@ -1,11 +1,7 @@
-.. _debugging-jited-code:
-
 ==============================
 Debugging JIT-ed Code With GDB
 ==============================
 
-.. sectionauthor:: Reid Kleckner and Eli Bendersky
-
 Background
 ==========
 
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 925e769..43bdc85 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -1,5 +1,3 @@
-.. _developer_policy:
-
 =====================
 LLVM Developer Policy
 =====================
diff --git a/docs/ExceptionHandling.rst b/docs/ExceptionHandling.rst
index 190f182..0a86607 100644
--- a/docs/ExceptionHandling.rst
+++ b/docs/ExceptionHandling.rst
@@ -1,5 +1,3 @@
-.. _exception_handling:
-
 ==========================
 Exception Handling in LLVM
 ==========================
@@ -34,13 +32,13 @@ execution of an application.
 
 A more complete description of the Itanium ABI exception handling runtime
 support of can be found at `Itanium C++ ABI: Exception Handling
-<http://www.codesourcery.com/cxx-abi/abi-eh.html>`_. A description of the
+<http://mentorembedded.github.com/cxx-abi/abi-eh.html>`_. A description of the
 exception frame format can be found at `Exception Frames
-<http://refspecs.freestandards.org/LSB_3.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html>`_,
+<http://refspecs.linuxfoundation.org/LSB_3.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html>`_,
 with details of the DWARF 4 specification at `DWARF 4 Standard
 <http://dwarfstd.org/Dwarf4Std.php>`_.  A description for the C++ exception
 table formats can be found at `Exception Handling Tables
-<http://www.codesourcery.com/cxx-abi/exceptions.pdf>`_.
+<http://mentorembedded.github.com/cxx-abi/exceptions.pdf>`_.
 
 Setjmp/Longjmp Exception Handling
 ---------------------------------
@@ -151,10 +149,10 @@ type info index are passed in as arguments. The landing pad saves the exception
 structure reference and then proceeds to select the catch block that corresponds
 to the type info of the exception object.
 
-The LLVM `landingpad instruction <LangRef.html#i_landingpad>`_ is used to convey
-information about the landing pad to the back end. For C++, the ``landingpad``
-instruction returns a pointer and integer pair corresponding to the pointer to
-the *exception structure* and the *selector value* respectively.
+The LLVM :ref:`i_landingpad` is used to convey information about the landing
+pad to the back end. For C++, the ``landingpad`` instruction returns a pointer
+and integer pair corresponding to the pointer to the *exception structure* and
+the *selector value* respectively.
 
 The ``landingpad`` instruction takes a reference to the personality function to
 be used for this ``try``/``catch`` sequence. The remainder of the instruction is
@@ -203,10 +201,9 @@ A cleanup is extra code which needs to be run as part of unwinding a scope.  C++
 destructors are a typical example, but other languages and language extensions
 provide a variety of different kinds of cleanups. In general, a landing pad may
 need to run arbitrary amounts of cleanup code before actually entering a catch
-block. To indicate the presence of cleanups, a `landingpad
-instruction <LangRef.html#i_landingpad>`_ should have a *cleanup*
-clause. Otherwise, the unwinder will not stop at the landing pad if there are no
-catches or filters that require it to.
+block. To indicate the presence of cleanups, a :ref:`i_landingpad` should have
+a *cleanup* clause.  Otherwise, the unwinder will not stop at the landing pad if
+there are no catches or filters that require it to.
 
 .. note::
 
@@ -226,9 +223,9 @@ Throw Filters
 
 C++ allows the specification of which exception types may be thrown from a
 function. To represent this, a top level landing pad may exist to filter out
-invalid types. To express this in LLVM code the `landingpad
-instruction <LangRef.html#i_landingpad>`_ will have a filter clause. The clause
-consists of an array of type infos.  ``landingpad`` will return a negative value
+invalid types. To express this in LLVM code the :ref:`i_landingpad` will have a
+filter clause. The clause consists of an array of type infos.
+``landingpad`` will return a negative value
 if the exception does not match any of the type infos. If no match is found then
 a call to ``__cxa_call_unexpected`` should be made, otherwise
 ``_Unwind_Resume``.  Each of these functions requires a reference to the
@@ -269,8 +266,8 @@ handling information at various points in generated code.
 
 .. _llvm.eh.typeid.for:
 
-llvm.eh.typeid.for
-------------------
+``llvm.eh.typeid.for``
+----------------------
 
 .. code-block:: llvm
 
@@ -283,8 +280,8 @@ function.  This value can be used to compare against the result of
 
 .. _llvm.eh.sjlj.setjmp:
 
-llvm.eh.sjlj.setjmp
--------------------
+``llvm.eh.sjlj.setjmp``
+-----------------------
 
 .. code-block:: llvm
 
@@ -305,8 +302,8 @@ available for use in a target-specific manner.
 
 .. _llvm.eh.sjlj.longjmp:
 
-llvm.eh.sjlj.longjmp
---------------------
+``llvm.eh.sjlj.longjmp``
+------------------------
 
 .. code-block:: llvm
 
@@ -318,8 +315,8 @@ a buffer populated by `llvm.eh.sjlj.setjmp`_. The frame pointer and stack
 pointer are restored from the buffer, then control is transferred to the
 destination address.
 
-llvm.eh.sjlj.lsda
------------------
+``llvm.eh.sjlj.lsda``
+---------------------
 
 .. code-block:: llvm
 
@@ -330,8 +327,8 @@ the address of the Language Specific Data Area (LSDA) for the current
 function. The SJLJ front-end code stores this address in the exception handling
 function context for use by the runtime.
 
-llvm.eh.sjlj.callsite
----------------------
+``llvm.eh.sjlj.callsite``
+-------------------------
 
 .. code-block:: llvm
 
diff --git a/docs/ExtendingLLVM.rst b/docs/ExtendingLLVM.rst
index 6df08ee..3d8e9ee 100644
--- a/docs/ExtendingLLVM.rst
+++ b/docs/ExtendingLLVM.rst
@@ -1,5 +1,3 @@
-.. _extending_llvm:
-
 ============================================================
 Extending LLVM: Adding instructions, intrinsics, types, etc.
 ============================================================
diff --git a/docs/FAQ.rst b/docs/FAQ.rst
index b4c6261..e4ab2c1 100644
--- a/docs/FAQ.rst
+++ b/docs/FAQ.rst
@@ -1,5 +1,3 @@
-.. _faq:
-
 ================================
 Frequently Asked Questions (FAQ)
 ================================
diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst
index 7765bd7..5c3a1af 100644
--- a/docs/GarbageCollection.rst
+++ b/docs/GarbageCollection.rst
@@ -5,9 +5,6 @@ Accurate Garbage Collection with LLVM
 .. contents::
    :local:
 
-.. sectionauthor:: Chris Lattner <sabre@nondot.org>  and
-                   Gordon Henriksen
-
 Introduction
 ============
 
diff --git a/docs/GetElementPtr.rst b/docs/GetElementPtr.rst
index 3b57d78..306a2a8 100644
--- a/docs/GetElementPtr.rst
+++ b/docs/GetElementPtr.rst
@@ -1,5 +1,3 @@
-.. _gep:
-
 =======================================
 The Often Misunderstood GEP Instruction
 =======================================
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 5009d1c..539c75e 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -1,5 +1,3 @@
-.. _getting_started:
-
 ====================================
 Getting Started with the LLVM System  
 ====================================
@@ -126,6 +124,8 @@ LLVM is known to work on the following platforms:
 +-----------------+----------------------+-------------------------+
 |Linux            | amd64                | GCC                     |
 +-----------------+----------------------+-------------------------+
+|Linux            | ARM\ :sup:`13`       | GCC                     |
++-----------------+----------------------+-------------------------+
 |Solaris          | V9 (Ultrasparc)      | GCC                     |
 +-----------------+----------------------+-------------------------+
 |FreeBSD          | x86\ :sup:`1`        | GCC                     |
@@ -161,8 +161,6 @@ LLVM has partial support for the following platforms:
 
 .. note::
 
-  Code generation supported for Pentium processors and up
-
   #. Code generation supported for Pentium processors and up
   #. Code generation supported for 32-bit ABI only
   #. No native code generation
@@ -182,9 +180,9 @@ LLVM has partial support for the following platforms:
      Windows-specifics that will cause the build to fail.
   #. To use LLVM modules on Win32-based system, you may configure LLVM
      with ``--enable-shared``.
-
   #. To compile SPU backend, you need to add ``LDFLAGS=-Wl,--stack,16777216`` to
      configure.
+  #. MCJIT not working well pre-v7, old JIT engine not supported any more.
 
 Note that you will need about 1-3 GB of space for a full LLVM build in Debug
 mode, depending on the system (it is so large because of all the debugging
@@ -219,11 +217,7 @@ uses the package and provides other details.
 +--------------------------------------------------------------+-----------------+---------------------------------------------+
 | `SVN <http://subversion.tigris.org/project_packages.html>`_  | >=1.3           | Subversion access to LLVM\ :sup:`2`         |
 +--------------------------------------------------------------+-----------------+---------------------------------------------+
-| `DejaGnu <http://savannah.gnu.org/projects/dejagnu>`_        | 1.4.2           | Automated test suite\ :sup:`3`              |
-+--------------------------------------------------------------+-----------------+---------------------------------------------+
-| `tcl <http://www.tcl.tk/software/tcltk/>`_                   | 8.3, 8.4        | Automated test suite\ :sup:`3`              |
-+--------------------------------------------------------------+-----------------+---------------------------------------------+
-| `expect <http://expect.nist.gov/>`_                          | 5.38.0          | Automated test suite\ :sup:`3`              |
+| `python <http://www.python.org/>`_                           | >=2.4           | Automated test suite\ :sup:`3`              |
 +--------------------------------------------------------------+-----------------+---------------------------------------------+
 | `perl <http://www.perl.com/download.csp>`_                   | >=5.6.0         | Utilities                                   |
 +--------------------------------------------------------------+-----------------+---------------------------------------------+
@@ -368,6 +362,9 @@ optimizations are turned on. The symptom is an infinite loop in
 ``-O0``. A test failure in ``test/Assembler/alignstack.ll`` is one symptom of
 the problem.
 
+**GCC 4.6.3 on ARM**: Miscompiles ``llvm-readobj`` at ``-O3``. A test failure
+in ``test/Object/readobj-shared-object.test`` is one symptom of the problem.
+
 **GNU ld 2.16.X**. Some 2.16.X versions of the ld linker will produce very long
 warning messages complaining that some "``.gnu.linkonce.t.*``" symbol was
 defined in a discarded section. You can safely ignore these messages as they are
@@ -467,6 +464,8 @@ The files are as follows, with *x.y* marking the version number:
 
   Binary release of the llvm-gcc-4.2 front end for a specific platform.
 
+.. _checkout:
+
 Checkout LLVM from Subversion
 -----------------------------
 
@@ -643,6 +642,34 @@ This leaves your working directories on their master branches, so you'll need to
 ``checkout`` each working branch individually and ``rebase`` it on top of its
 parent branch.
 
+For those who wish to be able to update an llvm repo in a simpler fashion,
+consider placing the following git script in your path under the name
+``git-svnup``:
+
+.. code-block:: bash
+
+  #!/bin/bash
+
+  STATUS=$(git status -s | grep -v "??")
+
+  if [ ! -z "$STATUS" ]; then
+      STASH="yes"
+      git stash >/dev/null
+  fi
+
+  git fetch
+  OLD_BRANCH=$(git rev-parse --abbrev-ref HEAD)
+  git checkout master 2> /dev/null
+  git svn rebase -l
+  git checkout $OLD_BRANCH 2> /dev/null
+
+  if [ ! -z $STASH ]; then
+      git stash pop >/dev/null
+  fi
+
+Then to perform the aforementioned update steps go into your source directory
+and just type ``git-svnup`` or ``git svnup`` and everything will just work.
+
 To commit back changes via git-svn, use ``dcommit``:
 
 .. code-block:: console
diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst
index 35f97f0..4c80f2c 100644
--- a/docs/GettingStartedVS.rst
+++ b/docs/GettingStartedVS.rst
@@ -1,5 +1,3 @@
-.. _winvs:
-
 ==================================================================
 Getting Started with the LLVM System using Microsoft Visual Studio
 ==================================================================
diff --git a/docs/GoldPlugin.rst b/docs/GoldPlugin.rst
index 300aea9..17bbeb8 100644
--- a/docs/GoldPlugin.rst
+++ b/docs/GoldPlugin.rst
@@ -1,11 +1,7 @@
-.. _gold-plugin:
-
 ====================
 The LLVM gold plugin
 ====================
 
-.. sectionauthor:: Nick Lewycky
-
 Introduction
 ============
 
diff --git a/docs/HowToAddABuilder.rst b/docs/HowToAddABuilder.rst
index b0cd290..893f12d 100644
--- a/docs/HowToAddABuilder.rst
+++ b/docs/HowToAddABuilder.rst
@@ -1,11 +1,7 @@
-.. _how_to_add_a_builder:
-
 ===================================================================
 How To Add Your Build Configuration To LLVM Buildbot Infrastructure
 ===================================================================
 
-.. sectionauthor:: Galina Kistanova <gkistanova@gmail.com>
-
 Introduction
 ============
 
diff --git a/docs/HowToBuildOnARM.rst b/docs/HowToBuildOnARM.rst
index d786a7d..32ae39b 100644
--- a/docs/HowToBuildOnARM.rst
+++ b/docs/HowToBuildOnARM.rst
@@ -1,11 +1,7 @@
-.. _how_to_build_on_arm:
-
 ===================================================================
 How To Build On ARM
 ===================================================================
 
-.. sectionauthor:: Wei-Ren Chen (陳韋任) <chenwj@iis.sinica.edu.tw>
-
 Introduction
 ============
 
@@ -40,8 +36,8 @@ on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips.
 
    .. code-block:: bash
 
-     ./configure --build=armv7l-unknown-linux-gnueabihf
-     --host=armv7l-unknown-linux-gnueabihf
-     --target=armv7l-unknown-linux-gnueabihf --with-cpu=cortex-a9
-     --with-float=hard --with-abi=aapcs-vfp --with-fpu=neon
-     --enable-targets=arm --disable-optimized --enable-assertions
+     ./configure --build=armv7l-unknown-linux-gnueabihf \
+     --host=armv7l-unknown-linux-gnueabihf \
+     --target=armv7l-unknown-linux-gnueabihf --with-cpu=cortex-a9 \
+     --with-float=hard --with-abi=aapcs-vfp --with-fpu=neon \
+     --enable-targets=arm --enable-optimized --enable-assertions
diff --git a/docs/HowToReleaseLLVM.rst b/docs/HowToReleaseLLVM.rst
index eb6c838..31877bd 100644
--- a/docs/HowToReleaseLLVM.rst
+++ b/docs/HowToReleaseLLVM.rst
@@ -6,11 +6,6 @@ How To Release LLVM To The Public
    :local:
    :depth: 1
 
-.. sectionauthor:: Tanya Lattner <tonic@nondot.org>,
-                   Reid Spencer <rspencer@x10sys.com>,
-                   John Criswell <criswell@cs.uiuc.edu> and
-                   Bill Wendling <wendling@apple.com>
-
 Introduction
 ============
 
@@ -201,7 +196,7 @@ Build LLVM
 
 Build ``Debug``, ``Release+Asserts``, and ``Release`` versions
 of ``llvm`` on all supported platforms.  Directions to build ``llvm``
-are :ref:`here <getting_started>`.
+are :doc:`here <GettingStarted>`.
 
 Build Clang Binary Distribution
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -268,7 +263,7 @@ no regressions when using either ``clang`` or ``dragonegg`` with the
 Qualify Clang
 ^^^^^^^^^^^^^
 
-``Clang`` is qualified when front-end specific tests in the ``llvm`` dejagnu
+``Clang`` is qualified when front-end specific tests in the ``llvm`` regression
 test suite all pass, clang's own test suite passes cleanly, and there are no
 regressions in the ``test-suite``.
 
@@ -278,26 +273,26 @@ Specific Target Qualification Details
 +--------------+-------------+----------------+-----------------------------+
 | Architecture | OS          | clang baseline | tests                       |
 +==============+=============+================+=============================+
-| x86-32       | Linux       | last release   | llvm dejagnu,               |
-|              |             |                | clang tests,                |
+| x86-32       | Linux       | last release   | llvm regression tests,      |
+|              |             |                | clang regression tests,     |
 |              |             |                | test-suite (including spec) |
 +--------------+-------------+----------------+-----------------------------+
-| x86-32       | FreeBSD     | last release   | llvm dejagnu,               |
-|              |             |                | clang tests,                |
+| x86-32       | FreeBSD     | last release   | llvm regression tests,      |
+|              |             |                | clang regression tests,     |
 |              |             |                | test-suite                  |
 +--------------+-------------+----------------+-----------------------------+
 | x86-32       | mingw       | none           | QT                          |
 +--------------+-------------+----------------+-----------------------------+
-| x86-64       | Mac OS 10.X | last release   | llvm dejagnu,               |
-|              |             |                | clang tests,                |
+| x86-64       | Mac OS 10.X | last release   | llvm regression tests,      |
+|              |             |                | clang regression tests,     |
 |              |             |                | test-suite (including spec) |
 +--------------+-------------+----------------+-----------------------------+
-| x86-64       | Linux       | last release   | llvm dejagnu,               |
-|              |             |                | clang tests,                |
+| x86-64       | Linux       | last release   | llvm regression tests,      |
+|              |             |                | clang regression tests,     |
 |              |             |                | test-suite (including spec) |
 +--------------+-------------+----------------+-----------------------------+
-| x86-64       | FreeBSD     | last release   | llvm dejagnu,               |
-|              |             |                | clang tests,                |
+| x86-64       | FreeBSD     | last release   | llvm regression tests,      |
+|              |             |                | clang regression tests,     |
 |              |             |                | test-suite                  |
 +--------------+-------------+----------------+-----------------------------+
 
diff --git a/docs/HowToSetUpLLVMStyleRTTI.rst b/docs/HowToSetUpLLVMStyleRTTI.rst
index aa1ad84..b906b25 100644
--- a/docs/HowToSetUpLLVMStyleRTTI.rst
+++ b/docs/HowToSetUpLLVMStyleRTTI.rst
@@ -1,11 +1,7 @@
-.. _how-to-set-up-llvm-style-rtti:
-
 ======================================================
 How to set up LLVM-style RTTI for your class hierarchy
 ======================================================
 
-.. sectionauthor:: Sean Silva <silvas@purdue.edu>
-
 .. contents::
 
 Background
diff --git a/docs/HowToSubmitABug.rst b/docs/HowToSubmitABug.rst
index ff2d649..45be282 100644
--- a/docs/HowToSubmitABug.rst
+++ b/docs/HowToSubmitABug.rst
@@ -1,11 +1,7 @@
-.. _how-to-submit-a-bug-report:
-
 ================================
 How to submit an LLVM bug report
 ================================
 
-.. sectionauthor:: Chris Lattner <sabre@nondot.org> and Misha Brukman <http://misha.brukman.net>
-
 Introduction - Got bugs?
 ========================
 
diff --git a/docs/HowToUseAttributes.rst b/docs/HowToUseAttributes.rst
new file mode 100644
index 0000000..66c44c0
--- /dev/null
+++ b/docs/HowToUseAttributes.rst
@@ -0,0 +1,81 @@
+=====================
+How To Use Attributes
+=====================
+
+.. contents::
+  :local:
+
+Introduction
+============
+
+Attributes in LLVM have changed in some fundamental ways.  It was necessary to
+do this to support expanding the attributes to encompass more than a handful of
+attributes --- e.g. command line options.  The old way of handling attributes
+consisted of representing them as a bit mask of values.  This bit mask was
+stored in a "list" structure that was reference counted.  The advantage of this
+was that attributes could be manipulated with 'or's and 'and's.  The
+disadvantage of this was that there was limited room for expansion, and
+virtually no support for attribute-value pairs other than alignment.
+
+In the new scheme, an ``Attribute`` object represents a single attribute that's
+uniqued.  You use the ``Attribute::get`` methods to create a new ``Attribute``
+object.  An attribute can be a single "enum" value (the enum being the
+``Attribute::AttrKind`` enum), a string representing a target-dependent
+attribute, or an attribute-value pair.  Some examples:
+
+* Target-independent: ``noinline``, ``zext``
+* Target-dependent: ``"no-sse"``, ``"thumb2"``
+* Attribute-value pair: ``"cpu" = "cortex-a8"``, ``align = 4``
+
+Note: for an attribute value pair, we expect a target-dependent attribute to
+have a string for the value.
+
+``Attribute``
+=============
+An ``Attribute`` object is designed to be passed around by value.
+
+Because attributes are no longer represented as a bit mask, you will need to
+convert any code which does treat them as a bit mask to use the new query
+methods on the Attribute class.
+
+``AttributeSet``
+================
+
+The ``AttributeSet`` class replaces the old ``AttributeList`` class.  The
+``AttributeSet`` stores a collection of Attribute objects for each kind of
+object that may have an attribute associated with it: the function as a
+whole, the return type, or the function's parameters.  A function's attributes
+are at index ``AttributeSet::FunctionIndex``; the return type's attributes are
+at index ``AttributeSet::ReturnIndex``; and the function's parameters'
+attributes are at indices 1, ..., n (where 'n' is the number of parameters).
+Most methods on the ``AttributeSet`` class take an index parameter.
+
+An ``AttributeSet`` is also a uniqued and immutable object.  You create an
+``AttributeSet`` through the ``AttributeSet::get`` methods.  You can add and
+remove attributes, which result in the creation of a new ``AttributeSet``.
+
+An ``AttributeSet`` object is designed to be passed around by value.
+
+Note: It is advised that you do *not* use the ``AttributeSet`` "introspection"
+methods (e.g. ``Raw``, ``getRawPointer``, etc.).  These methods break
+encapsulation, and may be removed in a future release (i.e. LLVM 4.0).
+
+``AttrBuilder``
+===============
+
+Lastly, we have a "builder" class to help create the ``AttributeSet`` object
+without having to create several different intermediate uniqued
+``AttributeSet`` objects.  The ``AttrBuilder`` class allows you to add and
+remove attributes at will.  The attributes won't be uniqued until you call the
+appropriate ``AttributeSet::get`` method.
+
+An ``AttrBuilder`` object is *not* designed to be passed around by value.  It
+should be passed by reference.
+
+Note: It is advised that you do *not* use the ``AttrBuilder::addRawValue()``
+method or the ``AttrBuilder(uint64_t Val)`` constructor.  These are for
+backwards compatibility and may be removed in a future release (i.e. LLVM 4.0).
+
+And that's basically it! A lot of functionality is hidden behind these classes,
+but the interfaces are pretty straight forward.
+
diff --git a/docs/HowToUseInstrMappings.rst b/docs/HowToUseInstrMappings.rst
index bf9278e..8a3e7c8 100644
--- a/docs/HowToUseInstrMappings.rst
+++ b/docs/HowToUseInstrMappings.rst
@@ -1,11 +1,7 @@
-.. _how_to_use_instruction_mappings:
-
 ===============================
 How To Use Instruction Mappings
 ===============================
 
-.. sectionauthor:: Jyotsna Verma <jverma@codeaurora.org>
-
 .. contents::
    :local:
 
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 3c49ed0..c08cee1 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -111,7 +111,7 @@ After strength reduction:
 
 .. code-block:: llvm
 
-    %result = shl i32 %X, i8 3
+    %result = shl i32 %X, 3
 
 And the hard way:
 
@@ -148,20 +148,20 @@ symbol table entries. Here is an example of the "hello world" module:
 
 .. code-block:: llvm
 
-    ; Declare the string constant as a global constant. 
-    @.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00" 
+    ; Declare the string constant as a global constant. 
+    @.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00" 
 
-    ; External declaration of the puts function 
-    declare i32 @puts(i8* nocapture) nounwind 
+    ; External declaration of the puts function 
+    declare i32 @puts(i8* nocapture) nounwind 
 
     ; Definition of main function
-    define i32 @main() {   ; i32()*  
-      ; Convert [13 x i8]* to i8  *... 
+    define i32 @main() {   ; i32()*  
+      ; Convert [13 x i8]* to i8  *... 
       %cast210 = getelementptr [13 x i8]* @.str, i64 0, i64 0
 
-      ; Call puts function to write out the string to stdout. 
+      ; Call puts function to write out the string to stdout. 
       call i32 @puts(i8* %cast210)
-      ret i32 0 
+      ret i32 0 
     }
 
     ; Named metadata
@@ -262,7 +262,7 @@ linkage:
     Some languages allow differing globals to be merged, such as two
     functions with different semantics. Other languages, such as
     ``C++``, ensure that only equivalent globals are ever merged (the
-    "one definition rule" — "ODR"). Such languages can use the
+    "one definition rule" --- "ODR").  Such languages can use the
     ``linkonce_odr`` and ``weak_odr`` linkage types to indicate that the
     global will only be merged with equivalent globals. These linkage
     types are otherwise the same as their non-``odr`` versions.
@@ -465,11 +465,11 @@ more information on under which circumstances the different models may
 be used. The target may choose a different TLS model if the specified
 model is not supported, or if a better choice of model can be made.
 
-A variable may be defined as a global "constant," which indicates that
+A variable may be defined as a global ``constant``, which indicates that
 the contents of the variable will **never** be modified (enabling better
 optimization, allowing the global data to be placed in the read-only
 section of an executable, etc). Note that variables that need runtime
-initialization cannot be marked "constant" as there is a store to the
+initialization cannot be marked ``constant`` as there is a store to the
 variable.
 
 LLVM explicitly allows *declarations* of global variables to be marked
@@ -501,6 +501,14 @@ is zero. The address space qualifier must precede any other attributes.
 LLVM allows an explicit section to be specified for globals. If the
 target supports it, it will emit globals to the section specified.
 
+By default, global initializers are optimized by assuming that global
+variables defined within the module are not modified from their
+initial values before the start of the global initializer.  This is
+true even for variables potentially accessible from outside the
+module, including those with external linkage or appearing in
+``@llvm.used``. This assumption may be suppressed by marking the
+variable with ``externally_initialized``.
+
 An explicit alignment may be specified for a global, which must be a
 power of 2. If not present, or if the alignment is set to zero, the
 alignment of the global is set by the target to whatever it feels
@@ -679,7 +687,7 @@ Currently, only the following parameter attributes are defined:
     This indicates that the pointer parameter specifies the address of a
     structure that is the return value of the function in the source
     program. This pointer must be guaranteed by the caller to be valid:
-    loads and stores to the structure may be assumed by the callee to
+    loads and stores to the structure may be assumed by the callee
     not to trap and to be properly aligned. This may only be applied to
     the first parameter. This is not a valid attribute for return
     values.
@@ -712,6 +720,11 @@ Currently, only the following parameter attributes are defined:
     This indicates that the pointer parameter can be excised using the
     :ref:`trampoline intrinsics <int_trampoline>`. This is not a valid
     attribute for return values.
+``nobuiltin``
+    This indicates that the callee function at a call site is not
+    recognized as a built-in function. LLVM will retain the original call
+    and not replace it with equivalent code based on the semantics of the
+    built-in function.
 
 .. _gc:
 
@@ -729,6 +742,36 @@ The compiler declares the supported values of *name*. Specifying a
 collector which will cause the compiler to alter its output in order to
 support the named garbage collection algorithm.
 
+.. _attrgrp:
+
+Attribute Groups
+----------------
+
+Attribute groups are groups of attributes that are referenced by objects within
+the IR. They are important for keeping ``.ll`` files readable, because a lot of
+functions will use the same set of attributes. In the degenerative case of a
+``.ll`` file that corresponds to a single ``.c`` file, the single attribute
+group will capture the important command line flags used to build that file.
+
+An attribute group is a module-level object. To use an attribute group, an
+object references the attribute group's ID (e.g. ``#37``). An object may refer
+to more than one attribute group. In that situation, the attributes from the
+different groups are merged.
+
+Here is an example of attribute groups for a function that should always be
+inlined, has a stack alignment of 4, and which shouldn't use SSE instructions:
+
+.. code-block:: llvm
+
+   ; Target-independent attributes:
+   #0 = attributes { alwaysinline alignstack=4 }
+
+   ; Target-dependent attributes:
+   #1 = attributes { "no-sse" }
+
+   ; Function @f has attributes: alwaysinline, alignstack=4, and "no-sse".
+   define void @f() #0 #1 { ... }
+
 .. _fnattrs:
 
 Function Attributes
@@ -750,9 +793,6 @@ example:
     define void @f() alwaysinline optsize { ... }
     define void @f() optsize { ... }
 
-``address_safety``
-    This attribute indicates that the address safety analysis is enabled
-    for this function.
 ``alignstack(<n>)``
     This attribute indicates that, when emitting the prologue and
     epilogue, the backend should forcibly align the stack pointer.
@@ -774,6 +814,17 @@ example:
 ``naked``
     This attribute disables prologue / epilogue emission for the
     function. This can have very system-specific consequences.
+``noduplicate``
+    This attribute indicates that calls to the function cannot be
+    duplicated. A call to a ``noduplicate`` function may be moved
+    within its parent function, but may not be duplicated within
+    its parent function.
+
+    A function containing a ``noduplicate`` call may still
+    be an inlining candidate, provided that the call is not
+    duplicated by inlining. That implies that the function has
+    internal linkage and only has one call site, so the original
+    call is dead after inlining.
 ``noimplicitfloat``
     This attributes disables implicit floating point instructions.
 ``noinline``
@@ -819,13 +870,27 @@ example:
     ``setjmp`` is an example of such a function. The compiler disables
     some optimizations (like tail calls) in the caller of these
     functions.
+``sanitize_address``
+    This attribute indicates that AddressSanitizer checks
+    (dynamic address safety analysis) are enabled for this function.
+``sanitize_memory``
+    This attribute indicates that MemorySanitizer checks (dynamic detection
+    of accesses to uninitialized memory) are enabled for this function.
+``sanitize_thread``
+    This attribute indicates that ThreadSanitizer checks
+    (dynamic thread safety analysis) are enabled for this function.
 ``ssp``
     This attribute indicates that the function should emit a stack
-    smashing protector. It is in the form of a "canary"—a random value
+    smashing protector. It is in the form of a "canary" --- a random value
     placed on the stack before the local variables that's checked upon
     return from the function to see if it has been overwritten. A
     heuristic is used to determine if a function needs stack protectors
-    or not.
+    or not. The heuristic used will enable protectors for functions with:
+
+    - Character arrays larger than ``ssp-buffer-size`` (default 8).
+    - Aggregates containing character arrays larger than ``ssp-buffer-size``.
+    - Calls to alloca() with variable sizes or constant sizes greater than
+      ``ssp-buffer-size``.
 
     If a function that has an ``ssp`` attribute is inlined into a
     function that doesn't have an ``ssp`` attribute, then the resulting
@@ -837,25 +902,30 @@ example:
 
     If a function that has an ``sspreq`` attribute is inlined into a
     function that doesn't have an ``sspreq`` attribute or which has an
-    ``ssp`` attribute, then the resulting function will have an
-    ``sspreq`` attribute.
+    ``ssp`` or ``sspstrong`` attribute, then the resulting function will have
+    an ``sspreq`` attribute.
+``sspstrong``
+    This attribute indicates that the function should emit a stack smashing
+    protector. This attribute causes a strong heuristic to be used when
+    determining if a function needs stack protectors.  The strong heuristic
+    will enable protectors for functions with:
+
+    - Arrays of any size and type
+    - Aggregates containing an array of any size and type.
+    - Calls to alloca().
+    - Local variables that have had their address taken.
+
+    This overrides the ``ssp`` function attribute.
+
+    If a function that has an ``sspstrong`` attribute is inlined into a
+    function that doesn't have an ``sspstrong`` attribute, then the
+    resulting function will have an ``sspstrong`` attribute.
 ``uwtable``
     This attribute indicates that the ABI being targeted requires that
     an unwind table entry be produce for this function even if we can
     show that no exceptions passes by it. This is normally the case for
     the ELF x86-64 abi, but it can be disabled for some compilation
     units.
-``noduplicate``
-    This attribute indicates that calls to the function cannot be
-    duplicated. A call to a ``noduplicate`` function may be moved
-    within its parent function, but may not be duplicated within
-    its parent function.
-
-    A function containing a ``noduplicate`` call may still
-    be an inlining candidate, provided that the call is not
-    duplicated by inlining. That implies that the function has
-    internal linkage and only has one call site, so the original
-    call is dead after inlining.
 
 .. _moduleasm:
 
@@ -950,22 +1020,20 @@ specifications are given in this list:
 
 -  ``E`` - big endian
 -  ``p:64:64:64`` - 64-bit pointers with 64-bit alignment
--  ``p1:32:32:32`` - 32-bit pointers with 32-bit alignment for address
-   space 1
--  ``p2:16:32:32`` - 16-bit pointers with 32-bit alignment for address
-   space 2
+-  ``S0`` - natural stack alignment is unspecified
 -  ``i1:8:8`` - i1 is 8-bit (byte) aligned
 -  ``i8:8:8`` - i8 is 8-bit (byte) aligned
 -  ``i16:16:16`` - i16 is 16-bit aligned
 -  ``i32:32:32`` - i32 is 32-bit aligned
 -  ``i64:32:64`` - i64 has ABI alignment of 32-bits but preferred
    alignment of 64-bits
+-  ``f16:16:16`` - half is 16-bit aligned
 -  ``f32:32:32`` - float is 32-bit aligned
 -  ``f64:64:64`` - double is 64-bit aligned
+-  ``f128:128:128`` - quad is 128-bit aligned
 -  ``v64:64:64`` - 64-bit vector is 64-bit aligned
 -  ``v128:128:128`` - 128-bit vector is 128-bit aligned
--  ``a0:0:1`` - aggregates are 8-bit aligned
--  ``s0:64:64`` - stack objects are 64-bit aligned
+-  ``a0:0:64`` - aggregates are 64-bit aligned
 
 When LLVM is determining the alignment for a given type, it uses the
 following rules:
@@ -1061,6 +1129,21 @@ volatile operations. The optimizers *may* change the order of volatile
 operations relative to non-volatile operations. This is not Java's
 "volatile" and has no cross-thread synchronization behavior.
 
+IR-level volatile loads and stores cannot safely be optimized into
+llvm.memcpy or llvm.memmove intrinsics even when those intrinsics are
+flagged volatile. Likewise, the backend should never split or merge
+target-legal volatile load/store instructions.
+
+.. admonition:: Rationale
+
+ Platforms may rely on volatile loads and stores of natively supported
+ data width to be executed as single instruction. For example, in C
+ this holds for an l-value of volatile primitive type with native
+ hardware support, but not necessarily for aggregate types. The
+ frontend upholds these expectations, which are intentionally
+ unspecified in the IR. The rules above ensure that IR transformation
+ do not violate the frontend's contract with the language.
+
 .. _memmodel:
 
 Memory Model for Concurrent Operations
@@ -1554,7 +1637,7 @@ Examples:
 +---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``i32 (i32)``                   | function taking an ``i32``, returning an ``i32``                                                                                                                    |
 +---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ``float (i16, i32 *) *``        | :ref:`Pointer <t_pointer>` to a function that takes an ``i16`` and a :ref:`pointer <t_pointer>` to ``i32``, returning ``float``.                                    |
+| ``float (i16, i32 *) *``        | :ref:`Pointer <t_pointer>` to a function that takes an ``i16`` and a :ref:`pointer <t_pointer>` to ``i32``, returning ``float``.                                    |
 +---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``i32 (i8*, ...)``              | A vararg function that takes at least one :ref:`pointer <t_pointer>` to ``i8`` (char in C), which returns an integer. This is the signature for ``printf`` in LLVM. |
 +---------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
@@ -1605,7 +1688,7 @@ Examples:
 +------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``{ i32, i32, i32 }``        | A triple of three ``i32`` values                                                                                                                                                      |
 +------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| ``{ float, i32 (i32) * }``   | A pair, where the first element is a ``float`` and the second element is a :ref:`pointer <t_pointer>` to a :ref:`function <t_function>` that takes an ``i32``, returning an ``i32``.  |
+| ``{ float, i32 (i32) * }``   | A pair, where the first element is a ``float`` and the second element is a :ref:`pointer <t_pointer>` to a :ref:`function <t_function>` that takes an ``i32``, returning an ``i32``.  |
 +------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | ``<{ i8, i32 }>``            | A packed struct known to be 5 bytes in size.                                                                                                                                          |
 +------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
@@ -1754,7 +1837,7 @@ and disassembly do not cause any bits to change in the constants.
 When using the hexadecimal form, constants of types half, float, and
 double are represented using the 16-digit form shown above (which
 matches the IEEE754 representation for double); half and float values
-must, however, be exactly representable as IEE754 half and single
+must, however, be exactly representable as IEEE 754 half and single
 precision, respectively. Hexadecimal format is always used for long
 double, and there are three forms of long double. The 80-bit format used
 by x86 is represented as ``0xK`` followed by 20 hexadecimal digits. The
@@ -2079,7 +2162,7 @@ Taking the address of the entry block is illegal.
 This value only has defined behavior when used as an operand to the
 ':ref:`indirectbr <i_indirectbr>`' instruction, or for comparisons
 against null. Pointer equality tests between labels addresses results in
-undefined behavior — though, again, comparison against null is ok, and
+undefined behavior --- though, again, comparison against null is ok, and
 no label is equal to the null pointer. This may be passed around as an
 opaque pointer sized value as long as the bits are not inspected. This
 allows ``ptrtoint`` and arithmetic to be performed on these values so
@@ -2444,14 +2527,132 @@ Examples:
     !2 = metadata !{ i8 0, i8 2, i8 3, i8 6 }
     !3 = metadata !{ i8 -2, i8 0, i8 3, i8 6 }
 
+'``llvm.loop``'
+^^^^^^^^^^^^^^^
+
+It is sometimes useful to attach information to loop constructs. Currently,
+loop metadata is implemented as metadata attached to the branch instruction
+in the loop latch block. This type of metadata refer to a metadata node that is
+guaranteed to be separate for each loop. The loop-level metadata is prefixed
+with ``llvm.loop``.
+
+The loop identifier metadata is implemented using a metadata that refers to
+itself to avoid merging it with any other identifier metadata, e.g., 
+during module linkage or function inlining. That is, each loop should refer 
+to their own identification metadata even if they reside in separate functions. 
+The following example contains loop identifier metadata for two separate loop 
+constructs:
+
+.. code-block:: llvm
+
+    !0 = metadata !{ metadata !0 }
+    !1 = metadata !{ metadata !1 }
+
+
+'``llvm.loop.parallel``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This loop metadata can be used to communicate that a loop should be considered
+a parallel loop. The semantics of parallel loops in this case is the one
+with the strongest cross-iteration instruction ordering freedom: the
+iterations in the loop can be considered completely independent of each
+other (also known as embarrassingly parallel loops).
+
+This metadata can originate from a programming language with parallel loop
+constructs. In such a case it is completely the programmer's responsibility
+to ensure the instructions from the different iterations of the loop can be
+executed in an arbitrary order, in parallel, or intertwined. No loop-carried
+dependency checking at all must be expected from the compiler.
+
+In order to fulfill the LLVM requirement for metadata to be safely ignored,
+it is important to ensure that a parallel loop is converted to
+a sequential loop in case an optimization (agnostic of the parallel loop
+semantics) converts the loop back to such. This happens when new memory
+accesses that do not fulfill the requirement of free ordering across iterations
+are added to the loop. Therefore, this metadata is required, but not
+sufficient, to consider the loop at hand a parallel loop. For a loop
+to be parallel,  all its memory accessing instructions need to be
+marked with the ``llvm.mem.parallel_loop_access`` metadata that refer
+to the same loop identifier metadata that identify the loop at hand.
+
+'``llvm.mem``'
+^^^^^^^^^^^^^^^
+
+Metadata types used to annotate memory accesses with information helpful
+for optimizations are prefixed with ``llvm.mem``.
+
+'``llvm.mem.parallel_loop_access``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For a loop to be parallel, in addition to using
+the ``llvm.loop.parallel`` metadata to mark the loop latch branch instruction,
+also all of the memory accessing instructions in the loop body need to be
+marked with the ``llvm.mem.parallel_loop_access`` metadata. If there
+is at least one memory accessing instruction not marked with the metadata,
+the loop, despite it possibly using the ``llvm.loop.parallel`` metadata,
+must be considered a sequential loop. This causes parallel loops to be
+converted to sequential loops due to optimization passes that are unaware of
+the parallel semantics and that insert new memory instructions to the loop
+body.
+
+Example of a loop that is considered parallel due to its correct use of
+both ``llvm.loop.parallel`` and ``llvm.mem.parallel_loop_access``
+metadata types that refer to the same loop identifier metadata.
+
+.. code-block:: llvm
+
+   for.body:
+   ...
+   %0 = load i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !0
+   ...
+   store i32 %0, i32* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !0
+   ...
+   br i1 %exitcond, label %for.end, label %for.body, !llvm.loop.parallel !0
+
+   for.end:
+   ...
+   !0 = metadata !{ metadata !0 }
+
+It is also possible to have nested parallel loops. In that case the
+memory accesses refer to a list of loop identifier metadata nodes instead of
+the loop identifier metadata node directly:
+
+.. code-block:: llvm
+
+   outer.for.body:
+   ...
+
+   inner.for.body:
+   ...
+   %0 = load i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !0
+   ...
+   store i32 %0, i32* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !0
+   ...
+   br i1 %exitcond, label %inner.for.end, label %inner.for.body, !llvm.loop.parallel !1
+
+   inner.for.end:
+   ...
+   %0 = load i32* %arrayidx, align 4, !llvm.mem.parallel_loop_access !0
+   ...
+   store i32 %0, i32* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !0
+   ...
+   br i1 %exitcond, label %outer.for.end, label %outer.for.body, !llvm.loop.parallel !2
+
+   outer.for.end:                                          ; preds = %for.body
+   ...
+   !0 = metadata !{ metadata !1, metadata !2 } ; a list of parallel loop identifiers
+   !1 = metadata !{ metadata !1 } ; an identifier for the inner parallel loop
+   !2 = metadata !{ metadata !2 } ; an identifier for the outer parallel loop
+
+
 Module Flags Metadata
 =====================
 
 Information about the module as a whole is difficult to convey to LLVM's
 subsystems. The LLVM IR isn't sufficient to transmit this information.
 The ``llvm.module.flags`` named metadata exists in order to facilitate
-this. These flags are in the form of key / value pairs — much like a
-dictionary — making it easy for any subsystem who cares about a flag to
+this. These flags are in the form of key / value pairs --- much like a
+dictionary --- making it easy for any subsystem who cares about a flag to
 look it up.
 
 The ``llvm.module.flags`` metadata contains a list of metadata triplets.
@@ -2462,14 +2663,16 @@ Each triplet has the following form:
    (or more) metadata with the same ID. The supported behaviors are
    described below.
 -  The second element is a metadata string that is a unique ID for the
-   metadata. How each ID is interpreted is documented below.
+   metadata. Each module may only have one flag entry for each unique ID (not
+   including entries with the **Require** behavior).
 -  The third element is the value of the flag.
 
 When two (or more) modules are merged together, the resulting
-``llvm.module.flags`` metadata is the union of the modules'
-``llvm.module.flags`` metadata. The only exception being a flag with the
-*Override* behavior, which may override another flag's value (see
-below).
+``llvm.module.flags`` metadata is the union of the modules' flags. That is, for
+each unique metadata ID string, there will be exactly one entry in the merged
+modules ``llvm.module.flags`` metadata table, and the value for that entry will
+be determined by the merge behavior flag, as described below. The only exception
+is that entries with the *Require* behavior are always preserved.
 
 The following behaviors are supported:
 
@@ -2482,25 +2685,43 @@ The following behaviors are supported:
 
    * - 1
      - **Error**
-           Emits an error if two values disagree. It is an error to have an
-           ID with both an Error and a Warning behavior.
+           Emits an error if two values disagree, otherwise the resulting value
+           is that of the operands.
 
    * - 2
      - **Warning**
-           Emits a warning if two values disagree.
+           Emits a warning if two values disagree. The result value will be the
+           operand for the flag from the first module being linked.
 
    * - 3
      - **Require**
-           Emits an error when the specified value is not present or doesn't
-           have the specified value. It is an error for two (or more)
-           ``llvm.module.flags`` with the same ID to have the Require behavior
-           but different values. There may be multiple Require flags per ID.
+           Adds a requirement that another module flag be present and have a
+           specified value after linking is performed. The value must be a
+           metadata pair, where the first element of the pair is the ID of the
+           module flag to be restricted, and the second element of the pair is
+           the value the module flag should be restricted to. This behavior can
+           be used to restrict the allowable results (via triggering of an
+           error) of linking IDs with the **Override** behavior.
 
    * - 4
      - **Override**
-           Uses the specified value if the two values disagree. It is an
-           error for two (or more) ``llvm.module.flags`` with the same ID
-           to have the Override behavior but different values.
+           Uses the specified value, regardless of the behavior or value of the
+           other module. If both modules specify **Override**, but the values
+           differ, an error will be emitted.
+
+   * - 5
+     - **Append**
+           Appends the two values, which are required to be metadata nodes.
+
+   * - 6
+     - **AppendUnique**
+           Appends the two values, which are required to be metadata
+           nodes. However, duplicate entries in the second list are dropped
+           during the append operation.
+
+It is an error for a particular unique flag ID to have multiple behaviors,
+except in the case of **Require** (which adds restrictions on another metadata
+value) or **Override**.
 
 An example of module flags:
 
@@ -2522,7 +2743,7 @@ An example of module flags:
 
 -  Metadata ``!1`` has the ID ``!"bar"`` and the value '37'. The
    behavior if two or more ``!"bar"`` flags are seen is to use the value
-   '37' if their values are not equal.
+   '37'.
 
 -  Metadata ``!2`` has the ID ``!"qux"`` and the value '42'. The
    behavior if two or more ``!"qux"`` flags are seen is to emit a
@@ -2534,10 +2755,9 @@ An example of module flags:
 
        metadata !{ metadata !"foo", i32 1 }
 
-   The behavior is to emit an error if the ``llvm.module.flags`` does
-   not contain a flag with the ID ``!"foo"`` that has the value '1'. If
-   two or more ``!"qux"`` flags exist, then they must have the same
-   value or an error will be issued.
+   The behavior is to emit an error if the ``llvm.module.flags`` does not
+   contain a flag with the ID ``!"foo"`` that has the value '1' after linking is
+   performed.
 
 Objective-C Garbage Collection Module Flags Metadata
 ----------------------------------------------------
@@ -2559,26 +2779,26 @@ following key-value pairs:
    * - Key
      - Value
 
-   * - ``Objective-C Version``
-     - **[Required]** — The Objective-C ABI version. Valid values are 1 and 2.
+   * - ``Objective-C Version``
+     - **[Required]** --- The Objective-C ABI version. Valid values are 1 and 2.
 
-   * - ``Objective-C Image Info Version``
-     - **[Required]** — The version of the image info section. Currently
+   * - ``Objective-C Image Info Version``
+     - **[Required]** --- The version of the image info section. Currently
        always 0.
 
-   * - ``Objective-C Image Info Section``
-     - **[Required]** — The section to place the metadata. Valid values are
+   * - ``Objective-C Image Info Section``
+     - **[Required]** --- The section to place the metadata. Valid values are
        ``"__OBJC, __image_info, regular"`` for Objective-C ABI version 1, and
        ``"__DATA,__objc_imageinfo, regular, no_dead_strip"`` for
        Objective-C ABI version 2.
 
-   * - ``Objective-C Garbage Collection``
-     - **[Required]** — Specifies whether garbage collection is supported or
+   * - ``Objective-C Garbage Collection``
+     - **[Required]** --- Specifies whether garbage collection is supported or
        not. Valid values are 0, for no garbage collection, and 2, for garbage
        collection supported.
 
-   * - ``Objective-C GC Only``
-     - **[Optional]** — Specifies that only garbage collection is supported.
+   * - ``Objective-C GC Only``
+     - **[Optional]** --- Specifies that only garbage collection is supported.
        If present, its value must be 6. This flag requires that the
        ``Objective-C Garbage Collection`` flag have the value 2.
 
@@ -2591,6 +2811,40 @@ Some important flag interactions:
 -  A module with ``Objective-C Garbage Collection`` set to 0 cannot be
    merged with a module with ``Objective-C GC Only`` set to 6.
 
+Automatic Linker Flags Module Flags Metadata
+--------------------------------------------
+
+Some targets support embedding flags to the linker inside individual object
+files. Typically this is used in conjunction with language extensions which
+allow source files to explicitly declare the libraries they depend on, and have
+these automatically be transmitted to the linker via object files.
+
+These flags are encoded in the IR using metadata in the module flags section,
+using the ``Linker Options`` key. The merge behavior for this flag is required
+to be ``AppendUnique``, and the value for the key is expected to be a metadata
+node which should be a list of other metadata nodes, each of which should be a
+list of metadata strings defining linker options.
+
+For example, the following metadata section specifies two separate sets of
+linker options, presumably to link against ``libz`` and the ``Cocoa``
+framework::
+
+    !0 = metadata !{ i32 6, metadata !"Linker Options", 
+       metadata !{
+          metadata !{ metadata !"-lz" },
+          metadata !{ metadata !"-framework", metadata !"Cocoa" } } }
+    !llvm.module.flags = !{ !0 }
+
+The metadata encoding as lists of lists of options, as opposed to a collapsed
+list of options, is chosen so that the IR encoding can use multiple option
+strings to specify e.g., a single library, while still having that specifier be
+preserved as an atomic element that can be recognized by a target specific
+assembly writer or object file emitter.
+
+Each individual option is required to be either a valid option for the target's
+linker, or an option that is reserved by the target specific assembly writer or
+object file emitter. No other aspect of these options is defined by the IR.
+
 Intrinsic Global Variables
 ==========================
 
@@ -5777,7 +6031,7 @@ Overview:
 
 The '``landingpad``' instruction is used by `LLVM's exception handling
 system <ExceptionHandling.html#overview>`_ to specify that a basic block
-is a landing pad — one where the exception lands, and corresponds to the
+is a landing pad --- one where the exception lands, and corresponds to the
 code found in the ``catch`` portion of a ``try``/``catch`` sequence. It
 defines values supplied by the personality function (``pers_fn``) upon
 re-entry to the function. The ``resultval`` has the type ``resultty``.
@@ -5789,7 +6043,7 @@ This instruction takes a ``pers_fn`` value. This is the personality
 function associated with the unwinding mechanism. The optional
 ``cleanup`` flag indicates that the landing pad block is a cleanup.
 
-A ``clause`` begins with the clause type — ``catch`` or ``filter`` — and
+A ``clause`` begins with the clause type --- ``catch`` or ``filter`` --- and
 contains the global variable representing the "type" that may be caught
 or filtered respectively. Unlike the ``catch`` clause, the ``filter``
 clause takes an array constant as its argument. Use
@@ -7388,7 +7642,7 @@ Semantics:
 """"""""""
 
 The '``llvm.sadd.with.overflow``' family of intrinsic functions perform
-a signed addition of the two variables. They return a structure — the
+a signed addition of the two variables. They return a structure --- the
 first element of which is the signed summation, and the second element
 of which is a bit specifying if the signed summation resulted in an
 overflow.
@@ -7438,7 +7692,7 @@ Semantics:
 """"""""""
 
 The '``llvm.uadd.with.overflow``' family of intrinsic functions perform
-an unsigned addition of the two arguments. They return a structure — the
+an unsigned addition of the two arguments. They return a structure --- the
 first element of which is the sum, and the second element of which is a
 bit specifying if the unsigned summation resulted in a carry.
 
@@ -7487,7 +7741,7 @@ Semantics:
 """"""""""
 
 The '``llvm.ssub.with.overflow``' family of intrinsic functions perform
-a signed subtraction of the two arguments. They return a structure — the
+a signed subtraction of the two arguments. They return a structure --- the
 first element of which is the subtraction, and the second element of
 which is a bit specifying if the signed subtraction resulted in an
 overflow.
@@ -7537,7 +7791,7 @@ Semantics:
 """"""""""
 
 The '``llvm.usub.with.overflow``' family of intrinsic functions perform
-an unsigned subtraction of the two arguments. They return a structure —
+an unsigned subtraction of the two arguments. They return a structure ---
 the first element of which is the subtraction, and the second element of
 which is a bit specifying if the unsigned subtraction resulted in an
 overflow.
@@ -7587,7 +7841,7 @@ Semantics:
 """"""""""
 
 The '``llvm.smul.with.overflow``' family of intrinsic functions perform
-a signed multiplication of the two arguments. They return a structure —
+a signed multiplication of the two arguments. They return a structure ---
 the first element of which is the multiplication, and the second element
 of which is a bit specifying if the signed multiplication resulted in an
 overflow.
@@ -7637,8 +7891,8 @@ Semantics:
 """"""""""
 
 The '``llvm.umul.with.overflow``' family of intrinsic functions perform
-an unsigned multiplication of the two arguments. They return a structure
-— the first element of which is the multiplication, and the second
+an unsigned multiplication of the two arguments. They return a structure ---
+the first element of which is the multiplication, and the second
 element of which is a bit specifying if the unsigned multiplication
 resulted in an overflow.
 
@@ -7670,8 +7924,10 @@ Overview:
 """""""""
 
 The '``llvm.fmuladd.*``' intrinsic functions represent multiply-add
-expressions that can be fused if the code generator determines that the
-fused expression would be legal and efficient.
+expressions that can be fused if the code generator determines that (a) the
+target instruction set has support for a fused operation, and (b) that the
+fused operation is more efficient than the equivalent, separate pair of mul
+and add instructions.
 
 Arguments:
 """"""""""
diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
index cbe1585..11f1341 100644
--- a/docs/Lexicon.rst
+++ b/docs/Lexicon.rst
@@ -1,5 +1,3 @@
-.. _lexicon:
-
 ================
 The LLVM Lexicon
 ================
@@ -17,6 +15,21 @@ A
 **ADCE**
     Aggressive Dead Code Elimination
 
+**AST**
+    Abstract Syntax Tree.
+
+    Due to Clang's influence (mostly the fact that parsing and semantic
+    analysis are so intertwined for C and especially C++), the typical
+    working definition of AST in the LLVM community is roughly "the
+    compiler's first complete symbolic (as opposed to textual)
+    representation of an input program".
+    As such, an "AST" might be a more general graph instead of a "tree"
+    (consider the symbolic representation for the type of a typical "linked
+    list node"). This working definition is closer to what some authors
+    call an "annotated abstract syntax tree".
+
+    Consult your favorite compiler book or search engine for more details.
+
 B
 -
 
diff --git a/docs/LinkTimeOptimization.rst b/docs/LinkTimeOptimization.rst
index 822196c..c15abd3 100644
--- a/docs/LinkTimeOptimization.rst
+++ b/docs/LinkTimeOptimization.rst
@@ -1,5 +1,3 @@
-.. _lto:
-
 ======================================================
 LLVM Link Time Optimization: Design and Implementation
 ======================================================
diff --git a/docs/MakefileGuide.rst b/docs/MakefileGuide.rst
index 4988fe6..3e90907 100644
--- a/docs/MakefileGuide.rst
+++ b/docs/MakefileGuide.rst
@@ -1,5 +1,3 @@
-.. _makefile_guide:
-
 ===================
 LLVM Makefile Guide
 ===================
@@ -348,9 +346,9 @@ details.
 This target should be implemented by the ``Makefile`` in the project's ``test``
 directory. It is invoked by the ``check`` target elsewhere.  Each project is
 free to define the actions of ``check-local`` as appropriate for that
-project. The LLVM project itself uses dejagnu to run a suite of feature and
-regression tests. Other projects may choose to use dejagnu or any other testing
-mechanism.
+project. The LLVM project itself uses the :doc:`Lit <CommandGuide/lit>` testing
+tool to run a suite of feature and regression tests. Other projects may choose
+to use :program:`lit` or any other testing mechanism.
 
 ``clean``
 ---------
@@ -701,6 +699,9 @@ The override variables are given below:
 ``CFLAGS``
     Additional flags to be passed to the 'C' compiler.
 
+``CPPFLAGS``
+    Additional flags passed to the C/C++ preprocessor.
+
 ``CXX``
     Specifies the path to the C++ compiler.
 
diff --git a/docs/MarkedUpDisassembly.rst b/docs/MarkedUpDisassembly.rst
index e1282e1..cc4dbc8 100644
--- a/docs/MarkedUpDisassembly.rst
+++ b/docs/MarkedUpDisassembly.rst
@@ -1,5 +1,3 @@
-.. _marked_up_disassembly:
-
 =======================================
 LLVM's Optional Rich Disassembly Output
 =======================================
diff --git a/docs/Packaging.rst b/docs/Packaging.rst
index 6e74158..7c2dc95 100644
--- a/docs/Packaging.rst
+++ b/docs/Packaging.rst
@@ -1,5 +1,3 @@
-.. _packaging:
-
 ========================
 Advice on Packaging LLVM
 ========================
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index b454497..efab10c 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -88,6 +88,12 @@ diffs between different versions of the patch as it was reviewed in the
 *Revision Update History*. Most features are self descriptive - explore, and
 if you have a question, drop by on #llvm in IRC to get help.
 
+Note that as e-mail is the system of reference for code reviews, and some
+people prefer it over a web interface, we do not generate automated mail
+when a review changes state, for example by clicking "Accept Revision" in
+the web interface. Thus, please type LGTM into the comment box to accept
+a change from Phabricator.
+
 Status
 ------
 
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index 2b272de..4fc4597 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -6,14 +6,7 @@ LLVM Programmer's Manual
    :local:
 
 .. warning::
-   This is a work in progress.
-
-.. sectionauthor:: Chris Lattner <sabre@nondot.org>,
-                   Dinakar Dhurjati <dhurjati@cs.uiuc.edu>,
-                   Gabor Greif <ggreif@gmail.com>,
-                   Joel Stanley <jstanley@cs.uiuc.edu>,
-                   Reid Spencer <rspencer@x10sys.com> and
-                   Owen Anderson <owen@apple.com>
+   This is always a work in progress.
 
 .. _introduction:
 
@@ -84,8 +77,8 @@ Here are some useful links:
    (even better, get the book)
    <http://www.mindview.net/Books/TICPP/ThinkingInCPP2e.html>`_.
 
-You are also encouraged to take a look at the :ref:`LLVM Coding Standards
-<coding_standards>` guide which focuses on how to write maintainable code more
+You are also encouraged to take a look at the :doc:`LLVM Coding Standards
+<CodingStandards>` guide which focuses on how to write maintainable code more
 than where to put your curly braces.
 
 .. _resources:
@@ -185,8 +178,8 @@ rarely have to include this file directly).
 
 These five templates can be used with any classes, whether they have a v-table
 or not.  If you want to add support for these templates, see the document
-:ref:`How to set up LLVM-style RTTI for your class hierarchy
-<how-to-set-up-llvm-style-rtti>`
+:doc:`How to set up LLVM-style RTTI for your class hierarchy
+<HowToSetUpLLVMStyleRTTI>`
 
 .. _string_apis:
 
@@ -1059,6 +1052,22 @@ SparseSet is useful for algorithms that need very fast clear/find/insert/erase
 and fast iteration over small sets.  It is not intended for building composite
 data structures.
 
+.. _dss_sparsemultiset:
+
+llvm/ADT/SparseMultiSet.h
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+SparseMultiSet adds multiset behavior to SparseSet, while retaining SparseSet's
+desirable attributes. Like SparseSet, it typically uses a lot of memory, but
+provides operations that are almost as fast as a vector.  Typical keys are
+physical registers, virtual registers, or numbered basic blocks.
+
+SparseMultiSet is useful for algorithms that need very fast
+clear/find/insert/erase of the entire collection, and iteration over sets of
+elements sharing a key. It is often a more efficient choice than using composite
+data structures (e.g. vector-of-vectors, map-of-vectors). It is not intended for
+building composite data structures.
+
 .. _dss_FoldingSet:
 
 llvm/ADT/FoldingSet.h
@@ -2256,13 +2265,13 @@ accomplished by the following scheme:
 A bit-encoding in the 2 LSBits (least significant bits) of the ``Use::Prev``
 allows to find the start of the ``User`` object:
 
-* ``00`` –> binary digit 0
+* ``00`` --- binary digit 0
 
-* ``01`` –> binary digit 1
+* ``01`` --- binary digit 1
 
-* ``10`` –> stop and calculate (``s``)
+* ``10`` --- stop and calculate (``s``)
 
-* ``11`` –> full stop (``S``)
+* ``11`` --- full stop (``S``)
 
 Given a ``Use*``, all we have to do is to walk till we get a stop and we either
 have a ``User`` immediately behind or we have to walk to the next stop picking
diff --git a/docs/Projects.rst b/docs/Projects.rst
index c5d03d3..3246e3f 100644
--- a/docs/Projects.rst
+++ b/docs/Projects.rst
@@ -1,5 +1,3 @@
-.. _projects:
-
 ========================
 Creating an LLVM Project
 ========================
@@ -153,12 +151,10 @@ Underneath your top level directory, you should have the following directories:
     Currently, the LLVM build system provides basic support for tests. The LLVM
     system provides the following:
 
-* LLVM provides a ``tcl`` procedure that is used by ``Dejagnu`` to run tests.
-  It can be found in ``llvm/lib/llvm-dg.exp``.  This test procedure uses ``RUN``
+* LLVM contains regression tests in ``llvm/test``.  These tests are run by the
+  :doc:`Lit <CommandGuide/lit>` testing tool.  This test procedure uses ``RUN``
   lines in the actual test case to determine how to run the test.  See the
-  :doc:`TestingGuide` for more details. You can easily write Makefile
-  support similar to the Makefiles in ``llvm/test`` to use ``Dejagnu`` to
-  run your project's tests.
+  :doc:`TestingGuide` for more details.
 
 * LLVM contains an optional package called ``llvm-test``, which provides
   benchmarks and programs that are known to compile with the Clang front
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 2de4ebb..9383c5b 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -1,27 +1,21 @@
-.. raw:: html
-
-    <style> .red {color:red} </style>
-
-.. role:: red
-
 ======================
-LLVM 3.2 Release Notes
+LLVM 3.3 Release Notes
 ======================
 
 .. contents::
     :local:
 
-Written by the `LLVM Team <http://llvm.org/>`_
+.. warning::
+   These are in-progress notes for the upcoming LLVM 3.3 release.  You may
+   prefer the `LLVM 3.2 Release Notes <http://llvm.org/releases/3.2/docs
+   /ReleaseNotes.html>`_.
 
-:red:`These are in-progress notes for the upcoming LLVM 3.2 release.  You may
-prefer the` `LLVM 3.1 Release Notes <http://llvm.org/releases/3.1/docs
-/ReleaseNotes.html>`_.
 
 Introduction
 ============
 
 This document contains the release notes for the LLVM Compiler Infrastructure,
-release 3.2.  Here we describe the status of LLVM, including major improvements
+release 3.3.  Here we describe the status of LLVM, including major improvements
 from the previous release, improvements in various subprojects of LLVM, and
 some of the current users of the code.  All LLVM releases may be downloaded
 from the `LLVM releases web site <http://llvm.org/releases/>`_.
@@ -37,504 +31,90 @@ LLVM web page, this document applies to the *next* release, not the current
 one.  To see the release notes for a specific release, please see the `releases
 page <http://llvm.org/releases/>`_.
 
-Sub-project Status Update
-=========================
-
-The LLVM 3.2 distribution currently consists of code from the core LLVM
-repository, which roughly includes the LLVM optimizers, code generators and
-supporting tools, and the Clang repository.  In addition to this code, the LLVM
-Project includes other sub-projects that are in development.  Here we include
-updates on these subprojects.
-
-Clang: C/C++/Objective-C Frontend Toolkit
------------------------------------------
-
-`Clang <http://clang.llvm.org/>`_ is an LLVM front end for the C, C++, and
-Objective-C languages.  Clang aims to provide a better user experience through
-expressive diagnostics, a high level of conformance to language standards, fast
-compilation, and low memory use.  Like LLVM, Clang provides a modular,
-library-based architecture that makes it suitable for creating or integrating
-with other development tools.  Clang is considered a production-quality
-compiler for C, Objective-C, C++ and Objective-C++ on x86 (32- and 64-bit), and
-for Darwin/ARM targets.
-
-In the LLVM 3.2 time-frame, the Clang team has made many improvements.
-Highlights include:
-
-#. More powerful warnings, especially `-Wuninitialized`
-#. Template type diffing in diagnostic messages
-#. Higher quality and more efficient debug info generation
-
-For more details about the changes to Clang since the 3.1 release, see the
-`Clang release notes. <http://clang.llvm.org/docs/ReleaseNotes.html>`_
-
-If Clang rejects your code but another compiler accepts it, please take a look
-at the `language compatibility <http://clang.llvm.org/compatibility.html>`_
-guide to make sure this is not intentional or a known issue.
-
-DragonEgg: GCC front-ends, LLVM back-end
-----------------------------------------
-
-`DragonEgg <http://dragonegg.llvm.org/>`_ is a `gcc plugin
-<http://gcc.gnu.org/wiki/plugins>`_ that replaces GCC's optimizers and code
-generators with LLVM's.  It works with gcc-4.5 and gcc-4.6 (and partially with
-gcc-4.7), can target the x86-32/x86-64 and ARM processor families, and has been
-successfully used on the Darwin, FreeBSD, KFreeBSD, Linux and OpenBSD
-platforms.  It fully supports Ada, C, C++ and Fortran.  It has partial support
-for Go, Java, Obj-C and Obj-C++.
-
-The 3.2 release has the following notable changes:
-
-#. Able to load LLVM plugins such as Polly.
-#. Supports thread-local storage models.
-#. Passes knowledge of variable lifetimes to the LLVM optimizers.
-#. No longer requires GCC to be built with LTO support.
-
-compiler-rt: Compiler Runtime Library
--------------------------------------
-
-The new LLVM `compiler-rt project <http://compiler-rt.llvm.org/>`_ is a simple
-library that provides an implementation of the low-level target-specific hooks
-required by code generation and other runtime components.  For example, when
-compiling for a 32-bit target, converting a double to a 64-bit unsigned integer
-is compiled into a runtime call to the ``__fixunsdfdi`` function.  The
-``compiler-rt`` library provides highly optimized implementations of this and
-other low-level routines (some are 3x faster than the equivalent libgcc
-routines).
-
-The 3.2 release has the following notable changes:
-
-#. ...
-
-LLDB: Low Level Debugger
-------------------------
-
-`LLDB <http://lldb.llvm.org>`_ is a ground-up implementation of a command line
-debugger, as well as a debugger API that can be used from other applications.
-LLDB makes use of the Clang parser to provide high-fidelity expression parsing
-(particularly for C++) and uses the LLVM JIT for target support.
-
-The 3.2 release has the following notable changes:
-
-#. ...
-
-libc++: C++ Standard Library
-----------------------------
+Non-comprehensive list of changes in this release
+=================================================
 
-Like compiler_rt, libc++ is now :ref:`dual licensed
-<copyright-license-patents>` under the MIT and UIUC license, allowing it to be
-used more permissively.
+.. NOTE
+   For small 1-3 sentence descriptions, just add an entry at the end of
+   this list. If your description won't fit comfortably in one bullet
+   point (e.g. maybe you would like to give an example of the
+   functionality, or simply have a lot to talk about), see the `NOTE` below
+   for adding a new subsection.
 
-Within the LLVM 3.2 time-frame there were the following highlights:
+* The CellSPU port has been removed.  It can still be found in older versions.
 
-#. ...
+* The IR-level extended linker APIs (for example, to link bitcode files out of
+  archives) have been removed. Any existing clients of these features should
+  move to using a linker with integrated LTO support.
 
-VMKit
------
+* LLVM and Clang's documentation has been migrated to the `Sphinx
+  <http://sphinx-doc.org/>`_ documentation generation system which uses
+  easy-to-write reStructuredText. See `llvm/docs/README.txt` for more
+  information.
 
-The `VMKit project <http://vmkit.llvm.org/>`_ is an implementation of a Java
-Virtual Machine (Java VM or JVM) that uses LLVM for static and just-in-time
-compilation.
+* TargetTransformInfo (TTI) is a new interface that can be used by IR-level
+  passes to obtain target-specific information, such as the costs of
+  instructions. Only "Lowering" passes such as LSR and the vectorizer are
+  allowed to use the TTI infrastructure.
 
-The 3.2 release has the following notable changes:
+* We've improved the X86 and ARM cost model.
 
-#. ...
+* The Attributes classes have been completely rewritten and expanded. They now
+  support not only enumerated attributes and alignments, but "string"
+  attributes, which are useful for passing information to code generation. See
+  :doc:`HowToUseAttributes` for more details.
 
-Polly: Polyhedral Optimizer
----------------------------
+* ... next change ...
 
-`Polly <http://polly.llvm.org/>`_ is an *experimental* optimizer for data
-locality and parallelism.  It provides high-level loop optimizations and
-automatic parallelisation.
+.. NOTE
+   If you would like to document a larger change, then you can add a
+   subsection about it right here. You can copy the following boilerplate
+   and un-indent it (the indentation causes it to be inside this comment).
 
-Within the LLVM 3.2 time-frame there were the following highlights:
+   Special New Feature
+   -------------------
 
-#. isl, the integer set library used by Polly, was relicensed to the MIT license
-#. isl based code generation
-#. MIT licensed replacement for CLooG (LGPLv2)
-#. Fine grained option handling (separation of core and border computations,
-   control overhead vs. code size)
-#. Support for FORTRAN and dragonegg
-#. OpenMP code generation fixes
+   Makes programs 10x faster by doing Special New Thing.
 
-External Open Source Projects Using LLVM 3.2
-============================================
+AArch64 target
+--------------
 
-An exciting aspect of LLVM is that it is used as an enabling technology for a
-lot of other language and tools projects.  This section lists some of the
-projects that have already been updated to work with LLVM 3.2.
+We've added support for AArch64, ARM's 64-bit architecture. Development is still
+in fairly early stages, but we expect successful compilation when:
 
-Crack
------
+- compiling standard compliant C99 and C++03 with Clang;
+- using Linux as a target platform;
+- where code + static data doesn't exceed 4GB in size (heap allocated data has
+  no limitation).
 
-`Crack <http://code.google.com/p/crack-language/>`_ aims to provide the ease of
-development of a scripting language with the performance of a compiled
-language.  The language derives concepts from C++, Java and Python,
-incorporating object-oriented programming, operator overloading and strong
-typing.
+Some additional functionality is also implemented, notably DWARF debugging,
+GNU-style thread local storage and inline assembly.
 
-FAUST
------
-
-`FAUST <http://faust.grame.fr/>`_ is a compiled language for real-time audio
-signal processing.  The name FAUST stands for Functional AUdio STream.  Its
-programming model combines two approaches: functional programming and block
-diagram composition.  In addition with the C, C++, Java, JavaScript output
-formats, the Faust compiler can generate LLVM bitcode, and works with LLVM
-2.7-3.1.
-
-Glasgow Haskell Compiler (GHC)
-------------------------------
-
-`GHC <http://www.haskell.org/ghc/>`_ is an open source compiler and programming
-suite for Haskell, a lazy functional programming language.  It includes an
-optimizing static compiler generating good code for a variety of platforms,
-together with an interactive system for convenient, quick development.
-
-GHC 7.0 and onwards include an LLVM code generator, supporting LLVM 2.8 and
-later.
-
-Julia
------
-
-`Julia <https://github.com/JuliaLang/julia>`_ is a high-level, high-performance
-dynamic language for technical computing.  It provides a sophisticated
-compiler, distributed parallel execution, numerical accuracy, and an extensive
-mathematical function library.  The compiler uses type inference to generate
-fast code without any type declarations, and uses LLVM's optimization passes
-and JIT compiler.  The `Julia Language <http://julialang.org/>`_ is designed
-around multiple dispatch, giving programs a large degree of flexibility.  It is
-ready for use on many kinds of problems.
-
-LLVM D Compiler
+Loop Vectorizer
 ---------------
 
-`LLVM D Compiler <https://github.com/ldc-developers/ldc>`_ (LDC) is a compiler
-for the D programming Language.  It is based on the DMD frontend and uses LLVM
-as backend.
-
-Open Shading Language
----------------------
-
-`Open Shading Language (OSL)
-<https://github.com/imageworks/OpenShadingLanguage/>`_ is a small but rich
-language for programmable shading in advanced global illumination renderers and
-other applications, ideal for describing materials, lights, displacement, and
-pattern generation.  It uses LLVM to JIT complex shader networks to x86 code at
-runtime.
-
-OSL was developed by Sony Pictures Imageworks for use in its in-house renderer
-used for feature film animation and visual effects, and is distributed as open
-source software with the "New BSD" license.
-
-Portable OpenCL (pocl)
-----------------------
-
-In addition to producing an easily portable open source OpenCL implementation,
-another major goal of `pocl <http://pocl.sourceforge.net/>`_ is improving
-performance portability of OpenCL programs with compiler optimizations,
-reducing the need for target-dependent manual optimizations.  An important part
-of pocl is a set of LLVM passes used to statically parallelize multiple
-work-items with the kernel compiler, even in the presence of work-group
-barriers.  This enables static parallelization of the fine-grained static
-concurrency in the work groups in multiple ways (SIMD, VLIW, superscalar, ...).
-
-Pure
-----
-
-`Pure <http://pure-lang.googlecode.com/>`_ is an algebraic/functional
-programming language based on term rewriting.  Programs are collections of
-equations which are used to evaluate expressions in a symbolic fashion.  The
-interpreter uses LLVM as a backend to JIT-compile Pure programs to fast native
-code.  Pure offers dynamic typing, eager and lazy evaluation, lexical closures,
-a hygienic macro system (also based on term rewriting), built-in list and
-matrix support (including list and matrix comprehensions) and an easy-to-use
-interface to C and other programming languages (including the ability to load
-LLVM bitcode modules, and inline C, C++, Fortran and Faust code in Pure
-programs if the corresponding LLVM-enabled compilers are installed).
-
-Pure version 0.54 has been tested and is known to work with LLVM 3.1 (and
-continues to work with older LLVM releases >= 2.5).
-
-TTA-based Co-design Environment (TCE)
--------------------------------------
-
-`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing application-specific
-processors (ASP) based on the Transport triggered architecture (TTA).  The
-toolset provides a complete co-design flow from C/C++ programs down to
-synthesizable VHDL/Verilog and parallel program binaries.  Processor
-customization points include the register files, function units, supported
-operations, and the interconnection network.
-
-TCE uses Clang and LLVM for C/C++ language support, target independent
-optimizations and also for parts of code generation.  It generates new
-LLVM-based code generators "on the fly" for the designed TTA processors and
-loads them in to the compiler backend as runtime libraries to avoid per-target
-recompilation of larger parts of the compiler chain.
-
-Installation Instructions
-=========================
-
-See :doc:`GettingStarted`.
-
-What's New in LLVM 3.2?
-=======================
-
-This release includes a huge number of bug fixes, performance tweaks and minor
-improvements.  Some of the major improvements and new features are listed in
-this section.
-
-Major New Features
-------------------
-
-..
-
-  Features that need text if they're finished for 3.2:
-   ARM EHABI
-   combiner-aa?
-   strong phi elim
-   loop dependence analysis
-   CorrelatedValuePropagation
-   Integrated assembler on by default for arm/thumb?
-
-  Near dead:
-   Analysis/RegionInfo.h + Dom Frontiers
-   SparseBitVector: used in LiveVar.
-   llvm/lib/Archive - replace with lib object?
-
-
-LLVM 3.2 includes several major changes and big features:
-
-#. New NVPTX back-end (replacing existing PTX back-end) based on NVIDIA sources
-#. ...
-
-LLVM IR and Core Improvements
------------------------------
-
-LLVM IR has several new features for better support of new targets and that
-expose new optimization opportunities:
-
-#. Thread local variables may have a specified TLS model.  See the :ref:`Language
-   Reference Manual <globalvars>`.
-#. ...
-
-Optimizer Improvements
-----------------------
-
-In addition to many minor performance tweaks and bug fixes, this release
-includes a few major enhancements and additions to the optimizers:
-
-Loop Vectorizer - We've added a loop vectorizer and we are now able to
-vectorize small loops.  The loop vectorizer is disabled by default and can be
-enabled using the ``-mllvm -vectorize-loops`` flag.  The SIMD vector width can
-be specified using the flag ``-mllvm -force-vector-width=4``.  The default
-value is ``0`` which means auto-select.
-
-We can now vectorize this function:
-
-.. code-block:: c++
-
-  unsigned sum_arrays(int *A, int *B, int start, int end) {
-    unsigned sum = 0;
-    for (int i = start; i < end; ++i)
-      sum += A[i] + B[i] + i;
-    return sum;
-  }
-
-We vectorize under the following loops:
-
-#. The inner most loops must have a single basic block.
-#. The number of iterations are known before the loop starts to execute.
-#. The loop counter needs to be incremented by one.
-#. The loop trip count **can** be a variable.
-#. Loops do **not** need to start at zero.
-#. The induction variable can be used inside the loop.
-#. Loop reductions are supported.
-#. Arrays with affine access pattern do **not** need to be marked as
-   '``noalias``' and are checked at runtime.
-#. ...
-
-SROA - We've re-written SROA to be significantly more powerful and generate
-code which is much more friendly to the rest of the optimization pipeline.
-Previously this pass had scaling problems that required it to only operate on
-relatively small aggregates, and at times it would mistakenly replace a large
-aggregate with a single very large integer in order to make it a scalar SSA
-value. The result was a large number of i1024 and i2048 values representing any
-small stack buffer. These in turn slowed down many subsequent optimization
-paths.
-
-The new SROA pass uses a different algorithm that allows it to only promote to
-scalars the pieces of the aggregate actively in use. Because of this it doesn't
-require any thresholds. It also always deduces the scalar values from the uses
-of the aggregate rather than the specific LLVM type of the aggregate. These
-features combine to both optimize more code with the pass but to improve the
-compile time of many functions dramatically.
-
-#. Branch weight metadata is preseved through more of the optimizer.
-#. ...
-
-MC Level Improvements
----------------------
-
-The LLVM Machine Code (aka MC) subsystem was created to solve a number of
-problems in the realm of assembly, disassembly, object file format handling,
-and a number of other related areas that CPU instruction-set level tools work
-in.  For more information, please see the `Intro to the LLVM MC Project Blog
-Post <http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html>`_.
-
-#. ...
-
-.. _codegen:
-
-Target Independent Code Generator Improvements
-----------------------------------------------
-
-We have put a significant amount of work into the code generator
-infrastructure, which allows us to implement more aggressive algorithms and
-make it run faster:
-
-#. ...
-
-Stack Coloring - We have implemented a new optimization pass to merge stack
-objects which are used in disjoin areas of the code.  This optimization reduces
-the required stack space significantly, in cases where it is clear to the
-optimizer that the stack slot is not shared.  We use the lifetime markers to
-tell the codegen that a certain alloca is used within a region.
-
-We now merge consecutive loads and stores.
-
-X86-32 and X86-64 Target Improvements
--------------------------------------
-
-New features and major changes in the X86 target include:
-
-#. ...
-
-.. _ARM:
-
-ARM Target Improvements
------------------------
-
-New features of the ARM target include:
-
-#. ...
-
-.. _armintegratedassembler:
-
-MIPS Target Improvements
-------------------------
-
-New features and major changes in the MIPS target include:
-
-#. ...
-
-PowerPC Target Improvements
----------------------------
-
-Many fixes and changes across LLVM (and Clang) for better compliance with the
-64-bit PowerPC ELF Application Binary Interface, interoperability with GCC, and
-overall 64-bit PowerPC support.  Some highlights include:
-
-#. MCJIT support added.
-#. PPC64 relocation support and (small code model) TOC handling added.
-#. Parameter passing and return value fixes (alignment issues, padding, varargs
-   support, proper register usage, odd-sized structure support, float support,
-   extension of return values for i32 return values).
-#. Fixes in spill and reload code for vector registers.
-#. C++ exception handling enabled.
-#. Changes to remediate double-rounding compatibility issues with respect to
-   GCC behavior.
-#. Refactoring to disentangle ``ppc64-elf-linux`` ABI from Darwin ppc64 ABI
-   support.
-#. Assorted new test cases and test case fixes (endian and word size issues).
-#. Fixes for big-endian codegen bugs, instruction encodings, and instruction
-   constraints.
-#. Implemented ``-integrated-as`` support.
-#. Additional support for Altivec compare operations.
-#. IBM long double support.
-
-There have also been code generation improvements for both 32- and 64-bit code.
-Instruction scheduling support for the Freescale e500mc and e5500 cores has
-been added.
-
-PTX/NVPTX Target Improvements
------------------------------
-
-The PTX back-end has been replaced by the NVPTX back-end, which is based on the
-LLVM back-end used by NVIDIA in their CUDA (nvcc) and OpenCL compiler.  Some
-highlights include:
-
-#. Compatibility with PTX 3.1 and SM 3.5.
-#. Support for NVVM intrinsics as defined in the NVIDIA Compiler SDK.
-#. Full compatibility with old PTX back-end, with much greater coverage of LLVM
-   SIR.
-
-Please submit any back-end bugs to the LLVM Bugzilla site.
-
-Other Target Specific Improvements
-----------------------------------
-
-#. ...
-
-Major Changes and Removed Features
-----------------------------------
-
-If you're already an LLVM user or developer with out-of-tree changes based on
-LLVM 3.2, this section lists some "gotchas" that you may run into upgrading
-from the previous release.
-
-#. The CellSPU port has been removed.  It can still be found in older versions.
-#. ...
-
-Internal API Changes
---------------------
-
-In addition, many APIs have changed in this release.  Some of the major LLVM
-API changes are:
-
-We've added a new interface for allowing IR-level passes to access
-target-specific information.  A new IR-level pass, called
-``TargetTransformInfo`` provides a number of low-level interfaces.  LSR and
-LowerInvoke already use the new interface.
-
-The ``TargetData`` structure has been renamed to ``DataLayout`` and moved to
-``VMCore`` to remove a dependency on ``Target``.
-
-#. ...
-
-Tools Changes
--------------
-
-In addition, some tools have changed in this release.  Some of the changes are:
-
-#. ...
-
-Python Bindings
----------------
-
-Officially supported Python bindings have been added!  Feature support is far
-from complete.  The current bindings support interfaces to:
-
-#. ...
+We've continued the work on the loop vectorizer. The loop vectorizer now
+has the following features:
 
-Known Problems
-==============
+- Loops with unknown trip count.
+- Runtime checks of pointers
+- Reductions, Inductions
+- If Conversion
+- Pointer induction variables
+- Reverse iterators
+- Vectorization of mixed types
+- Vectorization of function calls
+- Partial unrolling during vectorization
 
-LLVM is generally a production quality compiler, and is used by a broad range
-of applications and shipping in many products.  That said, not every subsystem
-is as mature as the aggregate, particularly the more obscure1 targets.  If you
-run into a problem, please check the `LLVM bug database
-<http://llvm.org/bugs/>`_ and submit a bug if there isn't already one or ask on
-the `LLVMdev list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_.
+R600 Backend
+------------
 
-Known problem areas include:
+The R600 backend was added in this release, it supports AMD GPUs
+(HD2XXX - HD7XXX).  This backend is used in AMD's Open Source
+graphics / compute drivers which are developed as part of the `Mesa3D
+<http://www.mesa3d.org>`_ project.
 
-#. The MSP430 and XCore backends are experimental.
 
-#. The integrated assembler, disassembler, and JIT is not supported by several
-   targets.  If an integrated assembler is not supported, then a system
-   assembler is required.  For more details, see the
-   :ref:`target-feature-matrix`.
 
 Additional Information
 ======================
diff --git a/docs/SegmentedStacks.rst b/docs/SegmentedStacks.rst
index f97d62a..e44ce423 100644
--- a/docs/SegmentedStacks.rst
+++ b/docs/SegmentedStacks.rst
@@ -1,5 +1,3 @@
-.. _segmented_stacks:
-
 ========================
 Segmented Stacks in LLVM
 ========================
diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst
index 781824c..78ce4e0 100644
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst
@@ -2,8 +2,6 @@
 Source Level Debugging with LLVM
 ================================
 
-.. sectionauthor:: Chris Lattner <sabre@nondot.org> and Jim Laskey <jlaskey@mac.com>
-
 .. contents::
    :local:
 
@@ -300,7 +298,6 @@ Subprogram descriptors
     metadata, ;; Reference to type descriptor
     i1,       ;; True if the global is local to compile unit (static)
     i1,       ;; True if the global is defined in the compile unit (not extern)
-    i32,      ;; Line number where the scope of the subprogram begins
     i32,      ;; Virtuality, e.g. dwarf::DW_VIRTUALITY__virtual
     i32,      ;; Index into a virtual function
     metadata, ;; indicates which base type contains the vtable pointer for the
@@ -310,7 +307,8 @@ Subprogram descriptors
     Function * , ;; Pointer to LLVM function
     metadata, ;; Lists function template parameters
     metadata, ;; Function declaration descriptor
-    metadata  ;; List of function variables
+    metadata, ;; List of function variables
+    i32       ;; Line number where the scope of the subprogram begins
   }
 
 These descriptors provide debug information about functions, methods and
diff --git a/docs/SphinxQuickstartTemplate.rst b/docs/SphinxQuickstartTemplate.rst
index b0002ba..8f6d71e 100644
--- a/docs/SphinxQuickstartTemplate.rst
+++ b/docs/SphinxQuickstartTemplate.rst
@@ -2,8 +2,6 @@
 Sphinx Quickstart Template
 ==========================
 
-.. sectionauthor:: Sean Silva <silvas@purdue.edu>
-
 Introduction and Quickstart
 ===========================
 
@@ -65,7 +63,7 @@ Your text can be *emphasized*, **bold**, or ``monospace``.
 
 Use blank lines to separate paragraphs.
 
-Headings (like ``Example Section`` just above) give your document
+Headings (like ``Example Section`` just above) give your document its
 structure. Use the same kind of adornments (e.g. ``======`` vs. ``------``)
 as are used in this document. The adornment must be the same length as the
 text above it. For Vim users, variations of ``yypVr=`` might be handy.
@@ -86,7 +84,7 @@ Lists can be made like this:
 
 #. This is a second list element.
 
-   #. They nest too.
+   #. Use indentation to create nested lists.
 
 You can also use unordered lists.
 
@@ -104,7 +102,7 @@ You can make blocks of code like this:
 .. code-block:: c++
 
    int main() {
-     return 0
+     return 0;
    }
 
 For a shell session, use a ``console`` code block (some existing docs use
diff --git a/docs/SystemLibrary.rst b/docs/SystemLibrary.rst
index 88404f4..0d0f4fa 100644
--- a/docs/SystemLibrary.rst
+++ b/docs/SystemLibrary.rst
@@ -2,8 +2,6 @@
 System Library
 ==============
 
-.. sectionauthor:: Reid Spencer <rspencer@x10sys.com>
-
 Abstract
 ========
 
diff --git a/docs/TableGen/LangRef.rst b/docs/TableGen/LangRef.rst
index 34098a0..c9e1efb 100644
--- a/docs/TableGen/LangRef.rst
+++ b/docs/TableGen/LangRef.rst
@@ -74,6 +74,11 @@ TableGen also has two string-like literals:
    TokString: '"' <non-'"' characters and C-like escapes> '"'
    TokCodeFragment: "[{" <shortest text not containing "}]"> "}]"
 
+.. note::
+   The current implementation accepts the following C-like escapes::
+
+      \\ \' \" \t \n
+
 TableGen also has the following keywords::
 
    bit   bits      class   code         dag
@@ -81,11 +86,13 @@ TableGen also has the following keywords::
    int   let       list    multiclass   string
 
 TableGen also has "bang operators" which have a
-wide variety of meanings::
+wide variety of meanings:
 
-   !eq     !if      !head    !tail      !con
-   !shl    !sra     !srl
-   !cast   !empty   !subst   !foreach   !strconcat
+.. productionlist::
+   BangOperator: one of
+               :!eq     !if      !head    !tail      !con
+               :!add    !shl     !sra     !srl
+               :!cast   !empty   !subst   !foreach   !strconcat
 
 Syntax
 ======
@@ -291,14 +298,14 @@ Bodies
 
 .. productionlist::
    ObjectBody: `BaseClassList` `Body`
-   BaseClassList: [`BaseClassListNE`]
+   BaseClassList: [":" `BaseClassListNE`]
    BaseClassListNE: `SubClassRef` ("," `SubClassRef`)*
-   SubClassRef: (`ClassID` | `DefmID`) ["<" `ValueList` ">"]
+   SubClassRef: (`ClassID` | `MultiClassID`) ["<" `ValueList` ">"]
    DefmID: `TokIdentifier`
 
-The version with the :token:`DefmID` is only valid in the
+The version with the :token:`MultiClassID` is only valid in the
 :token:`BaseClassList` of a ``defm``.
-The :token:`DefmID` should be the name of a ``multiclass``.
+The :token:`MultiClassID` should be the name of a ``multiclass``.
 
 .. put this somewhere else
 
@@ -336,7 +343,7 @@ a ``foreach``.
 --------
 
 .. productionlist::
-   Defm: "defm" `TokIdentifier` ":" `BaseClassList` ";"
+   Defm: "defm" `TokIdentifier` ":" `BaseClassListNE` ";"
 
 Note that in the :token:`BaseClassList`, all of the ``multiclass``'s must
 precede any ``class``'s that appear.
@@ -370,6 +377,7 @@ applied at the end of parsing the base classes of a record.
 
 .. productionlist::
    MultiClass: "multiclass" `TokIdentifier` [`TemplateArgList`]
-             : [":" `BaseMultiClassList`] "{" `MultiClassDef`+ "}"
+             : [":" `BaseMultiClassList`] "{" `MultiClassObject`+ "}"
    BaseMultiClassList: `MultiClassID` ("," `MultiClassID`)*
    MultiClassID: `TokIdentifier`
+   MultiClassObject: `Def` | `Defm` | `Let` | `Foreach`
diff --git a/docs/TableGenFundamentals.rst b/docs/TableGenFundamentals.rst
index 73bcd66..4fe4bb9 100644
--- a/docs/TableGenFundamentals.rst
+++ b/docs/TableGenFundamentals.rst
@@ -1,5 +1,3 @@
-.. _tablegen:
-
 =====================
 TableGen Fundamentals
 =====================
@@ -793,6 +791,10 @@ Expressions used by code generator to describe instructions and isel patterns:
 TableGen backends
 =================
 
+Until we get a step-by-step HowTo for writing TableGen backends, you can at
+least grab the boilerplate (build system, new files, etc.) from Clang's
+r173931.
+
 TODO: How they work, how to write one.  This section should not contain details
 about any particular backend, except maybe ``-print-enums`` as an example.  This
 should highlight the APIs in ``TableGen/Record.h``.
diff --git a/docs/TestSuiteMakefileGuide.rst b/docs/TestSuiteMakefileGuide.rst
index b10379e..e2852a0 100644
--- a/docs/TestSuiteMakefileGuide.rst
+++ b/docs/TestSuiteMakefileGuide.rst
@@ -2,9 +2,6 @@
 LLVM test-suite Makefile Guide
 ==============================
 
-Written by John T. Criswell, Daniel Dunbar, Reid Spencer, and Tanya
-Lattner
-
 .. contents::
    :local:
 
diff --git a/docs/TestingGuide.rst b/docs/TestingGuide.rst
index f26d1bf..4d8c8ce 100644
--- a/docs/TestingGuide.rst
+++ b/docs/TestingGuide.rst
@@ -2,9 +2,6 @@
 LLVM Testing Infrastructure Guide
 =================================
 
-Written by John T. Criswell, Daniel Dunbar, Reid Spencer, and Tanya
-Lattner
-
 .. contents::
    :local:
 
@@ -234,33 +231,21 @@ what you can use in yours. The major differences are:
 -  You can't do ``2>&1``. That will cause :program:`lit` to write to a file
    named ``&1``. Usually this is done to get stderr to go through a pipe. You
    can do that with ``|&`` so replace this idiom:
-   ``... 2>&1 | grep`` with ``... |& grep``
+   ``... 2>&1 | FileCheck`` with ``... |& FileCheck``
 -  You can only redirect to a file, not to another descriptor and not
    from a here document.
 
 There are some quoting rules that you must pay attention to when writing
 your RUN lines. In general nothing needs to be quoted. :program:`lit` won't
 strip off any quote characters so they will get passed to the invoked program.
-For example:
-
-.. code-block:: bash
-
-    ... | grep 'find this string'
-
-This will fail because the ``'`` characters are passed to ``grep``. This would
-make ``grep`` to look for ``'find`` in the files ``this`` and
-``string'``. To avoid this use curly braces to tell :program:`lit` that it
-should treat everything enclosed as one value. So our example would become:
-
-.. code-block:: bash
-
-    ... | grep {find this string}
+To avoid this use curly braces to tell :program:`lit` that it should treat
+everything enclosed as one value.
 
 In general, you should strive to keep your RUN lines as simple as possible,
-using them only to run tools that generate the output you can then examine. The
-recommended way to examine output to figure out if the test passes it using the
-:doc:`FileCheck tool <CommandGuide/FileCheck>`. The usage of ``grep`` in RUN
-lines is discouraged.
+using them only to run tools that generate textual output you can then examine.
+The recommended way to examine output to figure out if the test passes it using
+the :doc:`FileCheck tool <CommandGuide/FileCheck>`. *[The usage of grep in RUN
+lines is deprecated - please do not send or commit patches that use it.]*
 
 Fragile tests
 -------------
@@ -299,24 +284,6 @@ This test will fail if placed into a ``download`` directory.
 To make your tests robust, always use ``opt ... < %s`` in the RUN line.
 :program:`opt` does not output a ``ModuleID`` when input comes from stdin.
 
-The FileCheck utility
----------------------
-
-A powerful feature of the RUN lines is that it allows any arbitrary
-commands to be executed as part of the test harness. While standard
-(portable) unix tools like ``grep`` work fine on run lines, as you see
-above, there are a lot of caveats due to interaction with shell syntax,
-and we want to make sure the run lines are portable to a wide range of
-systems. Another major problem is that ``grep`` is not very good at checking
-to verify that the output of a tools contains a series of different
-output in a specific order. The :program:`FileCheck` tool was designed to
-help with these problems.
-
-:program:`FileCheck` is designed to read a file to check from standard input,
-and the set of things to verify from a file specified as a command line
-argument. :program:`FileCheck` is described in :doc:`the FileCheck man page
-<CommandGuide/FileCheck>`.
-
 Variables and substitutions
 ---------------------------
 
diff --git a/docs/Vectorizers.rst b/docs/Vectorizers.rst
index 0748634..0894b1e 100644
--- a/docs/Vectorizers.rst
+++ b/docs/Vectorizers.rst
@@ -206,6 +206,25 @@ vectorization is profitable.
       A[i] += 4 * B[i];
   }
 
+Global Structures Alias Analysis
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Access to global structures can also be vectorized, with alias analysis being
+used to make sure accesses don't alias. Run-time checks can also be added on
+pointer access to structure members.
+
+Many variations are supported, but some that rely on undefined behaviour being
+ignored (as other compilers do) are still being left un-vectorized.
+
+.. code-block:: c++
+
+  struct { int A[100], K, B[100]; } Foo;
+
+  int foo() {
+    for (int i = 0; i < 100; ++i)
+      Foo.A[i] = Foo.B[i] + 100;
+  }
+
 Vectorization of function calls
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/WritingAnLLVMBackend.rst b/docs/WritingAnLLVMBackend.rst
index 868ca20..6d6c2a1 100644
--- a/docs/WritingAnLLVMBackend.rst
+++ b/docs/WritingAnLLVMBackend.rst
@@ -7,8 +7,6 @@ Writing an LLVM Compiler Backend
 
    HowToUseInstrMappings
 
-.. sectionauthor:: Mason Woo <http://www.woo.com> and Misha Brukman <http://misha.brukman.net>
-
 .. contents::
    :local:
 
diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst
index db47fef..b10d98f 100644
--- a/docs/WritingAnLLVMPass.rst
+++ b/docs/WritingAnLLVMPass.rst
@@ -5,9 +5,6 @@ Writing an LLVM Pass
 .. contents::
     :local:
 
-Written by `Chris Lattner <mailto:sabre@nondot.org>`_ and
-`Jim Laskey <mailto:jlaskey@mac.com>`_
-
 Introduction --- What is a pass?
 ================================
 
diff --git a/docs/YamlIO.rst b/docs/YamlIO.rst
index b009b67..ac50292 100644
--- a/docs/YamlIO.rst
+++ b/docs/YamlIO.rst
@@ -1,5 +1,3 @@
-.. _yamlio:
-
 =====================
 YAML I/O
 =====================
@@ -87,13 +85,13 @@ locations, making it hard for a human to write such YAML correctly.
 In relational database theory there is a design step called normalization in 
 which you reorganize fields and tables.  The same considerations need to 
 go into the design of your YAML encoding.  But, you may not want to change
-your exisiting native data structures.  Therefore, when writing out YAML
+your existing native data structures.  Therefore, when writing out YAML
 there may be a normalization step, and when reading YAML there would be a
 corresponding denormalization step.  
 
 YAML I/O uses a non-invasive, traits based design.  YAML I/O defines some 
 abstract base templates.  You specialize those templates on your data types.
-For instance, if you have an eumerated type FooBar you could specialize 
+For instance, if you have an enumerated type FooBar you could specialize 
 ScalarEnumerationTraits on that type and define the enumeration() method:
 
 .. code-block:: c++
@@ -115,17 +113,17 @@ values and the YAML string representation is only in place.
 This assures that the code for writing and parsing of YAML stays in sync.
 
 To specify a YAML mappings, you define a specialization on 
-llvm::yaml::MapppingTraits.
+llvm::yaml::MappingTraits.
 If your native data structure happens to be a struct that is already normalized,
 then the specialization is simple.  For example:
 
 .. code-block:: c++
    
-    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::MappingTraits;
     using llvm::yaml::IO;
     
     template <>
-    struct MapppingTraits<Person> {
+    struct MappingTraits<Person> {
       static void mapping(IO &io, Person &info) {
         io.mapRequired("name",         info.name);
         io.mapOptional("hat-size",     info.hatSize);
@@ -133,7 +131,7 @@ then the specialization is simple.  For example:
     };
 
 
-A YAML sequence is automatically infered if you data type has begin()/end()
+A YAML sequence is automatically inferred if you data type has begin()/end()
 iterators and a push_back() method.  Therefore any of the STL containers
 (such as std::vector<>) will automatically translate to YAML sequences.
 
@@ -245,7 +243,7 @@ The following types have built-in support in YAML I/O:
 * uint16_t
 * uint8_t
 
-That is, you can use those types in fields of MapppingTraits or as element type
+That is, you can use those types in fields of MappingTraits or as element type
 in sequence.  When reading, YAML I/O will validate that the string found
 is convertible to that type and error out if not.
 
@@ -313,7 +311,7 @@ as a field type:
 .. code-block:: c++
 
     using llvm::yaml::ScalarEnumerationTraits;
-    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::MappingTraits;
     using llvm::yaml::IO;
 
     template <>
@@ -326,7 +324,7 @@ as a field type:
     };
  
     template <>
-    struct MapppingTraits<Info> {
+    struct MappingTraits<Info> {
       static void mapping(IO &io, Info &info) {
         io.mapRequired("cpu",       info.cpu);
         io.mapOptional("flags",     info.flags, 0);
@@ -363,7 +361,7 @@ on MyFlags and provide the bit values and their names.
 .. code-block:: c++
 
     using llvm::yaml::ScalarBitSetTraits;
-    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::MappingTraits;
     using llvm::yaml::IO;
 
     template <>
@@ -382,7 +380,7 @@ on MyFlags and provide the bit values and their names.
     };
     
     template <>
-    struct MapppingTraits<Info> {
+    struct MappingTraits<Info> {
       static void mapping(IO &io, Info& info) {
         io.mapRequired("name",  info.name);
         io.mapRequired("flags", info.flags);
@@ -436,18 +434,18 @@ Mappings
 ========
 
 To be translated to or from a YAML mapping for your type T you must specialize  
-llvm::yaml::MapppingTraits on T and implement the "void mapping(IO &io, T&)" 
+llvm::yaml::MappingTraits on T and implement the "void mapping(IO &io, T&)" 
 method. If your native data structures use pointers to a class everywhere,
 you can specialize on the class pointer.  Examples:
 
 .. code-block:: c++
    
-    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::MappingTraits;
     using llvm::yaml::IO;
     
     // Example of struct Foo which is used by value
     template <>
-    struct MapppingTraits<Foo> {
+    struct MappingTraits<Foo> {
       static void mapping(IO &io, Foo &foo) {
         io.mapOptional("size",      foo.size);
       ...
@@ -456,7 +454,7 @@ you can specialize on the class pointer.  Examples:
 
     // Example of struct Bar which is natively always a pointer
     template <>
-    struct MapppingTraits<Bar*> {
+    struct MappingTraits<Bar*> {
       static void mapping(IO &io, Bar *&bar) {
         io.mapOptional("size",    bar->size);
       ...
@@ -474,11 +472,11 @@ bind the struct's fields to YAML key names.  For example:
 
 .. code-block:: c++
    
-    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::MappingTraits;
     using llvm::yaml::IO;
     
     template <>
-    struct MapppingTraits<Person> {
+    struct MappingTraits<Person> {
       static void mapping(IO &io, Person &info) {
         io.mapRequired("name",         info.name);
         io.mapOptional("hat-size",     info.hatSize);
@@ -513,17 +511,17 @@ is, you want the yaml to look like:
     x:   10.3
     y:   -4.7
 
-You can support this by defining a MapppingTraits that normalizes the polar
+You can support this by defining a MappingTraits that normalizes the polar
 coordinates to x,y coordinates when writing YAML and denormalizes x,y 
-coordindates into polar when reading YAML.  
+coordinates into polar when reading YAML.  
 
 .. code-block:: c++
    
-    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::MappingTraits;
     using llvm::yaml::IO;
         
     template <>
-    struct MapppingTraits<Polar> {
+    struct MappingTraits<Polar> {
       
       class NormalizedPolar {
       public:
@@ -568,7 +566,7 @@ could be returned by the denormalize() method, except that the temporary
 normalized instance is stack allocated.  In these cases, the utility template
 MappingNormalizationHeap<> can be used instead.  It just like 
 MappingNormalization<> except that it heap allocates the normalized object
-when reading YAML.  It never destroyes the normalized object.  The denormalize()
+when reading YAML.  It never destroys the normalized object.  The denormalize()
 method can this return "this".
 
 
@@ -614,7 +612,7 @@ This works for both reading and writing. For example:
 
 .. code-block:: c++
 
-    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::MappingTraits;
     using llvm::yaml::IO;
     
     struct Info {
@@ -623,7 +621,7 @@ This works for both reading and writing. For example:
     };
 
     template <>
-    struct MapppingTraits<Info> {
+    struct MappingTraits<Info> {
       static void mapping(IO &io, Info &info) {
         io.mapRequired("cpu",       info.cpu);
         // flags must come after cpu for this to work when reading yaml
@@ -640,8 +638,8 @@ Sequence
 
 To be translated to or from a YAML sequence for your type T you must specialize
 llvm::yaml::SequenceTraits on T and implement two methods:
-“size_t size(IO &io, T&)” and “T::value_type& element(IO &io, T&, size_t indx)”.
-For example:
+``size_t size(IO &io, T&)`` and
+``T::value_type& element(IO &io, T&, size_t indx)``.  For example:
 
 .. code-block:: c++
 
@@ -678,13 +676,13 @@ add "static const bool flow = true;".  For instance:
   };
 
 With the above, if you used MyList as the data type in your native data 
-strucutures, then then when converted to YAML, a flow sequence of integers 
+structures, then then when converted to YAML, a flow sequence of integers 
 will be used (e.g. [ 10, -3, 4 ]).
 
 
 Utility Macros
 --------------
-Since a common source of sequences is std::vector<>, YAML I/O provids macros:
+Since a common source of sequences is std::vector<>, YAML I/O provides macros:
 LLVM_YAML_IS_SEQUENCE_VECTOR() and LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR() which
 can be used to easily specify SequenceTraits<> on a std::vector type.  YAML 
 I/O does not partial specialize SequenceTraits on std::vector<> because that
diff --git a/docs/design_and_overview.rst b/docs/design_and_overview.rst
deleted file mode 100644
index 4b1d627..0000000
--- a/docs/design_and_overview.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-.. _design_and_overview:
-
-LLVM Design & Overview
-======================
-
-.. toctree::
-   :hidden:
-
-   LangRef
-   GetElementPtr
-
-* :doc:`LangRef`
-
-  Defines the LLVM intermediate representation.
-
-* `Introduction to the LLVM Compiler <http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html>`_
-
-  Presentation providing a users introduction to LLVM.
-
-* `Intro to LLVM <http://www.aosabook.org/en/llvm.html>`_
-
-  Book chapter providing a compiler hacker's introduction to LLVM.
-
-* `LLVM: A Compilation Framework for Lifelong Program Analysis & Transformation
-  <http://llvm.org/pubs/2004-01-30-CGO-LLVM.html>`_
-
-  Design overview.
-
-* `LLVM: An Infrastructure for Multi-Stage Optimization
-  <http://llvm.org/pubs/2002-12-LattnerMSThesis.html>`_
-
-  More details (quite old now).
-
-* :ref:`gep`
-
-  Answers to some very frequent questions about LLVM's most frequently
-  misunderstood instruction.
diff --git a/docs/development_process.rst b/docs/development_process.rst
deleted file mode 100644
index ecd4c6a..0000000
--- a/docs/development_process.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-.. _development_process:
-
-Development Process Documentation
-=================================
-
-.. toctree::
-   :hidden:
-
-   MakefileGuide
-   Projects
-   LLVMBuild
-   HowToReleaseLLVM
-
-* :ref:`projects`
-
-  How-to guide and templates for new projects that *use* the LLVM
-  infrastructure.  The templates (directory organization, Makefiles, and test
-  tree) allow the project code to be located outside (or inside) the ``llvm/``
-  tree, while using LLVM header files and libraries.
-
-* :doc:`LLVMBuild`
-
-  Describes the LLVMBuild organization and files used by LLVM to specify
-  component descriptions.
-
-* :ref:`makefile_guide`
-
-  Describes how the LLVM makefiles work and how to use them.
-
-* :doc:`HowToReleaseLLVM`
-
-  This is a guide to preparing LLVM releases. Most developers can ignore it.
diff --git a/docs/gcc-loops.png b/docs/gcc-loops.png
index 6606bfd..8923a31 100644
--- a/docs/gcc-loops.png
+++ b/docs/gcc-loops.png
diff --git a/docs/index.rst b/docs/index.rst
index d406b52..8f22ef2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,5 +1,3 @@
-.. _contents:
-
 Overview
 ========
 
@@ -15,54 +13,382 @@ research projects.
 Similarly, documentation is broken down into several high-level groupings
 targeted at different audiences:
 
-* **Design & Overview**
+LLVM Design & Overview
+======================
+
+Several introductory papers and presentations.
+
+.. toctree::
+   :hidden:
+
+   LangRef
+   GetElementPtr
+
+:doc:`LangRef`
+  Defines the LLVM intermediate representation.
+
+`Introduction to the LLVM Compiler`__
+  Presentation providing a users introduction to LLVM.
+
+  .. __: http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html
+
+`Intro to LLVM`__
+  Book chapter providing a compiler hacker's introduction to LLVM.
+
+  .. __: http://www.aosabook.org/en/llvm.html
+
+
+`LLVM: A Compilation Framework for Lifelong Program Analysis & Transformation`__
+  Design overview.
+
+  .. __: http://llvm.org/pubs/2004-01-30-CGO-LLVM.html
+
+`LLVM: An Infrastructure for Multi-Stage Optimization`__
+  More details (quite old now).
+
+  .. __: http://llvm.org/pubs/2002-12-LattnerMSThesis.html
+
+:doc:`GetElementPtr`
+  Answers to some very frequent questions about LLVM's most frequently
+  misunderstood instruction.
+
+`Publications mentioning LLVM <http://llvm.org/pubs>`_
+   ..
+
+User Guides
+===========
+
+For those new to the LLVM system.
+
+NOTE: If you are a user who is only interested in using LLVM-based
+compilers, you should look into `Clang <http://clang.llvm.org>`_ or
+`DragonEgg <http://dragonegg.llvm.org>`_ instead. The documentation here is
+intended for users who have a need to work with the intermediate LLVM
+representation.
+
+.. toctree::
+   :hidden:
+
+   CMake
+   HowToBuildOnARM
+   CommandGuide/index
+   DeveloperPolicy
+   GettingStarted
+   GettingStartedVS
+   FAQ
+   Lexicon
+   HowToAddABuilder
+   yaml2obj
+   HowToSubmitABug
+   SphinxQuickstartTemplate
+   Phabricator
+   TestingGuide
+   tutorial/index
+   ReleaseNotes
+   Passes
+   YamlIO
+
+:doc:`GettingStarted`
+   Discusses how to get up and running quickly with the LLVM infrastructure.
+   Everything from unpacking and compilation of the distribution to execution
+   of some tools.
+
+:doc:`CMake`
+   An addendum to the main Getting Started guide for those using the `CMake
+   build system <http://www.cmake.org>`_.
+
+:doc:`HowToBuildOnARM`
+   Notes on building and testing LLVM/Clang on ARM.
+
+:doc:`GettingStartedVS`
+   An addendum to the main Getting Started guide for those using Visual Studio
+   on Windows.
+
+:doc:`tutorial/index`
+   Tutorials about using LLVM. Includes a tutorial about making a custom
+   language with LLVM.
+
+:doc:`DeveloperPolicy`
+   The LLVM project's policy towards developers and their contributions.
+
+:doc:`LLVM Command Guide <CommandGuide/index>`
+   A reference manual for the LLVM command line utilities ("man" pages for LLVM
+   tools).
+
+:doc:`Passes`
+   A list of optimizations and analyses implemented in LLVM.
+
+:doc:`FAQ`
+   A list of common questions and problems and their solutions.
+
+:doc:`Release notes for the current release <ReleaseNotes>`
+   This describes new features, known bugs, and other limitations.
+
+:doc:`HowToSubmitABug`
+   Instructions for properly submitting information about any bugs you run into
+   in the LLVM system.
+
+:doc:`SphinxQuickstartTemplate`
+  A template + tutorial for writing new Sphinx documentation. It is meant
+  to be read in source form.
+
+:doc:`LLVM Testing Infrastructure Guide <TestingGuide>`
+   A reference manual for using the LLVM testing infrastructure.
+
+`How to build the C, C++, ObjC, and ObjC++ front end`__
+   Instructions for building the clang front-end from source.
+
+   .. __: http://clang.llvm.org/get_started.html
+
+:doc:`Lexicon`
+   Definition of acronyms, terms and concepts used in LLVM.
+
+:doc:`HowToAddABuilder`
+   Instructions for adding new builder to LLVM buildbot master.
+
+:doc:`YamlIO`
+   A reference guide for using LLVM's YAML I/O library.
+
+IRC
+===
+
+Users and developers of the LLVM project (including subprojects such as Clang)
+can be found in #llvm on `irc.oftc.net <irc://irc.oftc.net/llvm>`_.
+
+This channel has several bots.
+
+* Buildbot reporters
+
+  * llvmbb - Bot for the main LLVM buildbot master.
+    http://lab.llvm.org:8011/console
+  * bb-chapuni - An individually run buildbot master. http://bb.pgr.jp/console
+  * smooshlab - Apple's internal buildbot master.
+
+* robot - Bugzilla linker. %bug <number>
 
- Several introductory papers and presentations are available at
- :ref:`design_and_overview`.
+* clang-bot - A `geordi <http://www.eelis.net/geordi/>`_ instance running
+  near-trunk clang instead of gcc.
 
-* **Publications**
+Programming Documentation
+=========================
 
- The list of `publications <http://llvm.org/pubs>`_ based on LLVM.
+For developers of applications which use LLVM as a library.
 
-* **User Guides**
+.. toctree::
+   :hidden:
+
+   Atomics
+   CodingStandards
+   CommandLine
+   CompilerWriterInfo
+   ExtendingLLVM
+   HowToSetUpLLVMStyleRTTI
+   ProgrammersManual
+
+:doc:`LLVM Language Reference Manual <LangRef>`
+  Defines the LLVM intermediate representation and the assembly form of the
+  different nodes.
 
- Those new to the LLVM system should first visit the :ref:`userguides`.
+:doc:`Atomics`
+  Information about LLVM's concurrency model.
 
- NOTE: If you are a user who is only interested in using LLVM-based
- compilers, you should look into `Clang <http://clang.llvm.org>`_ or
- `DragonEgg <http://dragonegg.llvm.org>`_ instead. The documentation here is
- intended for users who have a need to work with the intermediate LLVM
- representation.
+:doc:`ProgrammersManual`
+  Introduction to the general layout of the LLVM sourcebase, important classes
+  and APIs, and some tips & tricks.
 
-* **API Clients**
+:doc:`CommandLine`
+  Provides information on using the command line parsing library.
 
- Developers of applications which use LLVM as a library should visit the
- :ref:`programming`.
+:doc:`CodingStandards`
+  Details the LLVM coding standards and provides useful information on writing
+  efficient C++ code.
 
-* **Subsystems**
+:doc:`HowToSetUpLLVMStyleRTTI`
+  How to make ``isa<>``, ``dyn_cast<>``, etc. available for clients of your
+  class hierarchy.
 
- API clients and LLVM developers may be interested in the
- :ref:`subsystems` documentation.
+:doc:`ExtendingLLVM`
+  Look here to see how to add instructions and intrinsics to LLVM.
 
-* **Development Process**
+`Doxygen generated documentation <http://llvm.org/doxygen/>`_
+  (`classes <http://llvm.org/doxygen/inherits.html>`_)
+  (`tarball <http://llvm.org/doxygen/doxygen.tar.gz>`_)
 
- Additional documentation on the LLVM project can be found at
- :ref:`development_process`.
+`ViewVC Repository Browser <http://llvm.org/viewvc/>`_
+   ..
 
-* **Mailing Lists**
+:doc:`CompilerWriterInfo`
+  A list of helpful links for compiler writers.
 
- For more information, consider consulting the LLVM :ref:`mailing_lists`.
+Subsystem Documentation
+=======================
+
+For API clients and LLVM developers.
 
 .. toctree::
-   :maxdepth: 2
-
-   design_and_overview
-   userguides
-   programming
-   subsystems
-   development_process
-   mailing_lists
-   
+   :hidden:
+
+   AliasAnalysis
+   BitCodeFormat
+   BranchWeightMetadata
+   Bugpoint
+   CodeGenerator
+   ExceptionHandling
+   LinkTimeOptimization
+   SegmentedStacks
+   TableGenFundamentals
+   DebuggingJITedCode
+   GoldPlugin
+   MarkedUpDisassembly
+   SystemLibrary
+   SourceLevelDebugging
+   Vectorizers
+   WritingAnLLVMBackend
+   GarbageCollection
+   WritingAnLLVMPass
+   TableGen/LangRef
+   HowToUseAttributes
+
+:doc:`WritingAnLLVMPass`
+   Information on how to write LLVM transformations and analyses.
+
+:doc:`WritingAnLLVMBackend`
+   Information on how to write LLVM backends for machine targets.
+
+:doc:`CodeGenerator`
+   The design and implementation of the LLVM code generator.  Useful if you are
+   working on retargetting LLVM to a new architecture, designing a new codegen
+   pass, or enhancing existing components.
+
+:doc:`TableGenFundamentals`
+   Describes the TableGen tool, which is used heavily by the LLVM code
+   generator.
+
+:doc:`AliasAnalysis`
+   Information on how to write a new alias analysis implementation or how to
+   use existing analyses.
+
+:doc:`GarbageCollection`
+   The interfaces source-language compilers should use for compiling GC'd
+   programs.
+
+:doc:`Source Level Debugging with LLVM <SourceLevelDebugging>`
+   This document describes the design and philosophy behind the LLVM
+   source-level debugger.
+
+:doc:`Vectorizers`
+   This document describes the current status of vectorization in LLVM.
+
+:doc:`ExceptionHandling`
+   This document describes the design and implementation of exception handling
+   in LLVM.
+
+:doc:`Bugpoint`
+   Automatic bug finder and test-case reducer description and usage
+   information.
+
+:doc:`BitCodeFormat`
+   This describes the file format and encoding used for LLVM "bc" files.
+
+:doc:`System Library <SystemLibrary>`
+   This document describes the LLVM System Library (``lib/System``) and
+   how to keep LLVM source code portable
+
+:doc:`LinkTimeOptimization`
+   This document describes the interface between LLVM intermodular optimizer
+   and the linker and its design
+
+:doc:`GoldPlugin`
+   How to build your programs with link-time optimization on Linux.
+
+:doc:`DebuggingJITedCode`
+   How to debug JITed code with GDB.
+
+:doc:`BranchWeightMetadata`
+   Provides information about Branch Prediction Information.
+
+:doc:`SegmentedStacks`
+   This document describes segmented stacks and how they are used in LLVM.
+
+:doc:`MarkedUpDisassembly`
+   This document describes the optional rich disassembly output syntax.
+
+:doc:`HowToUseAttributes`
+  Answers some questions about the new Attributes infrastructure.
+
+Development Process Documentation
+=================================
+
+Information about LLVM's development process.
+
+.. toctree::
+   :hidden:
+
+   MakefileGuide
+   Projects
+   LLVMBuild
+   HowToReleaseLLVM
+   Packaging
+
+:doc:`Projects`
+  How-to guide and templates for new projects that *use* the LLVM
+  infrastructure.  The templates (directory organization, Makefiles, and test
+  tree) allow the project code to be located outside (or inside) the ``llvm/``
+  tree, while using LLVM header files and libraries.
+
+:doc:`LLVMBuild`
+  Describes the LLVMBuild organization and files used by LLVM to specify
+  component descriptions.
+
+:doc:`MakefileGuide`
+  Describes how the LLVM makefiles work and how to use them.
+
+:doc:`HowToReleaseLLVM`
+  This is a guide to preparing LLVM releases. Most developers can ignore it.
+
+:doc:`Packaging`
+   Advice on packaging LLVM into a distribution.
+
+Mailing Lists
+=============
+
+If you can't find what you need in these docs, try consulting the mailing
+lists.
+
+`LLVM Announcements List`__
+  This is a low volume list that provides important announcements regarding
+  LLVM.  It gets email about once a month.
+
+  .. __: http://lists.cs.uiuc.edu/mailman/listinfo/llvm-announce
+
+`Developer's List`__
+  This list is for people who want to be included in technical discussions of
+  LLVM. People post to this list when they have questions about writing code
+  for or using the LLVM tools. It is relatively low volume.
+
+  .. __: http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
+
+`Bugs & Patches Archive`__
+  This list gets emailed every time a bug is opened and closed, and when people
+  submit patches to be included in LLVM.  It is higher volume than the LLVMdev
+  list.
+
+  .. __: http://lists.cs.uiuc.edu/pipermail/llvmbugs/
+
+`Commits Archive`__
+  This list contains all commit messages that are made when LLVM developers
+  commit code changes to the repository. It is useful for those who want to
+  stay on the bleeding edge of LLVM development. This list is very high volume.
+
+  .. __: http://lists.cs.uiuc.edu/pipermail/llvm-commits/
+
+`Test Results Archive`__
+  A message is automatically sent to this list by every active nightly tester
+  when it completes.  As such, this list gets email several times each day,
+  making it a high volume list.
+
+  .. __: http://lists.cs.uiuc.edu/pipermail/llvm-testresults/
+
 Indices and tables
 ==================
 
diff --git a/docs/linpack-pc.png b/docs/linpack-pc.png
index 4294892..bbbee7d 100644
--- a/docs/linpack-pc.png
+++ b/docs/linpack-pc.png
diff --git a/docs/mailing_lists.rst b/docs/mailing_lists.rst
deleted file mode 100644
index 106f1da..0000000
--- a/docs/mailing_lists.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-.. _mailing_lists:
-
-Mailing Lists
-=============
-
- * `LLVM Announcements List
-   <http://lists.cs.uiuc.edu/mailman/listinfo/llvm-announce>`_
-
-   This is a low volume list that provides important announcements regarding
-   LLVM.  It gets email about once a month.
-
- * `Developer's List <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_
-
-   This list is for people who want to be included in technical discussions of
-   LLVM. People post to this list when they have questions about writing code
-   for or using the LLVM tools. It is relatively low volume.
-
- * `Bugs & Patches Archive <http://lists.cs.uiuc.edu/pipermail/llvmbugs/>`_
-
-   This list gets emailed every time a bug is opened and closed, and when people
-   submit patches to be included in LLVM.  It is higher volume than the LLVMdev
-   list.
-
- * `Commits Archive <http://lists.cs.uiuc.edu/pipermail/llvm-commits/>`_
-
-   This list contains all commit messages that are made when LLVM developers
-   commit code changes to the repository. It is useful for those who want to
-   stay on the bleeding edge of LLVM development. This list is very high volume.
-
- * `Test Results Archive
-   <http://lists.cs.uiuc.edu/pipermail/llvm-testresults/>`_
-
-   A message is automatically sent to this list by every active nightly tester
-   when it completes.  As such, this list gets email several times each day,
-   making it a high volume list.
diff --git a/docs/programming.rst b/docs/programming.rst
deleted file mode 100644
index 3fea6ed..0000000
--- a/docs/programming.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-.. _programming:
-
-Programming Documentation
-=========================
-
-.. toctree::
-   :hidden:
-
-   Atomics
-   CodingStandards
-   CommandLine
-   CompilerWriterInfo
-   ExtendingLLVM
-   HowToSetUpLLVMStyleRTTI
-   ProgrammersManual
-
-* `LLVM Language Reference Manual <LangRef.html>`_
-
-  Defines the LLVM intermediate representation and the assembly form of the
-  different nodes.
-
-* :ref:`atomics`
-
-  Information about LLVM's concurrency model.
-
-* :doc:`ProgrammersManual`
-
-  Introduction to the general layout of the LLVM sourcebase, important classes
-  and APIs, and some tips & tricks.
-
-* :ref:`commandline`
-
-  Provides information on using the command line parsing library.
-
-* :ref:`coding_standards`
-
-  Details the LLVM coding standards and provides useful information on writing
-  efficient C++ code.
-
-* :doc:`HowToSetUpLLVMStyleRTTI`
-
-  How to make ``isa<>``, ``dyn_cast<>``, etc. available for clients of your
-  class hierarchy.
-
-* :ref:`extending_llvm`
-
-  Look here to see how to add instructions and intrinsics to LLVM.
-
-* `Doxygen generated documentation <http://llvm.org/doxygen/>`_
-
-  (`classes <http://llvm.org/doxygen/inherits.html>`_)
-  (`tarball <http://llvm.org/doxygen/doxygen.tar.gz>`_)
-
-* `ViewVC Repository Browser <http://llvm.org/viewvc/>`_
-
-* :ref:`compiler_writer_info`
-
-  A list of helpful links for compiler writers.
diff --git a/docs/subsystems.rst b/docs/subsystems.rst
deleted file mode 100644
index 505c573..0000000
--- a/docs/subsystems.rst
+++ /dev/null
@@ -1,110 +0,0 @@
-.. _subsystems:
-
-Subsystem Documentation
-=======================
-
-.. toctree::
-   :hidden:
-
-   AliasAnalysis
-   BitCodeFormat
-   BranchWeightMetadata
-   Bugpoint
-   CodeGenerator
-   ExceptionHandling
-   LinkTimeOptimization
-   SegmentedStacks
-   TableGenFundamentals
-   DebuggingJITedCode
-   GoldPlugin
-   MarkedUpDisassembly
-   SystemLibrary
-   SourceLevelDebugging
-   Vectorizers
-   WritingAnLLVMBackend
-   GarbageCollection
-   WritingAnLLVMPass
-   TableGen/LangRef
-
-* :doc:`WritingAnLLVMPass`
-
-   Information on how to write LLVM transformations and analyses.
-
-* :doc:`WritingAnLLVMBackend`
-
-   Information on how to write LLVM backends for machine targets.
-
-* :ref:`code_generator`
-
-   The design and implementation of the LLVM code generator.  Useful if you are
-   working on retargetting LLVM to a new architecture, designing a new codegen
-   pass, or enhancing existing components.
-    
-* :ref:`tablegen`
-
-   Describes the TableGen tool, which is used heavily by the LLVM code
-   generator.
-    
-* :ref:`alias_analysis`
-    
-   Information on how to write a new alias analysis implementation or how to
-   use existing analyses.
-
-* :doc:`GarbageCollection`
-
-   The interfaces source-language compilers should use for compiling GC'd
-   programs.
-
-* :doc:`Source Level Debugging with LLVM <SourceLevelDebugging>`
-    
-   This document describes the design and philosophy behind the LLVM
-   source-level debugger.
-
-* :doc:`Vectorizers`
-    
-   This document describes the current status of vectorization in LLVM.
-    
-* :ref:`exception_handling`
-    
-   This document describes the design and implementation of exception handling
-   in LLVM.
-    
-* :ref:`bugpoint`
-    
-   Automatic bug finder and test-case reducer description and usage
-   information.
-    
-* :ref:`bitcode_format`
-    
-   This describes the file format and encoding used for LLVM "bc" files.
-    
-* :doc:`System Library <SystemLibrary>`
-    
-   This document describes the LLVM System Library (``lib/System``) and
-   how to keep LLVM source code portable
-    
-* :ref:`lto`
-    
-   This document describes the interface between LLVM intermodular optimizer
-   and the linker and its design
-    
-* :ref:`gold-plugin`
-    
-   How to build your programs with link-time optimization on Linux.
-    
-* :ref:`debugging-jited-code`
-    
-   How to debug JITed code with GDB.
-    
-* :ref:`branch_weight`
-    
-   Provides information about Branch Prediction Information.
-
-* :ref:`segmented_stacks`
-
-   This document describes segmented stacks and how they are used in LLVM.
-
-* :ref:`marked_up_disassembly`
-
-   This document describes the optional rich disassembly output syntax.
-
diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst
index a7e2c4f..69a9aee 100644
--- a/docs/tutorial/index.rst
+++ b/docs/tutorial/index.rst
@@ -25,7 +25,7 @@ Kaleidoscope: Implementing a Language with LLVM in Objective Caml
 External Tutorials
 ==================
 
-`Write An LLVM Backend Tutorial For Cpu0 <http://jonathan2251.github.com/lbd/>`_
+`Tutorial: Creating an LLVM Backend for the Cpu0 Architecture <http://jonathan2251.github.com/lbd/>`_
    A step-by-step tutorial for developing an LLVM backend. Under
    active development at `<https://github.com/Jonathan2251/lbd>`_ (please
    contribute!).
diff --git a/docs/userguides.rst b/docs/userguides.rst
deleted file mode 100644
index 7e4e3b7..0000000
--- a/docs/userguides.rst
+++ /dev/null
@@ -1,112 +0,0 @@
-.. _userguides:
-
-User Guides
-===========
-
-.. toctree::
-   :hidden:
-
-   CMake
-   HowToBuildOnARM
-   CommandGuide/index
-   DeveloperPolicy
-   GettingStarted
-   GettingStartedVS
-   FAQ
-   Lexicon
-   Packaging
-   HowToAddABuilder
-   yaml2obj
-   HowToSubmitABug
-   SphinxQuickstartTemplate
-   Phabricator
-   TestingGuide
-   tutorial/index
-   ReleaseNotes
-   Passes
-   YamlIO
-
-* :ref:`getting_started`
-    
-   Discusses how to get up and running quickly with the LLVM infrastructure.
-   Everything from unpacking and compilation of the distribution to execution
-   of some tools.
-    
-* :ref:`building-with-cmake`
-
-   An addendum to the main Getting Started guide for those using the `CMake
-   build system <http://www.cmake.org>`_.
-
-* :ref:`how_to_build_on_arm`
-
-   Notes on building and testing LLVM/Clang on ARM.
-
-* :doc:`GettingStartedVS`
-
-   An addendum to the main Getting Started guide for those using Visual Studio
-   on Windows.
-    
-* :doc:`tutorial/index`
-
-   A walk through the process of using LLVM for a custom language, and the
-   facilities LLVM offers in tutorial form.
-
-* :ref:`developer_policy`
-
-   The LLVM project's policy towards developers and their contributions.
-
-* :ref:`LLVM Command Guide <commands>`
-
-   A reference manual for the LLVM command line utilities ("man" pages for LLVM
-   tools).
-    
-* :doc:`Passes`
-
-   A list of optimizations and analyses implemented in LLVM.
-
-* :ref:`faq`
-
-   A list of common questions and problems and their solutions.
-    
-* :doc:`Release notes for the current release <ReleaseNotes>`
-
-   This describes new features, known bugs, and other limitations.
-
-* :ref:`how-to-submit-a-bug-report`
-    
-   Instructions for properly submitting information about any bugs you run into
-   in the LLVM system.
-* :doc:`SphinxQuickstartTemplate`
-
-  A template + tutorial for writing new Sphinx documentation. It is meant
-  to be read in source form.
-    
-* :doc:`LLVM Testing Infrastructure Guide <TestingGuide>`
-
-   A reference manual for using the LLVM testing infrastructure.
-    
-* `How to build the C, C++, ObjC, and ObjC++ front end <http://clang.llvm.org/get_started.html>`_
-
-   Instructions for building the clang front-end from source.
-    
-* :ref:`packaging`
-
-   Advice on packaging LLVM into a distribution.
-    
-* :ref:`lexicon`
-
-   Definition of acronyms, terms and concepts used in LLVM.
-
-* :ref:`how_to_add_a_builder`
-
-   Instructions for adding new builder to LLVM buildbot master.
-    
-* :ref:`yamlio`
-
-   A reference guide for using LLVM's YAML I/O library.
-
-* **IRC** -- You can probably find help on the unofficial LLVM IRC.
-
-   We often are on irc.oftc.net in the #llvm channel.  If you are using the
-   mozilla browser, and have chatzilla installed, you can `join #llvm on
-   irc.oftc.net <irc://irc.oftc.net/llvm>`_.
diff --git a/docs/yaml2obj.rst b/docs/yaml2obj.rst
index d051e7e..b269806 100644
--- a/docs/yaml2obj.rst
+++ b/docs/yaml2obj.rst
@@ -1,5 +1,3 @@
-.. _yaml2obj:
-
 yaml2obj
 ========
 
diff --git a/examples/ExceptionDemo/ExceptionDemo.cpp b/examples/ExceptionDemo/ExceptionDemo.cpp
index 7c3b7de..264ef54 100644
--- a/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -41,7 +41,7 @@
 //     Cases -1 and 7 are caught by a C++ test harness where the validity of
 //         of a C++ catch(...) clause catching a generated exception with a
 //         type info type of 7 is explained by: example in rules 1.6.4 in
-//         http://sourcery.mentor.com/public/cxx-abi/abi-eh.html (v1.22)
+//         http://mentorembedded.github.com/cxx-abi/abi-eh.html (v1.22)
 //
 // This code uses code from the llvm compiler-rt project and the llvm
 // Kaleidoscope project.
@@ -82,7 +82,7 @@
 #endif
 
 // System C++ ABI unwind types from:
-//     http://sourcery.mentor.com/public/cxx-abi/abi-eh.html (v1.22)
+//     http://mentorembedded.github.com/cxx-abi/abi-eh.html (v1.22)
 
 extern "C" {
 
@@ -151,7 +151,7 @@ struct OurExceptionType_t {
 ///
 /// Note: The above unwind.h defines struct _Unwind_Exception to be aligned
 ///       on a double word boundary. This is necessary to match the standard:
-///       http://refspecs.freestandards.org/abi-eh-1.21.html
+///       http://mentorembedded.github.com/cxx-abi/abi-eh.html
 struct OurBaseException_t {
   struct OurExceptionType_t type;
 
@@ -339,7 +339,7 @@ void deleteOurException(OurUnwindException *expToDelete) {
 /// This function is the struct _Unwind_Exception API mandated delete function
 /// used by foreign exception handlers when deleting our exception
 /// (OurException), instances.
-/// @param reason @link http://refspecs.freestandards.org/abi-eh-1.21.html
+/// @param reason @link http://mentorembedded.github.com/cxx-abi/abi-eh.html
 /// @unlink
 /// @param expToDelete exception instance to delete
 void deleteFromUnwindOurException(_Unwind_Reason_Code reason,
@@ -512,7 +512,7 @@ static uintptr_t readEncodedPointer(const uint8_t **data, uint8_t encoding) {
 /// are supported. Filters are not supported.
 /// See Variable Length Data in:
 /// @link http://dwarfstd.org/Dwarf3.pdf @unlink
-/// Also see @link http://refspecs.freestandards.org/abi-eh-1.21.html @unlink
+/// Also see @link http://mentorembedded.github.com/cxx-abi/abi-eh.html @unlink
 /// @param resultAction reference variable which will be set with result
 /// @param classInfo our array of type info pointers (to globals)
 /// @param actionEntry index into above type info array or 0 (clean up).
@@ -599,7 +599,7 @@ static bool handleActionValue(int64_t *resultAction,
 
 
 /// Deals with the Language specific data portion of the emitted dwarf code.
-/// See @link http://refspecs.freestandards.org/abi-eh-1.21.html @unlink
+/// See @link http://mentorembedded.github.com/cxx-abi/abi-eh.html @unlink
 /// @param version unsupported (ignored), unwind version
 /// @param lsda language specific data area
 /// @param _Unwind_Action actions minimally supported unwind stage
@@ -783,7 +783,7 @@ static _Unwind_Reason_Code handleLsda(int version,
 
 /// This is the personality function which is embedded (dwarf emitted), in the
 /// dwarf unwind info block. Again see: JITDwarfEmitter.cpp.
-/// See @link http://refspecs.freestandards.org/abi-eh-1.21.html @unlink
+/// See @link http://mentorembedded.github.com/cxx-abi/abi-eh.html @unlink
 /// @param version unsupported (ignored), unwind version
 /// @param _Unwind_Action actions minimally supported unwind stage
 ///        (forced specifically not supported)
@@ -831,7 +831,7 @@ _Unwind_Reason_Code ourPersonality(int version,
 /// Generates our _Unwind_Exception class from a given character array.
 /// thereby handling arbitrary lengths (not in standard), and handling
 /// embedded \0s.
-/// See @link http://refspecs.freestandards.org/abi-eh-1.21.html @unlink
+/// See @link http://mentorembedded.github.com/cxx-abi/abi-eh.html @unlink
 /// @param classChars char array to encode. NULL values not checkedf
 /// @param classCharsSize number of chars in classChars. Value is not checked.
 /// @returns class value
@@ -1592,7 +1592,7 @@ void runExceptionThrow(llvm::ExecutionEngine *engine,
   catch (...) {
     // Catch all exceptions including our generated ones. This latter
     // functionality works according to the example in rules 1.6.4 of
-    // http://sourcery.mentor.com/public/cxx-abi/abi-eh.html (v1.22),
+    // http://mentorembedded.github.com/cxx-abi/abi-eh.html (v1.22),
     // given that these will be exceptions foreign to C++
     // (the _Unwind_Exception::exception_class should be different from
     // the one used by C++).
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index f2fe764..e85fb97 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -173,10 +173,11 @@ typedef enum {
     LLVMUWTable = 1 << 30,
     LLVMNonLazyBind = 1 << 31
 
-    /* FIXME: This attribute is currently not included in the C API as
+    /* FIXME: These attributes are currently not included in the C API as
        a temporary measure until the API/ABI impact to the C API is understood
        and the path forward agreed upon.
-    LLVMAddressSafety = 1ULL << 32
+    LLVMAddressSafety = 1ULL << 32,
+    LLVMStackProtectStrongAttribute = 1ULL<<33
     */
 } LLVMAttribute;
 
@@ -357,6 +358,11 @@ typedef enum {
 
 void LLVMInitializeCore(LLVMPassRegistryRef R);
 
+/** Deallocate and destroy all ManagedStatic variables.
+    @see llvm::llvm_shutdown
+    @see ManagedStatic */
+void LLVMShutdown();
+
 
 /*===-- Error handling ----------------------------------------------------===*/
 
@@ -2547,6 +2553,13 @@ LLVMBool LLVMCreateMemoryBufferWithContentsOfFile(const char *Path,
                                                   char **OutMessage);
 LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf,
                                          char **OutMessage);
+LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRange(const char *InputData,
+                                                          size_t InputDataLength,
+                                                          const char *BufferName,
+                                                          LLVMBool RequiresNullTerminator);
+LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRangeCopy(const char *InputData,
+                                                              size_t InputDataLength,
+                                                              const char *BufferName);
 void LLVMDisposeMemoryBuffer(LLVMMemoryBufferRef MemBuf);
 
 /**
@@ -2619,6 +2632,34 @@ void LLVMDisposePassManager(LLVMPassManagerRef PM);
  */
 
 /**
+ * @defgroup LLVMCCoreThreading Threading
+ *
+ * Handle the structures needed to make LLVM safe for multithreading.
+ *
+ * @{
+ */
+
+/** Allocate and initialize structures needed to make LLVM safe for
+    multithreading. The return value indicates whether multithreaded
+    initialization succeeded. Must be executed in isolation from all
+    other LLVM api calls.
+    @see llvm::llvm_start_multithreaded */
+LLVMBool LLVMStartMultithreaded();
+
+/** Deallocate structures necessary to make LLVM safe for multithreading.
+    Must be executed in isolation from all other LLVM api calls.
+    @see llvm::llvm_stop_multithreaded */
+void LLVMStopMultithreaded();
+
+/** Check whether LLVM is executing in thread-safe mode or not.
+    @see llvm::llvm_is_multithreaded */
+LLVMBool LLVMIsMultithreaded();
+
+/**
+ * @}
+ */
+
+/**
  * @}
  */
 
diff --git a/include/llvm-c/Initialization.h b/include/llvm-c/Initialization.h
index cb3ab9e..ada4738 100644
--- a/include/llvm-c/Initialization.h
+++ b/include/llvm-c/Initialization.h
@@ -34,6 +34,7 @@ extern "C" {
 void LLVMInitializeCore(LLVMPassRegistryRef R);
 void LLVMInitializeTransformUtils(LLVMPassRegistryRef R);
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R);
+void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R);
 void LLVMInitializeVectorization(LLVMPassRegistryRef R);
 void LLVMInitializeInstCombine(LLVMPassRegistryRef R);
 void LLVMInitializeIPO(LLVMPassRegistryRef R);
diff --git a/include/llvm-c/LinkTimeOptimizer.h b/include/llvm-c/LinkTimeOptimizer.h
index 5338d3f..7a0fbf6 100644
--- a/include/llvm-c/LinkTimeOptimizer.h
+++ b/include/llvm-c/LinkTimeOptimizer.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __LTO_CAPI_H__
-#define __LTO_CAPI_H__
+#ifndef LLVM_C_LINKTIMEOPTIMIZER_H
+#define LLVM_C_LINKTIMEOPTIMIZER_H
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/llvm-c/Transforms/PassManagerBuilder.h b/include/llvm-c/Transforms/PassManagerBuilder.h
index cee6e5a..0e5e3ca 100644
--- a/include/llvm-c/Transforms/PassManagerBuilder.h
+++ b/include/llvm-c/Transforms/PassManagerBuilder.h
@@ -11,8 +11,8 @@
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
-#ifndef LLVM_C_PASSMANAGERBUILDER
-#define LLVM_C_PASSMANAGERBUILDER
+#ifndef LLVM_C_TRANSFORMS_PASSMANAGERBUILDER_H
+#define LLVM_C_TRANSFORMS_PASSMANAGERBUILDER_H
 
 #include "llvm-c/Core.h"
 
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index 74915c0..40110fd 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -13,8 +13,8 @@
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
 
-#ifndef LTO_H
-#define LTO_H  1
+#ifndef LLVM_C_LTO_H
+#define LLVM_C_LTO_H
 
 #include <stdbool.h>
 #include <stddef.h>
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index 93d343a..14bcaef 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -97,8 +97,8 @@
     nexttoward.
 */
 
-#ifndef LLVM_FLOAT_H
-#define LLVM_FLOAT_H
+#ifndef LLVM_ADT_APFLOAT_H
+#define LLVM_ADT_APFLOAT_H
 
 // APInt contains static functions implementing bignum arithmetic.
 #include "llvm/ADT/APInt.h"
@@ -184,9 +184,9 @@ namespace llvm {
     APFloat(const fltSemantics &, integerPart);
     APFloat(const fltSemantics &, fltCategory, bool negative);
     APFloat(const fltSemantics &, uninitializedTag);
+    APFloat(const fltSemantics &, const APInt &);
     explicit APFloat(double d);
     explicit APFloat(float f);
-    explicit APFloat(const APInt &, bool isIEEE = false);
     APFloat(const APFloat &);
     ~APFloat();
 
@@ -300,7 +300,7 @@ namespace llvm {
     /* The definition of equality is not straightforward for floating point,
        so we won't use operator==.  Use one of the following, or write
        whatever it is you really mean. */
-    // bool operator==(const APFloat &) const;     // DO NOT IMPLEMENT
+    bool operator==(const APFloat &) const LLVM_DELETED_FUNCTION;
 
     /* IEEE comparison with another floating point number (NaNs
        compare unordered, 0==-0). */
@@ -423,7 +423,7 @@ namespace llvm {
     APInt convertQuadrupleAPFloatToAPInt() const;
     APInt convertF80LongDoubleAPFloatToAPInt() const;
     APInt convertPPCDoubleDoubleAPFloatToAPInt() const;
-    void initFromAPInt(const APInt& api, bool isIEEE = false);
+    void initFromAPInt(const fltSemantics *Sem, const APInt& api);
     void initFromHalfAPInt(const APInt& api);
     void initFromFloatAPInt(const APInt& api);
     void initFromDoubleAPInt(const APInt& api);
@@ -463,4 +463,4 @@ namespace llvm {
   hash_code hash_value(const APFloat &Arg);
 } /* namespace llvm */
 
-#endif /* LLVM_FLOAT_H */
+#endif /* LLVM_ADT_APFLOAT_H */
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 95cd23d..13b353c 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_APINT_H
-#define LLVM_APINT_H
+#ifndef LLVM_ADT_APINT_H
+#define LLVM_ADT_APINT_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Compiler.h"
@@ -505,6 +505,17 @@ public:
     return getAllOnesValue(numBits).lshr(numBits - loBitsSet);
   }
 
+  /// \brief Return a value containing V broadcasted over NewLen bits.
+  static APInt getSplat(unsigned NewLen, const APInt &V) {
+    assert(NewLen >= V.getBitWidth() && "Can't splat to smaller bit width!");
+
+    APInt Val = V.zextOrSelf(NewLen);
+    for (unsigned I = V.getBitWidth(); I < NewLen; I <<= 1)
+      Val |= Val << I;
+
+    return Val;
+  }
+
   /// \brief Determine if two APInts have the same value, after zero-extending
   /// one of them (if needed!) to ensure that the bit-widths match.
   static bool isSameValue(const APInt &I1, const APInt &I2) {
@@ -799,16 +810,7 @@ public:
 
   /// Signed divide this APInt by APInt RHS.
   /// @brief Signed division function for APInt.
-  APInt sdiv(const APInt &RHS) const {
-    if (isNegative())
-      if (RHS.isNegative())
-        return (-(*this)).udiv(-RHS);
-      else
-        return -((-(*this)).udiv(RHS));
-    else if (RHS.isNegative())
-      return -(this->udiv(-RHS));
-    return this->udiv(RHS);
-  }
+  APInt sdiv(const APInt &RHS) const;
 
   /// Perform an unsigned remainder operation on this APInt with RHS being the
   /// divisor. Both this and RHS are treated as unsigned quantities for purposes
@@ -821,16 +823,7 @@ public:
 
   /// Signed remainder operation on APInt.
   /// @brief Function for signed remainder operation.
-  APInt srem(const APInt &RHS) const {
-    if (isNegative())
-      if (RHS.isNegative())
-        return -((-(*this)).urem(-RHS));
-      else
-        return -((-(*this)).urem(RHS));
-    else if (RHS.isNegative())
-      return this->urem(-RHS);
-    return this->urem(RHS);
-  }
+  APInt srem(const APInt &RHS) const;
 
   /// Sometimes it is convenient to divide two APInt values and obtain both the
   /// quotient and remainder. This function does both operations in the same
@@ -842,24 +835,9 @@ public:
                       APInt &Quotient, APInt &Remainder);
 
   static void sdivrem(const APInt &LHS, const APInt &RHS,
-                      APInt &Quotient, APInt &Remainder) {
-    if (LHS.isNegative()) {
-      if (RHS.isNegative())
-        APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
-      else {
-        APInt::udivrem(-LHS, RHS, Quotient, Remainder);
-        Quotient = -Quotient;
-      }
-      Remainder = -Remainder;
-    } else if (RHS.isNegative()) {
-      APInt::udivrem(LHS, -RHS, Quotient, Remainder);
-      Quotient = -Quotient;
-    } else {
-      APInt::udivrem(LHS, RHS, Quotient, Remainder);
-    }
-  }
-  
-  
+                      APInt &Quotient, APInt &Remainder);
+
+
   // Operations that return overflow indicators.
   APInt sadd_ov(const APInt &RHS, bool &Overflow) const;
   APInt uadd_ov(const APInt &RHS, bool &Overflow) const;
@@ -1191,7 +1169,8 @@ public:
   /// APInt. This is used in conjunction with getActiveData to extract the raw
   /// value of the APInt.
   unsigned getActiveWords() const {
-    return whichWord(getActiveBits()-1) + 1;
+    unsigned numActiveBits = getActiveBits();
+    return numActiveBits ? whichWord(numActiveBits - 1) + 1 : 1;
   }
 
   /// Computes the minimum bit width for this APInt while considering it to be
diff --git a/include/llvm/ADT/APSInt.h b/include/llvm/ADT/APSInt.h
index 4a5e7a3..42e1239 100644
--- a/include/llvm/ADT/APSInt.h
+++ b/include/llvm/ADT/APSInt.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_APSINT_H
-#define LLVM_APSINT_H
+#ifndef LLVM_ADT_APSINT_H
+#define LLVM_ADT_APSINT_H
 
 #include "llvm/ADT/APInt.h"
 
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index 1e35d62..c555c1c 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -33,6 +33,8 @@ namespace llvm {
     typedef const T *const_iterator;
     typedef size_t size_type;
 
+    typedef std::reverse_iterator<iterator> reverse_iterator;
+
   private:
     /// The start of the array, in an external buffer.
     const T *Data;
@@ -84,6 +86,9 @@ namespace llvm {
     iterator begin() const { return Data; }
     iterator end() const { return Data + Length; }
 
+    reverse_iterator rbegin() const { return reverse_iterator(end()); }
+    reverse_iterator rend() const { return reverse_iterator(begin()); }
+
     /// empty - Check if the array is empty.
     bool empty() const { return Length == 0; }
 
@@ -171,41 +176,41 @@ namespace llvm {
 
     /// Construct an empty ArrayRef.
     /*implicit*/ MutableArrayRef() : ArrayRef<T>() {}
-    
+
     /// Construct an MutableArrayRef from a single element.
     /*implicit*/ MutableArrayRef(T &OneElt) : ArrayRef<T>(OneElt) {}
-    
+
     /// Construct an MutableArrayRef from a pointer and length.
     /*implicit*/ MutableArrayRef(T *data, size_t length)
       : ArrayRef<T>(data, length) {}
-    
+
     /// Construct an MutableArrayRef from a range.
     MutableArrayRef(T *begin, T *end) : ArrayRef<T>(begin, end) {}
-    
+
     /// Construct an MutableArrayRef from a SmallVector.
     /*implicit*/ MutableArrayRef(SmallVectorImpl<T> &Vec)
     : ArrayRef<T>(Vec) {}
-    
+
     /// Construct a MutableArrayRef from a std::vector.
     /*implicit*/ MutableArrayRef(std::vector<T> &Vec)
     : ArrayRef<T>(Vec) {}
-    
+
     /// Construct an MutableArrayRef from a C array.
     template <size_t N>
     /*implicit*/ MutableArrayRef(T (&Arr)[N])
       : ArrayRef<T>(Arr) {}
-    
+
     T *data() const { return const_cast<T*>(ArrayRef<T>::data()); }
 
     iterator begin() const { return data(); }
     iterator end() const { return data() + this->size(); }
-    
+
     /// front - Get the first element.
     T &front() const {
       assert(!this->empty());
       return data()[0];
     }
-    
+
     /// back - Get the last element.
     T &back() const {
       assert(!this->empty());
@@ -217,14 +222,14 @@ namespace llvm {
       assert(N <= this->size() && "Invalid specifier");
       return MutableArrayRef<T>(data()+N, this->size()-N);
     }
-    
+
     /// slice(n, m) - Chop off the first N elements of the array, and keep M
     /// elements in the array.
     MutableArrayRef<T> slice(unsigned N, unsigned M) const {
       assert(N+M <= this->size() && "Invalid specifier");
       return MutableArrayRef<T>(data()+N, M);
     }
-    
+
     /// @}
     /// @name Operator Overloads
     /// @{
@@ -301,5 +306,5 @@ namespace llvm {
     static const bool value = true;
   };
 }
-  
+
 #endif
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index 01f7e90..d410619 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -159,6 +159,24 @@ public:
     return std::make_pair(iterator(TheBucket, getBucketsEnd(), true), true);
   }
 
+#if LLVM_HAS_RVALUE_REFERENCES
+  // Inserts key,value pair into the map if the key isn't already in the map.
+  // If the key is already in the map, it returns false and doesn't update the
+  // value.
+  std::pair<iterator, bool> insert(std::pair<KeyT, ValueT> &&KV) {
+    BucketT *TheBucket;
+    if (LookupBucketFor(KV.first, TheBucket))
+      return std::make_pair(iterator(TheBucket, getBucketsEnd(), true),
+                            false); // Already in map.
+    
+    // Otherwise, insert the new element.
+    TheBucket = InsertIntoBucket(std::move(KV.first),
+                                 std::move(KV.second),
+                                 TheBucket);
+    return std::make_pair(iterator(TheBucket, getBucketsEnd(), true), true);
+  }
+#endif
+  
   /// insert - Range insertion of pairs.
   template<typename InputIt>
   void insert(InputIt I, InputIt E) {
@@ -475,7 +493,6 @@ private:
       if (KeyInfoT::isEqual(ThisBucket->first, EmptyKey)) {
         // If we've already seen a tombstone while probing, fill it in instead
         // of the empty bucket we eventually probed to.
-        if (FoundTombstone) ThisBucket = FoundTombstone;
         FoundBucket = FoundTombstone ? FoundTombstone : ThisBucket;
         return false;
       }
diff --git a/include/llvm/ADT/ImmutableIntervalMap.h b/include/llvm/ADT/ImmutableIntervalMap.h
index fa7ccb9..6793c6b 100644
--- a/include/llvm/ADT/ImmutableIntervalMap.h
+++ b/include/llvm/ADT/ImmutableIntervalMap.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_IMMUTABLE_INTERVAL_MAP_H
-#define LLVM_ADT_IMMUTABLE_INTERVAL_MAP_H
+#ifndef LLVM_ADT_IMMUTABLEINTERVALMAP_H
+#define LLVM_ADT_IMMUTABLEINTERVALMAP_H
 
 #include "llvm/ADT/ImmutableMap.h"
 
diff --git a/include/llvm/ADT/ImmutableList.h b/include/llvm/ADT/ImmutableList.h
index 998d785..7f0c239 100644
--- a/include/llvm/ADT/ImmutableList.h
+++ b/include/llvm/ADT/ImmutableList.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_IMLIST_H
-#define LLVM_ADT_IMLIST_H
+#ifndef LLVM_ADT_IMMUTABLELIST_H
+#define LLVM_ADT_IMMUTABLELIST_H
 
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/Support/Allocator.h"
diff --git a/include/llvm/ADT/ImmutableMap.h b/include/llvm/ADT/ImmutableMap.h
index f9baec2..a667479 100644
--- a/include/llvm/ADT/ImmutableMap.h
+++ b/include/llvm/ADT/ImmutableMap.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_IMMAP_H
-#define LLVM_ADT_IMMAP_H
+#ifndef LLVM_ADT_IMMUTABLEMAP_H
+#define LLVM_ADT_IMMUTABLEMAP_H
 
 #include "llvm/ADT/ImmutableSet.h"
 
@@ -211,17 +211,22 @@ public:
     friend class ImmutableMap;
 
   public:
-    value_type_ref operator*() const { return itr->getValue(); }
-    value_type*    operator->() const { return &itr->getValue(); }
+    typedef typename ImmutableMap<KeyT,ValT,ValInfo>::value_type value_type;
+    typedef typename ImmutableMap<KeyT,ValT,ValInfo>::value_type_ref reference;
+    typedef typename iterator::value_type *pointer;
+    typedef std::bidirectional_iterator_tag iterator_category;
+
+    typename iterator::reference operator*() const { return itr->getValue(); }
+    typename iterator::pointer   operator->() const { return &itr->getValue(); }
 
     key_type_ref getKey() const { return itr->getValue().first; }
     data_type_ref getData() const { return itr->getValue().second; }
 
-
     iterator& operator++() { ++itr; return *this; }
     iterator  operator++(int) { iterator tmp(*this); ++itr; return tmp; }
     iterator& operator--() { --itr; return *this; }
     iterator  operator--(int) { iterator tmp(*this); --itr; return tmp; }
+
     bool operator==(const iterator& RHS) const { return RHS.itr == itr; }
     bool operator!=(const iterator& RHS) const { return RHS.itr != itr; }
   };
diff --git a/include/llvm/ADT/ImmutableSet.h b/include/llvm/ADT/ImmutableSet.h
index 0982657..fbdf066 100644
--- a/include/llvm/ADT/ImmutableSet.h
+++ b/include/llvm/ADT/ImmutableSet.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_IMSET_H
-#define LLVM_ADT_IMSET_H
+#ifndef LLVM_ADT_IMMUTABLESET_H
+#define LLVM_ADT_IMMUTABLESET_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -1054,18 +1054,27 @@ public:
 
   class iterator {
     typename TreeTy::iterator itr;
+
+    iterator() {}
     iterator(TreeTy* t) : itr(t) {}
     friend class ImmutableSet<ValT,ValInfo>;
+
   public:
-    iterator() {}
-    inline value_type_ref operator*() const { return itr->getValue(); }
-    inline iterator& operator++() { ++itr; return *this; }
-    inline iterator  operator++(int) { iterator tmp(*this); ++itr; return tmp; }
-    inline iterator& operator--() { --itr; return *this; }
-    inline iterator  operator--(int) { iterator tmp(*this); --itr; return tmp; }
-    inline bool operator==(const iterator& RHS) const { return RHS.itr == itr; }
-    inline bool operator!=(const iterator& RHS) const { return RHS.itr != itr; }
-    inline value_type *operator->() const { return &(operator*()); }
+    typedef typename ImmutableSet<ValT,ValInfo>::value_type value_type;
+    typedef typename ImmutableSet<ValT,ValInfo>::value_type_ref reference;
+    typedef typename iterator::value_type *pointer;
+    typedef std::bidirectional_iterator_tag iterator_category;
+
+    typename iterator::reference operator*() const { return itr->getValue(); }
+    typename iterator::pointer   operator->() const { return &(operator*()); }
+
+    iterator& operator++() { ++itr; return *this; }
+    iterator  operator++(int) { iterator tmp(*this); ++itr; return tmp; }
+    iterator& operator--() { --itr; return *this; }
+    iterator  operator--(int) { iterator tmp(*this); --itr; return tmp; }
+
+    bool operator==(const iterator& RHS) const { return RHS.itr == itr; }
+    bool operator!=(const iterator& RHS) const { return RHS.itr != itr; }
   };
 
   iterator begin() const { return iterator(Root); }
diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h
index 5849503..9e5ab02 100644
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -18,8 +18,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_INTRUSIVE_REF_CNT_PTR
-#define LLVM_ADT_INTRUSIVE_REF_CNT_PTR
+#ifndef LLVM_ADT_INTRUSIVEREFCNTPTR_H
+#define LLVM_ADT_INTRUSIVEREFCNTPTR_H
 
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
@@ -240,4 +240,4 @@ namespace llvm {
 
 } // end namespace llvm
 
-#endif // LLVM_ADT_INTRUSIVE_REF_CNT_PTR
+#endif // LLVM_ADT_INTRUSIVEREFCNTPTR_H
diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h
index c34e32a..f6fcb08 100644
--- a/include/llvm/ADT/MapVector.h
+++ b/include/llvm/ADT/MapVector.h
@@ -19,6 +19,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include <vector>
 
 namespace llvm {
@@ -63,6 +64,11 @@ public:
     return Vector.empty();
   }
 
+  std::pair<KeyT, ValueT>       &front()       { return Vector.front(); }
+  const std::pair<KeyT, ValueT> &front() const { return Vector.front(); }
+  std::pair<KeyT, ValueT>       &back()        { return Vector.back(); }
+  const std::pair<KeyT, ValueT> &back()  const { return Vector.back(); }
+
   void clear() {
     Map.clear();
     Vector.clear();
@@ -84,6 +90,18 @@ public:
     return Pos == Map.end()? ValueT() : Vector[Pos->second].second;
   }
 
+  std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &KV) {
+    std::pair<KeyT, unsigned> Pair = std::make_pair(KV.first, 0);
+    std::pair<typename MapType::iterator, bool> Result = Map.insert(Pair);
+    unsigned &I = Result.first->second;
+    if (Result.second) {
+      Vector.push_back(std::make_pair(KV.first, KV.second));
+      I = Vector.size() - 1;
+      return std::make_pair(llvm::prior(end()), true);
+    }
+    return std::make_pair(begin() + I, false);
+  }
+
   unsigned count(const KeyT &Key) const {
     typename MapType::const_iterator Pos = Map.find(Key);
     return Pos == Map.end()? 0 : 1;
@@ -100,6 +118,13 @@ public:
     return Pos == Map.end()? Vector.end() :
                             (Vector.begin() + Pos->second);
   }
+
+  /// \brief Remove the last element from the vector.
+  void pop_back() {
+    typename MapType::iterator Pos = Map.find(Vector.back().first);
+    Map.erase(Pos);
+    Vector.pop_back();
+  }
 };
 
 }
diff --git a/include/llvm/ADT/None.h b/include/llvm/ADT/None.h
new file mode 100644
index 0000000..5793bd2
--- /dev/null
+++ b/include/llvm/ADT/None.h
@@ -0,0 +1,27 @@
+//===-- None.h - Simple null value for implicit construction ------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file provides None, an enumerator for use in implicit constructors
+//  of various (usually templated) types to make such construction more
+//  terse.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_NONE_H
+#define LLVM_ADT_NONE_H
+
+namespace llvm {
+/// \brief A simple null object to allow implicit construction of Optional<T>
+/// and similar types without having to spell out the specialization's name.
+enum NoneType {
+  None
+};
+}
+
+#endif
diff --git a/include/llvm/ADT/NullablePtr.h b/include/llvm/ADT/NullablePtr.h
index a9c47a1..8ddfd5d 100644
--- a/include/llvm/ADT/NullablePtr.h
+++ b/include/llvm/ADT/NullablePtr.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_NULLABLE_PTR_H
-#define LLVM_ADT_NULLABLE_PTR_H
+#ifndef LLVM_ADT_NULLABLEPTR_H
+#define LLVM_ADT_NULLABLEPTR_H
 
 #include <cassert>
 #include <cstddef>
diff --git a/include/llvm/ADT/Optional.h b/include/llvm/ADT/Optional.h
index 06e5af0..81d73ed 100644
--- a/include/llvm/ADT/Optional.h
+++ b/include/llvm/ADT/Optional.h
@@ -13,10 +13,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_OPTIONAL
-#define LLVM_ADT_OPTIONAL
+#ifndef LLVM_ADT_OPTIONAL_H
+#define LLVM_ADT_OPTIONAL_H
 
+#include "llvm/ADT/None.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/AlignOf.h"
 #include <cassert>
 
 #if LLVM_HAS_RVALUE_REFERENCES
@@ -27,37 +29,102 @@ namespace llvm {
 
 template<typename T>
 class Optional {
-  T x;
+  AlignedCharArrayUnion<T> storage;
   bool hasVal;
 public:
-  explicit Optional() : x(), hasVal(false) {}
-  Optional(const T &y) : x(y), hasVal(true) {}
+  Optional(NoneType) : hasVal(false) {}
+  explicit Optional() : hasVal(false) {}
+  Optional(const T &y) : hasVal(true) {
+    new (storage.buffer) T(y);
+  }
+  Optional(const Optional &O) : hasVal(O.hasVal) {
+    if (hasVal)
+      new (storage.buffer) T(*O);
+  }
 
 #if LLVM_HAS_RVALUE_REFERENCES
-  Optional(T &&y) : x(std::forward<T>(y)), hasVal(true) {}
+  Optional(T &&y) : hasVal(true) {
+    new (storage.buffer) T(std::forward<T>(y));
+  }
+  Optional(Optional<T> &&O) : hasVal(O) {
+    if (O) {
+      new (storage.buffer) T(std::move(*O));
+      O.reset();
+    }
+  }
+  Optional &operator=(T &&y) {
+    if (hasVal)
+      **this = std::move(y);
+    else {
+      new (storage.buffer) T(std::move(y));
+      hasVal = true;
+    }
+    return *this;
+  }
+  Optional &operator=(Optional &&O) {
+    if (!O)
+      reset();
+    else {
+      *this = std::move(*O);
+      O.reset();
+    }
+    return *this;
+  }
 #endif
 
   static inline Optional create(const T* y) {
     return y ? Optional(*y) : Optional();
   }
 
+  // FIXME: these assignments (& the equivalent const T&/const Optional& ctors)
+  // could be made more efficient by passing by value, possibly unifying them
+  // with the rvalue versions above - but this could place a different set of
+  // requirements (notably: the existence of a default ctor) when implemented
+  // in that way. Careful SFINAE to avoid such pitfalls would be required.
   Optional &operator=(const T &y) {
-    x = y;
-    hasVal = true;
+    if (hasVal)
+      **this = y;
+    else {
+      new (storage.buffer) T(y);
+      hasVal = true;
+    }
+    return *this;
+  }
+
+  Optional &operator=(const Optional &O) {
+    if (!O)
+      reset();
+    else
+      *this = *O;
     return *this;
   }
-  
-  const T* getPointer() const { assert(hasVal); return &x; }
-  const T& getValue() const LLVM_LVALUE_FUNCTION { assert(hasVal); return x; }
 
-  operator bool() const { return hasVal; }
+  void reset() {
+    if (hasVal) {
+      (**this).~T();
+      hasVal = false;
+    }
+  }
+
+  ~Optional() {
+    reset();
+  }
+
+  const T* getPointer() const { assert(hasVal); return reinterpret_cast<const T*>(storage.buffer); }
+  T* getPointer() { assert(hasVal); return reinterpret_cast<T*>(storage.buffer); }
+  const T& getValue() const LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
+  T& getValue() LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
+
+  LLVM_EXPLICIT operator bool() const { return hasVal; }
   bool hasValue() const { return hasVal; }
   const T* operator->() const { return getPointer(); }
-  const T& operator*() const LLVM_LVALUE_FUNCTION { assert(hasVal); return x; }
+  T* operator->() { return getPointer(); }
+  const T& operator*() const LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
+  T& operator*() LLVM_LVALUE_FUNCTION { assert(hasVal); return *getPointer(); }
 
 #if LLVM_HAS_RVALUE_REFERENCE_THIS
-  T&& getValue() && { assert(hasVal); return std::move(x); }
-  T&& operator*() && { assert(hasVal); return std::move(x); } 
+  T&& getValue() && { assert(hasVal); return std::move(*getPointer()); }
+  T&& operator*() && { assert(hasVal); return std::move(*getPointer()); }
 #endif
 };
 
@@ -75,11 +142,17 @@ template <typename T>
 struct simplify_type<Optional<T> >
   : public simplify_type<const Optional<T> > {};
 
+template <typename T> struct isPodLike;
+template <typename T> struct isPodLike<Optional<T> > {
+  // An Optional<T> is pod-like if T is.
+  static const bool value = isPodLike<T>::value;
+};
+
 /// \brief Poison comparison between two \c Optional objects. Clients needs to
 /// explicitly compare the underlying values and account for empty \c Optional
 /// objects.
 ///
-/// This routine will never be defined. It returns \c void to help diagnose 
+/// This routine will never be defined. It returns \c void to help diagnose
 /// errors at compile time.
 template<typename T, typename U>
 void operator==(const Optional<T> &X, const Optional<U> &Y);
@@ -88,7 +161,7 @@ void operator==(const Optional<T> &X, const Optional<U> &Y);
 /// explicitly compare the underlying values and account for empty \c Optional
 /// objects.
 ///
-/// This routine will never be defined. It returns \c void to help diagnose 
+/// This routine will never be defined. It returns \c void to help diagnose
 /// errors at compile time.
 template<typename T, typename U>
 void operator!=(const Optional<T> &X, const Optional<U> &Y);
@@ -97,7 +170,7 @@ void operator!=(const Optional<T> &X, const Optional<U> &Y);
 /// explicitly compare the underlying values and account for empty \c Optional
 /// objects.
 ///
-/// This routine will never be defined. It returns \c void to help diagnose 
+/// This routine will never be defined. It returns \c void to help diagnose
 /// errors at compile time.
 template<typename T, typename U>
 void operator<(const Optional<T> &X, const Optional<U> &Y);
@@ -106,7 +179,7 @@ void operator<(const Optional<T> &X, const Optional<U> &Y);
 /// explicitly compare the underlying values and account for empty \c Optional
 /// objects.
 ///
-/// This routine will never be defined. It returns \c void to help diagnose 
+/// This routine will never be defined. It returns \c void to help diagnose
 /// errors at compile time.
 template<typename T, typename U>
 void operator<=(const Optional<T> &X, const Optional<U> &Y);
@@ -115,7 +188,7 @@ void operator<=(const Optional<T> &X, const Optional<U> &Y);
 /// explicitly compare the underlying values and account for empty \c Optional
 /// objects.
 ///
-/// This routine will never be defined. It returns \c void to help diagnose 
+/// This routine will never be defined. It returns \c void to help diagnose
 /// errors at compile time.
 template<typename T, typename U>
 void operator>=(const Optional<T> &X, const Optional<U> &Y);
@@ -124,7 +197,7 @@ void operator>=(const Optional<T> &X, const Optional<U> &Y);
 /// explicitly compare the underlying values and account for empty \c Optional
 /// objects.
 ///
-/// This routine will never be defined. It returns \c void to help diagnose 
+/// This routine will never be defined. It returns \c void to help diagnose
 /// errors at compile time.
 template<typename T, typename U>
 void operator>(const Optional<T> &X, const Optional<U> &Y);
diff --git a/include/llvm/ADT/OwningPtr.h b/include/llvm/ADT/OwningPtr.h
index ea22991..86f9fee 100644
--- a/include/llvm/ADT/OwningPtr.h
+++ b/include/llvm/ADT/OwningPtr.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_OWNING_PTR_H
-#define LLVM_ADT_OWNING_PTR_H
+#ifndef LLVM_ADT_OWNINGPTR_H
+#define LLVM_ADT_OWNINGPTR_H
 
 #include "llvm/Support/Compiler.h"
 #include <cassert>
diff --git a/include/llvm/ADT/PriorityQueue.h b/include/llvm/ADT/PriorityQueue.h
index bf8a687..827d0b3 100644
--- a/include/llvm/ADT/PriorityQueue.h
+++ b/include/llvm/ADT/PriorityQueue.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_PRIORITY_QUEUE_H
-#define LLVM_ADT_PRIORITY_QUEUE_H
+#ifndef LLVM_ADT_PRIORITYQUEUE_H
+#define LLVM_ADT_PRIORITYQUEUE_H
 
 #include <algorithm>
 #include <queue>
diff --git a/include/llvm/ADT/SmallString.h b/include/llvm/ADT/SmallString.h
index 8da99d1..2cfb5b9 100644
--- a/include/llvm/ADT/SmallString.h
+++ b/include/llvm/ADT/SmallString.h
@@ -77,7 +77,7 @@ public:
   void append(in_iter S, in_iter E) {
     SmallVectorImpl<char>::append(S, E);
   }
-  
+
   void append(size_t NumInputs, char Elt) {
     SmallVectorImpl<char>::append(NumInputs, Elt);
   }
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 951e574..9167f87 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -145,16 +145,20 @@ public:
   }
 
   reference front() {
+    assert(!empty());
     return begin()[0];
   }
   const_reference front() const {
+    assert(!empty());
     return begin()[0];
   }
 
   reference back() {
+    assert(!empty());
     return end()[-1];
   }
   const_reference back() const {
+    assert(!empty());
     return end()[-1];
   }
 };
diff --git a/include/llvm/ADT/SparseMultiSet.h b/include/llvm/ADT/SparseMultiSet.h
new file mode 100644
index 0000000..7f2a6f7
--- /dev/null
+++ b/include/llvm/ADT/SparseMultiSet.h
@@ -0,0 +1,526 @@
+//===--- llvm/ADT/SparseMultiSet.h - Sparse multiset ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SparseMultiSet class, which adds multiset behavior to
+// the SparseSet.
+//
+// A sparse multiset holds a small number of objects identified by integer keys
+// from a moderately sized universe. The sparse multiset uses more memory than
+// other containers in order to provide faster operations. Any key can map to
+// multiple values. A SparseMultiSetNode class is provided, which serves as a
+// convenient base class for the contents of a SparseMultiSet.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_SPARSEMULTISET_H
+#define LLVM_ADT_SPARSEMULTISET_H
+
+#include "llvm/ADT/SparseSet.h"
+
+namespace llvm {
+
+/// Fast multiset implementation for objects that can be identified by small
+/// unsigned keys.
+///
+/// SparseMultiSet allocates memory proportional to the size of the key
+/// universe, so it is not recommended for building composite data structures.
+/// It is useful for algorithms that require a single set with fast operations.
+///
+/// Compared to DenseSet and DenseMap, SparseMultiSet provides constant-time
+/// fast clear() as fast as a vector.  The find(), insert(), and erase()
+/// operations are all constant time, and typically faster than a hash table.
+/// The iteration order doesn't depend on numerical key values, it only depends
+/// on the order of insert() and erase() operations.  Iteration order is the
+/// insertion order. Iteration is only provided over elements of equivalent
+/// keys, but iterators are bidirectional.
+///
+/// Compared to BitVector, SparseMultiSet<unsigned> uses 8x-40x more memory, but
+/// offers constant-time clear() and size() operations as well as fast iteration
+/// independent on the size of the universe.
+///
+/// SparseMultiSet contains a dense vector holding all the objects and a sparse
+/// array holding indexes into the dense vector.  Most of the memory is used by
+/// the sparse array which is the size of the key universe. The SparseT template
+/// parameter provides a space/speed tradeoff for sets holding many elements.
+///
+/// When SparseT is uint32_t, find() only touches up to 3 cache lines, but the
+/// sparse array uses 4 x Universe bytes.
+///
+/// When SparseT is uint8_t (the default), find() touches up to 3+[N/256] cache
+/// lines, but the sparse array is 4x smaller.  N is the number of elements in
+/// the set.
+///
+/// For sets that may grow to thousands of elements, SparseT should be set to
+/// uint16_t or uint32_t.
+///
+/// Multiset behavior is provided by providing doubly linked lists for values
+/// that are inlined in the dense vector. SparseMultiSet is a good choice when
+/// one desires a growable number of entries per key, as it will retain the
+/// SparseSet algorithmic properties despite being growable. Thus, it is often a
+/// better choice than a SparseSet of growable containers or a vector of
+/// vectors. SparseMultiSet also keeps iterators valid after erasure (provided
+/// the iterators don't point to the element erased), allowing for more
+/// intuitive and fast removal.
+///
+/// @tparam ValueT      The type of objects in the set.
+/// @tparam KeyFunctorT A functor that computes an unsigned index from KeyT.
+/// @tparam SparseT     An unsigned integer type. See above.
+///
+template<typename ValueT,
+         typename KeyFunctorT = llvm::identity<unsigned>,
+         typename SparseT = uint8_t>
+class SparseMultiSet {
+  /// The actual data that's stored, as a doubly-linked list implemented via
+  /// indices into the DenseVector.  The doubly linked list is implemented
+  /// circular in Prev indices, and INVALID-terminated in Next indices. This
+  /// provides efficient access to list tails. These nodes can also be
+  /// tombstones, in which case they are actually nodes in a single-linked
+  /// freelist of recyclable slots.
+  struct SMSNode {
+    static const unsigned INVALID = ~0U;
+
+    ValueT Data;
+    unsigned Prev;
+    unsigned Next;
+
+    SMSNode(ValueT D, unsigned P, unsigned N) : Data(D), Prev(P), Next(N) { }
+
+    /// List tails have invalid Nexts.
+    bool isTail() const {
+      return Next == INVALID;
+    }
+
+    /// Whether this node is a tombstone node, and thus is in our freelist.
+    bool isTombstone() const {
+      return Prev == INVALID;
+    }
+
+    /// Since the list is circular in Prev, all non-tombstone nodes have a valid
+    /// Prev.
+    bool isValid() const { return Prev != INVALID; }
+  };
+
+  typedef typename KeyFunctorT::argument_type KeyT;
+  typedef SmallVector<SMSNode, 8> DenseT;
+  DenseT Dense;
+  SparseT *Sparse;
+  unsigned Universe;
+  KeyFunctorT KeyIndexOf;
+  SparseSetValFunctor<KeyT, ValueT, KeyFunctorT> ValIndexOf;
+
+  /// We have a built-in recycler for reusing tombstone slots. This recycler
+  /// puts a singly-linked free list into tombstone slots, allowing us quick
+  /// erasure, iterator preservation, and dense size.
+  unsigned FreelistIdx;
+  unsigned NumFree;
+
+  unsigned sparseIndex(const ValueT &Val) const {
+    assert(ValIndexOf(Val) < Universe &&
+           "Invalid key in set. Did object mutate?");
+    return ValIndexOf(Val);
+  }
+  unsigned sparseIndex(const SMSNode &N) const { return sparseIndex(N.Data); }
+
+  // Disable copy construction and assignment.
+  // This data structure is not meant to be used that way.
+  SparseMultiSet(const SparseMultiSet&) LLVM_DELETED_FUNCTION;
+  SparseMultiSet &operator=(const SparseMultiSet&) LLVM_DELETED_FUNCTION;
+
+  /// Whether the given entry is the head of the list. List heads's previous
+  /// pointers are to the tail of the list, allowing for efficient access to the
+  /// list tail. D must be a valid entry node.
+  bool isHead(const SMSNode &D) const {
+    assert(D.isValid() && "Invalid node for head");
+    return Dense[D.Prev].isTail();
+  }
+
+  /// Whether the given entry is a singleton entry, i.e. the only entry with
+  /// that key.
+  bool isSingleton(const SMSNode &N) const {
+    assert(N.isValid() && "Invalid node for singleton");
+    // Is N its own predecessor?
+    return &Dense[N.Prev] == &N;
+  }
+
+  /// Add in the given SMSNode. Uses a free entry in our freelist if
+  /// available. Returns the index of the added node.
+  unsigned addValue(const ValueT& V, unsigned Prev, unsigned Next) {
+    if (NumFree == 0) {
+      Dense.push_back(SMSNode(V, Prev, Next));
+      return Dense.size() - 1;
+    }
+
+    // Peel off a free slot
+    unsigned Idx = FreelistIdx;
+    unsigned NextFree = Dense[Idx].Next;
+    assert(Dense[Idx].isTombstone() && "Non-tombstone free?");
+
+    Dense[Idx] = SMSNode(V, Prev, Next);
+    FreelistIdx = NextFree;
+    --NumFree;
+    return Idx;
+  }
+
+  /// Make the current index a new tombstone. Pushes it onto the freelist.
+  void makeTombstone(unsigned Idx) {
+    Dense[Idx].Prev = SMSNode::INVALID;
+    Dense[Idx].Next = FreelistIdx;
+    FreelistIdx = Idx;
+    ++NumFree;
+  }
+
+public:
+  typedef ValueT value_type;
+  typedef ValueT &reference;
+  typedef const ValueT &const_reference;
+  typedef ValueT *pointer;
+  typedef const ValueT *const_pointer;
+
+  SparseMultiSet()
+    : Sparse(0), Universe(0), FreelistIdx(SMSNode::INVALID), NumFree(0) { }
+
+  ~SparseMultiSet() { free(Sparse); }
+
+  /// Set the universe size which determines the largest key the set can hold.
+  /// The universe must be sized before any elements can be added.
+  ///
+  /// @param U Universe size. All object keys must be less than U.
+  ///
+  void setUniverse(unsigned U) {
+    // It's not hard to resize the universe on a non-empty set, but it doesn't
+    // seem like a likely use case, so we can add that code when we need it.
+    assert(empty() && "Can only resize universe on an empty map");
+    // Hysteresis prevents needless reallocations.
+    if (U >= Universe/4 && U <= Universe)
+      return;
+    free(Sparse);
+    // The Sparse array doesn't actually need to be initialized, so malloc
+    // would be enough here, but that will cause tools like valgrind to
+    // complain about branching on uninitialized data.
+    Sparse = reinterpret_cast<SparseT*>(calloc(U, sizeof(SparseT)));
+    Universe = U;
+  }
+
+  /// Our iterators are iterators over the collection of objects that share a
+  /// key.
+  template<typename SMSPtrTy>
+  class iterator_base : public std::iterator<std::bidirectional_iterator_tag,
+                                             ValueT> {
+    friend class SparseMultiSet;
+    SMSPtrTy SMS;
+    unsigned Idx;
+    unsigned SparseIdx;
+
+    iterator_base(SMSPtrTy P, unsigned I, unsigned SI)
+      : SMS(P), Idx(I), SparseIdx(SI) { }
+
+    /// Whether our iterator has fallen outside our dense vector.
+    bool isEnd() const {
+      if (Idx == SMSNode::INVALID)
+        return true;
+
+      assert(Idx < SMS->Dense.size() && "Out of range, non-INVALID Idx?");
+      return false;
+    }
+
+    /// Whether our iterator is properly keyed, i.e. the SparseIdx is valid
+    bool isKeyed() const { return SparseIdx < SMS->Universe; }
+
+    unsigned Prev() const { return SMS->Dense[Idx].Prev; }
+    unsigned Next() const { return SMS->Dense[Idx].Next; }
+
+    void setPrev(unsigned P) { SMS->Dense[Idx].Prev = P; }
+    void setNext(unsigned N) { SMS->Dense[Idx].Next = N; }
+
+  public:
+    typedef std::iterator<std::bidirectional_iterator_tag, ValueT> super;
+    typedef typename super::value_type value_type;
+    typedef typename super::difference_type difference_type;
+    typedef typename super::pointer pointer;
+    typedef typename super::reference reference;
+
+    iterator_base(const iterator_base &RHS)
+      : SMS(RHS.SMS), Idx(RHS.Idx), SparseIdx(RHS.SparseIdx) { }
+
+    const iterator_base &operator=(const iterator_base &RHS) {
+      SMS = RHS.SMS;
+      Idx = RHS.Idx;
+      SparseIdx = RHS.SparseIdx;
+      return *this;
+    }
+
+    reference operator*() const {
+      assert(isKeyed() && SMS->sparseIndex(SMS->Dense[Idx].Data) == SparseIdx &&
+             "Dereferencing iterator of invalid key or index");
+
+      return SMS->Dense[Idx].Data;
+    }
+    pointer operator->() const { return &operator*(); }
+
+    /// Comparison operators
+    bool operator==(const iterator_base &RHS) const {
+      // end compares equal
+      if (SMS == RHS.SMS && Idx == RHS.Idx) {
+        assert((isEnd() || SparseIdx == RHS.SparseIdx) &&
+               "Same dense entry, but different keys?");
+        return true;
+      }
+
+      return false;
+    }
+
+    bool operator!=(const iterator_base &RHS) const {
+      return !operator==(RHS);
+    }
+
+    /// Increment and decrement operators
+    iterator_base &operator--() { // predecrement - Back up
+      assert(isKeyed() && "Decrementing an invalid iterator");
+      assert((isEnd() || !SMS->isHead(SMS->Dense[Idx])) &&
+             "Decrementing head of list");
+
+      // If we're at the end, then issue a new find()
+      if (isEnd())
+        Idx = SMS->findIndex(SparseIdx).Prev();
+      else
+        Idx = Prev();
+
+      return *this;
+    }
+    iterator_base &operator++() { // preincrement - Advance
+      assert(!isEnd() && isKeyed() && "Incrementing an invalid/end iterator");
+      Idx = Next();
+      return *this;
+    }
+    iterator_base operator--(int) { // postdecrement
+      iterator_base I(*this);
+      --*this;
+      return I;
+    }
+    iterator_base operator++(int) { // postincrement
+      iterator_base I(*this);
+      ++*this;
+      return I;
+    }
+  };
+  typedef iterator_base<SparseMultiSet *> iterator;
+  typedef iterator_base<const SparseMultiSet *> const_iterator;
+
+  // Convenience types
+  typedef std::pair<iterator, iterator> RangePair;
+
+  /// Returns an iterator past this container. Note that such an iterator cannot
+  /// be decremented, but will compare equal to other end iterators.
+  iterator end() { return iterator(this, SMSNode::INVALID, SMSNode::INVALID); }
+  const_iterator end() const {
+    return const_iterator(this, SMSNode::INVALID, SMSNode::INVALID);
+  }
+
+  /// Returns true if the set is empty.
+  ///
+  /// This is not the same as BitVector::empty().
+  ///
+  bool empty() const { return size() == 0; }
+
+  /// Returns the number of elements in the set.
+  ///
+  /// This is not the same as BitVector::size() which returns the size of the
+  /// universe.
+  ///
+  unsigned size() const {
+    assert(NumFree <= Dense.size() && "Out-of-bounds free entries");
+    return Dense.size() - NumFree;
+  }
+
+  /// Clears the set.  This is a very fast constant time operation.
+  ///
+  void clear() {
+    // Sparse does not need to be cleared, see find().
+    Dense.clear();
+    NumFree = 0;
+    FreelistIdx = SMSNode::INVALID;
+  }
+
+  /// Find an element by its index.
+  ///
+  /// @param   Idx A valid index to find.
+  /// @returns An iterator to the element identified by key, or end().
+  ///
+  iterator findIndex(unsigned Idx) {
+    assert(Idx < Universe && "Key out of range");
+    assert(std::numeric_limits<SparseT>::is_integer &&
+           !std::numeric_limits<SparseT>::is_signed &&
+           "SparseT must be an unsigned integer type");
+    const unsigned Stride = std::numeric_limits<SparseT>::max() + 1u;
+    for (unsigned i = Sparse[Idx], e = Dense.size(); i < e; i += Stride) {
+      const unsigned FoundIdx = sparseIndex(Dense[i]);
+      // Check that we're pointing at the correct entry and that it is the head
+      // of a valid list.
+      if (Idx == FoundIdx && Dense[i].isValid() && isHead(Dense[i]))
+        return iterator(this, i, Idx);
+      // Stride is 0 when SparseT >= unsigned.  We don't need to loop.
+      if (!Stride)
+        break;
+    }
+    return end();
+  }
+
+  /// Find an element by its key.
+  ///
+  /// @param   Key A valid key to find.
+  /// @returns An iterator to the element identified by key, or end().
+  ///
+  iterator find(const KeyT &Key) {
+    return findIndex(KeyIndexOf(Key));
+  }
+
+  const_iterator find(const KeyT &Key) const {
+    iterator I = const_cast<SparseMultiSet*>(this)->findIndex(KeyIndexOf(Key));
+    return const_iterator(I.SMS, I.Idx, KeyIndexOf(Key));
+  }
+
+  /// Returns the number of elements identified by Key. This will be linear in
+  /// the number of elements of that key.
+  unsigned count(const KeyT &Key) const {
+    unsigned Ret = 0;
+    for (const_iterator It = find(Key); It != end(); ++It)
+      ++Ret;
+
+    return Ret;
+  }
+
+  /// Returns true if this set contains an element identified by Key.
+  bool contains(const KeyT &Key) const {
+    return find(Key) != end();
+  }
+
+  /// Return the head and tail of the subset's list, otherwise returns end().
+  iterator getHead(const KeyT &Key) { return find(Key); }
+  iterator getTail(const KeyT &Key) {
+    iterator I = find(Key);
+    if (I != end())
+      I = iterator(this, I.Prev(), KeyIndexOf(Key));
+    return I;
+  }
+
+  /// The bounds of the range of items sharing Key K. First member is the head
+  /// of the list, and the second member is a decrementable end iterator for
+  /// that key.
+  RangePair equal_range(const KeyT &K) {
+    iterator B = find(K);
+    iterator E = iterator(this, SMSNode::INVALID, B.SparseIdx);
+    return make_pair(B, E);
+  }
+
+  /// Insert a new element at the tail of the subset list. Returns an iterator
+  /// to the newly added entry.
+  iterator insert(const ValueT &Val) {
+    unsigned Idx = sparseIndex(Val);
+    iterator I = findIndex(Idx);
+
+    unsigned NodeIdx = addValue(Val, SMSNode::INVALID, SMSNode::INVALID);
+
+    if (I == end()) {
+      // Make a singleton list
+      Sparse[Idx] = NodeIdx;
+      Dense[NodeIdx].Prev = NodeIdx;
+      return iterator(this, NodeIdx, Idx);
+    }
+
+    // Stick it at the end.
+    unsigned HeadIdx = I.Idx;
+    unsigned TailIdx = I.Prev();
+    Dense[TailIdx].Next = NodeIdx;
+    Dense[HeadIdx].Prev = NodeIdx;
+    Dense[NodeIdx].Prev = TailIdx;
+
+    return iterator(this, NodeIdx, Idx);
+  }
+
+  /// Erases an existing element identified by a valid iterator.
+  ///
+  /// This invalidates iterators pointing at the same entry, but erase() returns
+  /// an iterator pointing to the next element in the subset's list. This makes
+  /// it possible to erase selected elements while iterating over the subset:
+  ///
+  ///   tie(I, E) = Set.equal_range(Key);
+  ///   while (I != E)
+  ///     if (test(*I))
+  ///       I = Set.erase(I);
+  ///     else
+  ///       ++I;
+  ///
+  /// Note that if the last element in the subset list is erased, this will
+  /// return an end iterator which can be decremented to get the new tail (if it
+  /// exists):
+  ///
+  ///  tie(B, I) = Set.equal_range(Key);
+  ///  for (bool isBegin = B == I; !isBegin; /* empty */) {
+  ///    isBegin = (--I) == B;
+  ///    if (test(I))
+  ///      break;
+  ///    I = erase(I);
+  ///  }
+  iterator erase(iterator I) {
+    assert(I.isKeyed() && !I.isEnd() && !Dense[I.Idx].isTombstone() &&
+           "erasing invalid/end/tombstone iterator");
+
+    // First, unlink the node from its list. Then swap the node out with the
+    // dense vector's last entry
+    iterator NextI = unlink(Dense[I.Idx]);
+
+    // Put in a tombstone.
+    makeTombstone(I.Idx);
+
+    return NextI;
+  }
+
+  /// Erase all elements with the given key. This invalidates all
+  /// iterators of that key.
+  void eraseAll(const KeyT &K) {
+    for (iterator I = find(K); I != end(); /* empty */)
+      I = erase(I);
+  }
+
+private:
+  /// Unlink the node from its list. Returns the next node in the list.
+  iterator unlink(const SMSNode &N) {
+    if (isSingleton(N)) {
+      // Singleton is already unlinked
+      assert(N.Next == SMSNode::INVALID && "Singleton has next?");
+      return iterator(this, SMSNode::INVALID, ValIndexOf(N.Data));
+    }
+
+    if (isHead(N)) {
+      // If we're the head, then update the sparse array and our next.
+      Sparse[sparseIndex(N)] = N.Next;
+      Dense[N.Next].Prev = N.Prev;
+      return iterator(this, N.Next, ValIndexOf(N.Data));
+    }
+
+    if (N.isTail()) {
+      // If we're the tail, then update our head and our previous.
+      findIndex(sparseIndex(N)).setPrev(N.Prev);
+      Dense[N.Prev].Next = N.Next;
+
+      // Give back an end iterator that can be decremented
+      iterator I(this, N.Prev, ValIndexOf(N.Data));
+      return ++I;
+    }
+
+    // Otherwise, just drop us
+    Dense[N.Next].Prev = N.Prev;
+    Dense[N.Prev].Next = N.Next;
+    return iterator(this, N.Next, ValIndexOf(N.Data));
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 9503e0f..d2887c5 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -27,6 +27,17 @@ static inline char hexdigit(unsigned X, bool LowerCase = false) {
   return X < 10 ? '0' + X : HexChar + X - 10;
 }
 
+/// Interpret the given character \p C as a hexadecimal digit and return its
+/// value.
+///
+/// If \p C is not a valid hex digit, -1U is returned.
+static inline unsigned hexDigitValue(char C) {
+  if (C >= '0' && C <= '9') return C-'0';
+  if (C >= 'a' && C <= 'f') return C-'a'+10U;
+  if (C >= 'A' && C <= 'F') return C-'A'+10U;
+  return -1U;
+}
+
 /// utohex_buffer - Emit the specified number into the buffer specified by
 /// BufferEnd, returning a pointer to the start of the string.  This can be used
 /// like this: (note that the buffer must be large enough to handle any number):
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 0d68d11..d01437b 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -237,6 +237,10 @@ public:
   explicit StringMap(AllocatorTy A)
     : StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))), Allocator(A) {}
 
+  StringMap(unsigned InitialSize, AllocatorTy A)
+    : StringMapImpl(InitialSize, static_cast<unsigned>(sizeof(MapEntryTy))),
+      Allocator(A) {}
+
   StringMap(const StringMap &RHS)
     : StringMapImpl(static_cast<unsigned>(sizeof(MapEntryTy))) {
     assert(RHS.empty() &&
@@ -334,8 +338,8 @@ public:
       StringMapEntryBase *&Bucket = TheTable[I];
       if (Bucket && Bucket != getTombstoneVal()) {
         static_cast<MapEntryTy*>(Bucket)->Destroy(Allocator);
-        Bucket = 0;
       }
+      Bucket = 0;
     }
 
     NumItems = 0;
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index 0e93f51..224855e 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -57,14 +57,14 @@ namespace llvm {
     // integer works around this bug.
     static size_t min(size_t a, size_t b) { return a < b ? a : b; }
     static size_t max(size_t a, size_t b) { return a > b ? a : b; }
-    
+
     // Workaround memcmp issue with null pointers (undefined behavior)
     // by providing a specialized version
     static int compareMemory(const char *Lhs, const char *Rhs, size_t Length) {
       if (Length == 0) { return 0; }
       return ::memcmp(Lhs,Rhs,Length);
     }
-    
+
   public:
     /// @name Constructors
     /// @{
@@ -387,7 +387,7 @@ namespace llvm {
       Start = min(Start, Length);
       return StringRef(Data + Start, min(N, Length - Start));
     }
-    
+
     /// Return a StringRef equal to 'this' but with the first \p N elements
     /// dropped.
     StringRef drop_front(unsigned N = 1) const {
@@ -535,7 +535,7 @@ namespace llvm {
     return LHS.compare(RHS) != -1;
   }
 
-  inline std::string &operator+=(std::string &buffer, llvm::StringRef string) {
+  inline std::string &operator+=(std::string &buffer, StringRef string) {
     return buffer.append(string.data(), string.size());
   }
 
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 49d9f68..8fac222 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -44,6 +44,7 @@ public:
     UnknownArch,
 
     arm,     // ARM; arm, armv.*, xscale
+    aarch64, // AArch64: aarch64
     hexagon, // Hexagon: hexagon
     mips,    // MIPS: mips, mipsallegrex
     mipsel,  // MIPSEL: mipsel, mipsallegrexel
@@ -111,6 +112,7 @@ public:
     GNU,
     GNUEABI,
     GNUEABIHF,
+    GNUX32,
     EABI,
     MachO,
     Android,
@@ -295,9 +297,14 @@ public:
     return getOS() == Triple::Darwin || getOS() == Triple::MacOSX;
   }
 
+  /// Is this an iOS triple.
+  bool isiOS() const {
+    return getOS() == Triple::IOS;
+  }
+
   /// isOSDarwin - Is this a "Darwin" OS (OS X or iOS).
   bool isOSDarwin() const {
-    return isMacOSX() || getOS() == Triple::IOS;
+    return isMacOSX() || isiOS();
   }
 
   /// \brief Tests for either Cygwin or MinGW OS
diff --git a/include/llvm/ADT/VariadicFunction.h b/include/llvm/ADT/VariadicFunction.h
index a7f83a6..0497aa7 100644
--- a/include/llvm/ADT/VariadicFunction.h
+++ b/include/llvm/ADT/VariadicFunction.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_VARIADIC_FUNCTION_H
-#define LLVM_ADT_VARIADIC_FUNCTION_H
+#ifndef LLVM_ADT_VARIADICFUNCTION_H
+#define LLVM_ADT_VARIADICFUNCTION_H
 
 #include "llvm/ADT/ArrayRef.h"
 
@@ -328,4 +328,4 @@ struct VariadicFunction3 {
 
 } // end namespace llvm
 
-#endif  // LLVM_ADT_VARIADIC_FUNCTION_H
+#endif  // LLVM_ADT_VARIADICFUNCTION_H
diff --git a/include/llvm/ADT/ilist.h b/include/llvm/ADT/ilist.h
index 20cdae3..aeb7848 100644
--- a/include/llvm/ADT/ilist.h
+++ b/include/llvm/ADT/ilist.h
@@ -234,17 +234,17 @@ public:
   pointer getNodePtrUnchecked() const { return NodePtr; }
 };
 
-// do not implement. this is to catch errors when people try to use
-// them as random access iterators
+// These are to catch errors when people try to use them as random access
+// iterators.
 template<typename T>
-void operator-(int, ilist_iterator<T>);
+void operator-(int, ilist_iterator<T>) LLVM_DELETED_FUNCTION;
 template<typename T>
-void operator-(ilist_iterator<T>,int);
+void operator-(ilist_iterator<T>,int) LLVM_DELETED_FUNCTION;
 
 template<typename T>
-void operator+(int, ilist_iterator<T>);
+void operator+(int, ilist_iterator<T>) LLVM_DELETED_FUNCTION;
 template<typename T>
-void operator+(ilist_iterator<T>,int);
+void operator+(ilist_iterator<T>,int) LLVM_DELETED_FUNCTION;
 
 // operator!=/operator== - Allow mixed comparisons without dereferencing
 // the iterator, which could very likely be pointing to end().
diff --git a/include/llvm/ADT/ilist_node.h b/include/llvm/ADT/ilist_node.h
index f008003..0361244 100644
--- a/include/llvm/ADT/ilist_node.h
+++ b/include/llvm/ADT/ilist_node.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_ILIST_NODE_H
-#define LLVM_ADT_ILIST_NODE_H
+#ifndef LLVM_ADT_ILISTNODE_H
+#define LLVM_ADT_ILISTNODE_H
 
 namespace llvm {
 
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 3ce4732..d703f21 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -34,8 +34,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_ALIAS_ANALYSIS_H
-#define LLVM_ANALYSIS_ALIAS_ANALYSIS_H
+#ifndef LLVM_ANALYSIS_ALIASANALYSIS_H
+#define LLVM_ANALYSIS_ALIASANALYSIS_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CallSite.h"
@@ -593,11 +593,6 @@ bool isNoAliasCall(const Value *V);
 ///
 bool isIdentifiedObject(const Value *V);
 
-/// isKnownNonNull - Return true if this pointer couldn't possibly be null by
-/// its definition.  This returns true for allocas, non-extern-weak globals and
-/// byval arguments.
-bool isKnownNonNull(const Value *V);
-
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Analysis/CallGraphSCCPass.h b/include/llvm/Analysis/CallGraphSCCPass.h
index 4446777..e609dac 100644
--- a/include/llvm/Analysis/CallGraphSCCPass.h
+++ b/include/llvm/Analysis/CallGraphSCCPass.h
@@ -18,8 +18,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CALL_GRAPH_SCC_PASS_H
-#define LLVM_CALL_GRAPH_SCC_PASS_H
+#ifndef LLVM_ANALYSIS_CALLGRAPHSCCPASS_H
+#define LLVM_ANALYSIS_CALLGRAPHSCCPASS_H
 
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Pass.h"
diff --git a/include/llvm/Analysis/CallPrinter.h b/include/llvm/Analysis/CallPrinter.h
new file mode 100644
index 0000000..5f5d160
--- /dev/null
+++ b/include/llvm/Analysis/CallPrinter.h
@@ -0,0 +1,27 @@
+//===-- CallPrinter.h - Call graph printer external interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines external functions that can be called to explicitly
+// instantiate the call graph printer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_CALLPRINTER_H
+#define LLVM_ANALYSIS_CALLPRINTER_H
+
+namespace llvm {
+
+  class ModulePass;
+
+  ModulePass *createCallGraphViewerPass();
+  ModulePass *createCallGraphPrinterPass();
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Analysis/CodeMetrics.h b/include/llvm/Analysis/CodeMetrics.h
index 35eabec..086934d 100644
--- a/include/llvm/Analysis/CodeMetrics.h
+++ b/include/llvm/Analysis/CodeMetrics.h
@@ -19,80 +19,75 @@
 #include "llvm/Support/CallSite.h"
 
 namespace llvm {
-  class BasicBlock;
-  class Function;
-  class Instruction;
-  class DataLayout;
-  class Value;
+class BasicBlock;
+class Function;
+class Instruction;
+class DataLayout;
+class TargetTransformInfo;
+class Value;
+
+/// \brief Check whether a call will lower to something small.
+///
+/// This tests checks whether this callsite will lower to something
+/// significantly cheaper than a traditional call, often a single
+/// instruction. Note that if isInstructionFree(CS.getInstruction()) would
+/// return true, so will this function.
+bool callIsSmall(ImmutableCallSite CS);
+
+/// \brief Utility to calculate the size and a few similar metrics for a set
+/// of basic blocks.
+struct CodeMetrics {
+  /// \brief True if this function contains a call to setjmp or other functions
+  /// with attribute "returns twice" without having the attribute itself.
+  bool exposesReturnsTwice;
+
+  /// \brief True if this function calls itself.
+  bool isRecursive;
+
+  /// \brief True if this function cannot be duplicated.
+  ///
+  /// True if this function contains one or more indirect branches, or it contains
+  /// one or more 'noduplicate' instructions.
+  bool notDuplicatable;
+
+  /// \brief True if this function calls alloca (in the C sense).
+  bool usesDynamicAlloca;
+
+  /// \brief Number of instructions in the analyzed blocks.
+  unsigned NumInsts;
 
-  /// \brief Check whether an instruction is likely to be "free" when lowered.
-  bool isInstructionFree(const Instruction *I, const DataLayout *TD = 0);
+  /// \brief Number of analyzed blocks.
+  unsigned NumBlocks;
 
-  /// \brief Check whether a call will lower to something small.
+  /// \brief Keeps track of basic block code size estimates.
+  DenseMap<const BasicBlock *, unsigned> NumBBInsts;
+
+  /// \brief Keep track of the number of calls to 'big' functions.
+  unsigned NumCalls;
+
+  /// \brief The number of calls to internal functions with a single caller.
   ///
-  /// This tests checks whether this callsite will lower to something
-  /// significantly cheaper than a traditional call, often a single
-  /// instruction. Note that if isInstructionFree(CS.getInstruction()) would
-  /// return true, so will this function.
-  bool callIsSmall(ImmutableCallSite CS);
-
-  /// \brief Utility to calculate the size and a few similar metrics for a set
-  /// of basic blocks.
-  struct CodeMetrics {
-    /// \brief True if this function contains a call to setjmp or other functions
-    /// with attribute "returns twice" without having the attribute itself.
-    bool exposesReturnsTwice;
-
-    /// \brief True if this function calls itself.
-    bool isRecursive;
-
-    /// \brief True if this function cannot be duplicated.
-    ///
-    /// True if this function contains one or more indirect branches, or it contains
-    /// one or more 'noduplicate' instructions.
-    bool notDuplicatable;
-
-    /// \brief True if this function calls alloca (in the C sense).
-    bool usesDynamicAlloca;
-
-    /// \brief Number of instructions in the analyzed blocks.
-    unsigned NumInsts;
-
-    /// \brief Number of analyzed blocks.
-    unsigned NumBlocks;
-
-    /// \brief Keeps track of basic block code size estimates.
-    DenseMap<const BasicBlock *, unsigned> NumBBInsts;
-
-    /// \brief Keep track of the number of calls to 'big' functions.
-    unsigned NumCalls;
-
-    /// \brief The number of calls to internal functions with a single caller.
-    ///
-    /// These are likely targets for future inlining, likely exposed by
-    /// interleaved devirtualization.
-    unsigned NumInlineCandidates;
-
-    /// \brief How many instructions produce vector values.
-    ///
-    /// The inliner is more aggressive with inlining vector kernels.
-    unsigned NumVectorInsts;
-
-    /// \brief How many 'ret' instructions the blocks contain.
-    unsigned NumRets;
-
-    CodeMetrics() : exposesReturnsTwice(false), isRecursive(false),
-                    notDuplicatable(false), usesDynamicAlloca(false),
-                    NumInsts(0), NumBlocks(0), NumCalls(0),
-                    NumInlineCandidates(0), NumVectorInsts(0),
-                    NumRets(0) {}
-
-    /// \brief Add information about a block to the current state.
-    void analyzeBasicBlock(const BasicBlock *BB, const DataLayout *TD = 0);
-
-    /// \brief Add information about a function to the current state.
-    void analyzeFunction(Function *F, const DataLayout *TD = 0);
-  };
+  /// These are likely targets for future inlining, likely exposed by
+  /// interleaved devirtualization.
+  unsigned NumInlineCandidates;
+
+  /// \brief How many instructions produce vector values.
+  ///
+  /// The inliner is more aggressive with inlining vector kernels.
+  unsigned NumVectorInsts;
+
+  /// \brief How many 'ret' instructions the blocks contain.
+  unsigned NumRets;
+
+  CodeMetrics()
+      : exposesReturnsTwice(false), isRecursive(false), notDuplicatable(false),
+        usesDynamicAlloca(false), NumInsts(0), NumBlocks(0), NumCalls(0),
+        NumInlineCandidates(0), NumVectorInsts(0), NumRets(0) {}
+
+  /// \brief Add information about a block to the current state.
+  void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI);
+};
+
 }
 
 #endif
diff --git a/include/llvm/Analysis/DOTGraphTraitsPass.h b/include/llvm/Analysis/DOTGraphTraitsPass.h
index bd8c4a1..0fc1c2d 100644
--- a/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -11,27 +11,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_DOT_GRAPHTRAITS_PASS_H
-#define LLVM_ANALYSIS_DOT_GRAPHTRAITS_PASS_H
+#ifndef LLVM_ANALYSIS_DOTGRAPHTRAITSPASS_H
+#define LLVM_ANALYSIS_DOTGRAPHTRAITSPASS_H
 
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Pass.h"
 
 namespace llvm {
-template <class Analysis, bool Simple>
-struct DOTGraphTraitsViewer : public FunctionPass {
-  std::string Name;
 
-  DOTGraphTraitsViewer(std::string GraphName, char &ID) : FunctionPass(ID) {
-    Name = GraphName;
-  }
+template <class Analysis, bool Simple>
+class DOTGraphTraitsViewer : public FunctionPass {
+public:
+  DOTGraphTraitsViewer(StringRef GraphName, char &ID)
+    : FunctionPass(ID), Name(GraphName) {}
 
   virtual bool runOnFunction(Function &F) {
-    Analysis *Graph;
-    std::string Title, GraphName;
-    Graph = &getAnalysis<Analysis>();
-    GraphName = DOTGraphTraits<Analysis*>::getGraphName(Graph);
-    Title = GraphName + " for '" + F.getName().str() + "' function";
+    Analysis *Graph = &getAnalysis<Analysis>();
+    std::string GraphName = DOTGraphTraits<Analysis*>::getGraphName(Graph);
+    std::string Title = GraphName + " for '" + F.getName().str() + "' function";
+
     ViewGraph(Graph, Name, Simple, Title);
 
     return false;
@@ -41,36 +39,92 @@ struct DOTGraphTraitsViewer : public FunctionPass {
     AU.setPreservesAll();
     AU.addRequired<Analysis>();
   }
+
+private:
+  std::string Name;
 };
 
 template <class Analysis, bool Simple>
-struct DOTGraphTraitsPrinter : public FunctionPass {
+class DOTGraphTraitsPrinter : public FunctionPass {
+public:
+  DOTGraphTraitsPrinter(StringRef GraphName, char &ID)
+    : FunctionPass(ID), Name(GraphName) {}
+
+  virtual bool runOnFunction(Function &F) {
+    Analysis *Graph = &getAnalysis<Analysis>();
+    std::string Filename = Name + "." + F.getName().str() + ".dot";
+    std::string ErrorInfo;
+
+    errs() << "Writing '" << Filename << "'...";
+
+    raw_fd_ostream File(Filename.c_str(), ErrorInfo);
+    std::string GraphName = DOTGraphTraits<Analysis*>::getGraphName(Graph);
+    std::string Title = GraphName + " for '" + F.getName().str() + "' function";
+
+    if (ErrorInfo.empty())
+      WriteGraph(File, Graph, Simple, Title);
+    else
+      errs() << "  error opening file for writing!";
+    errs() << "\n";
+
+    return false;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    AU.addRequired<Analysis>();
+  }
 
+private:
   std::string Name;
+};
+
+template <class Analysis, bool Simple>
+class DOTGraphTraitsModuleViewer : public ModulePass {
+public:
+  DOTGraphTraitsModuleViewer(StringRef GraphName, char &ID)
+    : ModulePass(ID), Name(GraphName) {}
+
+  virtual bool runOnModule(Module &M) {
+    Analysis *Graph = &getAnalysis<Analysis>();
+    std::string Title = DOTGraphTraits<Analysis*>::getGraphName(Graph);
 
-  DOTGraphTraitsPrinter(std::string GraphName, char &ID)
-    : FunctionPass(ID) {
-    Name = GraphName;
+    ViewGraph(Graph, Name, Simple, Title);
+
+    return false;
   }
 
-  virtual bool runOnFunction(Function &F) {
-    Analysis *Graph;
-    std::string Filename = Name + "." + F.getName().str() + ".dot";
-    errs() << "Writing '" << Filename << "'...";
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    AU.addRequired<Analysis>();
+  }
 
+private:
+  std::string Name;
+};
+
+template <class Analysis, bool Simple>
+class DOTGraphTraitsModulePrinter : public ModulePass {
+public:
+  DOTGraphTraitsModulePrinter(StringRef GraphName, char &ID)
+    : ModulePass(ID), Name(GraphName) {}
+
+  virtual bool runOnModule(Module &M) {
+    Analysis *Graph = &getAnalysis<Analysis>();
+    std::string Filename = Name + ".dot";
     std::string ErrorInfo;
-    raw_fd_ostream File(Filename.c_str(), ErrorInfo);
-    Graph = &getAnalysis<Analysis>();
 
-    std::string Title, GraphName;
-    GraphName = DOTGraphTraits<Analysis*>::getGraphName(Graph);
-    Title = GraphName + " for '" + F.getName().str() + "' function";
+    errs() << "Writing '" << Filename << "'...";
+
+    raw_fd_ostream File(Filename.c_str(), ErrorInfo);
+    std::string Title = DOTGraphTraits<Analysis*>::getGraphName(Graph);
 
     if (ErrorInfo.empty())
       WriteGraph(File, Graph, Simple, Title);
     else
       errs() << "  error opening file for writing!";
     errs() << "\n";
+
     return false;
   }
 
@@ -78,6 +132,11 @@ struct DOTGraphTraitsPrinter : public FunctionPass {
     AU.setPreservesAll();
     AU.addRequired<Analysis>();
   }
+
+private:
+  std::string Name;
 };
-}
+
+} // end namespace llvm
+
 #endif
diff --git a/include/llvm/Analysis/DependenceAnalysis.h b/include/llvm/Analysis/DependenceAnalysis.h
index f49efd9..d0ead39 100644
--- a/include/llvm/Analysis/DependenceAnalysis.h
+++ b/include/llvm/Analysis/DependenceAnalysis.h
@@ -244,8 +244,8 @@ namespace llvm {
   /// DependenceAnalysis - This class is the main dependence-analysis driver.
   ///
   class DependenceAnalysis : public FunctionPass {
-    void operator=(const DependenceAnalysis &);     // do not implement
-    DependenceAnalysis(const DependenceAnalysis &); // do not implement
+    void operator=(const DependenceAnalysis &) LLVM_DELETED_FUNCTION;
+    DependenceAnalysis(const DependenceAnalysis &) LLVM_DELETED_FUNCTION;
   public:
     /// depends - Tests for a dependence between the Src and Dst instructions.
     /// Returns NULL if no dependence; otherwise, returns a Dependence (or a
diff --git a/include/llvm/Analysis/InlineCost.h b/include/llvm/Analysis/InlineCost.h
index 42e329e..bc7924e 100644
--- a/include/llvm/Analysis/InlineCost.h
+++ b/include/llvm/Analysis/InlineCost.h
@@ -14,125 +14,130 @@
 #ifndef LLVM_ANALYSIS_INLINECOST_H
 #define LLVM_ANALYSIS_INLINECOST_H
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/ValueMap.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/IR/Function.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
 #include <cassert>
 #include <climits>
-#include <vector>
 
 namespace llvm {
+class CallSite;
+class DataLayout;
+class Function;
+class TargetTransformInfo;
+
+namespace InlineConstants {
+  // Various magic constants used to adjust heuristics.
+  const int InstrCost = 5;
+  const int IndirectCallThreshold = 100;
+  const int CallPenalty = 25;
+  const int LastCallToStaticBonus = -15000;
+  const int ColdccPenalty = 2000;
+  const int NoreturnPenalty = 10000;
+  /// Do not inline functions which allocate this many bytes on the stack
+  /// when the caller is recursive.
+  const unsigned TotalAllocaSizeRecursiveCaller = 1024;
+}
+
+/// \brief Represents the cost of inlining a function.
+///
+/// This supports special values for functions which should "always" or
+/// "never" be inlined. Otherwise, the cost represents a unitless amount;
+/// smaller values increase the likelihood of the function being inlined.
+///
+/// Objects of this type also provide the adjusted threshold for inlining
+/// based on the information available for a particular callsite. They can be
+/// directly tested to determine if inlining should occur given the cost and
+/// threshold for this cost metric.
+class InlineCost {
+  enum SentinelValues {
+    AlwaysInlineCost = INT_MIN,
+    NeverInlineCost = INT_MAX
+  };
+
+  /// \brief The estimated cost of inlining this callsite.
+  const int Cost;
+
+  /// \brief The adjusted threshold against which this cost was computed.
+  const int Threshold;
+
+  // Trivial constructor, interesting logic in the factory functions below.
+  InlineCost(int Cost, int Threshold) : Cost(Cost), Threshold(Threshold) {}
+
+public:
+  static InlineCost get(int Cost, int Threshold) {
+    assert(Cost > AlwaysInlineCost && "Cost crosses sentinel value");
+    assert(Cost < NeverInlineCost && "Cost crosses sentinel value");
+    return InlineCost(Cost, Threshold);
+  }
+  static InlineCost getAlways() {
+    return InlineCost(AlwaysInlineCost, 0);
+  }
+  static InlineCost getNever() {
+    return InlineCost(NeverInlineCost, 0);
+  }
 
-  class CallSite;
-  class DataLayout;
-
-  namespace InlineConstants {
-    // Various magic constants used to adjust heuristics.
-    const int InstrCost = 5;
-    const int IndirectCallThreshold = 100;
-    const int CallPenalty = 25;
-    const int LastCallToStaticBonus = -15000;
-    const int ColdccPenalty = 2000;
-    const int NoreturnPenalty = 10000;
-    /// Do not inline functions which allocate this many bytes on the stack
-    /// when the caller is recursive.
-    const unsigned TotalAllocaSizeRecursiveCaller = 1024;
+  /// \brief Test whether the inline cost is low enough for inlining.
+  operator bool() const {
+    return Cost < Threshold;
   }
 
-  /// \brief Represents the cost of inlining a function.
+  bool isAlways() const { return Cost == AlwaysInlineCost; }
+  bool isNever() const { return Cost == NeverInlineCost; }
+  bool isVariable() const { return !isAlways() && !isNever(); }
+
+  /// \brief Get the inline cost estimate.
+  /// It is an error to call this on an "always" or "never" InlineCost.
+  int getCost() const {
+    assert(isVariable() && "Invalid access of InlineCost");
+    return Cost;
+  }
+
+  /// \brief Get the cost delta from the threshold for inlining.
+  /// Only valid if the cost is of the variable kind. Returns a negative
+  /// value if the cost is too high to inline.
+  int getCostDelta() const { return Threshold - getCost(); }
+};
+
+/// \brief Cost analyzer used by inliner.
+class InlineCostAnalysis : public CallGraphSCCPass {
+  const DataLayout *TD;
+  const TargetTransformInfo *TTI;
+
+public:
+  static char ID;
+
+  InlineCostAnalysis();
+  ~InlineCostAnalysis();
+
+  // Pass interface implementation.
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnSCC(CallGraphSCC &SCC);
+
+  /// \brief Get an InlineCost object representing the cost of inlining this
+  /// callsite.
   ///
-  /// This supports special values for functions which should "always" or
-  /// "never" be inlined. Otherwise, the cost represents a unitless amount;
-  /// smaller values increase the likelihood of the function being inlined.
+  /// Note that threshold is passed into this function. Only costs below the
+  /// threshold are computed with any accuracy. The threshold can be used to
+  /// bound the computation necessary to determine whether the cost is
+  /// sufficiently low to warrant inlining.
   ///
-  /// Objects of this type also provide the adjusted threshold for inlining
-  /// based on the information available for a particular callsite. They can be
-  /// directly tested to determine if inlining should occur given the cost and
-  /// threshold for this cost metric.
-  class InlineCost {
-    enum SentinelValues {
-      AlwaysInlineCost = INT_MIN,
-      NeverInlineCost = INT_MAX
-    };
-
-    /// \brief The estimated cost of inlining this callsite.
-    const int Cost;
-
-    /// \brief The adjusted threshold against which this cost was computed.
-    const int Threshold;
-
-    // Trivial constructor, interesting logic in the factory functions below.
-    InlineCost(int Cost, int Threshold)
-      : Cost(Cost), Threshold(Threshold) {}
-
-  public:
-    static InlineCost get(int Cost, int Threshold) {
-      assert(Cost > AlwaysInlineCost && "Cost crosses sentinel value");
-      assert(Cost < NeverInlineCost && "Cost crosses sentinel value");
-      return InlineCost(Cost, Threshold);
-    }
-    static InlineCost getAlways() {
-      return InlineCost(AlwaysInlineCost, 0);
-    }
-    static InlineCost getNever() {
-      return InlineCost(NeverInlineCost, 0);
-    }
-
-    /// \brief Test whether the inline cost is low enough for inlining.
-    operator bool() const {
-      return Cost < Threshold;
-    }
-
-    bool isAlways() const   { return Cost == AlwaysInlineCost; }
-    bool isNever() const    { return Cost == NeverInlineCost; }
-    bool isVariable() const { return !isAlways() && !isNever(); }
-
-    /// \brief Get the inline cost estimate.
-    /// It is an error to call this on an "always" or "never" InlineCost.
-    int getCost() const {
-      assert(isVariable() && "Invalid access of InlineCost");
-      return Cost;
-    }
-
-    /// \brief Get the cost delta from the threshold for inlining.
-    /// Only valid if the cost is of the variable kind. Returns a negative
-    /// value if the cost is too high to inline.
-    int getCostDelta() const { return Threshold - getCost(); }
-  };
+  /// Also note that calling this function *dynamically* computes the cost of
+  /// inlining the callsite. It is an expensive, heavyweight call.
+  InlineCost getInlineCost(CallSite CS, int Threshold);
+
+  /// \brief Get an InlineCost with the callee explicitly specified.
+  /// This allows you to calculate the cost of inlining a function via a
+  /// pointer. This behaves exactly as the version with no explicit callee
+  /// parameter in all other respects.
+  //
+  //  Note: This is used by out-of-tree passes, please do not remove without
+  //  adding a replacement API.
+  InlineCost getInlineCost(CallSite CS, Function *Callee, int Threshold);
+
+  /// \brief Minimal filter to detect invalid constructs for inlining.
+  bool isInlineViable(Function &Callee);
+};
 
-  /// InlineCostAnalyzer - Cost analyzer used by inliner.
-  class InlineCostAnalyzer {
-    // DataLayout if available, or null.
-    const DataLayout *TD;
-
-  public:
-    InlineCostAnalyzer(): TD(0) {}
-
-    void setDataLayout(const DataLayout *TData) { TD = TData; }
-
-    /// \brief Get an InlineCost object representing the cost of inlining this
-    /// callsite.
-    ///
-    /// Note that threshold is passed into this function. Only costs below the
-    /// threshold are computed with any accuracy. The threshold can be used to
-    /// bound the computation necessary to determine whether the cost is
-    /// sufficiently low to warrant inlining.
-    InlineCost getInlineCost(CallSite CS, int Threshold);
-
-    /// \brief Get an InlineCost with the callee explicitly specified.
-    /// This allows you to calculate the cost of inlining a function via a
-    /// pointer. This behaves exactly as the version with no explicit callee
-    /// parameter in all other respects.
-    //
-    //  Note: This is used by out-of-tree passes, please do not remove without
-    //  adding a replacement API.
-    InlineCost getInlineCost(CallSite CS, Function *Callee, int Threshold);
-
-    /// \brief Minimal filter to detect invalid constructs for inlining.
-    bool isInlineViable(Function &Callee);
-  };
 }
 
 #endif
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index b653e79..d760a4c 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -14,6 +14,19 @@
 // ("and i32 %x, %x" -> "%x").  If the simplification is also an instruction
 // then it dominates the original instruction.
 //
+// These routines implicitly resolve undef uses. The easiest way to be safe when
+// using these routines to obtain simplified values for existing instructions is
+// to always replace all uses of the instructions with the resulting simplified
+// values. This will prevent other code from seeing the same undef uses and
+// resolving them to different values.
+//
+// These routines are designed to tolerate moderately incomplete IR, such as
+// instructions that are not connected to basic blocks yet. However, they do
+// require that all the IR that they encounter be valid. In particular, they
+// require that all non-constant values be defined in the same function, and the
+// same call context of that function (and not split between caller and callee
+// contexts of a directly recursive call, for example).
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_INSTRUCTIONSIMPLIFY_H
diff --git a/include/llvm/Analysis/Interval.h b/include/llvm/Analysis/Interval.h
index ca8ad73..5ce1260 100644
--- a/include/llvm/Analysis/Interval.h
+++ b/include/llvm/Analysis/Interval.h
@@ -17,8 +17,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_INTERVAL_H
-#define LLVM_INTERVAL_H
+#ifndef LLVM_ANALYSIS_INTERVAL_H
+#define LLVM_ANALYSIS_INTERVAL_H
 
 #include "llvm/ADT/GraphTraits.h"
 #include <vector>
diff --git a/include/llvm/Analysis/IntervalIterator.h b/include/llvm/Analysis/IntervalIterator.h
index bd9ab20..333f289 100644
--- a/include/llvm/Analysis/IntervalIterator.h
+++ b/include/llvm/Analysis/IntervalIterator.h
@@ -30,8 +30,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_INTERVAL_ITERATOR_H
-#define LLVM_INTERVAL_ITERATOR_H
+#ifndef LLVM_ANALYSIS_INTERVALITERATOR_H
+#define LLVM_ANALYSIS_INTERVALITERATOR_H
 
 #include "llvm/Analysis/IntervalPartition.h"
 #include "llvm/IR/Function.h"
diff --git a/include/llvm/Analysis/IntervalPartition.h b/include/llvm/Analysis/IntervalPartition.h
index bce84be..8cade58 100644
--- a/include/llvm/Analysis/IntervalPartition.h
+++ b/include/llvm/Analysis/IntervalPartition.h
@@ -20,8 +20,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_INTERVAL_PARTITION_H
-#define LLVM_INTERVAL_PARTITION_H
+#ifndef LLVM_ANALYSIS_INTERVALPARTITION_H
+#define LLVM_ANALYSIS_INTERVALPARTITION_H
 
 #include "llvm/Analysis/Interval.h"
 #include "llvm/Pass.h"
diff --git a/include/llvm/Analysis/LibCallAliasAnalysis.h b/include/llvm/Analysis/LibCallAliasAnalysis.h
index 243234b..c01b210 100644
--- a/include/llvm/Analysis/LibCallAliasAnalysis.h
+++ b/include/llvm/Analysis/LibCallAliasAnalysis.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_LIBCALL_AA_H
-#define LLVM_ANALYSIS_LIBCALL_AA_H
+#ifndef LLVM_ANALYSIS_LIBCALLALIASANALYSIS_H
+#define LLVM_ANALYSIS_LIBCALLALIASANALYSIS_H
 
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Pass.h"
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index 830754d..a20e065 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -27,21 +27,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_LOOP_INFO_H
-#define LLVM_ANALYSIS_LOOP_INFO_H
+#ifndef LLVM_ANALYSIS_LOOPINFO_H
+#define LLVM_ANALYSIS_LOOPINFO_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <map>
 
 namespace llvm {
 
@@ -381,6 +377,20 @@ public:
   /// isSafeToClone - Return true if the loop body is safe to clone in practice.
   bool isSafeToClone() const;
 
+  /// Returns true if the loop is annotated parallel.
+  ///
+  /// A parallel loop can be assumed to not contain any dependencies between
+  /// iterations by the compiler. That is, any loop-carried dependency checking
+  /// can be skipped completely when parallelizing the loop on the target
+  /// machine. Thus, if the parallel loop information originates from the
+  /// programmer, e.g. via the OpenMP parallel for pragma, it is the
+  /// programmer's responsibility to ensure there are no loop-carried
+  /// dependencies. The final execution order of the instructions across
+  /// iterations is not guaranteed, thus, the end result might or might not
+  /// implement actual concurrent execution of instructions across multiple
+  /// iterations.
+  bool isAnnotatedParallel() const;
+
   /// hasDedicatedExits - Return true if no exit block for the loop
   /// has a predecessor that is outside the loop.
   bool hasDedicatedExits() const;
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index 4b8e4c9..5485f3c 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -12,10 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_LOOP_INFO_IMPL_H
-#define LLVM_ANALYSIS_LOOP_INFO_IMPL_H
+#ifndef LLVM_ANALYSIS_LOOPINFOIMPL_H
+#define LLVM_ANALYSIS_LOOPINFOIMPL_H
 
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopInfo.h"
 
 namespace llvm {
diff --git a/include/llvm/Analysis/LoopIterator.h b/include/llvm/Analysis/LoopIterator.h
index 68f25f7..e3dd963 100644
--- a/include/llvm/Analysis/LoopIterator.h
+++ b/include/llvm/Analysis/LoopIterator.h
@@ -21,10 +21,9 @@
 // reachable from the loop header.
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_LOOP_ITERATOR_H
-#define LLVM_ANALYSIS_LOOP_ITERATOR_H
+#ifndef LLVM_ANALYSIS_LOOPITERATOR_H
+#define LLVM_ANALYSIS_LOOPITERATOR_H
 
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/LoopInfo.h"
 
diff --git a/include/llvm/Analysis/LoopPass.h b/include/llvm/Analysis/LoopPass.h
index f78472a..5767c19 100644
--- a/include/llvm/Analysis/LoopPass.h
+++ b/include/llvm/Analysis/LoopPass.h
@@ -12,11 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LOOP_PASS_H
-#define LLVM_LOOP_PASS_H
+#ifndef LLVM_ANALYSIS_LOOPPASS_H
+#define LLVM_ANALYSIS_LOOPPASS_H
 
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
 #include "llvm/PassManagers.h"
 #include <deque>
diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index f2c564f..f07658f 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@@ -158,7 +158,7 @@ class ObjectSizeOffsetVisitor
   bool RoundToAlign;
   unsigned IntTyBits;
   APInt Zero;
-  SmallPtrSet<Value*, 8> SeenInsts;
+  SmallPtrSet<Instruction *, 8> SeenInsts;
 
   APInt align(APInt Size, uint64_t Align);
 
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index b954840..b87f886 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_MEMORY_DEPENDENCE_H
-#define LLVM_ANALYSIS_MEMORY_DEPENDENCE_H
+#ifndef LLVM_ANALYSIS_MEMORYDEPENDENCEANALYSIS_H
+#define LLVM_ANALYSIS_MEMORYDEPENDENCEANALYSIS_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/OwningPtr.h"
@@ -34,14 +34,14 @@ namespace llvm {
   class PredIteratorCache;
   class DominatorTree;
   class PHITransAddr;
-  
+
   /// MemDepResult - A memory dependence query can return one of three different
   /// answers, described below.
   class MemDepResult {
     enum DepType {
       /// Invalid - Clients of MemDep never see this.
       Invalid = 0,
-      
+
       /// Clobber - This is a dependence on the specified instruction which
       /// clobbers the desired value.  The pointer member of the MemDepResult
       /// pair holds the instruction that clobbers the memory.  For example,
@@ -72,7 +72,7 @@ namespace llvm {
       ///      and no intervening clobbers.  No validation is done that the
       ///      operands to the calls are the same.
       Def,
-      
+
       /// Other - This marker indicates that the query has no known dependency
       /// in the specified block.  More detailed state info is encoded in the
       /// upper part of the pair (i.e. the Instruction*)
@@ -99,7 +99,7 @@ namespace llvm {
     explicit MemDepResult(PairTy V) : Value(V) {}
   public:
     MemDepResult() : Value(0, Invalid) {}
-    
+
     /// get methods: These are static ctor methods for creating various
     /// MemDepResult kinds.
     static MemDepResult getDef(Instruction *Inst) {
@@ -130,7 +130,7 @@ namespace llvm {
     /// isDef - Return true if this MemDepResult represents a query that is
     /// an instruction definition dependency.
     bool isDef() const { return Value.getInt() == Def; }
-    
+
     /// isNonLocal - Return true if this MemDepResult represents a query that
     /// is transparent to the start of the block, but where a non-local hasn't
     /// been done.
@@ -145,7 +145,7 @@ namespace llvm {
       return Value.getInt() == Other
         && Value.getPointer() == reinterpret_cast<Instruction*>(NonFuncLocal);
     }
-    
+
     /// isUnknown - Return true if this MemDepResult represents a query which
     /// cannot and/or will not be computed.
     bool isUnknown() const {
@@ -159,7 +159,7 @@ namespace llvm {
       if (Value.getInt() == Other) return NULL;
       return Value.getPointer();
     }
-    
+
     bool operator==(const MemDepResult &M) const { return Value == M.Value; }
     bool operator!=(const MemDepResult &M) const { return Value != M.Value; }
     bool operator<(const MemDepResult &M) const { return Value < M.Value; }
@@ -175,11 +175,11 @@ namespace llvm {
     /// In a default-constructed MemDepResult object, the type will be Dirty
     /// and the instruction pointer will be null.
     ///
-         
+
     /// isDirty - Return true if this is a MemDepResult in its dirty/invalid.
     /// state.
     bool isDirty() const { return Value.getInt() == Invalid; }
-    
+
     static MemDepResult getDirty(Instruction *Inst) {
       return MemDepResult(PairTy(Inst, Invalid));
     }
@@ -199,16 +199,16 @@ namespace llvm {
 
     // BB is the sort key, it can't be changed.
     BasicBlock *getBB() const { return BB; }
-    
+
     void setResult(const MemDepResult &R) { Result = R; }
 
     const MemDepResult &getResult() const { return Result; }
-    
+
     bool operator<(const NonLocalDepEntry &RHS) const {
       return BB < RHS.BB;
     }
   };
-  
+
   /// NonLocalDepResult - This is a result from a NonLocal dependence query.
   /// For each BasicBlock (the BB entry) it keeps a MemDepResult and the
   /// (potentially phi translated) address that was live in the block.
@@ -218,17 +218,17 @@ namespace llvm {
   public:
     NonLocalDepResult(BasicBlock *bb, MemDepResult result, Value *address)
       : Entry(bb, result), Address(address) {}
-    
+
     // BB is the sort key, it can't be changed.
     BasicBlock *getBB() const { return Entry.getBB(); }
-    
+
     void setResult(const MemDepResult &R, Value *Addr) {
       Entry.setResult(R);
       Address = Addr;
     }
-    
+
     const MemDepResult &getResult() const { return Entry.getResult(); }
-    
+
     /// getAddress - Return the address of this pointer in this block.  This can
     /// be different than the address queried for the non-local result because
     /// of phi translation.  This returns null if the address was not available
@@ -238,7 +238,7 @@ namespace llvm {
     /// The address is always null for a non-local 'call' dependence.
     Value *getAddress() const { return Address; }
   };
-  
+
   /// MemoryDependenceAnalysis - This is an analysis that determines, for a
   /// given memory operation, what preceding memory operations it depends on.
   /// It builds on alias analysis information, and tries to provide a lazy,
@@ -297,30 +297,30 @@ namespace llvm {
     CachedNonLocalPointerInfo NonLocalPointerDeps;
 
     // A map from instructions to their non-local pointer dependencies.
-    typedef DenseMap<Instruction*, 
+    typedef DenseMap<Instruction*,
                      SmallPtrSet<ValueIsLoadPair, 4> > ReverseNonLocalPtrDepTy;
     ReverseNonLocalPtrDepTy ReverseNonLocalPtrDeps;
 
-    
+
     /// PerInstNLInfo - This is the instruction we keep for each cached access
     /// that we have for an instruction.  The pointer is an owning pointer and
     /// the bool indicates whether we have any dirty bits in the set.
     typedef std::pair<NonLocalDepInfo, bool> PerInstNLInfo;
-    
+
     // A map from instructions to their non-local dependencies.
     typedef DenseMap<Instruction*, PerInstNLInfo> NonLocalDepMapType;
-      
+
     NonLocalDepMapType NonLocalDeps;
-    
+
     // A reverse mapping from dependencies to the dependees.  This is
     // used when removing instructions to keep the cache coherent.
     typedef DenseMap<Instruction*,
                      SmallPtrSet<Instruction*, 4> > ReverseDepMapType;
     ReverseDepMapType ReverseLocalDeps;
-    
+
     // A reverse mapping from dependencies to the non-local dependees.
     ReverseDepMapType ReverseNonLocalDeps;
-    
+
     /// Current AA implementation, just a cache.
     AliasAnalysis *AA;
     DataLayout *TD;
@@ -333,15 +333,15 @@ namespace llvm {
 
     /// Pass Implementation stuff.  This doesn't do any analysis eagerly.
     bool runOnFunction(Function &);
-    
+
     /// Clean up memory in between runs
     void releaseMemory();
-    
+
     /// getAnalysisUsage - Does not modify anything.  It uses Value Numbering
     /// and Alias Analysis.
     ///
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    
+
     /// getDependency - Return the instruction on which a memory operation
     /// depends.  See the class comment for more details.  It is illegal to call
     /// this on non-memory instructions.
@@ -360,8 +360,8 @@ namespace llvm {
     /// removed.  Clients must copy this data if they want it around longer than
     /// that.
     const NonLocalDepInfo &getNonLocalCallDependency(CallSite QueryCS);
-    
-    
+
+
     /// getNonLocalPointerDependency - Perform a full dependency query for an
     /// access to the specified (non-volatile) memory location, returning the
     /// set of instructions that either define or clobber the value.
@@ -374,7 +374,7 @@ namespace llvm {
     /// removeInstruction - Remove an instruction from the dependence analysis,
     /// updating the dependence of instructions that previously depended on it.
     void removeInstruction(Instruction *InstToRemove);
-    
+
     /// invalidateCachedPointerInfo - This method is used to invalidate cached
     /// information about the specified pointer, because it may be too
     /// conservative in memdep.  This is an optional call that can be used when
@@ -387,7 +387,7 @@ namespace llvm {
     /// This needs to be done when the CFG changes, e.g., due to splitting
     /// critical edges.
     void invalidateCachedPredecessors();
-    
+
     /// getPointerDependencyFrom - Return the instruction on which a memory
     /// location depends.  If isLoad is true, this routine ignores may-aliases
     /// with read-only operations.  If isLoad is false, this routine ignores
@@ -396,11 +396,11 @@ namespace llvm {
     /// Note that this is an uncached query, and thus may be inefficient.
     ///
     MemDepResult getPointerDependencyFrom(const AliasAnalysis::Location &Loc,
-                                          bool isLoad, 
+                                          bool isLoad,
                                           BasicBlock::iterator ScanIt,
                                           BasicBlock *BB);
-    
-    
+
+
     /// getLoadLoadClobberFullWidthSize - This is a little bit of analysis that
     /// looks at a memory location for a load (specified by MemLocBase, Offs,
     /// and Size) and compares it against a load.  If the specified load could
@@ -413,7 +413,7 @@ namespace llvm {
                                                     unsigned MemLocSize,
                                                     const LoadInst *LI,
                                                     const DataLayout &TD);
-    
+
   private:
     MemDepResult getCallSiteDependencyFrom(CallSite C, bool isReadOnlyCall,
                                            BasicBlock::iterator ScanIt,
@@ -430,11 +430,11 @@ namespace llvm {
                                          unsigned NumSortedEntries);
 
     void RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair P);
-    
+
     /// verifyRemoved - Verify that the specified instruction does not occur
     /// in our internal data structures.
     void verifyRemoved(Instruction *Inst) const;
-    
+
   };
 
 } // End llvm namespace
diff --git a/include/llvm/Analysis/PathNumbering.h b/include/llvm/Analysis/PathNumbering.h
index 86b1520..400a37d 100644
--- a/include/llvm/Analysis/PathNumbering.h
+++ b/include/llvm/Analysis/PathNumbering.h
@@ -23,8 +23,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PATH_NUMBERING_H
-#define LLVM_PATH_NUMBERING_H
+#ifndef LLVM_ANALYSIS_PATHNUMBERING_H
+#define LLVM_ANALYSIS_PATHNUMBERING_H
 
 #include "llvm/Analysis/ProfileInfoTypes.h"
 #include "llvm/IR/BasicBlock.h"
diff --git a/include/llvm/Analysis/PathProfileInfo.h b/include/llvm/Analysis/PathProfileInfo.h
index 8684f41..4fce16e 100644
--- a/include/llvm/Analysis/PathProfileInfo.h
+++ b/include/llvm/Analysis/PathProfileInfo.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PATHPROFILEINFO_H
-#define LLVM_PATHPROFILEINFO_H
+#ifndef LLVM_ANALYSIS_PATHPROFILEINFO_H
+#define LLVM_ANALYSIS_PATHPROFILEINFO_H
 
 #include "llvm/Analysis/PathNumbering.h"
 #include "llvm/IR/BasicBlock.h"
diff --git a/include/llvm/Analysis/PostDominators.h b/include/llvm/Analysis/PostDominators.h
index 0eddb91..d082297 100644
--- a/include/llvm/Analysis/PostDominators.h
+++ b/include/llvm/Analysis/PostDominators.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_POST_DOMINATORS_H
-#define LLVM_ANALYSIS_POST_DOMINATORS_H
+#ifndef LLVM_ANALYSIS_POSTDOMINATORS_H
+#define LLVM_ANALYSIS_POSTDOMINATORS_H
 
 #include "llvm/Analysis/Dominators.h"
 
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 48d7ee6..69cc293 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -24,8 +24,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_REGION_INFO_H
-#define LLVM_ANALYSIS_REGION_INFO_H
+#ifndef LLVM_ANALYSIS_REGIONINFO_H
+#define LLVM_ANALYSIS_REGIONINFO_H
 
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Analysis/DominanceFrontier.h"
diff --git a/include/llvm/Analysis/RegionIterator.h b/include/llvm/Analysis/RegionIterator.h
index bcff227..8fd4263 100644
--- a/include/llvm/Analysis/RegionIterator.h
+++ b/include/llvm/Analysis/RegionIterator.h
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 // This file defines the iterators to iterate over the elements of a Region.
 //===----------------------------------------------------------------------===//
-#ifndef LLVM_ANALYSIS_REGION_ITERATOR_H
-#define LLVM_ANALYSIS_REGION_ITERATOR_H
+#ifndef LLVM_ANALYSIS_REGIONITERATOR_H
+#define LLVM_ANALYSIS_REGIONITERATOR_H
 
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PointerIntPair.h"
diff --git a/include/llvm/Analysis/RegionPass.h b/include/llvm/Analysis/RegionPass.h
index 54f96d6..0690ac5 100644
--- a/include/llvm/Analysis/RegionPass.h
+++ b/include/llvm/Analysis/RegionPass.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_REGION_PASS_H
-#define LLVM_REGION_PASS_H
+#ifndef LLVM_ANALYSIS_REGIONPASS_H
+#define LLVM_ANALYSIS_REGIONPASS_H
 
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/IR/Function.h"
diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h
index ea45aff..00779fc 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_SCALAREVOLUTION_EXPANDER_H
-#define LLVM_ANALYSIS_SCALAREVOLUTION_EXPANDER_H
+#ifndef LLVM_ANALYSIS_SCALAREVOLUTIONEXPANDER_H
+#define LLVM_ANALYSIS_SCALAREVOLUTIONEXPANDER_H
 
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
@@ -40,8 +40,10 @@ namespace llvm {
     // New instructions receive a name to identifies them with the current pass.
     const char* IVName;
 
-    std::map<std::pair<const SCEV *, Instruction *>, AssertingVH<Value> >
+    // InsertedExpressions caches Values for reuse, so must track RAUW.
+    std::map<std::pair<const SCEV *, Instruction *>, TrackingVH<Value> >
       InsertedExpressions;
+    // InsertedValues only flags inserted instructions so needs no RAUW.
     std::set<AssertingVH<Value> > InsertedValues;
     std::set<AssertingVH<Value> > InsertedPostIncValues;
 
diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h
index b74cb33..eac9113 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_SCALAREVOLUTION_EXPRESSIONS_H
-#define LLVM_ANALYSIS_SCALAREVOLUTION_EXPRESSIONS_H
+#ifndef LLVM_ANALYSIS_SCALAREVOLUTIONEXPRESSIONS_H
+#define LLVM_ANALYSIS_SCALAREVOLUTIONEXPRESSIONS_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -548,6 +548,151 @@ namespace llvm {
     SCEVTraversal<SV> T(Visitor);
     T.visitAll(Root);
   }
+
+  /// The SCEVRewriter takes a scalar evolution expression and copies all its
+  /// components. The result after a rewrite is an identical SCEV.
+  struct SCEVRewriter
+    : public SCEVVisitor<SCEVRewriter, const SCEV*> {
+  public:
+    SCEVRewriter(ScalarEvolution &S) : SE(S) {}
+
+    virtual ~SCEVRewriter() {}
+
+    virtual const SCEV *visitConstant(const SCEVConstant *Constant) {
+      return Constant;
+    }
+
+    virtual const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+      const SCEV *Operand = visit(Expr->getOperand());
+      return SE.getTruncateExpr(Operand, Expr->getType());
+    }
+
+    virtual const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+      const SCEV *Operand = visit(Expr->getOperand());
+      return SE.getZeroExtendExpr(Operand, Expr->getType());
+    }
+
+    virtual const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+      const SCEV *Operand = visit(Expr->getOperand());
+      return SE.getSignExtendExpr(Operand, Expr->getType());
+    }
+
+    virtual const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+      return SE.getAddExpr(Operands);
+    }
+
+    virtual const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+      return SE.getMulExpr(Operands);
+    }
+
+    virtual const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
+      return SE.getUDivExpr(visit(Expr->getLHS()), visit(Expr->getRHS()));
+    }
+
+    virtual const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+      return SE.getAddRecExpr(Operands, Expr->getLoop(),
+                              Expr->getNoWrapFlags());
+    }
+
+    virtual const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+      return SE.getSMaxExpr(Operands);
+    }
+
+    virtual const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+      return SE.getUMaxExpr(Operands);
+    }
+
+    virtual const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+      return Expr;
+    }
+
+    virtual const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+      return Expr;
+    }
+
+  protected:
+    ScalarEvolution &SE;
+  };
+
+  typedef DenseMap<const Value*, Value*> ValueToValueMap;
+
+  /// The SCEVParameterRewriter takes a scalar evolution expression and updates
+  /// the SCEVUnknown components following the Map (Value -> Value).
+  struct SCEVParameterRewriter: public SCEVRewriter {
+  public:
+    static const SCEV *rewrite(const SCEV *Scev, ScalarEvolution &SE,
+                               ValueToValueMap &Map) {
+      SCEVParameterRewriter Rewriter(SE, Map);
+      return Rewriter.visit(Scev);
+    }
+    SCEVParameterRewriter(ScalarEvolution &S, ValueToValueMap &M)
+      : SCEVRewriter(S), Map(M) {}
+
+    virtual const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+      Value *V = Expr->getValue();
+      if (Map.count(V))
+        return SE.getUnknown(Map[V]);
+      return Expr;
+    }
+
+  private:
+    ValueToValueMap &Map;
+  };
+
+  typedef DenseMap<const Loop*, const SCEV*> LoopToScevMapT;
+
+  /// The SCEVApplyRewriter takes a scalar evolution expression and applies
+  /// the Map (Loop -> SCEV) to all AddRecExprs.
+  struct SCEVApplyRewriter: public SCEVRewriter {
+  public:
+    static const SCEV *rewrite(const SCEV *Scev, LoopToScevMapT &Map,
+                               ScalarEvolution &SE) {
+      SCEVApplyRewriter Rewriter(SE, Map);
+      return Rewriter.visit(Scev);
+    }
+    SCEVApplyRewriter(ScalarEvolution &S, LoopToScevMapT &M)
+      : SCEVRewriter(S), Map(M) {}
+
+    virtual const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+
+      const Loop *L = Expr->getLoop();
+      const SCEV *Res = SE.getAddRecExpr(Operands, L, Expr->getNoWrapFlags());
+
+      if (0 == Map.count(L))
+        return Res;
+
+      const SCEVAddRecExpr *Rec = (const SCEVAddRecExpr *) Res;
+      return Rec->evaluateAtIteration(Map[L], SE);
+    }
+
+  private:
+    LoopToScevMapT &Map;
+  };
+
+/// Applies the Map (Loop -> SCEV) to the given Scev.
+static inline const SCEV *apply(const SCEV *Scev, LoopToScevMapT &Map,
+                                ScalarEvolution &SE) {
+  return SCEVApplyRewriter::rewrite(Scev, Map, SE);
+}
+
 }
 
 #endif
diff --git a/include/llvm/Analysis/ScalarEvolutionNormalization.h b/include/llvm/Analysis/ScalarEvolutionNormalization.h
index 342e5937..7c6423a 100644
--- a/include/llvm/Analysis/ScalarEvolutionNormalization.h
+++ b/include/llvm/Analysis/ScalarEvolutionNormalization.h
@@ -33,8 +33,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_SCALAREVOLUTION_NORMALIZATION_H
-#define LLVM_ANALYSIS_SCALAREVOLUTION_NORMALIZATION_H
+#ifndef LLVM_ANALYSIS_SCALAREVOLUTIONNORMALIZATION_H
+#define LLVM_ANALYSIS_SCALAREVOLUTIONNORMALIZATION_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 
diff --git a/include/llvm/Analysis/SparsePropagation.h b/include/llvm/Analysis/SparsePropagation.h
index 604e306..76c8ccf 100644
--- a/include/llvm/Analysis/SparsePropagation.h
+++ b/include/llvm/Analysis/SparsePropagation.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_SPARSE_PROPAGATION_H
-#define LLVM_ANALYSIS_SPARSE_PROPAGATION_H
+#ifndef LLVM_ANALYSIS_SPARSEPROPAGATION_H
+#define LLVM_ANALYSIS_SPARSEPROPAGATION_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -203,4 +203,4 @@ private:
 
 } // end namespace llvm
 
-#endif // LLVM_ANALYSIS_SPARSE_PROPAGATION_H
+#endif // LLVM_ANALYSIS_SPARSEPROPAGATION_H
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index ddf615f..e1331a1 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -19,9 +19,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_TARGET_TRANSFORM_INTERFACE
-#define LLVM_ANALYSIS_TARGET_TRANSFORM_INTERFACE
+#ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
+#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
@@ -67,6 +68,123 @@ public:
   /// This class is intended to be subclassed by real implementations.
   virtual ~TargetTransformInfo() = 0;
 
+  /// \name Generic Target Information
+  /// @{
+
+  /// \brief Underlying constants for 'cost' values in this interface.
+  ///
+  /// Many APIs in this interface return a cost. This enum defines the
+  /// fundamental values that should be used to interpret (and produce) those
+  /// costs. The costs are returned as an unsigned rather than a member of this
+  /// enumeration because it is expected that the cost of one IR instruction
+  /// may have a multiplicative factor to it or otherwise won't fit dircetly
+  /// into the enum. Moreover, it is common to sum or average costs which works
+  /// better as simple integral values. Thus this enum only provides constants.
+  ///
+  /// Note that these costs should usually reflect the intersection of code-size
+  /// cost and execution cost. A free instruction is typically one that folds
+  /// into another instruction. For example, reg-to-reg moves can often be
+  /// skipped by renaming the registers in the CPU, but they still are encoded
+  /// and thus wouldn't be considered 'free' here.
+  enum TargetCostConstants {
+    TCC_Free = 0,       ///< Expected to fold away in lowering.
+    TCC_Basic = 1,      ///< The cost of a typical 'add' instruction.
+    TCC_Expensive = 4   ///< The cost of a 'div' instruction on x86.
+  };
+
+  /// \brief Estimate the cost of a specific operation when lowered.
+  ///
+  /// Note that this is designed to work on an arbitrary synthetic opcode, and
+  /// thus work for hypothetical queries before an instruction has even been
+  /// formed. However, this does *not* work for GEPs, and must not be called
+  /// for a GEP instruction. Instead, use the dedicated getGEPCost interface as
+  /// analyzing a GEP's cost required more information.
+  ///
+  /// Typically only the result type is required, and the operand type can be
+  /// omitted. However, if the opcode is one of the cast instructions, the
+  /// operand type is required.
+  ///
+  /// The returned cost is defined in terms of \c TargetCostConstants, see its
+  /// comments for a detailed explanation of the cost values.
+  virtual unsigned getOperationCost(unsigned Opcode, Type *Ty,
+                                    Type *OpTy = 0) const;
+
+  /// \brief Estimate the cost of a GEP operation when lowered.
+  ///
+  /// The contract for this function is the same as \c getOperationCost except
+  /// that it supports an interface that provides extra information specific to
+  /// the GEP operation.
+  virtual unsigned getGEPCost(const Value *Ptr,
+                              ArrayRef<const Value *> Operands) const;
+
+  /// \brief Estimate the cost of a function call when lowered.
+  ///
+  /// The contract for this is the same as \c getOperationCost except that it
+  /// supports an interface that provides extra information specific to call
+  /// instructions.
+  ///
+  /// This is the most basic query for estimating call cost: it only knows the
+  /// function type and (potentially) the number of arguments at the call site.
+  /// The latter is only interesting for varargs function types.
+  virtual unsigned getCallCost(FunctionType *FTy, int NumArgs = -1) const;
+
+  /// \brief Estimate the cost of calling a specific function when lowered.
+  ///
+  /// This overload adds the ability to reason about the particular function
+  /// being called in the event it is a library call with special lowering.
+  virtual unsigned getCallCost(const Function *F, int NumArgs = -1) const;
+
+  /// \brief Estimate the cost of calling a specific function when lowered.
+  ///
+  /// This overload allows specifying a set of candidate argument values.
+  virtual unsigned getCallCost(const Function *F,
+                               ArrayRef<const Value *> Arguments) const;
+
+  /// \brief Estimate the cost of an intrinsic when lowered.
+  ///
+  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
+  virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                                    ArrayRef<Type *> ParamTys) const;
+
+  /// \brief Estimate the cost of an intrinsic when lowered.
+  ///
+  /// Mirrors the \c getCallCost method but uses an intrinsic identifier.
+  virtual unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                                    ArrayRef<const Value *> Arguments) const;
+
+  /// \brief Estimate the cost of a given IR user when lowered.
+  ///
+  /// This can estimate the cost of either a ConstantExpr or Instruction when
+  /// lowered. It has two primary advantages over the \c getOperationCost and
+  /// \c getGEPCost above, and one significant disadvantage: it can only be
+  /// used when the IR construct has already been formed.
+  ///
+  /// The advantages are that it can inspect the SSA use graph to reason more
+  /// accurately about the cost. For example, all-constant-GEPs can often be
+  /// folded into a load or other instruction, but if they are used in some
+  /// other context they may not be folded. This routine can distinguish such
+  /// cases.
+  ///
+  /// The returned cost is defined in terms of \c TargetCostConstants, see its
+  /// comments for a detailed explanation of the cost values.
+  virtual unsigned getUserCost(const User *U) const;
+
+  /// \brief Test whether calls to a function lower to actual program function
+  /// calls.
+  ///
+  /// The idea is to test whether the program is likely to require a 'call'
+  /// instruction or equivalent in order to call the given function.
+  ///
+  /// FIXME: It's not clear that this is a good or useful query API. Client's
+  /// should probably move to simpler cost metrics using the above.
+  /// Alternatively, we could split the cost interface into distinct code-size
+  /// and execution-speed costs. This would allow modelling the core of this
+  /// query more accurately as the a call is a single small instruction, but
+  /// incurs significant execution cost.
+  virtual bool isLoweredToCall(const Function *F) const;
+
+  /// @}
+
   /// \name Scalar Target Information
   /// @{
 
@@ -148,6 +266,14 @@ public:
   /// set to false, it returns the number of scalar registers.
   virtual unsigned getNumberOfRegisters(bool Vector) const;
 
+  /// \return The width of the largest scalar or vector register type.
+  virtual unsigned getRegisterBitWidth(bool Vector) const;
+
+  /// \return The maximum unroll factor that the vectorizer should try to
+  /// perform for this target. This number depends on the level of parallelism
+  /// and the number of execution units in the CPU.
+  virtual unsigned getMaximumUnrollFactor() const;
+
   /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
   virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
 
@@ -188,6 +314,12 @@ public:
   /// split during legalization. Zero is returned when the answer is unknown.
   virtual unsigned getNumberOfParts(Type *Tp) const;
 
+  /// \returns The cost of the address computation. For most targets this can be
+  /// merged into the instruction indexing mode. Some targets might want to
+  /// distinguish between address computation for memory operations on vector
+  /// types and scalar types. Such targets should override this function.
+  virtual unsigned getAddressComputationCost(Type *Ty) const;
+
   /// @}
 
   /// Analysis group identification.
diff --git a/include/llvm/Analysis/Trace.h b/include/llvm/Analysis/Trace.h
index 26947fe..bedd654 100644
--- a/include/llvm/Analysis/Trace.h
+++ b/include/llvm/Analysis/Trace.h
@@ -116,4 +116,4 @@ public:
 
 } // end namespace llvm
 
-#endif // TRACE_H
+#endif // LLVM_ANALYSIS_TRACE_H
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index 875c47d..3775ec9 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -117,10 +117,10 @@ namespace llvm {
   /// it can be expressed as a base pointer plus a constant offset.  Return the
   /// base and offset to the caller.
   Value *GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
-                                          const DataLayout &TD);
+                                          const DataLayout *TD);
   static inline const Value *
   GetPointerBaseWithConstantOffset(const Value *Ptr, int64_t &Offset,
-                                   const DataLayout &TD) {
+                                   const DataLayout *TD) {
     return GetPointerBaseWithConstantOffset(const_cast<Value*>(Ptr), Offset,TD);
   }
   
@@ -183,6 +183,11 @@ namespace llvm {
   bool isSafeToSpeculativelyExecute(const Value *V,
                                     const DataLayout *TD = 0);
 
+  /// isKnownNonNull - Return true if this pointer couldn't possibly be null by
+  /// its definition.  This returns true for allocas, non-extern-weak globals
+  /// and byval arguments.
+  bool isKnownNonNull(const Value *V);
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Assembly/PrintModulePass.h b/include/llvm/Assembly/PrintModulePass.h
index 239fbcc..02b9bd9 100644
--- a/include/llvm/Assembly/PrintModulePass.h
+++ b/include/llvm/Assembly/PrintModulePass.h
@@ -23,6 +23,7 @@
 namespace llvm {
   class FunctionPass;
   class ModulePass;
+  class BasicBlockPass;
   class raw_ostream;
   
   /// createPrintModulePass - Create and return a pass that writes the
@@ -37,6 +38,11 @@ namespace llvm {
                                         raw_ostream *OS, 
                                         bool DeleteStream=false);  
 
+  /// createPrintBasicBlockPass - Create and return a pass that writes the
+  /// BB to the specified raw_ostream.
+  BasicBlockPass *createPrintBasicBlockPass(raw_ostream *OS,
+                                            bool DeleteStream=false,
+                                            const std::string &Banner = "");
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Bitcode/BitstreamReader.h b/include/llvm/Bitcode/BitstreamReader.h
index 5b60f72..2d2976c 100644
--- a/include/llvm/Bitcode/BitstreamReader.h
+++ b/include/llvm/Bitcode/BitstreamReader.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef BITSTREAM_READER_H
-#define BITSTREAM_READER_H
+#ifndef LLVM_BITCODE_BITSTREAMREADER_H
+#define LLVM_BITCODE_BITSTREAMREADER_H
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Bitcode/BitCodes.h"
@@ -27,6 +27,11 @@ namespace llvm {
 
   class Deserializer;
 
+/// BitstreamReader - This class is used to read from an LLVM bitcode stream,
+/// maintaining information that is global to decoding the entire file.  While
+/// a file is being read, multiple cursors can be independently advanced or
+/// skipped around within the file.  These are represented by the
+/// BitstreamCursor class.
 class BitstreamReader {
 public:
   /// BlockInfo - This contains information emitted to BLOCKINFO_BLOCK blocks.
@@ -119,20 +124,63 @@ public:
     BlockInfoRecords.back().BlockID = BlockID;
     return BlockInfoRecords.back();
   }
+};
 
+  
+/// BitstreamEntry - When advancing through a bitstream cursor, each advance can
+/// discover a few different kinds of entries:
+///   Error    - Malformed bitcode was found.
+///   EndBlock - We've reached the end of the current block, (or the end of the
+///              file, which is treated like a series of EndBlock records.
+///   SubBlock - This is the start of a new subblock of a specific ID.
+///   Record   - This is a record with a specific AbbrevID.
+///
+struct BitstreamEntry {
+  enum {
+    Error,
+    EndBlock,
+    SubBlock,
+    Record
+  } Kind;
+  
+  unsigned ID;
+
+  static BitstreamEntry getError() {
+    BitstreamEntry E; E.Kind = Error; return E;
+  }
+  static BitstreamEntry getEndBlock() {
+    BitstreamEntry E; E.Kind = EndBlock; return E;
+  }
+  static BitstreamEntry getSubBlock(unsigned ID) {
+    BitstreamEntry E; E.Kind = SubBlock; E.ID = ID; return E;
+  }
+  static BitstreamEntry getRecord(unsigned AbbrevID) {
+    BitstreamEntry E; E.Kind = Record; E.ID = AbbrevID; return E;
+  }
 };
 
+/// BitstreamCursor - This represents a position within a bitcode file.  There
+/// may be multiple independent cursors reading within one bitstream, each
+/// maintaining their own local state.
+///
+/// Unlike iterators, BitstreamCursors are heavy-weight objects that should not
+/// be passed by value.
 class BitstreamCursor {
   friend class Deserializer;
   BitstreamReader *BitStream;
   size_t NextChar;
 
-  /// CurWord - This is the current data we have pulled from the stream but have
-  /// not returned to the client.
-  uint32_t CurWord;
+  
+  /// CurWord/word_t - This is the current data we have pulled from the stream
+  /// but have not returned to the client.  This is specifically and
+  /// intentionally defined to follow the word size of the host machine for
+  /// efficiency.  We use word_t in places that are aware of this to make it
+  /// perfectly explicit what is going on.
+  typedef uint32_t word_t;
+  word_t CurWord;
 
   /// BitsInCurWord - This is the number of bits in CurWord that are valid. This
-  /// is always from [0...31] inclusive.
+  /// is always from [0...31/63] inclusive (depending on word size).
   unsigned BitsInCurWord;
 
   // CurCodeSize - This is the declared size of code values used for the current
@@ -151,6 +199,7 @@ class BitstreamCursor {
   /// BlockScope - This tracks the codesize of parent blocks.
   SmallVector<Block, 8> BlockScope;
 
+  
 public:
   BitstreamCursor() : BitStream(0), NextChar(0) {
   }
@@ -179,53 +228,10 @@ public:
     freeState();
   }
 
-  void operator=(const BitstreamCursor &RHS) {
-    freeState();
-
-    BitStream = RHS.BitStream;
-    NextChar = RHS.NextChar;
-    CurWord = RHS.CurWord;
-    BitsInCurWord = RHS.BitsInCurWord;
-    CurCodeSize = RHS.CurCodeSize;
-
-    // Copy abbreviations, and bump ref counts.
-    CurAbbrevs = RHS.CurAbbrevs;
-    for (unsigned i = 0, e = static_cast<unsigned>(CurAbbrevs.size());
-         i != e; ++i)
-      CurAbbrevs[i]->addRef();
-
-    // Copy block scope and bump ref counts.
-    BlockScope = RHS.BlockScope;
-    for (unsigned S = 0, e = static_cast<unsigned>(BlockScope.size());
-         S != e; ++S) {
-      std::vector<BitCodeAbbrev*> &Abbrevs = BlockScope[S].PrevAbbrevs;
-      for (unsigned i = 0, e = static_cast<unsigned>(Abbrevs.size());
-           i != e; ++i)
-        Abbrevs[i]->addRef();
-    }
-  }
-
-  void freeState() {
-    // Free all the Abbrevs.
-    for (unsigned i = 0, e = static_cast<unsigned>(CurAbbrevs.size());
-         i != e; ++i)
-      CurAbbrevs[i]->dropRef();
-    CurAbbrevs.clear();
-
-    // Free all the Abbrevs in the block scope.
-    for (unsigned S = 0, e = static_cast<unsigned>(BlockScope.size());
-         S != e; ++S) {
-      std::vector<BitCodeAbbrev*> &Abbrevs = BlockScope[S].PrevAbbrevs;
-      for (unsigned i = 0, e = static_cast<unsigned>(Abbrevs.size());
-           i != e; ++i)
-        Abbrevs[i]->dropRef();
-    }
-    BlockScope.clear();
-  }
-
-  /// GetAbbrevIDWidth - Return the number of bits used to encode an abbrev #.
-  unsigned GetAbbrevIDWidth() const { return CurCodeSize; }
+  void operator=(const BitstreamCursor &RHS);
 
+  void freeState();
+  
   bool isEndPos(size_t pos) {
     return BitStream->getBitcodeBytes().isObjectEnd(static_cast<uint64_t>(pos));
   }
@@ -236,26 +242,19 @@ public:
         static_cast<uint64_t>(pos - 1));
   }
 
-  unsigned char getByte(size_t pos) {
-    uint8_t byte = -1;
-    BitStream->getBitcodeBytes().readByte(pos, &byte);
-    return byte;
-  }
-
   uint32_t getWord(size_t pos) {
-    uint8_t buf[sizeof(uint32_t)];
-    memset(buf, 0xFF, sizeof(buf));
-    BitStream->getBitcodeBytes().readBytes(pos,
-                                           sizeof(buf),
-                                           buf,
-                                           NULL);
+    uint8_t buf[4] = { 0xFF, 0xFF, 0xFF, 0xFF };
+    BitStream->getBitcodeBytes().readBytes(pos, sizeof(buf), buf, NULL);
     return *reinterpret_cast<support::ulittle32_t *>(buf);
   }
 
   bool AtEndOfStream() {
-    return isEndPos(NextChar) && BitsInCurWord == 0;
+    return BitsInCurWord == 0 && isEndPos(NextChar);
   }
 
+  /// getAbbrevIDWidth - Return the number of bits used to encode an abbrev #.
+  unsigned getAbbrevIDWidth() const { return CurCodeSize; }
+
   /// GetCurrentBitNo - Return the bit # of the bit we are reading.
   uint64_t GetCurrentBitNo() const {
     return NextChar*CHAR_BIT - BitsInCurWord;
@@ -268,11 +267,64 @@ public:
     return BitStream;
   }
 
+  /// Flags that modify the behavior of advance().
+  enum {
+    /// AF_DontPopBlockAtEnd - If this flag is used, the advance() method does
+    /// not automatically pop the block scope when the end of a block is
+    /// reached.
+    AF_DontPopBlockAtEnd = 1,
+
+    /// AF_DontAutoprocessAbbrevs - If this flag is used, abbrev entries are
+    /// returned just like normal records.
+    AF_DontAutoprocessAbbrevs = 2
+  };
+  
+  /// advance - Advance the current bitstream, returning the next entry in the
+  /// stream.
+  BitstreamEntry advance(unsigned Flags = 0) {
+    while (1) {
+      unsigned Code = ReadCode();
+      if (Code == bitc::END_BLOCK) {
+        // Pop the end of the block unless Flags tells us not to.
+        if (!(Flags & AF_DontPopBlockAtEnd) && ReadBlockEnd())
+          return BitstreamEntry::getError();
+        return BitstreamEntry::getEndBlock();
+      }
+      
+      if (Code == bitc::ENTER_SUBBLOCK)
+        return BitstreamEntry::getSubBlock(ReadSubBlockID());
+      
+      if (Code == bitc::DEFINE_ABBREV &&
+          !(Flags & AF_DontAutoprocessAbbrevs)) {
+        // We read and accumulate abbrev's, the client can't do anything with
+        // them anyway.
+        ReadAbbrevRecord();
+        continue;
+      }
+
+      return BitstreamEntry::getRecord(Code);
+    }
+  }
+
+  /// advanceSkippingSubblocks - This is a convenience function for clients that
+  /// don't expect any subblocks.  This just skips over them automatically.
+  BitstreamEntry advanceSkippingSubblocks(unsigned Flags = 0) {
+    while (1) {
+      // If we found a normal entry, return it.
+      BitstreamEntry Entry = advance(Flags);
+      if (Entry.Kind != BitstreamEntry::SubBlock)
+        return Entry;
+      
+      // If we found a sub-block, just skip over it and check the next entry.
+      if (SkipBlock())
+        return BitstreamEntry::getError();
+    }
+  }
 
   /// JumpToBit - Reset the stream to the specified bit number.
   void JumpToBit(uint64_t BitNo) {
-    uintptr_t ByteNo = uintptr_t(BitNo/8) & ~3;
-    uintptr_t WordBitNo = uintptr_t(BitNo) & 31;
+    uintptr_t ByteNo = uintptr_t(BitNo/8) & ~(sizeof(word_t)-1);
+    unsigned WordBitNo = unsigned(BitNo & (sizeof(word_t)*8-1));
     assert(canSkipToPos(ByteNo) && "Invalid location");
 
     // Move the cursor to the right word.
@@ -281,16 +333,22 @@ public:
     CurWord = 0;
 
     // Skip over any bits that are already consumed.
-    if (WordBitNo)
-      Read(static_cast<unsigned>(WordBitNo));
+    if (WordBitNo) {
+      if (sizeof(word_t) > 4)
+        Read64(WordBitNo);
+      else
+        Read(WordBitNo);
+    }
   }
 
 
   uint32_t Read(unsigned NumBits) {
-    assert(NumBits <= 32 && "Cannot return more than 32 bits!");
+    assert(NumBits && NumBits <= 32 &&
+           "Cannot return zero or more than 32 bits!");
+    
     // If the field is fully contained by CurWord, return it quickly.
     if (BitsInCurWord >= NumBits) {
-      uint32_t R = CurWord & ((1U << NumBits)-1);
+      uint32_t R = uint32_t(CurWord) & (~0U >> (32-NumBits));
       CurWord >>= NumBits;
       BitsInCurWord -= NumBits;
       return R;
@@ -303,24 +361,37 @@ public:
       return 0;
     }
 
-    unsigned R = CurWord;
+    uint32_t R = uint32_t(CurWord);
 
     // Read the next word from the stream.
-    CurWord = getWord(NextChar);
-    NextChar += 4;
+    uint8_t Array[sizeof(word_t)] = {0};
+    
+    BitStream->getBitcodeBytes().readBytes(NextChar, sizeof(Array),
+                                           Array, NULL);
+    
+    // Handle big-endian byte-swapping if necessary.
+    support::detail::packed_endian_specific_integral
+      <word_t, support::little, support::unaligned> EndianValue;
+    memcpy(&EndianValue, Array, sizeof(Array));
+    
+    CurWord = EndianValue;
+
+    NextChar += sizeof(word_t);
 
     // Extract NumBits-BitsInCurWord from what we just read.
     unsigned BitsLeft = NumBits-BitsInCurWord;
 
-    // Be careful here, BitsLeft is in the range [1..32] inclusive.
-    R |= (CurWord & (~0U >> (32-BitsLeft))) << BitsInCurWord;
+    // Be careful here, BitsLeft is in the range [1..32]/[1..64] inclusive.
+    R |= uint32_t((CurWord & (word_t(~0ULL) >> (sizeof(word_t)*8-BitsLeft)))
+                    << BitsInCurWord);
 
-    // BitsLeft bits have just been used up from CurWord.
-    if (BitsLeft != 32)
+    // BitsLeft bits have just been used up from CurWord.  BitsLeft is in the
+    // range [1..32]/[1..64] so be careful how we shift.
+    if (BitsLeft != sizeof(word_t)*8)
       CurWord >>= BitsLeft;
     else
       CurWord = 0;
-    BitsInCurWord = 32-BitsLeft;
+    BitsInCurWord = sizeof(word_t)*8-BitsLeft;
     return R;
   }
 
@@ -369,10 +440,21 @@ public:
     }
   }
 
-  void SkipToWord() {
+private:
+  void SkipToFourByteBoundary() {
+    // If word_t is 64-bits and if we've read less than 32 bits, just dump
+    // the bits we have up to the next 32-bit boundary.
+    if (sizeof(word_t) > 4 &&
+        BitsInCurWord >= 32) {
+      CurWord >>= BitsInCurWord-32;
+      BitsInCurWord = 32;
+      return;
+    }
+    
     BitsInCurWord = 0;
     CurWord = 0;
   }
+public:
 
   unsigned ReadCode() {
     return Read(CurCodeSize);
@@ -395,62 +477,37 @@ public:
     // Read and ignore the codelen value.  Since we are skipping this block, we
     // don't care what code widths are used inside of it.
     ReadVBR(bitc::CodeLenWidth);
-    SkipToWord();
-    unsigned NumWords = Read(bitc::BlockSizeWidth);
+    SkipToFourByteBoundary();
+    unsigned NumFourBytes = Read(bitc::BlockSizeWidth);
 
     // Check that the block wasn't partially defined, and that the offset isn't
     // bogus.
-    size_t SkipTo = NextChar + NumWords*4;
-    if (AtEndOfStream() || !canSkipToPos(SkipTo))
+    size_t SkipTo = GetCurrentBitNo() + NumFourBytes*4*8;
+    if (AtEndOfStream() || !canSkipToPos(SkipTo/8))
       return true;
 
-    NextChar = SkipTo;
+    JumpToBit(SkipTo);
     return false;
   }
 
   /// EnterSubBlock - Having read the ENTER_SUBBLOCK abbrevid, enter
   /// the block, and return true if the block has an error.
-  bool EnterSubBlock(unsigned BlockID, unsigned *NumWordsP = 0) {
-    // Save the current block's state on BlockScope.
-    BlockScope.push_back(Block(CurCodeSize));
-    BlockScope.back().PrevAbbrevs.swap(CurAbbrevs);
-
-    // Add the abbrevs specific to this block to the CurAbbrevs list.
-    if (const BitstreamReader::BlockInfo *Info =
-          BitStream->getBlockInfo(BlockID)) {
-      for (unsigned i = 0, e = static_cast<unsigned>(Info->Abbrevs.size());
-           i != e; ++i) {
-        CurAbbrevs.push_back(Info->Abbrevs[i]);
-        CurAbbrevs.back()->addRef();
-      }
-    }
-
-    // Get the codesize of this block.
-    CurCodeSize = ReadVBR(bitc::CodeLenWidth);
-    SkipToWord();
-    unsigned NumWords = Read(bitc::BlockSizeWidth);
-    if (NumWordsP) *NumWordsP = NumWords;
-
-    // Validate that this block is sane.
-    if (CurCodeSize == 0 || AtEndOfStream())
-      return true;
-
-    return false;
-  }
-
+  bool EnterSubBlock(unsigned BlockID, unsigned *NumWordsP = 0);
+  
   bool ReadBlockEnd() {
     if (BlockScope.empty()) return true;
 
     // Block tail:
     //    [END_BLOCK, <align4bytes>]
-    SkipToWord();
+    SkipToFourByteBoundary();
 
-    PopBlockScope();
+    popBlockScope();
     return false;
   }
 
 private:
-  void PopBlockScope() {
+
+  void popBlockScope() {
     CurCodeSize = BlockScope.back().PrevCodeSize;
 
     // Delete abbrevs from popped scope.
@@ -462,36 +519,17 @@ private:
     BlockScope.pop_back();
   }
 
- //===--------------------------------------------------------------------===//
+  //===--------------------------------------------------------------------===//
   // Record Processing
   //===--------------------------------------------------------------------===//
 
 private:
-  void ReadAbbreviatedLiteral(const BitCodeAbbrevOp &Op,
-                              SmallVectorImpl<uint64_t> &Vals) {
-    assert(Op.isLiteral() && "Not a literal");
-    // If the abbrev specifies the literal value to use, use it.
-    Vals.push_back(Op.getLiteralValue());
-  }
-
-  void ReadAbbreviatedField(const BitCodeAbbrevOp &Op,
-                            SmallVectorImpl<uint64_t> &Vals) {
-    assert(!Op.isLiteral() && "Use ReadAbbreviatedLiteral for literals!");
-
-    // Decode the value as we are commanded.
-    switch (Op.getEncoding()) {
-    default: llvm_unreachable("Unknown encoding!");
-    case BitCodeAbbrevOp::Fixed:
-      Vals.push_back(Read((unsigned)Op.getEncodingData()));
-      break;
-    case BitCodeAbbrevOp::VBR:
-      Vals.push_back(ReadVBR64((unsigned)Op.getEncodingData()));
-      break;
-    case BitCodeAbbrevOp::Char6:
-      Vals.push_back(BitCodeAbbrevOp::DecodeChar6(Read(6)));
-      break;
-    }
-  }
+  void readAbbreviatedLiteral(const BitCodeAbbrevOp &Op,
+                              SmallVectorImpl<uint64_t> &Vals);
+  void readAbbreviatedField(const BitCodeAbbrevOp &Op,
+                            SmallVectorImpl<uint64_t> &Vals);
+  void skipAbbreviatedField(const BitCodeAbbrevOp &Op);
+  
 public:
 
   /// getAbbrev - Return the abbreviation for the specified AbbrevId.
@@ -501,166 +539,18 @@ public:
     return CurAbbrevs[AbbrevNo];
   }
 
-  unsigned ReadRecord(unsigned AbbrevID, SmallVectorImpl<uint64_t> &Vals,
-                      const char **BlobStart = 0, unsigned *BlobLen = 0) {
-    if (AbbrevID == bitc::UNABBREV_RECORD) {
-      unsigned Code = ReadVBR(6);
-      unsigned NumElts = ReadVBR(6);
-      for (unsigned i = 0; i != NumElts; ++i)
-        Vals.push_back(ReadVBR64(6));
-      return Code;
-    }
-
-    const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
-
-    for (unsigned i = 0, e = Abbv->getNumOperandInfos(); i != e; ++i) {
-      const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
-      if (Op.isLiteral()) {
-        ReadAbbreviatedLiteral(Op, Vals);
-      } else if (Op.getEncoding() == BitCodeAbbrevOp::Array) {
-        // Array case.  Read the number of elements as a vbr6.
-        unsigned NumElts = ReadVBR(6);
-
-        // Get the element encoding.
-        assert(i+2 == e && "array op not second to last?");
-        const BitCodeAbbrevOp &EltEnc = Abbv->getOperandInfo(++i);
-
-        // Read all the elements.
-        for (; NumElts; --NumElts)
-          ReadAbbreviatedField(EltEnc, Vals);
-      } else if (Op.getEncoding() == BitCodeAbbrevOp::Blob) {
-        // Blob case.  Read the number of bytes as a vbr6.
-        unsigned NumElts = ReadVBR(6);
-        SkipToWord();  // 32-bit alignment
-
-        // Figure out where the end of this blob will be including tail padding.
-        size_t NewEnd = NextChar+((NumElts+3)&~3);
-
-        // If this would read off the end of the bitcode file, just set the
-        // record to empty and return.
-        if (!canSkipToPos(NewEnd)) {
-          Vals.append(NumElts, 0);
-          NextChar = BitStream->getBitcodeBytes().getExtent();
-          break;
-        }
-
-        // Otherwise, read the number of bytes.  If we can return a reference to
-        // the data, do so to avoid copying it.
-        if (BlobStart) {
-          *BlobStart = (const char*)BitStream->getBitcodeBytes().getPointer(
-              NextChar, NumElts);
-          *BlobLen = NumElts;
-        } else {
-          for (; NumElts; ++NextChar, --NumElts)
-            Vals.push_back(getByte(NextChar));
-        }
-        // Skip over tail padding.
-        NextChar = NewEnd;
-      } else {
-        ReadAbbreviatedField(Op, Vals);
-      }
-    }
-
-    unsigned Code = (unsigned)Vals[0];
-    Vals.erase(Vals.begin());
-    return Code;
-  }
-
-  unsigned ReadRecord(unsigned AbbrevID, SmallVectorImpl<uint64_t> &Vals,
-                      const char *&BlobStart, unsigned &BlobLen) {
-    return ReadRecord(AbbrevID, Vals, &BlobStart, &BlobLen);
-  }
-
+  /// skipRecord - Read the current record and discard it.
+  void skipRecord(unsigned AbbrevID);
+  
+  unsigned readRecord(unsigned AbbrevID, SmallVectorImpl<uint64_t> &Vals,
+                      StringRef *Blob = 0);
 
   //===--------------------------------------------------------------------===//
   // Abbrev Processing
   //===--------------------------------------------------------------------===//
-
-  void ReadAbbrevRecord() {
-    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
-    unsigned NumOpInfo = ReadVBR(5);
-    for (unsigned i = 0; i != NumOpInfo; ++i) {
-      bool IsLiteral = Read(1) ? true : false;
-      if (IsLiteral) {
-        Abbv->Add(BitCodeAbbrevOp(ReadVBR64(8)));
-        continue;
-      }
-
-      BitCodeAbbrevOp::Encoding E = (BitCodeAbbrevOp::Encoding)Read(3);
-      if (BitCodeAbbrevOp::hasEncodingData(E))
-        Abbv->Add(BitCodeAbbrevOp(E, ReadVBR64(5)));
-      else
-        Abbv->Add(BitCodeAbbrevOp(E));
-    }
-    CurAbbrevs.push_back(Abbv);
-  }
-
-public:
-
-  bool ReadBlockInfoBlock() {
-    // If this is the second stream to get to the block info block, skip it.
-    if (BitStream->hasBlockInfoRecords())
-      return SkipBlock();
-
-    if (EnterSubBlock(bitc::BLOCKINFO_BLOCK_ID)) return true;
-
-    SmallVector<uint64_t, 64> Record;
-    BitstreamReader::BlockInfo *CurBlockInfo = 0;
-
-    // Read all the records for this module.
-    while (1) {
-      unsigned Code = ReadCode();
-      if (Code == bitc::END_BLOCK)
-        return ReadBlockEnd();
-      if (Code == bitc::ENTER_SUBBLOCK) {
-        ReadSubBlockID();
-        if (SkipBlock()) return true;
-        continue;
-      }
-
-      // Read abbrev records, associate them with CurBID.
-      if (Code == bitc::DEFINE_ABBREV) {
-        if (!CurBlockInfo) return true;
-        ReadAbbrevRecord();
-
-        // ReadAbbrevRecord installs the abbrev in CurAbbrevs.  Move it to the
-        // appropriate BlockInfo.
-        BitCodeAbbrev *Abbv = CurAbbrevs.back();
-        CurAbbrevs.pop_back();
-        CurBlockInfo->Abbrevs.push_back(Abbv);
-        continue;
-      }
-
-      // Read a record.
-      Record.clear();
-      switch (ReadRecord(Code, Record)) {
-      default: break;  // Default behavior, ignore unknown content.
-      case bitc::BLOCKINFO_CODE_SETBID:
-        if (Record.size() < 1) return true;
-        CurBlockInfo = &BitStream->getOrCreateBlockInfo((unsigned)Record[0]);
-        break;
-      case bitc::BLOCKINFO_CODE_BLOCKNAME: {
-        if (!CurBlockInfo) return true;
-        if (BitStream->isIgnoringBlockInfoNames()) break;  // Ignore name.
-        std::string Name;
-        for (unsigned i = 0, e = Record.size(); i != e; ++i)
-          Name += (char)Record[i];
-        CurBlockInfo->Name = Name;
-        break;
-      }
-      case bitc::BLOCKINFO_CODE_SETRECORDNAME: {
-        if (!CurBlockInfo) return true;
-        if (BitStream->isIgnoringBlockInfoNames()) break;  // Ignore name.
-        std::string Name;
-        for (unsigned i = 1, e = Record.size(); i != e; ++i)
-          Name += (char)Record[i];
-        CurBlockInfo->RecordNames.push_back(std::make_pair((unsigned)Record[0],
-                                                           Name));
-        break;
-      }
-      }
-    }
-  }
+  void ReadAbbrevRecord();
+  
+  bool ReadBlockInfoBlock();
 };
 
 } // End llvm namespace
diff --git a/include/llvm/Bitcode/BitstreamWriter.h b/include/llvm/Bitcode/BitstreamWriter.h
index 1021fbd..7b68f87 100644
--- a/include/llvm/Bitcode/BitstreamWriter.h
+++ b/include/llvm/Bitcode/BitstreamWriter.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef BITSTREAM_WRITER_H
-#define BITSTREAM_WRITER_H
+#ifndef LLVM_BITCODE_BITSTREAMWRITER_H
+#define LLVM_BITCODE_BITSTREAMWRITER_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 601777d..f9690d5 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -29,13 +29,12 @@ namespace bitc {
 
     // Module sub-block id's.
     PARAMATTR_BLOCK_ID,
-
-    UNUSED_ID1,
+    PARAMATTR_GROUP_BLOCK_ID,
 
     CONSTANTS_BLOCK_ID,
     FUNCTION_BLOCK_ID,
 
-    UNUSED_ID2,
+    UNUSED_ID1,
 
     VALUE_SYMTAB_BLOCK_ID,
     METADATA_BLOCK_ID,
@@ -69,7 +68,7 @@ namespace bitc {
     // ALIAS: [alias type, aliasee val#, linkage, visibility]
     MODULE_CODE_ALIAS       = 9,
 
-    /// MODULE_CODE_PURGEVALS: [numvals]
+    // MODULE_CODE_PURGEVALS: [numvals]
     MODULE_CODE_PURGEVALS   = 10,
 
     MODULE_CODE_GCNAME      = 11   // GCNAME: [strchr x N]
@@ -77,7 +76,12 @@ namespace bitc {
 
   /// PARAMATTR blocks have code for defining a parameter attribute set.
   enum AttributeCodes {
-    PARAMATTR_CODE_ENTRY = 1   // ENTRY: [paramidx0, attr0, paramidx1, attr1...]
+    // FIXME: Remove `PARAMATTR_CODE_ENTRY_OLD' in 4.0
+    PARAMATTR_CODE_ENTRY_OLD  = 1, // ENTRY: [paramidx0, attr0,
+                                   //         paramidx1, attr1...]
+    PARAMATTR_CODE_ENTRY      = 2, // ENTRY: [paramidx0, attrgrp0,
+                                   //         paramidx1, attrgrp1, ...]
+    PARAMATTR_GRP_CODE_ENTRY  = 3  // ENTRY: [id, attr0, att1, ...]
   };
 
   /// TYPE blocks have codes for each type primitive they use.
@@ -143,6 +147,7 @@ namespace bitc {
     METADATA_NAMED_NODE    = 10,  // NAMED_NODE:    [n x mdnodes]
     METADATA_ATTACHMENT    = 11   // [m x [value, [n x [id, mdnode]]]
   };
+
   // The constants block (CONSTANTS_BLOCK_ID) describes emission for each
   // constant and maintains an implicit current type value.
   enum ConstantsCodes {
diff --git a/include/llvm/Bitcode/ReaderWriter.h b/include/llvm/Bitcode/ReaderWriter.h
index dd96b04..78f40ca 100644
--- a/include/llvm/Bitcode/ReaderWriter.h
+++ b/include/llvm/Bitcode/ReaderWriter.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_BITCODE_H
-#define LLVM_BITCODE_H
+#ifndef LLVM_BITCODE_READERWRITER_H
+#define LLVM_BITCODE_READERWRITER_H
 
 #include <string>
 
diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h
index 81e75d8..ce9ca0a 100644
--- a/include/llvm/CodeGen/Analysis.h
+++ b/include/llvm/CodeGen/Analysis.h
@@ -86,8 +86,7 @@ ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred);
 /// between it and the return.
 ///
 /// This function only tests target-independent requirements.
-bool isInTailCallPosition(ImmutableCallSite CS, Attribute CalleeRetAttr,
-                          const TargetLowering &TLI);
+bool isInTailCallPosition(ImmutableCallSite CS, const TargetLowering &TLI);
 
 } // End llvm namespace
 
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index a66e05b..9a27661 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_COMMAND_LINE_FLAGS_H
-#define LLVM_CODEGEN_COMMAND_LINE_FLAGS_H
+#ifndef LLVM_CODEGEN_COMMANDFLAGS_H
+#define LLVM_CODEGEN_COMMANDFLAGS_H
 
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index 0561905..57273d8 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -90,6 +90,11 @@ public:
 
   /// getCurDebugLoc() - Return current debug location information.
   DebugLoc getCurDebugLoc() const { return DL; }
+  
+  /// LowerArguments - Do "fast" instruction selection for function arguments
+  /// and append machine instructions to the current block. Return true if
+  /// it is successful.
+  bool LowerArguments();
 
   /// SelectInstruction - Do "fast" instruction selection for the given
   /// LLVM IR instruction, and append generated machine instructions to
@@ -160,6 +165,11 @@ protected:
   ///
   virtual bool
   TargetSelectInstruction(const Instruction *I) = 0;
+  
+  /// FastLowerArguments - This method is called by target-independent code to
+  /// do target specific argument lowering. It returns true if it was
+  /// successful.
+  virtual bool FastLowerArguments();
 
   /// FastEmit_r - This method is called by target-independent code
   /// to request that an instruction with the given type and opcode
@@ -362,6 +372,11 @@ protected:
     return 0;
   }
 
+  /// Whether we should skip target-independent fast-isel
+  virtual bool SkipTargetIndependentFastISel() {
+    return false;
+  }
+
 private:
   bool SelectBinaryOp(const User *I, unsigned ISDOpcode);
 
diff --git a/include/llvm/CodeGen/FunctionLoweringInfo.h b/include/llvm/CodeGen/FunctionLoweringInfo.h
index 72e0937..ea6cb27 100644
--- a/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -17,17 +17,13 @@
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <vector>
 
@@ -35,6 +31,7 @@ namespace llvm {
 
 class AllocaInst;
 class BasicBlock;
+class BranchProbabilityInfo;
 class CallInst;
 class Function;
 class GlobalVariable;
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 5d0a3b4..092fa5a 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -455,6 +455,9 @@ namespace ISD {
     FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW,
     FLOG, FLOG2, FLOG10, FEXP, FEXP2,
     FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR,
+    
+    /// FSINCOS - Compute both fsin and fcos as a single operation.
+    FSINCOS,
 
     /// LOAD and STORE have token chains as their first operand, then the same
     /// operands as an LLVM load/store instruction, then an offset node that
diff --git a/include/llvm/CodeGen/JITCodeEmitter.h b/include/llvm/CodeGen/JITCodeEmitter.h
index 21a3f48..9a73214 100644
--- a/include/llvm/CodeGen/JITCodeEmitter.h
+++ b/include/llvm/CodeGen/JITCodeEmitter.h
@@ -207,8 +207,7 @@ public:
   /// emitString - This callback is invoked when a String needs to be
   /// written to the output stream.
   void emitString(const std::string &String) {
-    for (unsigned i = 0, N = static_cast<unsigned>(String.size());
-         i < N; ++i) {
+    for (size_t i = 0, N = String.size(); i < N; ++i) {
       uint8_t C = String[i];
       emitByte(C);
     }
diff --git a/include/llvm/CodeGen/LatencyPriorityQueue.h b/include/llvm/CodeGen/LatencyPriorityQueue.h
index 8fb31aa..d454347 100644
--- a/include/llvm/CodeGen/LatencyPriorityQueue.h
+++ b/include/llvm/CodeGen/LatencyPriorityQueue.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LATENCY_PRIORITY_QUEUE_H
-#define LATENCY_PRIORITY_QUEUE_H
+#ifndef LLVM_CODEGEN_LATENCYPRIORITYQUEUE_H
+#define LLVM_CODEGEN_LATENCYPRIORITYQUEUE_H
 
 #include "llvm/CodeGen/ScheduleDAG.h"
 
diff --git a/include/llvm/CodeGen/LexicalScopes.h b/include/llvm/CodeGen/LexicalScopes.h
index 3cb6b4d..ff65db4 100644
--- a/include/llvm/CodeGen/LexicalScopes.h
+++ b/include/llvm/CodeGen/LexicalScopes.h
@@ -159,9 +159,6 @@ public:
   LexicalScope(LexicalScope *P, const MDNode *D, const MDNode *I, bool A)
     : Parent(P), Desc(D), InlinedAtLocation(I), AbstractScope(A),
       LastInsn(0), FirstInsn(0), DFSIn(0), DFSOut(0) {
-#ifndef NDEBUG
-    IndentLevel = 0;
-#endif
     if (Parent)
       Parent->addChild(this);
   }
@@ -228,7 +225,7 @@ public:
   void setDFSIn(unsigned I)             { DFSIn = I; }
 
   /// dump - print lexical scope.
-  void dump() const;
+  void dump(unsigned Indent = 0) const;
 
 private:
   LexicalScope *Parent;                          // Parent to this scope.
@@ -244,9 +241,6 @@ private:
   const MachineInstr *FirstInsn;      // First instruction of this scope.
   unsigned DFSIn, DFSOut;             // In & Out Depth use to determine
                                       // scope nesting.
-#ifndef NDEBUG
-  mutable unsigned IndentLevel;       // Private state for dump()
-#endif
 };
 
 } // end llvm namespace
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index 9576075..244be9c 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -86,9 +86,10 @@ namespace llvm {
     SlotIndex end;    // End point of the interval (exclusive)
     VNInfo *valno;   // identifier for the value contained in this interval.
 
+    LiveRange() : valno(0) {}
+
     LiveRange(SlotIndex S, SlotIndex E, VNInfo *V)
       : start(S), end(E), valno(V) {
-
       assert(S < E && "Cannot create empty or backwards range");
     }
 
@@ -373,8 +374,8 @@ namespace llvm {
     /// addRange - Add the specified LiveRange to this interval, merging
     /// intervals as appropriate.  This returns an iterator to the inserted live
     /// range (which may have grown since it was inserted.
-    void addRange(LiveRange LR) {
-      addRangeFrom(LR, ranges.begin());
+    iterator addRange(LiveRange LR) {
+      return addRangeFrom(LR, ranges.begin());
     }
 
     /// extendInBlock - If this interval is live before Kill in the basic block
@@ -460,9 +461,6 @@ namespace llvm {
     void extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd);
     Ranges::iterator extendIntervalStartTo(Ranges::iterator I, SlotIndex NewStr);
     void markValNoForDeletion(VNInfo *V);
-    void mergeIntervalRanges(const LiveInterval &RHS,
-                             VNInfo *LHSValNo = 0,
-                             const VNInfo *RHSValNo = 0);
 
     LiveInterval& operator=(const LiveInterval& rhs) LLVM_DELETED_FUNCTION;
 
@@ -473,6 +471,64 @@ namespace llvm {
     return OS;
   }
 
+  /// Helper class for performant LiveInterval bulk updates.
+  ///
+  /// Calling LiveInterval::addRange() repeatedly can be expensive on large
+  /// live ranges because segments after the insertion point may need to be
+  /// shifted. The LiveRangeUpdater class can defer the shifting when adding
+  /// many segments in order.
+  ///
+  /// The LiveInterval will be in an invalid state until flush() is called.
+  class LiveRangeUpdater {
+    LiveInterval *LI;
+    SlotIndex LastStart;
+    LiveInterval::iterator WriteI;
+    LiveInterval::iterator ReadI;
+    SmallVector<LiveRange, 16> Spills;
+    void mergeSpills();
+
+  public:
+    /// Create a LiveRangeUpdater for adding segments to LI.
+    /// LI will temporarily be in an invalid state until flush() is called.
+    LiveRangeUpdater(LiveInterval *li = 0) : LI(li) {}
+
+    ~LiveRangeUpdater() { flush(); }
+
+    /// Add a segment to LI and coalesce when possible, just like LI.addRange().
+    /// Segments should be added in increasing start order for best performance.
+    void add(LiveRange);
+
+    void add(SlotIndex Start, SlotIndex End, VNInfo *VNI) {
+      add(LiveRange(Start, End, VNI));
+    }
+
+    /// Return true if the LI is currently in an invalid state, and flush()
+    /// needs to be called.
+    bool isDirty() const { return LastStart.isValid(); }
+
+    /// Flush the updater state to LI so it is valid and contains all added
+    /// segments.
+    void flush();
+
+    /// Select a different destination live range.
+    void setDest(LiveInterval *li) {
+      if (LI != li && isDirty())
+        flush();
+      LI = li;
+    }
+
+    /// Get the current destination live range.
+    LiveInterval *getDest() const { return LI; }
+
+    void dump() const;
+    void print(raw_ostream&) const;
+  };
+
+  inline raw_ostream &operator<<(raw_ostream &OS, const LiveRangeUpdater &X) {
+    X.print(OS);
+    return OS;
+  }
+
   /// LiveRangeQuery - Query information about a live range around a given
   /// instruction. This class hides the implementation details of live ranges,
   /// and it should be used as the primary interface for examining live ranges
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index 9e89519..4632368 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -53,7 +53,6 @@ namespace llvm {
     const TargetRegisterInfo* TRI;
     const TargetInstrInfo* TII;
     AliasAnalysis *AA;
-    LiveVariables* LV;
     SlotIndexes* Indexes;
     MachineDominatorTree *DomTree;
     LiveRangeCalc *LRCalc;
@@ -215,6 +214,13 @@ namespace llvm {
       return Indexes->getMBBFromIndex(index);
     }
 
+    void insertMBBInMaps(MachineBasicBlock *MBB) {
+      Indexes->insertMBBInMaps(MBB);
+      assert(unsigned(MBB->getNumber()) == RegMaskBlocks.size() &&
+             "Blocks must be added in order.");
+      RegMaskBlocks.push_back(std::make_pair(RegMaskSlots.size(), 0));
+    }
+
     SlotIndex InsertMachineInstrInMaps(MachineInstr *MI) {
       return Indexes->insertMachineInstrInMaps(MI);
     }
@@ -275,6 +281,21 @@ namespace llvm {
     void handleMoveIntoBundle(MachineInstr* MI, MachineInstr* BundleStart,
                               bool UpdateFlags = false);
 
+    /// repairIntervalsInRange - Update live intervals for instructions in a
+    /// range of iterators. It is intended for use after target hooks that may
+    /// insert or remove instructions, and is only efficient for a small number
+    /// of instructions.
+    ///
+    /// OrigRegs is a vector of registers that were originally used by the
+    /// instructions in the range between the two iterators.
+    ///
+    /// Currently, the only only changes that are supported are simple removal
+    /// and addition of uses.
+    void repairIntervalsInRange(MachineBasicBlock *MBB,
+                                MachineBasicBlock::iterator Begin,
+                                MachineBasicBlock::iterator End,
+                                ArrayRef<unsigned> OrigRegs);
+
     // Register mask functions.
     //
     // Machine instructions may use a register mask operand to indicate that a
@@ -352,36 +373,12 @@ namespace llvm {
     }
 
   private:
-    /// computeIntervals - Compute live intervals.
-    void computeIntervals();
-
     /// Compute live intervals for all virtual registers.
     void computeVirtRegs();
 
     /// Compute RegMaskSlots and RegMaskBits.
     void computeRegMasks();
 
-    /// handleRegisterDef - update intervals for a register def
-    /// (calls handleVirtualRegisterDef)
-    void handleRegisterDef(MachineBasicBlock *MBB,
-                           MachineBasicBlock::iterator MI,
-                           SlotIndex MIIdx,
-                           MachineOperand& MO, unsigned MOIdx);
-
-    /// isPartialRedef - Return true if the specified def at the specific index
-    /// is partially re-defining the specified live interval. A common case of
-    /// this is a definition of the sub-register.
-    bool isPartialRedef(SlotIndex MIIdx, MachineOperand &MO,
-                        LiveInterval &interval);
-
-    /// handleVirtualRegisterDef - update intervals for a virtual
-    /// register def
-    void handleVirtualRegisterDef(MachineBasicBlock *MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  SlotIndex MIIdx, MachineOperand& MO,
-                                  unsigned MOIdx,
-                                  LiveInterval& interval);
-
     static LiveInterval* createInterval(unsigned Reg);
 
     void printInstrs(raw_ostream &O) const;
diff --git a/include/llvm/CodeGen/LiveIntervalUnion.h b/include/llvm/CodeGen/LiveIntervalUnion.h
index 6a61614..615b339 100644
--- a/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_LIVEINTERVALUNION
-#define LLVM_CODEGEN_LIVEINTERVALUNION
+#ifndef LLVM_CODEGEN_LIVEINTERVALUNION_H
+#define LLVM_CODEGEN_LIVEINTERVALUNION_H
 
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/CodeGen/LiveInterval.h"
@@ -202,4 +202,4 @@ public:
 
 } // end namespace llvm
 
-#endif // !defined(LLVM_CODEGEN_LIVEINTERVALUNION)
+#endif // !defined(LLVM_CODEGEN_LIVEINTERVALUNION_H)
diff --git a/include/llvm/CodeGen/LiveStackAnalysis.h b/include/llvm/CodeGen/LiveStackAnalysis.h
index a3b1855..92c35f7 100644
--- a/include/llvm/CodeGen/LiveStackAnalysis.h
+++ b/include/llvm/CodeGen/LiveStackAnalysis.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_LIVESTACK_ANALYSIS_H
-#define LLVM_CODEGEN_LIVESTACK_ANALYSIS_H
+#ifndef LLVM_CODEGEN_LIVESTACKANALYSIS_H
+#define LLVM_CODEGEN_LIVESTACKANALYSIS_H
 
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
diff --git a/include/llvm/CodeGen/MachORelocation.h b/include/llvm/CodeGen/MachORelocation.h
index 21fe74f..8c9b7a8 100644
--- a/include/llvm/CodeGen/MachORelocation.h
+++ b/include/llvm/CodeGen/MachORelocation.h
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef LLVM_CODEGEN_MACHO_RELOCATION_H
-#define LLVM_CODEGEN_MACHO_RELOCATION_H
+#ifndef LLVM_CODEGEN_MACHORELOCATION_H
+#define LLVM_CODEGEN_MACHORELOCATION_H
 
 #include "llvm/Support/DataTypes.h"
 
@@ -53,4 +53,4 @@ namespace llvm {
 
 } // end llvm namespace
 
-#endif // LLVM_CODEGEN_MACHO_RELOCATION_H
+#endif // LLVM_CODEGEN_MACHORELOCATION_H
diff --git a/include/llvm/CodeGen/MachineCodeInfo.h b/include/llvm/CodeGen/MachineCodeInfo.h
index c5c0c44..ba9dfab 100644
--- a/include/llvm/CodeGen/MachineCodeInfo.h
+++ b/include/llvm/CodeGen/MachineCodeInfo.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef EE_MACHINE_CODE_INFO_H
-#define EE_MACHINE_CODE_INFO_H
+#ifndef LLVM_CODEGEN_MACHINECODEINFO_H
+#define LLVM_CODEGEN_MACHINECODEINFO_H
 
 #include "llvm/Support/DataTypes.h"
 
diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h
index 40b2542..e41d206 100644
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h
@@ -41,15 +41,15 @@ class MachineDominatorTree : public MachineFunctionPass {
 public:
   static char ID; // Pass ID, replacement for typeid
   DominatorTreeBase<MachineBasicBlock>* DT;
-  
+
   MachineDominatorTree();
-  
+
   ~MachineDominatorTree();
-  
+
   DominatorTreeBase<MachineBasicBlock>& getBase() { return *DT; }
-  
+
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-  
+
   /// getRoots -  Return the root blocks of the current CFG.  This may include
   /// multiple blocks if we are computing post dominators.  For forward
   /// dominators, this will always be a single block (the entry node).
@@ -57,33 +57,35 @@ public:
   inline const std::vector<MachineBasicBlock*> &getRoots() const {
     return DT->getRoots();
   }
-  
+
   inline MachineBasicBlock *getRoot() const {
     return DT->getRoot();
   }
-  
+
   inline MachineDomTreeNode *getRootNode() const {
     return DT->getRootNode();
   }
-  
+
   virtual bool runOnMachineFunction(MachineFunction &F);
-  
-  inline bool dominates(MachineDomTreeNode* A, MachineDomTreeNode* B) const {
+
+  inline bool dominates(const MachineDomTreeNode* A,
+                        const MachineDomTreeNode* B) const {
     return DT->dominates(A, B);
   }
-  
-  inline bool dominates(MachineBasicBlock* A, MachineBasicBlock* B) const {
+
+  inline bool dominates(const MachineBasicBlock* A,
+                        const MachineBasicBlock* B) const {
     return DT->dominates(A, B);
   }
-  
+
   // dominates - Return true if A dominates B. This performs the
   // special checks necessary if A and B are in the same basic block.
-  bool dominates(MachineInstr *A, MachineInstr *B) const {
-    MachineBasicBlock *BBA = A->getParent(), *BBB = B->getParent();
+  bool dominates(const MachineInstr *A, const MachineInstr *B) const {
+    const MachineBasicBlock *BBA = A->getParent(), *BBB = B->getParent();
     if (BBA != BBB) return DT->dominates(BBA, BBB);
 
     // Loop through the basic block until we find A or B.
-    MachineBasicBlock::iterator I = BBA->begin();
+    MachineBasicBlock::const_iterator I = BBA->begin();
     for (; &*I != A && &*I != B; ++I)
       /*empty*/ ;
 
@@ -95,43 +97,43 @@ public:
     //  return &*I == B;
     //}
   }
-  
+
   inline bool properlyDominates(const MachineDomTreeNode* A,
-                                MachineDomTreeNode* B) const {
+                                const MachineDomTreeNode* B) const {
     return DT->properlyDominates(A, B);
   }
-  
-  inline bool properlyDominates(MachineBasicBlock* A,
-                                MachineBasicBlock* B) const {
+
+  inline bool properlyDominates(const MachineBasicBlock* A,
+                                const MachineBasicBlock* B) const {
     return DT->properlyDominates(A, B);
   }
-  
+
   /// findNearestCommonDominator - Find nearest common dominator basic block
   /// for basic block A and B. If there is no such block then return NULL.
   inline MachineBasicBlock *findNearestCommonDominator(MachineBasicBlock *A,
                                                        MachineBasicBlock *B) {
     return DT->findNearestCommonDominator(A, B);
   }
-  
+
   inline MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
     return DT->getNode(BB);
   }
-  
+
   /// getNode - return the (Post)DominatorTree node for the specified basic
   /// block.  This is the same as using operator[] on this class.
   ///
   inline MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
     return DT->getNode(BB);
   }
-  
+
   /// addNewBlock - Add a new node to the dominator tree information.  This
-  /// creates a new node as a child of DomBB dominator node,linking it into 
+  /// creates a new node as a child of DomBB dominator node,linking it into
   /// the children list of the immediate dominator.
   inline MachineDomTreeNode *addNewBlock(MachineBasicBlock *BB,
                                          MachineBasicBlock *DomBB) {
     return DT->addNewBlock(BB, DomBB);
   }
-  
+
   /// changeImmediateDominator - This method is used to update the dominator
   /// tree information when a node's immediate dominator changes.
   ///
@@ -139,19 +141,19 @@ public:
                                        MachineBasicBlock* NewIDom) {
     DT->changeImmediateDominator(N, NewIDom);
   }
-  
+
   inline void changeImmediateDominator(MachineDomTreeNode *N,
                                        MachineDomTreeNode* NewIDom) {
     DT->changeImmediateDominator(N, NewIDom);
   }
-  
+
   /// eraseNode - Removes a node from  the dominator tree. Block must not
   /// dominate any other blocks. Removes node from its immediate dominator's
   /// children list. Deletes dominator node associated with basic block BB.
   inline void eraseNode(MachineBasicBlock *BB) {
     DT->eraseNode(BB);
   }
-  
+
   /// splitBlock - BB is split and now it has one successor. Update dominator
   /// tree to reflect this change.
   inline void splitBlock(MachineBasicBlock* NewBB) {
@@ -160,12 +162,12 @@ public:
 
   /// isReachableFromEntry - Return true if A is dominated by the entry
   /// block of the function containing it.
-  bool isReachableFromEntry(MachineBasicBlock *A) {
+  bool isReachableFromEntry(const MachineBasicBlock *A) {
     return DT->isReachableFromEntry(A);
   }
 
   virtual void releaseMemory();
-  
+
   virtual void print(raw_ostream &OS, const Module*) const;
 };
 
@@ -179,7 +181,7 @@ template<class T> struct GraphTraits;
 template <> struct GraphTraits<MachineDomTreeNode *> {
   typedef MachineDomTreeNode NodeType;
   typedef NodeType::iterator  ChildIteratorType;
-  
+
   static NodeType *getEntryNode(NodeType *N) {
     return N;
   }
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index cd704c1..82c4cd6 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -131,6 +131,9 @@ class MachineFunction {
   /// about the control flow of such functions.
   bool ExposesReturnsTwice;
 
+  /// True if the function includes MS-style inline assembly.
+  bool HasMSInlineAsm;
+
   MachineFunction(const MachineFunction &) LLVM_DELETED_FUNCTION;
   void operator=(const MachineFunction&) LLVM_DELETED_FUNCTION;
 public:
@@ -214,6 +217,17 @@ public:
   void setExposesReturnsTwice(bool B) {
     ExposesReturnsTwice = B;
   }
+
+  /// Returns true if the function contains any MS-style inline assembly.
+  bool hasMSInlineAsm() const {
+    return HasMSInlineAsm;
+  }
+
+  /// Set a flag that indicates that the function contains MS-style inline
+  /// assembly.
+  void setHasMSInlineAsm(bool B) {
+    HasMSInlineAsm = B;
+  }
   
   /// getInfo - Keep track of various per-function pieces of information for
   /// backends that would like to do so.
diff --git a/include/llvm/CodeGen/MachineFunctionAnalysis.h b/include/llvm/CodeGen/MachineFunctionAnalysis.h
index 50ea206..112f07e 100644
--- a/include/llvm/CodeGen/MachineFunctionAnalysis.h
+++ b/include/llvm/CodeGen/MachineFunctionAnalysis.h
@@ -11,15 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_MACHINE_FUNCTION_ANALYSIS_H
-#define LLVM_CODEGEN_MACHINE_FUNCTION_ANALYSIS_H
+#ifndef LLVM_CODEGEN_MACHINEFUNCTIONANALYSIS_H
+#define LLVM_CODEGEN_MACHINEFUNCTIONANALYSIS_H
 
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
 class MachineFunction;
+class TargetMachine;
 
 /// MachineFunctionAnalysis - This class is a Pass that manages a
 /// MachineFunction object.
diff --git a/include/llvm/CodeGen/MachineFunctionPass.h b/include/llvm/CodeGen/MachineFunctionPass.h
index b7bf0a3..04881e5 100644
--- a/include/llvm/CodeGen/MachineFunctionPass.h
+++ b/include/llvm/CodeGen/MachineFunctionPass.h
@@ -16,8 +16,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_MACHINE_FUNCTION_PASS_H
-#define LLVM_CODEGEN_MACHINE_FUNCTION_PASS_H
+#ifndef LLVM_CODEGEN_MACHINEFUNCTIONPASS_H
+#define LLVM_CODEGEN_MACHINEFUNCTIONPASS_H
 
 #include "llvm/Pass.h"
 
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 89521e8..195cce7 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -313,11 +313,11 @@ public:
   /// The second argument indicates whether the query should look inside
   /// instruction bundles.
   bool hasProperty(unsigned MCFlag, QueryType Type = AnyInBundle) const {
-    // Inline the fast path.
-    if (Type == IgnoreBundle || !isBundle())
+    // Inline the fast path for unbundled or bundle-internal instructions.
+    if (Type == IgnoreBundle || !isBundled() || isBundledWithPred())
       return getDesc().getFlags() & (1 << MCFlag);
 
-    // If we have a bundle, take the slow path.
+    // If this is the first instruction in a bundle, take the slow path.
     return hasPropertyInBundle(1 << MCFlag, Type);
   }
 
@@ -642,6 +642,9 @@ public:
   bool isKill() const { return getOpcode() == TargetOpcode::KILL; }
   bool isImplicitDef() const { return getOpcode()==TargetOpcode::IMPLICIT_DEF; }
   bool isInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM; }
+  bool isMSInlineAsm() const { 
+    return getOpcode() == TargetOpcode::INLINEASM && getInlineAsmDialect();
+  }
   bool isStackAligningInlineAsm() const;
   InlineAsm::AsmDialect getInlineAsmDialect() const;
   bool isInsertSubreg() const {
@@ -699,7 +702,11 @@ public:
     }
   }
 
-  /// getBundleSize - Return the number of instructions inside the MI bundle.
+  /// Return the number of instructions inside the MI bundle, excluding the
+  /// bundle header.
+  ///
+  /// This is the number of instructions that MachineBasicBlock::iterator
+  /// skips, 0 for unbundled instructions.
   unsigned getBundleSize() const;
 
   /// readsRegister - Return true if the MachineInstr reads the specified
@@ -944,7 +951,8 @@ public:
   //
   // Debugging support
   //
-  void print(raw_ostream &OS, const TargetMachine *TM = 0) const;
+  void print(raw_ostream &OS, const TargetMachine *TM = 0,
+             bool SkipOpers = false) const;
   void dump() const;
 
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index a2d3d63..92c8da9 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -18,6 +18,7 @@
 #define LLVM_CODEGEN_MACHINEINSTRBUILDER_H
 
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
@@ -391,11 +392,7 @@ public:
   /// Create an MIBundleBuilder representing an existing instruction or bundle
   /// that has MI as its head.
   explicit MIBundleBuilder(MachineInstr *MI)
-    : MBB(*MI->getParent()), Begin(MI) {
-    MachineBasicBlock::iterator I = MI;
-    ++I;
-    End = I.getInstrIterator();
-  }
+    : MBB(*MI->getParent()), Begin(MI), End(getBundleEnd(MI)) {}
 
   /// Return a reference to the basic block containing this bundle.
   MachineBasicBlock &getMBB() const { return MBB; }
diff --git a/include/llvm/CodeGen/MachineInstrBundle.h b/include/llvm/CodeGen/MachineInstrBundle.h
index 3c60ad1..9519edb 100644
--- a/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/include/llvm/CodeGen/MachineInstrBundle.h
@@ -45,18 +45,36 @@ bool finalizeBundles(MachineFunction &MF);
 ///
 inline MachineInstr *getBundleStart(MachineInstr *MI) {
   MachineBasicBlock::instr_iterator I = MI;
-  while (I->isInsideBundle())
+  while (I->isBundledWithPred())
     --I;
   return I;
 }
 
 inline const MachineInstr *getBundleStart(const MachineInstr *MI) {
   MachineBasicBlock::const_instr_iterator I = MI;
-  while (I->isInsideBundle())
+  while (I->isBundledWithPred())
     --I;
   return I;
 }
 
+/// Return an iterator pointing beyond the bundle containing MI.
+inline MachineBasicBlock::instr_iterator
+getBundleEnd(MachineInstr *MI) {
+  MachineBasicBlock::instr_iterator I = MI;
+  while (I->isBundledWithSucc())
+    ++I;
+  return ++I;
+}
+
+/// Return an iterator pointing beyond the bundle containing MI.
+inline MachineBasicBlock::const_instr_iterator
+getBundleEnd(const MachineInstr *MI) {
+  MachineBasicBlock::const_instr_iterator I = MI;
+  while (I->isBundledWithSucc())
+    ++I;
+  return ++I;
+}
+
 //===----------------------------------------------------------------------===//
 // MachineOperand iterator
 //
diff --git a/include/llvm/CodeGen/MachineLoopInfo.h b/include/llvm/CodeGen/MachineLoopInfo.h
index 99da6b2..b058ecb 100644
--- a/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/include/llvm/CodeGen/MachineLoopInfo.h
@@ -27,8 +27,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_MACHINE_LOOP_INFO_H
-#define LLVM_CODEGEN_MACHINE_LOOP_INFO_H
+#ifndef LLVM_CODEGEN_MACHINELOOPINFO_H
+#define LLVM_CODEGEN_MACHINELOOPINFO_H
 
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 6194d05..1db6252 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -35,7 +35,6 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MachineLocation.h"
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 36427e9..875285b 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -99,13 +99,11 @@ class MachineRegisterInfo {
   /// started.
   BitVector ReservedRegs;
 
-  /// LiveIns/LiveOuts - Keep track of the physical registers that are
-  /// livein/liveout of the function.  Live in values are typically arguments in
-  /// registers, live out values are typically return values in registers.
-  /// LiveIn values are allowed to have virtual registers associated with them,
-  /// stored in the second element.
+  /// Keep track of the physical registers that are live in to the function.
+  /// Live in values are typically arguments in registers, live out values are
+  /// typically return values in registers. LiveIn values are allowed to have
+  /// virtual registers associated with them, stored in the second element.
   std::vector<std::pair<unsigned, unsigned> > LiveIns;
-  std::vector<unsigned> LiveOuts;
 
   MachineRegisterInfo(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
   void operator=(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
@@ -379,6 +377,12 @@ public:
     return false;
   }
 
+  /// Mark the specified register unit as used in this function.
+  /// This should only be called during and after register allocation.
+  void setRegUnitUsed(unsigned RegUnit) {
+    UsedRegUnits.set(RegUnit);
+  }
+
   /// setPhysRegUsed - Mark the specified register used in this function.
   /// This should only be called during and after register allocation.
   void setPhysRegUsed(unsigned Reg) {
@@ -468,7 +472,6 @@ public:
   void addLiveIn(unsigned Reg, unsigned vreg = 0) {
     LiveIns.push_back(std::make_pair(Reg, vreg));
   }
-  void addLiveOut(unsigned Reg) { LiveOuts.push_back(Reg); }
 
   // Iteration support for live in/out sets.  These sets are kept in sorted
   // order by their register number.
@@ -478,12 +481,8 @@ public:
   livein_iterator livein_begin() const { return LiveIns.begin(); }
   livein_iterator livein_end()   const { return LiveIns.end(); }
   bool            livein_empty() const { return LiveIns.empty(); }
-  liveout_iterator liveout_begin() const { return LiveOuts.begin(); }
-  liveout_iterator liveout_end()   const { return LiveOuts.end(); }
-  bool             liveout_empty() const { return LiveOuts.empty(); }
 
   bool isLiveIn(unsigned Reg) const;
-  bool isLiveOut(unsigned Reg) const;
 
   /// getLiveInPhysReg - If VReg is a live-in virtual register, return the
   /// corresponding live-in physical register.
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 88f347e..57febe7 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -24,8 +24,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MACHINESCHEDULER_H
-#define MACHINESCHEDULER_H
+#ifndef LLVM_CODEGEN_MACHINESCHEDULER_H
+#define LLVM_CODEGEN_MACHINESCHEDULER_H
 
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/RegisterPressure.h"
@@ -43,6 +43,7 @@ class MachineDominatorTree;
 class MachineLoopInfo;
 class RegisterClassInfo;
 class ScheduleDAGInstrs;
+class SchedDFSResult;
 
 /// MachineSchedContext provides enough context from the MachineScheduler pass
 /// for the target to instantiate a scheduler.
@@ -119,6 +120,9 @@ public:
   /// be scheduled at the bottom.
   virtual SUnit *pickNode(bool &IsTopNode) = 0;
 
+  /// \brief Scheduler callback to notify that a new subtree is scheduled.
+  virtual void scheduleTree(unsigned SubtreeID) {}
+
   /// Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an
   /// instruction and updated scheduled/remaining flags in the DAG nodes.
   virtual void schedNode(SUnit *SU, bool IsTopNode) = 0;
@@ -164,6 +168,8 @@ public:
 
   iterator end() { return Queue.end(); }
 
+  ArrayRef<SUnit*> elements() { return Queue; }
+
   iterator find(SUnit *SU) {
     return std::find(Queue.begin(), Queue.end(), SU);
   }
@@ -202,6 +208,11 @@ protected:
   RegisterClassInfo *RegClassInfo;
   MachineSchedStrategy *SchedImpl;
 
+  /// Information about DAG subtrees. If DFSResult is NULL, then SchedulerTrees
+  /// will be empty.
+  SchedDFSResult *DFSResult;
+  BitVector ScheduledTrees;
+
   /// Topo - A topological ordering for SUnits which permits fast IsReachable
   /// and similar queries.
   ScheduleDAGTopologicalSort Topo;
@@ -243,7 +254,7 @@ protected:
 public:
   ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S):
     ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
-    AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S),
+    AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S), DFSResult(0),
     Topo(SUnits, &ExitSU), RPTracker(RegPressure), CurrentTop(),
     TopRPTracker(TopPressure), CurrentBottom(), BotRPTracker(BotPressure),
     NextClusterPred(NULL), NextClusterSucc(NULL) {
@@ -252,10 +263,7 @@ public:
 #endif
   }
 
-  virtual ~ScheduleDAGMI() {
-    DeleteContainerPointers(Mutations);
-    delete SchedImpl;
-  }
+  virtual ~ScheduleDAGMI();
 
   /// Add a postprocessing step to the DAG builder.
   /// Mutations are applied in the order that they are added after normal DAG
@@ -308,6 +316,18 @@ public:
 
   const SUnit *getNextClusterSucc() const { return NextClusterSucc; }
 
+  /// Compute a DFSResult after DAG building is complete, and before any
+  /// queue comparisons.
+  void computeDFSResult();
+
+  /// Return a non-null DFS result if the scheduling strategy initialized it.
+  const SchedDFSResult *getDFSResult() const { return DFSResult; }
+
+  BitVector &getScheduledTrees() { return ScheduledTrees; }
+
+  void viewGraph(const Twine &Name, const Twine &Title) LLVM_OVERRIDE;
+  void viewGraph() LLVM_OVERRIDE;
+
 protected:
   // Top-Level entry points for the schedule() driver...
 
@@ -321,8 +341,8 @@ protected:
   /// instances of ScheduleDAGMI to perform custom DAG postprocessing.
   void postprocessDAG();
 
-  /// Identify DAG roots and setup scheduler queues.
-  void initQueues();
+  /// Release ExitSU predecessors and setup scheduler queues.
+  void initQueues(ArrayRef<SUnit*> TopRoots, ArrayRef<SUnit*> BotRoots);
 
   /// Move an instruction and update register pressure.
   void scheduleMI(SUnit *SU, bool IsTopNode);
@@ -340,12 +360,13 @@ protected:
 
   void initRegPressure();
 
-  void updateScheduledPressure(std::vector<unsigned> NewMaxPressure);
+  void updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure);
 
   void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
   bool checkSchedLimit();
 
-  void releaseRoots();
+  void findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
+                             SmallVectorImpl<SUnit*> &BotRoots);
 
   void releaseSucc(SUnit *SU, SDep *SuccEdge);
   void releaseSuccessors(SUnit *SU);
diff --git a/lib/CodeGen/MachineTraceMetrics.h b/include/llvm/CodeGen/MachineTraceMetrics.h
index 460730b..460730b 100644
--- a/lib/CodeGen/MachineTraceMetrics.h
+++ b/include/llvm/CodeGen/MachineTraceMetrics.h
diff --git a/include/llvm/CodeGen/PBQP/Math.h b/include/llvm/CodeGen/PBQP/Math.h
index 4e51913..08f8b98 100644
--- a/include/llvm/CodeGen/PBQP/Math.h
+++ b/include/llvm/CodeGen/PBQP/Math.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_PBQP_MATH_H 
+#ifndef LLVM_CODEGEN_PBQP_MATH_H
 #define LLVM_CODEGEN_PBQP_MATH_H
 
 #include <algorithm>
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 0d02700..5f710e6 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -25,6 +25,7 @@ namespace llvm {
   class MachineFunctionPass;
   class PassInfo;
   class PassManagerBase;
+  class TargetLoweringBase;
   class TargetLowering;
   class TargetRegisterClass;
   class raw_ostream;
@@ -180,6 +181,16 @@ protected:
   /// instructions in SSA form.
   virtual void addMachineSSAOptimization();
 
+  /// Add passes that optimize instruction level parallelism for out-of-order
+  /// targets. These passes are run while the machine code is still in SSA
+  /// form, so they can use MachineTraceMetrics to control their heuristics.
+  ///
+  /// All passes added here should preserve the MachineDominatorTree,
+  /// MachineLoopInfo, and MachineTraceMetrics analyses.
+  virtual bool addILPOpts() {
+    return false;
+  }
+
   /// addPreRegAlloc - This method may be implemented by targets that want to
   /// run passes immediately before register allocation. This should return
   /// true if -print-machineinstrs should print after these passes.
@@ -284,7 +295,8 @@ namespace llvm {
   ///
   /// This pass implements the target transform info analysis using the target
   /// independent information available to the LLVM code generator.
-  ImmutablePass *createBasicTargetTransformInfoPass(const TargetLowering *TLI);
+  ImmutablePass *
+  createBasicTargetTransformInfoPass(const TargetLoweringBase *TLI);
 
   /// createUnreachableBlockEliminationPass - The LLVM code generator does not
   /// work well with unreachable basic blocks (what live ranges make sense for a
@@ -481,7 +493,7 @@ namespace llvm {
 
   /// createStackProtectorPass - This pass adds stack protectors to functions.
   ///
-  FunctionPass *createStackProtectorPass(const TargetLowering *tli);
+  FunctionPass *createStackProtectorPass(const TargetLoweringBase *tli);
 
   /// createMachineVerifierPass - This pass verifies cenerated machine code
   /// instructions for correctness.
@@ -495,7 +507,7 @@ namespace llvm {
   /// createSjLjEHPreparePass - This pass adapts exception handling code to use
   /// the GCC-style builtin setjmp/longjmp (sjlj) to handling EH control flow.
   ///
-  FunctionPass *createSjLjEHPreparePass(const TargetLowering *tli);
+  FunctionPass *createSjLjEHPreparePass(const TargetLoweringBase *tli);
 
   /// LocalStackSlotAllocation - This pass assigns local frame indices to stack
   /// slots relative to one another and allocates base registers to access them
diff --git a/include/llvm/CodeGen/RegAllocRegistry.h b/include/llvm/CodeGen/RegAllocRegistry.h
index 100e357..ca49577 100644
--- a/include/llvm/CodeGen/RegAllocRegistry.h
+++ b/include/llvm/CodeGen/RegAllocRegistry.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGENREGALLOCREGISTRY_H
-#define LLVM_CODEGENREGALLOCREGISTRY_H
+#ifndef LLVM_CODEGEN_REGALLOCREGISTRY_H
+#define LLVM_CODEGEN_REGALLOCREGISTRY_H
 
 #include "llvm/CodeGen/MachinePassRegistry.h"
 
diff --git a/include/llvm/CodeGen/RegisterClassInfo.h b/include/llvm/CodeGen/RegisterClassInfo.h
index 12bd1c6..3ad22e6 100644
--- a/include/llvm/CodeGen/RegisterClassInfo.h
+++ b/include/llvm/CodeGen/RegisterClassInfo.h
@@ -29,9 +29,14 @@ class RegisterClassInfo {
     unsigned Tag;
     unsigned NumRegs;
     bool ProperSubClass;
+    uint8_t MinCost;
+    uint16_t LastCostChange;
     OwningArrayPtr<MCPhysReg> Order;
 
-    RCInfo() : Tag(0), NumRegs(0), ProperSubClass(false) {}
+    RCInfo()
+      : Tag(0), NumRegs(0), ProperSubClass(false), MinCost(0),
+        LastCostChange(0) {}
+
     operator ArrayRef<MCPhysReg>() const {
       return makeArrayRef(Order.get(), NumRegs);
     }
@@ -106,6 +111,21 @@ public:
       return CalleeSaved[N-1];
     return 0;
   }
+
+  /// Get the minimum register cost in RC's allocation order.
+  /// This is the smallest value returned by TRI->getCostPerUse(Reg) for all
+  /// the registers in getOrder(RC).
+  unsigned getMinCost(const TargetRegisterClass *RC) {
+    return get(RC).MinCost;
+  }
+
+  /// Get the position of the last cost change in getOrder(RC).
+  ///
+  /// All registers in getOrder(RC).slice(getLastCostChange(RC)) will have the
+  /// same cost according to TRI->getCostPerUse().
+  unsigned getLastCostChange(const TargetRegisterClass *RC) {
+    return get(RC).LastCostChange;
+  }
 };
 } // end namespace llvm
 
diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h
index 6604d4c..0119920 100644
--- a/include/llvm/CodeGen/RegisterScavenging.h
+++ b/include/llvm/CodeGen/RegisterScavenging.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_REGISTER_SCAVENGING_H
-#define LLVM_CODEGEN_REGISTER_SCAVENGING_H
+#ifndef LLVM_CODEGEN_REGISTERSCAVENGING_H
+#define LLVM_CODEGEN_REGISTERSCAVENGING_H
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
diff --git a/include/llvm/CodeGen/ResourcePriorityQueue.h b/include/llvm/CodeGen/ResourcePriorityQueue.h
index 66a6039..f20a9fc 100644
--- a/include/llvm/CodeGen/ResourcePriorityQueue.h
+++ b/include/llvm/CodeGen/ResourcePriorityQueue.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef RESOURCE_PRIORITY_QUEUE_H
-#define RESOURCE_PRIORITY_QUEUE_H
+#ifndef LLVM_CODEGEN_RESOURCEPRIORITYQUEUE_H
+#define LLVM_CODEGEN_RESOURCEPRIORITYQUEUE_H
 
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
diff --git a/include/llvm/CodeGen/RuntimeLibcalls.h b/include/llvm/CodeGen/RuntimeLibcalls.h
index 1a9dc1c..41289a4 100644
--- a/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -158,6 +158,11 @@ namespace RTLIB {
     COS_F80,
     COS_F128,
     COS_PPCF128,
+    SINCOS_F32,
+    SINCOS_F64,
+    SINCOS_F80,
+    SINCOS_F128,
+    SINCOS_PPCF128,
     POW_F32,
     POW_F64,
     POW_F80,
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index aa91b03..d80bdf8 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -17,11 +17,10 @@
 #define LLVM_CODEGEN_SCHEDULEDAG_H
 
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
@@ -258,6 +257,8 @@ namespace llvm {
   /// SUnit - Scheduling unit. This is a node in the scheduling DAG.
   class SUnit {
   private:
+    enum { BoundaryID = ~0u };
+
     SDNode *Node;                       // Representative node.
     MachineInstr *Instr;                // Alternatively, a MachineInstr.
   public:
@@ -343,7 +344,7 @@ namespace llvm {
 
     /// SUnit - Construct a placeholder SUnit.
     SUnit()
-      : Node(0), Instr(0), OrigNode(0), SchedClass(0), NodeNum(~0u),
+      : Node(0), Instr(0), OrigNode(0), SchedClass(0), NodeNum(BoundaryID),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0),
         Latency(0), isVRegCycle(false), isCall(false), isCallOp(false),
@@ -354,6 +355,15 @@ namespace llvm {
         isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
         TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
 
+    /// \brief Boundary nodes are placeholders for the boundary of the
+    /// scheduling region.
+    ///
+    /// BoundaryNodes can have DAG edges, including Data edges, but they do not
+    /// correspond to schedulable entities (e.g. instructions) and do not have a
+    /// valid ID. Consequently, always check for boundary nodes before accessing
+    /// an assoicative data structure keyed on node ID.
+    bool isBoundaryNode() const { return NodeNum == BoundaryID; };
+
     /// setNode - Assign the representative SDNode for this SUnit.
     /// This may be used during pre-regalloc scheduling.
     void setNode(SDNode *N) {
@@ -455,6 +465,10 @@ namespace llvm {
       return NumSuccsLeft == 0;
     }
 
+    /// \brief Order this node's predecessor edges such that the critical path
+    /// edge occurs first.
+    void biasCriticalPath();
+
     void dump(const ScheduleDAG *G) const;
     void dumpAll(const ScheduleDAG *G) const;
     void print(raw_ostream &O, const ScheduleDAG *G) const;
@@ -563,8 +577,8 @@ namespace llvm {
     /// viewGraph - Pop up a GraphViz/gv window with the ScheduleDAG rendered
     /// using 'dot'.
     ///
-    void viewGraph(const Twine &Name, const Twine &Title);
-    void viewGraph();
+    virtual void viewGraph(const Twine &Name, const Twine &Title);
+    virtual void viewGraph();
 
     virtual void dumpNode(const SUnit *SU) const = 0;
 
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index ffc442e..94abec2 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -12,11 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SCHEDULEDAGINSTRS_H
-#define SCHEDULEDAGINSTRS_H
+#ifndef LLVM_CODEGEN_SCHEDULEDAGINSTRS_H
+#define LLVM_CODEGEN_SCHEDULEDAGINSTRS_H
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/SparseMultiSet.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
@@ -48,56 +49,18 @@ namespace llvm {
   struct PhysRegSUOper {
     SUnit *SU;
     int OpIdx;
+    unsigned Reg;
 
-    PhysRegSUOper(SUnit *su, int op): SU(su), OpIdx(op) {}
-  };
-
-  /// Combine a SparseSet with a 1x1 vector to track physical registers.
-  /// The SparseSet allows iterating over the (few) live registers for quickly
-  /// comparing against a regmask or clearing the set.
-  ///
-  /// Storage for the map is allocated once for the pass. The map can be
-  /// cleared between scheduling regions without freeing unused entries.
-  class Reg2SUnitsMap {
-    SparseSet<unsigned> PhysRegSet;
-    std::vector<std::vector<PhysRegSUOper> > SUnits;
-  public:
-    typedef SparseSet<unsigned>::const_iterator const_iterator;
-
-    // Allow iteration over register numbers (keys) in the map. If needed, we
-    // can provide an iterator over SUnits (values) as well.
-    const_iterator reg_begin() const { return PhysRegSet.begin(); }
-    const_iterator reg_end() const { return PhysRegSet.end(); }
-
-    /// Initialize the map with the number of registers.
-    /// If the map is already large enough, no allocation occurs.
-    /// For simplicity we expect the map to be empty().
-    void setRegLimit(unsigned Limit);
-
-    /// Returns true if the map is empty.
-    bool empty() const { return PhysRegSet.empty(); }
+    PhysRegSUOper(SUnit *su, int op, unsigned R): SU(su), OpIdx(op), Reg(R) {}
 
-    /// Clear the map without deallocating storage.
-    void clear();
-
-    bool contains(unsigned Reg) const { return PhysRegSet.count(Reg); }
-
-    /// If this register is mapped, return its existing SUnits vector.
-    /// Otherwise map the register and return an empty SUnits vector.
-    std::vector<PhysRegSUOper> &operator[](unsigned Reg) {
-      bool New = PhysRegSet.insert(Reg).second;
-      assert((!New || SUnits[Reg].empty()) && "stale SUnits vector");
-      (void)New;
-      return SUnits[Reg];
-    }
-
-    /// Erase an existing element without freeing memory.
-    void erase(unsigned Reg) {
-      PhysRegSet.erase(Reg);
-      SUnits[Reg].clear();
-    }
+    unsigned getSparseSetIndex() const { return Reg; }
   };
 
+  /// Use a SparseMultiSet to track physical registers. Storage is only
+  /// allocated once for the pass. It can be cleared in constant time and reused
+  /// without any frees.
+  typedef SparseMultiSet<PhysRegSUOper, llvm::identity<unsigned>, uint16_t> Reg2SUnitsMap;
+
   /// Use SparseSet as a SparseMap by relying on the fact that it never
   /// compares ValueT's, only unsigned keys. This allows the set to be cleared
   /// between scheduling regions in constant time as long as ValueT does not
diff --git a/include/llvm/CodeGen/ScheduleDFS.h b/include/llvm/CodeGen/ScheduleDFS.h
index fbbadd9..73ce99f 100644
--- a/include/llvm/CodeGen/ScheduleDFS.h
+++ b/include/llvm/CodeGen/ScheduleDFS.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SCHEDULEDAGILP_H
-#define LLVM_CODEGEN_SCHEDULEDAGILP_H
+#ifndef LLVM_CODEGEN_SCHEDULEDFS_H
+#define LLVM_CODEGEN_SCHEDULEDFS_H
 
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/Support/DataTypes.h"
@@ -27,6 +27,9 @@ class SUnit;
 
 /// \brief Represent the ILP of the subDAG rooted at a DAG node.
 ///
+/// ILPValues summarize the DAG subtree rooted at each node. ILPValues are
+/// valid for all nodes regardless of their subtree membership.
+///
 /// When computed using bottom-up DFS, this metric assumes that the DAG is a
 /// forest of trees with roots at the bottom of the schedule branching upward.
 struct ILPValue {
@@ -62,19 +65,30 @@ struct ILPValue {
 };
 
 /// \brief Compute the values of each DAG node for various metrics during DFS.
-///
-/// ILPValues summarize the DAG subtree rooted at each node up to
-/// SubtreeLimit. ILPValues are also valid for interior nodes of a subtree, not
-/// just the root.
 class SchedDFSResult {
   friend class SchedDFSImpl;
 
+  static const unsigned InvalidSubtreeID = ~0u;
+
   /// \brief Per-SUnit data computed during DFS for various metrics.
+  ///
+  /// A node's SubtreeID is set to itself when it is visited to indicate that it
+  /// is the root of a subtree. Later it is set to its parent to indicate an
+  /// interior node. Finally, it is set to a representative subtree ID during
+  /// finalization.
   struct NodeData {
     unsigned InstrCount;
     unsigned SubtreeID;
 
-    NodeData(): InstrCount(0), SubtreeID(0) {}
+    NodeData(): InstrCount(0), SubtreeID(InvalidSubtreeID) {}
+  };
+
+  /// \brief Per-Subtree data computed during DFS.
+  struct TreeData {
+    unsigned ParentTreeID;
+    unsigned SubInstrCount;
+
+    TreeData(): ParentTreeID(InvalidSubtreeID), SubInstrCount(0) {}
   };
 
   /// \brief Record a connection between subtrees and the connection level.
@@ -88,7 +102,10 @@ class SchedDFSResult {
   bool IsBottomUp;
   unsigned SubtreeLimit;
   /// DFS results for each SUnit in this DAG.
-  std::vector<NodeData> DFSData;
+  std::vector<NodeData> DFSNodeData;
+
+  // Store per-tree data indexed on tree ID,
+  SmallVector<TreeData, 16> DFSTreeData;
 
   // For each subtree discovered during DFS, record its connections to other
   // subtrees.
@@ -102,41 +119,68 @@ public:
   SchedDFSResult(bool IsBU, unsigned lim)
     : IsBottomUp(IsBU), SubtreeLimit(lim) {}
 
+  /// \brief Get the node cutoff before subtrees are considered significant.
+  unsigned getSubtreeLimit() const { return SubtreeLimit; }
+
+  /// \brief Return true if this DFSResult is uninitialized.
+  ///
+  /// resize() initializes DFSResult, while compute() populates it.
+  bool empty() const { return DFSNodeData.empty(); }
+
   /// \brief Clear the results.
   void clear() {
-    DFSData.clear();
+    DFSNodeData.clear();
+    DFSTreeData.clear();
     SubtreeConnections.clear();
     SubtreeConnectLevels.clear();
   }
 
   /// \brief Initialize the result data with the size of the DAG.
   void resize(unsigned NumSUnits) {
-    DFSData.resize(NumSUnits);
+    DFSNodeData.resize(NumSUnits);
   }
 
   /// \brief Compute various metrics for the DAG with given roots.
-  void compute(ArrayRef<SUnit *> Roots);
+  void compute(ArrayRef<SUnit> SUnits);
+
+  /// \brief Get the number of instructions in the given subtree and its
+  /// children.
+  unsigned getNumInstrs(const SUnit *SU) const {
+    return DFSNodeData[SU->NodeNum].InstrCount;
+  }
+
+  /// \brief Get the number of instructions in the given subtree not including
+  /// children.
+  unsigned getNumSubInstrs(unsigned SubtreeID) const {
+    return DFSTreeData[SubtreeID].SubInstrCount;
+  }
 
   /// \brief Get the ILP value for a DAG node.
   ///
   /// A leaf node has an ILP of 1/1.
-  ILPValue getILP(const SUnit *SU) {
-    return ILPValue(DFSData[SU->NodeNum].InstrCount, 1 + SU->getDepth());
+  ILPValue getILP(const SUnit *SU) const {
+    return ILPValue(DFSNodeData[SU->NodeNum].InstrCount, 1 + SU->getDepth());
   }
 
   /// \brief The number of subtrees detected in this DAG.
   unsigned getNumSubtrees() const { return SubtreeConnectLevels.size(); }
 
   /// \brief Get the ID of the subtree the given DAG node belongs to.
-  unsigned getSubtreeID(const SUnit *SU) {
-    return DFSData[SU->NodeNum].SubtreeID;
+  ///
+  /// For convenience, if DFSResults have not been computed yet, give everything
+  /// tree ID 0.
+  unsigned getSubtreeID(const SUnit *SU) const {
+    if (empty())
+      return 0;
+    assert(SU->NodeNum < DFSNodeData.size() &&  "New Node");
+    return DFSNodeData[SU->NodeNum].SubtreeID;
   }
 
   /// \brief Get the connection level of a subtree.
   ///
   /// For bottom-up trees, the connection level is the latency depth (in cycles)
   /// of the deepest connection to another subtree.
-  unsigned getSubtreeLevel(unsigned SubtreeID) {
+  unsigned getSubtreeLevel(unsigned SubtreeID) const {
     return SubtreeConnectLevels[SubtreeID];
   }
 
diff --git a/include/llvm/CodeGen/SchedulerRegistry.h b/include/llvm/CodeGen/SchedulerRegistry.h
index 836b73a..51ac7f2 100644
--- a/include/llvm/CodeGen/SchedulerRegistry.h
+++ b/include/llvm/CodeGen/SchedulerRegistry.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGENSCHEDULERREGISTRY_H
-#define LLVM_CODEGENSCHEDULERREGISTRY_H
+#ifndef LLVM_CODEGEN_SCHEDULERREGISTRY_H
+#define LLVM_CODEGEN_SCHEDULERREGISTRY_H
 
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index c5cc8f5..c25497a 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -935,6 +935,20 @@ public:
     }
   }
 
+  /// Returns an APFloat semantics tag appropriate for the given type. If VT is
+  /// a vector type, the element semantics are returned.
+  static const fltSemantics &EVTToAPFloatSemantics(EVT VT) {
+    switch (VT.getScalarType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unknown FP format");
+    case MVT::f16:     return APFloat::IEEEhalf;
+    case MVT::f32:     return APFloat::IEEEsingle;
+    case MVT::f64:     return APFloat::IEEEdouble;
+    case MVT::f80:     return APFloat::x87DoubleExtended;
+    case MVT::f128:    return APFloat::IEEEquad;
+    case MVT::ppcf128: return APFloat::PPCDoubleDouble;
+    }
+  }
+
   /// AssignOrdering - Assign an order to the SDNode.
   void AssignOrdering(const SDNode *SD, unsigned Order);
 
@@ -978,10 +992,8 @@ public:
   SDValue CreateStackTemporary(EVT VT1, EVT VT2);
 
   /// FoldConstantArithmetic -
-  SDValue FoldConstantArithmetic(unsigned Opcode,
-                                 EVT VT,
-                                 ConstantSDNode *Cst1,
-                                 ConstantSDNode *Cst2);
+  SDValue FoldConstantArithmetic(unsigned Opcode, EVT VT,
+                                 SDNode *Cst1, SDNode *Cst2);
 
   /// FoldSetCC - Constant fold a setcc to true or false.
   SDValue FoldSetCC(EVT VT, SDValue N1,
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index 16aacc0..af7993f 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_SELECTIONDAG_ISEL_H
-#define LLVM_CODEGEN_SELECTIONDAG_ISEL_H
+#ifndef LLVM_CODEGEN_SELECTIONDAGISEL_H
+#define LLVM_CODEGEN_SELECTIONDAGISEL_H
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -281,4 +281,4 @@ private:
 
 }
 
-#endif /* LLVM_CODEGEN_SELECTIONDAG_ISEL_H */
+#endif /* LLVM_CODEGEN_SELECTIONDAGISEL_H */
diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index b46d153..78fb233 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -112,7 +112,7 @@ namespace llvm {
       return lie.getPointer();
     }
 
-    int getIndex() const {
+    unsigned getIndex() const {
       return listEntry()->getIndex() | getSlot();
     }
 
@@ -360,6 +360,11 @@ namespace llvm {
     /// Renumber the index list, providing space for new instructions.
     void renumberIndexes();
 
+    /// Repair indexes after adding and removing instructions.
+    void repairIndexesInRange(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator Begin,
+                              MachineBasicBlock::iterator End);
+
     /// Returns the zero index for this analysis.
     SlotIndex getZeroIndex() {
       assert(indexList.front().getIndex() == 0 && "First index is not 0?");
@@ -602,29 +607,35 @@ namespace llvm {
     void insertMBBInMaps(MachineBasicBlock *mbb) {
       MachineFunction::iterator nextMBB =
         llvm::next(MachineFunction::iterator(mbb));
-      IndexListEntry *startEntry = createEntry(0, 0);
-      IndexListEntry *stopEntry = createEntry(0, 0);
-      IndexListEntry *nextEntry = 0;
 
+      IndexListEntry *startEntry = 0;
+      IndexListEntry *endEntry = 0;
+      IndexList::iterator newItr;
       if (nextMBB == mbb->getParent()->end()) {
-        nextEntry = indexList.end();
+        startEntry = &indexList.back();
+        endEntry = createEntry(0, 0);
+        newItr = indexList.insertAfter(startEntry, endEntry);
       } else {
-        nextEntry = getMBBStartIdx(nextMBB).listEntry();
+        startEntry = createEntry(0, 0);
+        endEntry = getMBBStartIdx(nextMBB).listEntry();
+        newItr = indexList.insert(endEntry, startEntry);
       }
 
-      indexList.insert(nextEntry, startEntry);
-      indexList.insert(nextEntry, stopEntry);
-
       SlotIndex startIdx(startEntry, SlotIndex::Slot_Block);
-      SlotIndex endIdx(nextEntry, SlotIndex::Slot_Block);
+      SlotIndex endIdx(endEntry, SlotIndex::Slot_Block);
+
+      MachineFunction::iterator prevMBB(mbb);
+      assert(prevMBB != mbb->getParent()->end() &&
+             "Can't insert a new block at the beginning of a function.");
+      --prevMBB;
+      MBBRanges[prevMBB->getNumber()].second = startIdx;
 
       assert(unsigned(mbb->getNumber()) == MBBRanges.size() &&
              "Blocks must be added in order");
       MBBRanges.push_back(std::make_pair(startIdx, endIdx));
-
       idx2MBBMap.push_back(IdxMBBPair(startIdx, mbb));
 
-      renumberIndexes();
+      renumberIndexes(newItr);
       std::sort(idx2MBBMap.begin(), idx2MBBMap.end(), Idx2MBBCompare());
     }
 
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h
index 4c4a2a8..3e22252 100644
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_TARGETSCHEDMODEL_H
-#define LLVM_TARGET_TARGETSCHEDMODEL_H
+#ifndef LLVM_CODEGEN_TARGETSCHEDULE_H
+#define LLVM_CODEGEN_TARGETSCHEDULE_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInstrItineraries.h"
@@ -84,6 +84,9 @@ public:
   /// \brief Maximum number of micro-ops that may be scheduled per cycle.
   unsigned getIssueWidth() const { return SchedModel.IssueWidth; }
 
+  /// \brief Number of cycles the OOO processor is expected to hide.
+  unsigned getILPWindow() const { return SchedModel.ILPWindow; }
+
   /// \brief Return the number of issue slots required for this MI.
   unsigned getNumMicroOps(const MachineInstr *MI,
                           const MCSchedClassDesc *SC = 0) const;
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index f8a6503..ec48b67 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -827,7 +827,7 @@ namespace llvm {
     /// types are returned as Other, otherwise they are invalid.
     static EVT getEVT(Type *Ty, bool HandleUnknown = false);
 
-    intptr_t getRawBits() {
+    intptr_t getRawBits() const {
       if (isSimple())
         return V.SimpleTy;
       else
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index ff765cc..0a26857 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -72,7 +72,7 @@
 /* Define to 1 if you have the <CrashReporterClient.h> header file. */
 #undef HAVE_CRASHREPORTERCLIENT_H
 
-/* Define if __crashreporter_info__ exists. */
+/* can use __crashreporter_info__ */
 #undef HAVE_CRASHREPORTER_INFO
 
 /* Define to 1 if you have the <ctype.h> header file. */
@@ -146,6 +146,24 @@
 /* Define to 1 if you have the `floorf' function. */
 #cmakedefine HAVE_FLOORF ${HAVE_FLOORF}
 
+/* Define to 1 if you have the `log' function. */
+#cmakedefine HAVE_LOG ${HAVE_LOG}
+
+/* Define to 1 if you have the `log2' function. */
+#cmakedefine HAVE_LOG2 ${HAVE_LOG2}
+
+/* Define to 1 if you have the `log10' function. */
+#cmakedefine HAVE_LOG10 ${HAVE_LOG10}
+
+/* Define to 1 if you have the `exp' function. */
+#cmakedefine HAVE_EXP ${HAVE_LOG}
+
+/* Define to 1 if you have the `exp2' function. */
+#cmakedefine HAVE_EXP2 ${HAVE_LOG2}
+
+/* Define to 1 if you have the `exp10' function. */
+#cmakedefine HAVE_EXP10 ${HAVE_LOG10}
+
 /* Define to 1 if you have the `fmodf' function. */
 #cmakedefine HAVE_FMODF ${HAVE_FMODF}
 
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index a4f8af4..2f9e6ff 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -72,7 +72,7 @@
 /* Define to 1 if you have the <CrashReporterClient.h> header file. */
 #undef HAVE_CRASHREPORTERCLIENT_H
 
-/* Define if __crashreporter_info__ exists. */
+/* can use __crashreporter_info__ */
 #undef HAVE_CRASHREPORTER_INFO
 
 /* Define to 1 if you have the <ctype.h> header file. */
@@ -122,6 +122,12 @@
 /* Define to 1 if you have the <execinfo.h> header file. */
 #undef HAVE_EXECINFO_H
 
+/* Define to 1 if you have the `exp' function. */
+#undef HAVE_EXP
+
+/* Define to 1 if you have the `exp2' function. */
+#undef HAVE_EXP2
+
 /* Define to 1 if you have the <fcntl.h> header file. */
 #undef HAVE_FCNTL_H
 
@@ -225,6 +231,15 @@
    the current directory to the dynamic linker search path. */
 #undef HAVE_LINK_R
 
+/* Define to 1 if you have the `log' function. */
+#undef HAVE_LOG
+
+/* Define to 1 if you have the `log10' function. */
+#undef HAVE_LOG10
+
+/* Define to 1 if you have the `log2' function. */
+#undef HAVE_LOG2
+
 /* Define to 1 if you have the `longjmp' function. */
 #undef HAVE_LONGJMP
 
diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake
index fbc3040..eda17ee 100644
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@@ -124,4 +124,7 @@
 /* Minor version of the LLVM API */
 #cmakedefine LLVM_VERSION_MINOR ${LLVM_VERSION_MINOR}
 
+/* Define to 1 if you have the <sanitizer/msan_interface.h> header file. */
+#cmakedefine HAVE_SANITIZER_MSAN_INTERFACE_H ${HAVE_SANITIZER_MSAN_INTERFACE_H}
+
 #endif
diff --git a/include/llvm/DIBuilder.h b/include/llvm/DIBuilder.h
index 0a59cdd..d598475 100644
--- a/include/llvm/DIBuilder.h
+++ b/include/llvm/DIBuilder.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_DIBUILDER_H
-#define LLVM_ANALYSIS_DIBUILDER_H
+#ifndef LLVM_DIBUILDER_H
+#define LLVM_DIBUILDER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -28,6 +28,9 @@ namespace llvm {
   class LLVMContext;
   class MDNode;
   class StringRef;
+  class DIBasicType;
+  class DICompositeType;
+  class DIDerivedType;
   class DIDescriptor;
   class DIFile;
   class DIEnumerator;
@@ -88,9 +91,12 @@ namespace llvm {
     ///                 by a tool analyzing generated debugging information.
     /// @param RV       This indicates runtime version for languages like 
     ///                 Objective-C.
+    /// @param SplitName The name of the file that we'll split debug info out
+    ///                  into.
     void createCompileUnit(unsigned Lang, StringRef File, StringRef Dir, 
-                           StringRef Producer,
-                           bool isOptimized, StringRef Flags, unsigned RV);
+                           StringRef Producer, bool isOptimized,
+                           StringRef Flags, unsigned RV,
+                           StringRef SplitName = StringRef());
 
     /// createFile - Create a file descriptor to hold debugging information
     /// for a file.
@@ -108,32 +114,32 @@ namespace llvm {
     /// @param SizeInBits  Size of the type.
     /// @param AlignInBits Type alignment.
     /// @param Encoding    DWARF encoding code, e.g. dwarf::DW_ATE_float.
-    DIType createBasicType(StringRef Name, uint64_t SizeInBits, 
-                           uint64_t AlignInBits, unsigned Encoding);
+    DIBasicType createBasicType(StringRef Name, uint64_t SizeInBits,
+                                uint64_t AlignInBits, unsigned Encoding);
 
     /// createQualifiedType - Create debugging information entry for a qualified
     /// type, e.g. 'const int'.
     /// @param Tag         Tag identifing type, e.g. dwarf::TAG_volatile_type
     /// @param FromTy      Base Type.
-    DIType createQualifiedType(unsigned Tag, DIType FromTy);
+    DIDerivedType createQualifiedType(unsigned Tag, DIType FromTy);
 
     /// createPointerType - Create debugging information entry for a pointer.
     /// @param PointeeTy   Type pointed by this pointer.
     /// @param SizeInBits  Size.
     /// @param AlignInBits Alignment. (optional)
     /// @param Name        Pointer type name. (optional)
-    DIType createPointerType(DIType PointeeTy, uint64_t SizeInBits,
-                             uint64_t AlignInBits = 0, 
-                             StringRef Name = StringRef());
+    DIDerivedType
+    createPointerType(DIType PointeeTy, uint64_t SizeInBits,
+                      uint64_t AlignInBits = 0, StringRef Name = StringRef());
 
     /// \brief Create debugging information entry for a pointer to member.
     /// @param PointeeTy Type pointed to by this pointer.
     /// @param Class Type for which this pointer points to members of.
-    DIType createMemberPointerType(DIType PointeeTy, DIType Class);
+    DIDerivedType createMemberPointerType(DIType PointeeTy, DIType Class);
 
     /// createReferenceType - Create debugging information entry for a c++
     /// style reference or rvalue reference type.
-    DIType createReferenceType(unsigned Tag, DIType RTy);
+    DIDerivedType createReferenceType(unsigned Tag, DIType RTy);
 
     /// createTypedef - Create debugging information entry for a typedef.
     /// @param Ty          Original type.
@@ -141,8 +147,8 @@ namespace llvm {
     /// @param File        File where this type is defined.
     /// @param LineNo      Line number.
     /// @param Context     The surrounding context for the typedef.
-    DIType createTypedef(DIType Ty, StringRef Name, DIFile File, 
-                         unsigned LineNo, DIDescriptor Context);
+    DIDerivedType createTypedef(DIType Ty, StringRef Name, DIFile File,
+                                unsigned LineNo, DIDescriptor Context);
 
     /// createFriend - Create debugging information entry for a 'friend'.
     DIType createFriend(DIType Ty, DIType FriendTy);
@@ -154,8 +160,8 @@ namespace llvm {
     /// @param BaseOffset   Base offset.
     /// @param Flags        Flags to describe inheritance attribute, 
     ///                     e.g. private
-    DIType createInheritance(DIType Ty, DIType BaseTy, uint64_t BaseOffset,
-                             unsigned Flags);
+    DIDerivedType createInheritance(DIType Ty, DIType BaseTy,
+                                    uint64_t BaseOffset, unsigned Flags);
 
     /// createMemberType - Create debugging information entry for a member.
     /// @param Scope        Member scope.
@@ -167,10 +173,23 @@ namespace llvm {
     /// @param OffsetInBits Member offset.
     /// @param Flags        Flags to encode member attribute, e.g. private
     /// @param Ty           Parent type.
-    DIType createMemberType(DIDescriptor Scope, StringRef Name, DIFile File,
-                            unsigned LineNo, uint64_t SizeInBits, 
-                            uint64_t AlignInBits, uint64_t OffsetInBits, 
-                            unsigned Flags, DIType Ty);
+    DIDerivedType
+    createMemberType(DIDescriptor Scope, StringRef Name, DIFile File,
+                     unsigned LineNo, uint64_t SizeInBits, uint64_t AlignInBits,
+                     uint64_t OffsetInBits, unsigned Flags, DIType Ty);
+
+    /// createStaticMemberType - Create debugging information entry for a
+    /// C++ static data member.
+    /// @param Scope      Member scope.
+    /// @param Name       Member name.
+    /// @param File       File where this member is declared.
+    /// @param LineNo     Line number.
+    /// @param Ty         Type of the static member.
+    /// @param Flags      Flags to encode member attribute, e.g. private.
+    /// @param Val        Const initializer of the member.
+    DIType createStaticMemberType(DIDescriptor Scope, StringRef Name,
+                                  DIFile File, unsigned LineNo, DIType Ty,
+                                  unsigned Flags, llvm::Value *Val);
 
     /// createObjCIVar - Create debugging information entry for Objective-C
     /// instance variable.
@@ -263,10 +282,12 @@ namespace llvm {
     /// @param Flags        Flags to encode member attribute, e.g. private
     /// @param Elements     Struct elements.
     /// @param RunTimeLang  Optional parameter, Objective-C runtime version.
-    DIType createStructType(DIDescriptor Scope, StringRef Name, DIFile File,
-                            unsigned LineNumber, uint64_t SizeInBits,
-                            uint64_t AlignInBits, unsigned Flags,
-                            DIArray Elements, unsigned RunTimeLang = 0);
+    DICompositeType createStructType(DIDescriptor Scope, StringRef Name,
+                                     DIFile File, unsigned LineNumber,
+                                     uint64_t SizeInBits, uint64_t AlignInBits,
+                                     unsigned Flags, DIType DerivedFrom,
+                                     DIArray Elements, unsigned RunTimeLang = 0,
+                                     MDNode *VTableHolder = 0);
 
     /// createUnionType - Create debugging information entry for an union.
     /// @param Scope        Scope in which this union is defined.
@@ -278,10 +299,10 @@ namespace llvm {
     /// @param Flags        Flags to encode member attribute, e.g. private
     /// @param Elements     Union elements.
     /// @param RunTimeLang  Optional parameter, Objective-C runtime version.
-    DIType createUnionType(DIDescriptor Scope, StringRef Name, DIFile File,
-                           unsigned LineNumber, uint64_t SizeInBits,
-                           uint64_t AlignInBits, unsigned Flags,
-                           DIArray Elements, unsigned RunTimeLang = 0);
+    DICompositeType createUnionType(
+        DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
+        uint64_t SizeInBits, uint64_t AlignInBits, unsigned Flags,
+        DIArray Elements, unsigned RunTimeLang = 0);
 
     /// createTemplateTypeParameter - Create debugging information for template
     /// type parameter.
@@ -316,8 +337,8 @@ namespace llvm {
     /// @param AlignInBits  Alignment.
     /// @param Ty           Element type.
     /// @param Subscripts   Subscripts.
-    DIType createArrayType(uint64_t Size, uint64_t AlignInBits, 
-                           DIType Ty, DIArray Subscripts);
+    DICompositeType createArrayType(uint64_t Size, uint64_t AlignInBits,
+                                    DIType Ty, DIArray Subscripts);
 
     /// createVectorType - Create debugging information entry for a vector type.
     /// @param Size         Array size.
@@ -336,16 +357,16 @@ namespace llvm {
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param Elements     Enumeration elements.
-    DIType createEnumerationType(DIDescriptor Scope, StringRef Name, 
-                                 DIFile File, unsigned LineNumber, 
-                                 uint64_t SizeInBits, uint64_t AlignInBits,
-                                 DIArray Elements, DIType ClassType);
+    DICompositeType createEnumerationType(
+        DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
+        uint64_t SizeInBits, uint64_t AlignInBits, DIArray Elements,
+        DIType ClassType);
 
     /// createSubroutineType - Create subroutine type.
     /// @param File           File in which this subroutine is defined.
     /// @param ParameterTypes An array of subroutine parameter types. This
     ///                       includes return type at 0th index.
-    DIType createSubroutineType(DIFile File, DIArray ParameterTypes);
+    DICompositeType createSubroutineType(DIFile File, DIArray ParameterTypes);
 
     /// createArtificialType - Create a new DIType with "artificial" flag set.
     DIType createArtificialType(DIType Ty);
@@ -402,10 +423,12 @@ namespace llvm {
     /// @param isLocalToUnit Boolean flag indicate whether this variable is
     ///                      externally visible or not.
     /// @param Val         llvm::Value of the variable.
+    /// @param Decl        Reference to the corresponding declaration.
     DIGlobalVariable
     createStaticVariable(DIDescriptor Context, StringRef Name, 
                          StringRef LinkageName, DIFile File, unsigned LineNo, 
-                         DIType Ty, bool isLocalToUnit, llvm::Value *Val);
+                         DIType Ty, bool isLocalToUnit, llvm::Value *Val,
+                         MDNode *Decl = NULL);
 
 
     /// createLocalVariable - Create a new descriptor for the specified 
diff --git a/include/llvm/DebugInfo.h b/include/llvm/DebugInfo.h
index cc1c6a0..c557a7a 100644
--- a/include/llvm/DebugInfo.h
+++ b/include/llvm/DebugInfo.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_DEBUGINFO_H
-#define LLVM_ANALYSIS_DEBUGINFO_H
+#ifndef LLVM_DEBUGINFO_H
+#define LLVM_DEBUGINFO_H
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -62,7 +62,8 @@ namespace llvm {
       FlagPrototyped         = 1 << 8,
       FlagObjcClassComplete  = 1 << 9,
       FlagObjectPointer      = 1 << 10,
-      FlagVector             = 1 << 11
+      FlagVector             = 1 << 11,
+      FlagStaticMember       = 1 << 12
     };
   protected:
     const MDNode *DbgNode;
@@ -186,12 +187,12 @@ namespace llvm {
     /// isMain - Each input file is encoded as a separate compile unit in LLVM
     /// debugging information output. However, many target specific tool chains
     /// prefer to encode only one compile unit in an object file. In this
-    /// situation, the LLVM code generator will include  debugging information
+    /// situation, the LLVM code generator will include debugging information
     /// entities in the compile unit that is marked as main compile unit. The
     /// code generator accepts maximum one main compile unit per module. If a
     /// module does not contain any main compile unit then the code generator
     /// will emit multiple compile units in the output object file.
-
+    // TODO: This can be removed when we remove the legacy debug information.
     bool isMain() const                { return getUnsignedField(6) != 0; }
     bool isOptimized() const           { return getUnsignedField(7) != 0; }
     StringRef getFlags() const       { return getStringField(8);   }
@@ -202,6 +203,8 @@ namespace llvm {
     DIArray getSubprograms() const;
     DIArray getGlobalVariables() const;
 
+    StringRef getSplitDebugFilename() const { return getStringField(14); }
+
     /// Verify - Verify that a compile unit is well formed.
     bool Verify() const;
   };
@@ -300,6 +303,9 @@ namespace llvm {
     bool isVector() const {
       return (getFlags() & FlagVector) != 0;
     }
+    bool isStaticMember() const {
+      return (getFlags() & FlagStaticMember) != 0;
+    }
     bool isValid() const {
       return DbgNode && (isBasicType() || isDerivedType() || isCompositeType());
     }
@@ -337,7 +343,8 @@ namespace llvm {
   };
 
   /// DIDerivedType - A simple derived type, like a const qualified type,
-  /// a typedef, a pointer or reference, etc.
+  /// a typedef, a pointer or reference, et cetera.  Or, a data member of
+  /// a class/struct/union.
   class DIDerivedType : public DIType {
     friend class DIDescriptor;
     void printInternal(raw_ostream &OS) const;
@@ -363,6 +370,11 @@ namespace llvm {
       return getFieldAs<DIType>(10);
     }
 
+    Constant *getConstant() const {
+      assert((getTag() == dwarf::DW_TAG_member) && isStaticMember());
+      return getConstantField(10);
+    }
+
     StringRef getObjCPropertyName() const {
       if (getVersion() > LLVMDebugVersion11)
         return StringRef();
@@ -512,6 +524,10 @@ namespace llvm {
       return getFieldAs<DICompositeType>(13);
     }
 
+    unsigned getFlags() const {
+      return getUnsignedField(14);
+    }
+
     unsigned isArtificial() const    {
       if (getVersion() <= llvm::LLVMDebugVersion8)
         return getUnsignedField(14);
@@ -560,6 +576,10 @@ namespace llvm {
       return getFieldAs<DIFile>(6).getDirectory();
     }
 
+    DIFile getFile() const {
+      return getFieldAs<DIFile>(6);
+    }
+
     /// getScopeLineNumber - Get the beginning of the scope of the
     /// function, not necessarily where the name of the program
     /// starts.
@@ -620,6 +640,9 @@ namespace llvm {
 
     GlobalVariable *getGlobal() const { return getGlobalVariableField(11); }
     Constant *getConstant() const   { return getConstantField(11); }
+    DIDerivedType getStaticDataMemberDeclaration() const {
+      return getFieldAs<DIDerivedType>(12);
+    }
 
     /// Verify - Verify that a global variable descriptor is well formed.
     bool Verify() const;
@@ -692,7 +715,7 @@ namespace llvm {
       return getType().isBlockByrefStruct();
     }
 
-    /// isInlinedFnArgument - Return trule if this variable provides debugging
+    /// isInlinedFnArgument - Return true if this variable provides debugging
     /// information for an inlined function arguments.
     bool isInlinedFnArgument(const Function *CurFn);
 
@@ -721,7 +744,7 @@ namespace llvm {
   class DILexicalBlockFile : public DIScope {
   public:
     explicit DILexicalBlockFile(const MDNode *N = 0) : DIScope(N) {}
-    DIScope getContext() const { return getScope().getContext(); }
+    DIScope getContext() const { if (getScope().isSubprogram()) return getScope(); return getScope().getContext(); }
     unsigned getLineNumber() const { return getScope().getLineNumber(); }
     unsigned getColumnNumber() const { return getScope().getColumnNumber(); }
     StringRef getDirectory() const {
diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index 5ebf4b0..8fcd9e0 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h
@@ -58,6 +58,8 @@ public:
   }
 };
 
+typedef SmallVector<std::pair<uint64_t, DILineInfo>, 16> DILineInfoTable;
+
 /// DIInliningInfo - a format-neutral container for inlined code description.
 class DIInliningInfo {
   SmallVector<DILineInfo, 4> Frames;
@@ -92,6 +94,24 @@ public:
   }
 };
 
+/// Selects which debug sections get dumped.
+enum DIDumpType {
+  DIDT_Null,
+  DIDT_All,
+  DIDT_Abbrev,
+  DIDT_AbbrevDwo,
+  DIDT_Aranges,
+  DIDT_Frames,
+  DIDT_Info,
+  DIDT_InfoDwo,
+  DIDT_Line,
+  DIDT_Ranges,
+  DIDT_Pubnames,
+  DIDT_Str,
+  DIDT_StrDwo,
+  DIDT_StrOffsetsDwo
+};
+
 // In place of applying the relocations to the data we've read from disk we use
 // a separate mapping table to the side and checking that at locations in the
 // dwarf where we expect relocated values. This adds a bit of complexity to the
@@ -106,10 +126,12 @@ public:
   /// getDWARFContext - get a context for binary DWARF data.
   static DIContext *getDWARFContext(object::ObjectFile *);
 
-  virtual void dump(raw_ostream &OS) = 0;
+  virtual void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All) = 0;
 
   virtual DILineInfo getLineInfoForAddress(uint64_t Address,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
+  virtual DILineInfoTable getLineInfoForAddressRange(uint64_t Address,
+      uint64_t Size, DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
   virtual DIInliningInfo getInliningInfoForAddress(uint64_t Address,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
 };
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index 2d60c36..3fd69e266 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTION_ENGINE_H
-#define LLVM_EXECUTION_ENGINE_H
+#ifndef LLVM_EXECUTIONENGINE_EXECUTIONENGINE_H
+#define LLVM_EXECUTIONENGINE_EXECUTIONENGINE_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/include/llvm/ExecutionEngine/GenericValue.h b/include/llvm/ExecutionEngine/GenericValue.h
index a2fed98..e160e3a 100644
--- a/include/llvm/ExecutionEngine/GenericValue.h
+++ b/include/llvm/ExecutionEngine/GenericValue.h
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef GENERIC_VALUE_H
-#define GENERIC_VALUE_H
+#ifndef LLVM_EXECUTIONENGINE_GENERICVALUE_H
+#define LLVM_EXECUTIONENGINE_GENERICVALUE_H
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/Support/DataTypes.h"
diff --git a/include/llvm/ExecutionEngine/Interpreter.h b/include/llvm/ExecutionEngine/Interpreter.h
index 72d97ef..f49d0c4 100644
--- a/include/llvm/ExecutionEngine/Interpreter.h
+++ b/include/llvm/ExecutionEngine/Interpreter.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef EXECUTION_ENGINE_INTERPRETER_H
-#define EXECUTION_ENGINE_INTERPRETER_H
+#ifndef LLVM_EXECUTIONENGINE_INTERPRETER_H
+#define LLVM_EXECUTIONENGINE_INTERPRETER_H
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include <cstdlib>
diff --git a/include/llvm/ExecutionEngine/JIT.h b/include/llvm/ExecutionEngine/JIT.h
index b4cda1d..581d6e6 100644
--- a/include/llvm/ExecutionEngine/JIT.h
+++ b/include/llvm/ExecutionEngine/JIT.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTION_ENGINE_JIT_H
-#define LLVM_EXECUTION_ENGINE_JIT_H
+#ifndef LLVM_EXECUTIONENGINE_JIT_H
+#define LLVM_EXECUTIONENGINE_JIT_H
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include <cstdlib>
diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index 3aa014e..ed66102 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTION_ENGINE_JIT_EVENTLISTENER_H
-#define LLVM_EXECUTION_ENGINE_JIT_EVENTLISTENER_H
+#ifndef LLVM_EXECUTIONENGINE_JITEVENTLISTENER_H
+#define LLVM_EXECUTIONENGINE_JITEVENTLISTENER_H
 
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/DataTypes.h"
@@ -127,4 +127,4 @@ public:
 
 } // end namespace llvm.
 
-#endif // defined LLVM_EXECUTION_ENGINE_JIT_EVENTLISTENER_H
+#endif // defined LLVM_EXECUTIONENGINE_JITEVENTLISTENER_H
diff --git a/include/llvm/ExecutionEngine/JITMemoryManager.h b/include/llvm/ExecutionEngine/JITMemoryManager.h
index 29e01aa..714a980 100644
--- a/include/llvm/ExecutionEngine/JITMemoryManager.h
+++ b/include/llvm/ExecutionEngine/JITMemoryManager.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTION_ENGINE_JIT_MEMMANAGER_H
-#define LLVM_EXECUTION_ENGINE_JIT_MEMMANAGER_H
+#ifndef LLVM_EXECUTIONENGINE_JITMEMORYMANAGER_H
+#define LLVM_EXECUTIONENGINE_JITMEMORYMANAGER_H
 
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/Support/DataTypes.h"
diff --git a/include/llvm/ExecutionEngine/MCJIT.h b/include/llvm/ExecutionEngine/MCJIT.h
index ac16bdc..66ddb7c 100644
--- a/include/llvm/ExecutionEngine/MCJIT.h
+++ b/include/llvm/ExecutionEngine/MCJIT.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTION_ENGINE_MCJIT_H
-#define LLVM_EXECUTION_ENGINE_MCJIT_H
+#ifndef LLVM_EXECUTIONENGINE_MCJIT_H
+#define LLVM_EXECUTIONENGINE_MCJIT_H
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include <cstdlib>
diff --git a/include/llvm/ExecutionEngine/OProfileWrapper.h b/include/llvm/ExecutionEngine/OProfileWrapper.h
index 99553a3..05da594 100644
--- a/include/llvm/ExecutionEngine/OProfileWrapper.h
+++ b/include/llvm/ExecutionEngine/OProfileWrapper.h
@@ -17,8 +17,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef OPROFILE_WRAPPER_H
-#define OPROFILE_WRAPPER_H
+#ifndef LLVM_EXECUTIONENGINE_OPROFILEWRAPPER_H
+#define LLVM_EXECUTIONENGINE_OPROFILEWRAPPER_H
 
 #include "llvm/Support/DataTypes.h"
 #include <opagent.h>
@@ -121,4 +121,4 @@ private:
 
 } // namespace llvm
 
-#endif //OPROFILE_WRAPPER_H
+#endif // LLVM_EXECUTIONENGINE_OPROFILEWRAPPER_H
diff --git a/include/llvm/ExecutionEngine/ObjectImage.h b/include/llvm/ExecutionEngine/ObjectImage.h
index d09e4de..9fddca7 100644
--- a/include/llvm/ExecutionEngine/ObjectImage.h
+++ b/include/llvm/ExecutionEngine/ObjectImage.h
@@ -50,6 +50,8 @@ public:
 
   virtual StringRef getData() const = 0;
 
+  virtual object::ObjectFile* getObjectFile() const = 0;
+
   // Subclasses can override these methods to provide JIT debugging support
   virtual void registerWithDebugger() = 0;
   virtual void deregisterWithDebugger() = 0;
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index e2905e3..4222d53 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_RUNTIME_DYLD_H
-#define LLVM_RUNTIME_DYLD_H
+#ifndef LLVM_EXECUTIONENGINE_RUNTIMEDYLD_H
+#define LLVM_EXECUTIONENGINE_RUNTIMEDYLD_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
diff --git a/include/llvm/ExecutionEngine/SectionMemoryManager.h b/include/llvm/ExecutionEngine/SectionMemoryManager.h
index ba4ba8d0..ae5004e 100644
--- a/include/llvm/ExecutionEngine/SectionMemoryManager.h
+++ b/include/llvm/ExecutionEngine/SectionMemoryManager.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTION_ENGINE_SECTION_MEMORY_MANAGER_H
-#define LLVM_EXECUTION_ENGINE_SECTION_MEMORY_MANAGER_H
+#ifndef LLVM_EXECUTIONENGINE_SECTIONMEMORYMANAGER_H
+#define LLVM_EXECUTIONENGINE_SECTIONMEMORYMANAGER_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
diff --git a/include/llvm/GVMaterializer.h b/include/llvm/GVMaterializer.h
index c143552..1e5c426 100644
--- a/include/llvm/GVMaterializer.h
+++ b/include/llvm/GVMaterializer.h
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef GVMATERIALIZER_H
-#define GVMATERIALIZER_H
+#ifndef LLVM_GVMATERIALIZER_H
+#define LLVM_GVMATERIALIZER_H
 
 #include <string>
 
diff --git a/include/llvm/IR/Argument.h b/include/llvm/IR/Argument.h
index f737e40..ef4e4fc 100644
--- a/include/llvm/IR/Argument.h
+++ b/include/llvm/IR/Argument.h
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the Argument class. 
+// This file declares the Argument class.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ARGUMENT_H
-#define LLVM_ARGUMENT_H
+#ifndef LLVM_IR_ARGUMENT_H
+#define LLVM_IR_ARGUMENT_H
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist_node.h"
@@ -24,11 +24,14 @@ namespace llvm {
 template<typename ValueSubClass, typename ItemParentClass>
   class SymbolTableListTraits;
 
-/// A class to represent an incoming formal argument to a Function. An argument
-/// is a very simple Value. It is essentially a named (optional) type. When used
-/// in the body of a function, it represents the value of the actual argument
-/// the function was called with.
-/// @brief LLVM Argument representation  
+/// \brief LLVM Argument representation
+///
+/// This class represents an incoming formal argument to a Function. A formal
+/// argument, since it is ``formal'', does not contain an actual value but
+/// instead represents the type, argument number, and attributes of an argument
+/// for a specific function. When used in the body of said function, the
+/// argument of course represents the value of the actual argument that the
+/// function was called with.
 class Argument : public Value, public ilist_node<Argument> {
   virtual void anchor();
   Function *Parent;
@@ -37,50 +40,52 @@ class Argument : public Value, public ilist_node<Argument> {
   void setParent(Function *parent);
 
 public:
-  /// Argument ctor - If Function argument is specified, this argument is
-  /// inserted at the end of the argument list for the function.
+  /// \brief Constructor.
   ///
+  /// If \p F is specified, the argument is inserted at the end of the argument
+  /// list for \p F.
   explicit Argument(Type *Ty, const Twine &Name = "", Function *F = 0);
 
   inline const Function *getParent() const { return Parent; }
   inline       Function *getParent()       { return Parent; }
 
-  /// getArgNo - Return the index of this formal argument in its containing
-  /// function.  For example in "void foo(int a, float b)" a is 0 and b is 1. 
+  /// \brief Return the index of this formal argument in its containing
+  /// function.
+  ///
+  /// For example in "void foo(int a, float b)" a is 0 and b is 1.
   unsigned getArgNo() const;
-  
-  /// hasByValAttr - Return true if this argument has the byval attribute on it
-  /// in its containing function.
+
+  /// \brief Return true if this argument has the byval attribute on it in its
+  /// containing function.
   bool hasByValAttr() const;
-  
-  /// getParamAlignment - If this is a byval argument, return its alignment.
+
+  /// \brief If this is a byval argument, return its alignment.
   unsigned getParamAlignment() const;
 
-  /// hasNestAttr - Return true if this argument has the nest attribute on
-  /// it in its containing function.
+  /// \brief Return true if this argument has the nest attribute on it in its
+  /// containing function.
   bool hasNestAttr() const;
 
-  /// hasNoAliasAttr - Return true if this argument has the noalias attribute on
-  /// it in its containing function.
+  /// \brief Return true if this argument has the noalias attribute on it in its
+  /// containing function.
   bool hasNoAliasAttr() const;
-  
-  /// hasNoCaptureAttr - Return true if this argument has the nocapture
-  /// attribute on it in its containing function.
+
+  /// \brief Return true if this argument has the nocapture attribute on it in
+  /// its containing function.
   bool hasNoCaptureAttr() const;
-  
-  /// hasStructRetAttr - Return true if this argument has the sret attribute on
-  /// it in its containing function.
+
+  /// \brief Return true if this argument has the sret attribute on it in its
+  /// containing function.
   bool hasStructRetAttr() const;
 
-  /// addAttr - Add a Attribute to an argument
-  void addAttr(Attribute);
-  
-  /// removeAttr - Remove a Attribute from an argument
-  void removeAttr(Attribute);
+  /// \brief Add a Attribute to an argument.
+  void addAttr(AttributeSet AS);
 
-  /// classof - Methods for support type inquiry through isa, cast, and
-  /// dyn_cast:
-  ///
+  /// \brief Remove a Attribute from an argument.
+  void removeAttr(AttributeSet AS);
+
+  /// \brief Method for support type inquiry through isa, cast, and
+  /// dyn_cast.
   static inline bool classof(const Value *V) {
     return V->getValueID() == ArgumentVal;
   }
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 95b654c..30c0965 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -13,20 +13,25 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ATTRIBUTES_H
-#define LLVM_ATTRIBUTES_H
+#ifndef LLVM_IR_ATTRIBUTES_H
+#define LLVM_IR_ATTRIBUTES_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include <bitset>
 #include <cassert>
+#include <map>
 #include <string>
 
 namespace llvm {
 
 class AttrBuilder;
 class AttributeImpl;
+class AttributeSetImpl;
+class AttributeSetNode;
 class Constant;
+template<typename T> struct DenseMapInfo;
 class LLVMContext;
 class Type;
 
@@ -39,13 +44,13 @@ class Type;
 class Attribute {
 public:
   /// This enumeration lists the attributes that can be associated with
-  /// parameters, function results or the function itself.
+  /// parameters, function results, or the function itself.
   ///
-  /// Note: uwtable is about the ABI or the user mandating an entry in the
-  /// unwind table. The nounwind attribute is about an exception passing by the
-  /// function.
+  /// Note: The `uwtable' attribute is about the ABI or the user mandating an
+  /// entry in the unwind table. The `nounwind' attribute is about an exception
+  /// passing by the function.
   ///
-  /// In a theoretical system that uses tables for profiling and sjlj for
+  /// In a theoretical system that uses tables for profiling and SjLj for
   /// exceptions, they would be fully independent. In a normal system that uses
   /// tables for both, the semantics are:
   ///
@@ -58,7 +63,6 @@ public:
   enum AttrKind {
     // IR-Level Attributes
     None,                  ///< No attributes have been set
-    AddressSafety,         ///< Address safety checking is on.
     Alignment,             ///< Alignment of parameter (5 bits)
                            ///< stored as log2 of alignment with +1 bias
                            ///< 0 means unaligned (different from align(1))
@@ -70,6 +74,7 @@ public:
     Naked,                 ///< Naked function
     Nest,                  ///< Nested function static chain
     NoAlias,               ///< Considered to not alias after call
+    NoBuiltin,             ///< Callee isn't recognized as a builtin
     NoCapture,             ///< Function creates no aliases of pointer
     NoDuplicate,           ///< Call cannot be duplicated
     NoImplicitFloat,       ///< Disable implicit floating point insts
@@ -90,14 +95,15 @@ public:
                            ///< alignstack=(1))
     StackProtect,          ///< Stack protection.
     StackProtectReq,       ///< Stack protection required.
+    StackProtectStrong,    ///< Strong Stack protection.
     StructRet,             ///< Hidden pointer to structure to return
+    SanitizeAddress,       ///< AddressSanitizer is on.
+    SanitizeThread,        ///< ThreadSanitizer is on.
+    SanitizeMemory,        ///< MemorySanitizer is on.
     UWTable,               ///< Function must be in a unwind table
     ZExt,                  ///< Zero extended before/after call
 
-    EndAttrKinds,          ///< Sentinal value useful for loops
-
-    AttrKindEmptyKey,      ///< Empty key value for DenseMapInfo
-    AttrKindTombstoneKey   ///< Tombstone key value for DenseMapInfo
+    EndAttrKinds           ///< Sentinal value useful for loops
   };
 private:
   AttributeImpl *pImpl;
@@ -105,126 +111,88 @@ private:
 public:
   Attribute() : pImpl(0) {}
 
-  /// \brief Return a uniquified Attribute object. This takes the uniquified
-  /// value from the Builder and wraps it in the Attribute class.
-  static Attribute get(LLVMContext &Context, ArrayRef<AttrKind> Vals);
-  static Attribute get(LLVMContext &Context, AttrBuilder &B);
+  //===--------------------------------------------------------------------===//
+  // Attribute Construction
+  //===--------------------------------------------------------------------===//
 
-  /// \brief Return true if the attribute is present.
-  bool hasAttribute(AttrKind Val) const;
+  /// \brief Return a uniquified Attribute object.
+  static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val = 0);
+  static Attribute get(LLVMContext &Context, StringRef Kind,
+                       StringRef Val = StringRef());
 
-  /// \brief Return true if attributes exist
-  bool hasAttributes() const;
+  /// \brief Return a uniquified Attribute object that has the specific
+  /// alignment set.
+  static Attribute getWithAlignment(LLVMContext &Context, uint64_t Align);
+  static Attribute getWithStackAlignment(LLVMContext &Context, uint64_t Align);
 
-  /// \brief Returns the alignment field of an attribute as a byte alignment
-  /// value.
-  unsigned getAlignment() const;
+  //===--------------------------------------------------------------------===//
+  // Attribute Accessors
+  //===--------------------------------------------------------------------===//
 
-  /// \brief Set the alignment field of an attribute.
-  void setAlignment(unsigned Align);
+  /// \brief Return true if the attribute is an Attribute::AttrKind type.
+  bool isEnumAttribute() const;
 
-  /// \brief Returns the stack alignment field of an attribute as a byte
-  /// alignment value.
-  unsigned getStackAlignment() const;
+  /// \brief Return true if the attribute is an alignment attribute.
+  bool isAlignAttribute() const;
 
-  /// \brief Set the stack alignment field of an attribute.
-  void setStackAlignment(unsigned Align);
+  /// \brief Return true if the attribute is a string (target-dependent)
+  /// attribute.
+  bool isStringAttribute() const;
 
-  /// \brief Equality and non-equality query methods.
-  bool operator==(AttrKind K) const;
-  bool operator!=(AttrKind K) const;
+  /// \brief Return true if the attribute is present.
+  bool hasAttribute(AttrKind Val) const;
 
-  // FIXME: Remove these 'operator' methods.
-  bool operator==(const Attribute &A) const {
-    return pImpl == A.pImpl;
-  }
-  bool operator!=(const Attribute &A) const {
-    return pImpl != A.pImpl;
-  }
+  /// \brief Return true if the target-dependent attribute is present.
+  bool hasAttribute(StringRef Val) const;
 
-  uint64_t getBitMask() const;
+  /// \brief Return the attribute's kind as an enum (Attribute::AttrKind). This
+  /// requires the attribute to be an enum or alignment attribute.
+  Attribute::AttrKind getKindAsEnum() const;
 
-  /// \brief Which attributes cannot be applied to a type.
-  static Attribute typeIncompatible(Type *Ty);
+  /// \brief Return the attribute's value as an integer. This requires that the
+  /// attribute be an alignment attribute.
+  uint64_t getValueAsInt() const;
 
-  /// \brief This returns an integer containing an encoding of all the LLVM
-  /// attributes found in the given attribute bitset.  Any change to this
-  /// encoding is a breaking change to bitcode compatibility.
-  static uint64_t encodeLLVMAttributesForBitcode(Attribute Attrs);
+  /// \brief Return the attribute's kind as a string. This requires the
+  /// attribute to be a string attribute.
+  StringRef getKindAsString() const;
 
-  /// \brief This returns an attribute bitset containing the LLVM attributes
-  /// that have been decoded from the given integer.  This function must stay in
-  /// sync with 'encodeLLVMAttributesForBitcode'.
-  static Attribute decodeLLVMAttributesForBitcode(LLVMContext &C,
-                                                  uint64_t EncodedAttrs);
+  /// \brief Return the attribute's value as a string. This requires the
+  /// attribute to be a string attribute.
+  StringRef getValueAsString() const;
+
+  /// \brief Returns the alignment field of an attribute as a byte alignment
+  /// value.
+  unsigned getAlignment() const;
+
+  /// \brief Returns the stack alignment field of an attribute as a byte
+  /// alignment value.
+  unsigned getStackAlignment() const;
 
   /// \brief The Attribute is converted to a string of equivalent mnemonic. This
   /// is, presumably, for writing out the mnemonics for the assembly writer.
-  std::string getAsString() const;
-};
+  std::string getAsString(bool InAttrGrp = false) const;
 
-//===----------------------------------------------------------------------===//
-/// \class
-/// \brief Provide DenseMapInfo for Attribute::AttrKinds. This is used by
-/// AttrBuilder.
-template<> struct DenseMapInfo<Attribute::AttrKind> {
-  static inline Attribute::AttrKind getEmptyKey() {
-    return Attribute::AttrKindEmptyKey;
-  }
-  static inline Attribute::AttrKind getTombstoneKey() {
-    return Attribute::AttrKindTombstoneKey;
-  }
-  static unsigned getHashValue(const Attribute::AttrKind &Val) {
-    return Val * 37U;
-  }
-  static bool isEqual(const Attribute::AttrKind &LHS,
-                      const Attribute::AttrKind &RHS) {
-    return LHS == RHS;
-  }
-};
+  /// \brief Equality and non-equality operators.
+  bool operator==(Attribute A) const { return pImpl == A.pImpl; }
+  bool operator!=(Attribute A) const { return pImpl != A.pImpl; }
 
-//===----------------------------------------------------------------------===//
-/// \class
-/// \brief This is just a pair of values to associate a set of attributes with
-/// an index.
-struct AttributeWithIndex {
-  Attribute Attrs;  ///< The attributes that are set, or'd together.
-  Constant *Val;    ///< Value attached to attribute, e.g. alignment.
-  unsigned Index;   ///< Index of the parameter for which the attributes apply.
-                    ///< Index 0 is used for return value attributes.
-                    ///< Index ~0U is used for function attributes.
-
-  static AttributeWithIndex get(LLVMContext &C, unsigned Idx,
-                                ArrayRef<Attribute::AttrKind> Attrs) {
-    return get(Idx, Attribute::get(C, Attrs));
-  }
-  static AttributeWithIndex get(unsigned Idx, Attribute Attrs) {
-    AttributeWithIndex P;
-    P.Index = Idx;
-    P.Attrs = Attrs;
-    P.Val = 0;
-    return P;
-  }
-  static AttributeWithIndex get(unsigned Idx, Attribute Attrs, Constant *Val) {
-    AttributeWithIndex P;
-    P.Index = Idx;
-    P.Attrs = Attrs;
-    P.Val = Val;
-    return P;
+  /// \brief Less-than operator. Useful for sorting the attributes list.
+  bool operator<(Attribute A) const;
+
+  void Profile(FoldingSetNodeID &ID) const {
+    ID.AddPointer(pImpl);
   }
 };
 
 //===----------------------------------------------------------------------===//
-// AttributeSet Smart Pointer
-//===----------------------------------------------------------------------===//
-
-class AttrBuilder;
-class AttributeSetImpl;
-
-//===----------------------------------------------------------------------===//
 /// \class
-/// \brief This class manages the ref count for the opaque AttributeSetImpl
-/// object and provides accessors for it.
+/// \brief This class holds the attributes for a function, its return value, and
+/// its parameters. You access the attributes for each of them via an index into
+/// the AttributeSet object. The function attributes are at index
+/// `AttributeSet::FunctionIndex', the return value is at index
+/// `AttributeSet::ReturnIndex', and the attributes for the parameters start at
+/// index `1'.
 class AttributeSet {
 public:
   enum AttrIndex {
@@ -233,105 +201,141 @@ public:
   };
 private:
   friend class AttrBuilder;
+  friend class AttributeSetImpl;
+  template <typename Ty> friend struct DenseMapInfo;
 
-  /// \brief The attributes that we are managing.  This can be null to represent
+  /// \brief The attributes that we are managing. This can be null to represent
   /// the empty attributes list.
-  AttributeSetImpl *AttrList;
+  AttributeSetImpl *pImpl;
+
+  /// \brief The attributes for the specified index are returned.
+  AttributeSetNode *getAttributes(unsigned Idx) const;
 
-  /// \brief The attributes for the specified index are returned.  Attributes
-  /// for the result are denoted with Idx = 0.
-  Attribute getAttributes(unsigned Idx) const;
+  /// \brief Create an AttributeSet with the specified parameters in it.
+  static AttributeSet get(LLVMContext &C,
+                          ArrayRef<std::pair<unsigned, Attribute> > Attrs);
+  static AttributeSet get(LLVMContext &C,
+                          ArrayRef<std::pair<unsigned,
+                                             AttributeSetNode*> > Attrs);
 
-  explicit AttributeSet(AttributeSetImpl *LI) : AttrList(LI) {}
+  static AttributeSet getImpl(LLVMContext &C,
+                              ArrayRef<std::pair<unsigned,
+                                                 AttributeSetNode*> > Attrs);
+
+
+  explicit AttributeSet(AttributeSetImpl *LI) : pImpl(LI) {}
 public:
-  AttributeSet() : AttrList(0) {}
-  AttributeSet(const AttributeSet &P) : AttrList(P.AttrList) {}
-  const AttributeSet &operator=(const AttributeSet &RHS);
+  AttributeSet() : pImpl(0) {}
+  AttributeSet(const AttributeSet &P) : pImpl(P.pImpl) {}
+  const AttributeSet &operator=(const AttributeSet &RHS) {
+    pImpl = RHS.pImpl;
+    return *this;
+  }
 
   //===--------------------------------------------------------------------===//
-  // Attribute List Construction and Mutation
+  // AttributeSet Construction and Mutation
   //===--------------------------------------------------------------------===//
 
   /// \brief Return an AttributeSet with the specified parameters in it.
-  static AttributeSet get(LLVMContext &C, ArrayRef<AttributeWithIndex> Attrs);
+  static AttributeSet get(LLVMContext &C, ArrayRef<AttributeSet> Attrs);
+  static AttributeSet get(LLVMContext &C, unsigned Idx,
+                          ArrayRef<Attribute::AttrKind> Kind);
   static AttributeSet get(LLVMContext &C, unsigned Idx, AttrBuilder &B);
 
-  /// \brief Add the specified attribute at the specified index to this
-  /// attribute list.  Since attribute lists are immutable, this returns the new
-  /// list.
-  AttributeSet addAttr(LLVMContext &C, unsigned Idx, Attribute Attrs) const;
+  /// \brief Add an attribute to the attribute set at the given index. Since
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet addAttribute(LLVMContext &C, unsigned Idx,
+                            Attribute::AttrKind Attr) const;
+
+  /// \brief Add attributes to the attribute set at the given index. Since
+  /// attribute sets are immutable, this returns a new set.
+  AttributeSet addAttributes(LLVMContext &C, unsigned Idx,
+                             AttributeSet Attrs) const;
 
   /// \brief Remove the specified attribute at the specified index from this
-  /// attribute list.  Since attribute lists are immutable, this returns the new
+  /// attribute list. Since attribute lists are immutable, this returns the new
+  /// list.
+  AttributeSet removeAttribute(LLVMContext &C, unsigned Idx, 
+                               Attribute::AttrKind Attr) const;
+
+  /// \brief Remove the specified attributes at the specified index from this
+  /// attribute list. Since attribute lists are immutable, this returns the new
   /// list.
-  AttributeSet removeAttr(LLVMContext &C, unsigned Idx, Attribute Attrs) const;
+  AttributeSet removeAttributes(LLVMContext &C, unsigned Idx, 
+                                AttributeSet Attrs) const;
 
   //===--------------------------------------------------------------------===//
-  // Attribute List Accessors
+  // AttributeSet Accessors
   //===--------------------------------------------------------------------===//
 
+  /// \brief Retrieve the LLVM context.
+  LLVMContext &getContext() const;
+
   /// \brief The attributes for the specified index are returned.
-  Attribute getParamAttributes(unsigned Idx) const {
-    return getAttributes(Idx);
-  }
+  AttributeSet getParamAttributes(unsigned Idx) const;
 
   /// \brief The attributes for the ret value are returned.
-  Attribute getRetAttributes() const {
-    return getAttributes(ReturnIndex);
-  }
+  AttributeSet getRetAttributes() const;
 
   /// \brief The function attributes are returned.
-  Attribute getFnAttributes() const {
-    return getAttributes(FunctionIndex);
-  }
-
-  /// \brief Return the alignment for the specified function parameter.
-  unsigned getParamAlignment(unsigned Idx) const {
-    return getAttributes(Idx).getAlignment();
-  }
+  AttributeSet getFnAttributes() const;
 
   /// \brief Return true if the attribute exists at the given index.
   bool hasAttribute(unsigned Index, Attribute::AttrKind Kind) const;
 
+  /// \brief Return true if the attribute exists at the given index.
+  bool hasAttribute(unsigned Index, StringRef Kind) const;
+
   /// \brief Return true if attribute exists at the given index.
   bool hasAttributes(unsigned Index) const;
 
+  /// \brief Return true if the specified attribute is set for at least one
+  /// parameter or for the return value.
+  bool hasAttrSomewhere(Attribute::AttrKind Attr) const;
+
+  /// \brief Return the attribute object that exists at the given index.
+  Attribute getAttribute(unsigned Index, Attribute::AttrKind Kind) const;
+
+  /// \brief Return the attribute object that exists at the given index.
+  Attribute getAttribute(unsigned Index, StringRef Kind) const;
+
+  /// \brief Return the alignment for the specified function parameter.
+  unsigned getParamAlignment(unsigned Idx) const;
+
   /// \brief Get the stack alignment.
   unsigned getStackAlignment(unsigned Index) const;
 
   /// \brief Return the attributes at the index as a string.
-  std::string getAsString(unsigned Index) const;
+  std::string getAsString(unsigned Index, bool InAttrGrp = false) const;
 
-  uint64_t getBitMask(unsigned Index) const;
+  typedef ArrayRef<Attribute>::iterator iterator;
 
-  /// \brief Return true if the specified attribute is set for at least one
-  /// parameter or for the return value.
-  bool hasAttrSomewhere(Attribute::AttrKind Attr) const;
+  iterator begin(unsigned Idx) const;
+  iterator end(unsigned Idx) const;
 
   /// operator==/!= - Provide equality predicates.
   bool operator==(const AttributeSet &RHS) const {
-    return AttrList == RHS.AttrList;
+    return pImpl == RHS.pImpl;
   }
   bool operator!=(const AttributeSet &RHS) const {
-    return AttrList != RHS.AttrList;
+    return pImpl != RHS.pImpl;
   }
 
   //===--------------------------------------------------------------------===//
-  // Attribute List Introspection
+  // AttributeSet Introspection
   //===--------------------------------------------------------------------===//
 
+  // FIXME: Remove this.
+  uint64_t Raw(unsigned Index) const;
+
   /// \brief Return a raw pointer that uniquely identifies this attribute list.
   void *getRawPointer() const {
-    return AttrList;
+    return pImpl;
   }
 
-  // Attributes are stored as a dense set of slots, where there is one slot for
-  // each argument that has an attribute.  This allows walking over the dense
-  // set instead of walking the sparse list of attributes.
-
   /// \brief Return true if there are no attributes.
   bool isEmpty() const {
-    return AttrList == 0;
+    return getNumSlots() == 0;
   }
 
   /// \brief Return the number of slots used in this attribute list.  This is
@@ -339,56 +343,101 @@ public:
   /// function itself).
   unsigned getNumSlots() const;
 
-  /// \brief Return the AttributeWithIndex at the specified slot.  This holds a
-  /// index number plus a set of attributes.
-  const AttributeWithIndex &getSlot(unsigned Slot) const;
+  /// \brief Return the index for the given slot.
+  uint64_t getSlotIndex(unsigned Slot) const;
+
+  /// \brief Return the attributes at the given slot.
+  AttributeSet getSlotAttributes(unsigned Slot) const;
 
   void dump() const;
 };
 
 //===----------------------------------------------------------------------===//
 /// \class
+/// \brief Provide DenseMapInfo for AttributeSet.
+template<> struct DenseMapInfo<AttributeSet> {
+  static inline AttributeSet getEmptyKey() {
+    uintptr_t Val = static_cast<uintptr_t>(-1);
+    Val <<= PointerLikeTypeTraits<void*>::NumLowBitsAvailable;
+    return AttributeSet(reinterpret_cast<AttributeSetImpl*>(Val));
+  }
+  static inline AttributeSet getTombstoneKey() {
+    uintptr_t Val = static_cast<uintptr_t>(-2);
+    Val <<= PointerLikeTypeTraits<void*>::NumLowBitsAvailable;
+    return AttributeSet(reinterpret_cast<AttributeSetImpl*>(Val));
+  }
+  static unsigned getHashValue(AttributeSet AS) {
+    return (unsigned((uintptr_t)AS.pImpl) >> 4) ^
+           (unsigned((uintptr_t)AS.pImpl) >> 9);
+  }
+  static bool isEqual(AttributeSet LHS, AttributeSet RHS) { return LHS == RHS; }
+};
+
+//===----------------------------------------------------------------------===//
+/// \class
 /// \brief This class is used in conjunction with the Attribute::get method to
 /// create an Attribute object. The object itself is uniquified. The Builder's
 /// value, however, is not. So this can be used as a quick way to test for
 /// equality, presence of attributes, etc.
 class AttrBuilder {
-  DenseSet<Attribute::AttrKind> Attrs;
+  std::bitset<Attribute::EndAttrKinds> Attrs;
+  std::map<std::string, std::string> TargetDepAttrs;
   uint64_t Alignment;
   uint64_t StackAlignment;
 public:
-  AttrBuilder() : Alignment(0), StackAlignment(0) {}
-  explicit AttrBuilder(uint64_t B) : Alignment(0), StackAlignment(0) {
-    addRawValue(B);
+  AttrBuilder() : Attrs(0), Alignment(0), StackAlignment(0) {}
+  explicit AttrBuilder(uint64_t Val)
+    : Attrs(0), Alignment(0), StackAlignment(0) {
+    addRawValue(Val);
   }
-  AttrBuilder(const Attribute &A) : Alignment(0), StackAlignment(0) {
-    addAttributes(A);
+  AttrBuilder(const Attribute &A) : Attrs(0), Alignment(0), StackAlignment(0) {
+    addAttribute(A);
   }
   AttrBuilder(AttributeSet AS, unsigned Idx);
+  AttrBuilder(const AttrBuilder &B)
+    : Attrs(B.Attrs),
+      TargetDepAttrs(B.TargetDepAttrs.begin(), B.TargetDepAttrs.end()),
+      Alignment(B.Alignment), StackAlignment(B.StackAlignment) {}
 
   void clear();
 
   /// \brief Add an attribute to the builder.
   AttrBuilder &addAttribute(Attribute::AttrKind Val);
 
+  /// \brief Add the Attribute object to the builder.
+  AttrBuilder &addAttribute(Attribute A);
+
+  /// \brief Add the target-dependent attribute to the builder.
+  AttrBuilder &addAttribute(StringRef A, StringRef V = StringRef());
+
   /// \brief Remove an attribute from the builder.
   AttrBuilder &removeAttribute(Attribute::AttrKind Val);
 
-  /// \brief Add the attributes from A to the builder.
-  AttrBuilder &addAttributes(const Attribute &A);
+  /// \brief Remove the attributes from the builder.
+  AttrBuilder &removeAttributes(AttributeSet A, uint64_t Index);
 
-  /// \brief Remove the attributes from A from the builder.
-  AttrBuilder &removeAttributes(const Attribute &A);
+  /// \brief Remove the target-dependent attribute to the builder.
+  AttrBuilder &removeAttribute(StringRef A);
+
+  /// \brief Add the attributes from the builder.
+  AttrBuilder &merge(const AttrBuilder &B);
 
   /// \brief Return true if the builder has the specified attribute.
-  bool contains(Attribute::AttrKind A) const;
+  bool contains(Attribute::AttrKind A) const {
+    assert((unsigned)A < Attribute::EndAttrKinds && "Attribute out of range!");
+    return Attrs[A];
+  }
+
+  /// \brief Return true if the builder has the specified target-dependent
+  /// attribute.
+  bool contains(StringRef A) const;
 
   /// \brief Return true if the builder has IR-level attributes.
   bool hasAttributes() const;
 
   /// \brief Return true if the builder has any attribute that's in the
   /// specified attribute.
-  bool hasAttributes(const Attribute &A) const;
+  bool hasAttributes(AttributeSet A, uint64_t Index) const;
 
   /// \brief Return true if the builder has an alignment attribute.
   bool hasAlignmentAttr() const;
@@ -407,52 +456,44 @@ public:
   /// the form used internally in Attribute.
   AttrBuilder &addStackAlignmentAttr(unsigned Align);
 
-  typedef DenseSet<Attribute::AttrKind>::iterator       iterator;
-  typedef DenseSet<Attribute::AttrKind>::const_iterator const_iterator;
+  /// \brief Return true if the builder contains no target-independent
+  /// attributes.
+  bool empty() const { return Attrs.none(); }
 
-  iterator begin() { return Attrs.begin(); }
-  iterator end()   { return Attrs.end(); }
+  // Iterators for target-dependent attributes.
+  typedef std::pair<std::string, std::string>                td_type;
+  typedef std::map<std::string, std::string>::iterator       td_iterator;
+  typedef std::map<std::string, std::string>::const_iterator td_const_iterator;
 
-  const_iterator begin() const { return Attrs.begin(); }
-  const_iterator end() const   { return Attrs.end(); }
+  td_iterator td_begin()             { return TargetDepAttrs.begin(); }
+  td_iterator td_end()               { return TargetDepAttrs.end(); }
 
-  /// \brief Add the raw value to the internal representation.
-  /// 
-  /// N.B. This should be used ONLY for decoding LLVM bitcode!
-  AttrBuilder &addRawValue(uint64_t Val);
+  td_const_iterator td_begin() const { return TargetDepAttrs.begin(); }
+  td_const_iterator td_end() const   { return TargetDepAttrs.end(); }
 
-  /// \brief Remove attributes that are used on functions only.
-  void removeFunctionOnlyAttrs() {
-    removeAttribute(Attribute::NoReturn)
-      .removeAttribute(Attribute::NoUnwind)
-      .removeAttribute(Attribute::ReadNone)
-      .removeAttribute(Attribute::ReadOnly)
-      .removeAttribute(Attribute::NoInline)
-      .removeAttribute(Attribute::AlwaysInline)
-      .removeAttribute(Attribute::OptimizeForSize)
-      .removeAttribute(Attribute::StackProtect)
-      .removeAttribute(Attribute::StackProtectReq)
-      .removeAttribute(Attribute::NoRedZone)
-      .removeAttribute(Attribute::NoImplicitFloat)
-      .removeAttribute(Attribute::Naked)
-      .removeAttribute(Attribute::InlineHint)
-      .removeAttribute(Attribute::StackAlignment)
-      .removeAttribute(Attribute::UWTable)
-      .removeAttribute(Attribute::NonLazyBind)
-      .removeAttribute(Attribute::ReturnsTwice)
-      .removeAttribute(Attribute::AddressSafety)
-      .removeAttribute(Attribute::MinSize)
-      .removeAttribute(Attribute::NoDuplicate);
-  }
+  bool td_empty() const              { return TargetDepAttrs.empty(); }
 
-  uint64_t getBitMask() const;
+  /// \brief Remove attributes that are used on functions only.
+  void removeFunctionOnlyAttrs();
 
   bool operator==(const AttrBuilder &B);
   bool operator!=(const AttrBuilder &B) {
     return !(*this == B);
   }
+
+  // FIXME: Remove this in 4.0.
+
+  /// \brief Add the raw value to the internal representation.
+  AttrBuilder &addRawValue(uint64_t Val);
 };
 
+namespace AttributeFuncs {
+
+/// \brief Which attributes cannot be applied to a type.
+AttributeSet typeIncompatible(Type *Ty, uint64_t Index);
+
+} // end AttributeFuncs namespace
+
 } // end llvm namespace
 
 #endif
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 15f4aa7..ea5695a 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_BASICBLOCK_H
-#define LLVM_BASICBLOCK_H
+#ifndef LLVM_IR_BASICBLOCK_H
+#define LLVM_IR_BASICBLOCK_H
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
@@ -29,19 +29,19 @@ class BlockAddress;
 
 template<> struct ilist_traits<Instruction>
   : public SymbolTableListTraits<Instruction, BasicBlock> {
-  // createSentinel is used to get hold of a node that marks the end of
-  // the list...
-  // The sentinel is relative to this instance, so we use a non-static
-  // method.
+
+  /// \brief Return a node that marks the end of a list.
+  ///
+  /// The sentinel is relative to this instance, so we use a non-static
+  /// method.
   Instruction *createSentinel() const {
-    // since i(p)lists always publicly derive from the corresponding
-    // traits, placing a data member in this class will augment i(p)list.
-    // But since the NodeTy is expected to publicly derive from
-    // ilist_node<NodeTy>, there is a legal viable downcast from it
-    // to NodeTy. We use this trick to superpose i(p)list with a "ghostly"
-    // NodeTy, which becomes the sentinel. Dereferencing the sentinel is
-    // forbidden (save the ilist_node<NodeTy>) so no one will ever notice
-    // the superposition.
+    // Since i(p)lists always publicly derive from their corresponding traits,
+    // placing a data member in this class will augment the i(p)list.  But since
+    // the NodeTy is expected to be publicly derive from ilist_node<NodeTy>,
+    // there is a legal viable downcast from it to NodeTy. We use this trick to
+    // superimpose an i(p)list with a "ghostly" NodeTy, which becomes the
+    // sentinel. Dereferencing the sentinel is forbidden (save the
+    // ilist_node<NodeTy>), so no one will ever notice the superposition.
     return static_cast<Instruction*>(&Sentinel);
   }
   static void destroySentinel(Instruction*) {}
@@ -53,6 +53,8 @@ private:
   mutable ilist_half_node<Instruction> Sentinel;
 };
 
+/// \brief LLVM Basic Block Representation
+///
 /// This represents a single basic block in LLVM. A basic block is simply a
 /// container of instructions that execute sequentially. Basic blocks are Values
 /// because they are referenced by instructions such as branches and switch
@@ -66,7 +68,6 @@ private:
 /// occur because it may be useful in the intermediate stage of constructing or
 /// modifying a program. However, the verifier will ensure that basic blocks
 /// are "well formed".
-/// @brief LLVM Basic Block Representation
 class BasicBlock : public Value, // Basic blocks are data objects also
                    public ilist_node<BasicBlock> {
   friend class BlockAddress;
@@ -82,14 +83,15 @@ private:
   BasicBlock(const BasicBlock &) LLVM_DELETED_FUNCTION;
   void operator=(const BasicBlock &) LLVM_DELETED_FUNCTION;
 
-  /// BasicBlock ctor - If the function parameter is specified, the basic block
-  /// is automatically inserted at either the end of the function (if
-  /// InsertBefore is null), or before the specified basic block.
+  /// \brief Constructor.
   ///
+  /// If the function parameter is specified, the basic block is automatically
+  /// inserted at either the end of the function (if InsertBefore is null), or
+  /// before the specified basic block.
   explicit BasicBlock(LLVMContext &C, const Twine &Name = "",
                       Function *Parent = 0, BasicBlock *InsertBefore = 0);
 public:
-  /// getContext - Get the context in which this basic block lives.
+  /// \brief Get the context in which this basic block lives.
   LLVMContext &getContext() const;
 
   /// Instruction iterators...
@@ -98,88 +100,87 @@ public:
   typedef InstListType::reverse_iterator reverse_iterator;
   typedef InstListType::const_reverse_iterator const_reverse_iterator;
 
-  /// Create - Creates a new BasicBlock. If the Parent parameter is specified,
-  /// the basic block is automatically inserted at either the end of the
-  /// function (if InsertBefore is 0), or before the specified basic block.
+  /// \brief Creates a new BasicBlock.
+  ///
+  /// If the Parent parameter is specified, the basic block is automatically
+  /// inserted at either the end of the function (if InsertBefore is 0), or
+  /// before the specified basic block.
   static BasicBlock *Create(LLVMContext &Context, const Twine &Name = "",
                             Function *Parent = 0,BasicBlock *InsertBefore = 0) {
     return new BasicBlock(Context, Name, Parent, InsertBefore);
   }
   ~BasicBlock();
 
-  /// getParent - Return the enclosing method, or null if none
-  ///
+  /// \brief Return the enclosing method, or null if none.
   const Function *getParent() const { return Parent; }
         Function *getParent()       { return Parent; }
 
-  /// getTerminator() - If this is a well formed basic block, then this returns
-  /// a pointer to the terminator instruction.  If it is not, then you get a
-  /// null pointer back.
-  ///
+  /// \brief Returns the terminator instruction if the block is well formed or
+  /// null if the block is not well formed.
   TerminatorInst *getTerminator();
   const TerminatorInst *getTerminator() const;
 
-  /// Returns a pointer to the first instructon in this block that is not a
-  /// PHINode instruction. When adding instruction to the beginning of the
-  /// basic block, they should be added before the returned value, not before
-  /// the first instruction, which might be PHI.
-  /// Returns 0 is there's no non-PHI instruction.
+  /// \brief Returns a pointer to the first instruction in this block that is
+  /// not a PHINode instruction.
+  ///
+  /// When adding instructions to the beginning of the basic block, they should
+  /// be added before the returned value, not before the first instruction,
+  /// which might be PHI. Returns 0 is there's no non-PHI instruction.
   Instruction* getFirstNonPHI();
   const Instruction* getFirstNonPHI() const {
     return const_cast<BasicBlock*>(this)->getFirstNonPHI();
   }
 
-  // Same as above, but also skip debug intrinsics.
+  /// \brief Returns a pointer to the first instruction in this block that is not
+  /// a PHINode or a debug intrinsic.
   Instruction* getFirstNonPHIOrDbg();
   const Instruction* getFirstNonPHIOrDbg() const {
     return const_cast<BasicBlock*>(this)->getFirstNonPHIOrDbg();
   }
 
-  // Same as above, but also skip lifetime intrinsics.
+  /// \brief Returns a pointer to the first instruction in this block that is not
+  /// a PHINode, a debug intrinsic, or a lifetime intrinsic.
   Instruction* getFirstNonPHIOrDbgOrLifetime();
   const Instruction* getFirstNonPHIOrDbgOrLifetime() const {
     return const_cast<BasicBlock*>(this)->getFirstNonPHIOrDbgOrLifetime();
   }
 
-  /// getFirstInsertionPt - Returns an iterator to the first instruction in this
-  /// block that is suitable for inserting a non-PHI instruction. In particular,
-  /// it skips all PHIs and LandingPad instructions.
+  /// \brief Returns an iterator to the first instruction in this block that is
+  /// suitable for inserting a non-PHI instruction.
+  ///
+  /// In particular, it skips all PHIs and LandingPad instructions.
   iterator getFirstInsertionPt();
   const_iterator getFirstInsertionPt() const {
     return const_cast<BasicBlock*>(this)->getFirstInsertionPt();
   }
 
-  /// removeFromParent - This method unlinks 'this' from the containing
-  /// function, but does not delete it.
-  ///
+  /// \brief Unlink 'this' from the containing function, but do not delete it.
   void removeFromParent();
 
-  /// eraseFromParent - This method unlinks 'this' from the containing function
-  /// and deletes it.
-  ///
+  /// \brief Unlink 'this' from the containing function and delete it.
   void eraseFromParent();
 
-  /// moveBefore - Unlink this basic block from its current function and
-  /// insert it into the function that MovePos lives in, right before MovePos.
+  /// \brief Unlink this basic block from its current function and insert it
+  /// into the function that \p MovePos lives in, right before \p MovePos.
   void moveBefore(BasicBlock *MovePos);
 
-  /// moveAfter - Unlink this basic block from its current function and
-  /// insert it into the function that MovePos lives in, right after MovePos.
+  /// \brief Unlink this basic block from its current function and insert it
+  /// right after \p MovePos in the function \p MovePos lives in.
   void moveAfter(BasicBlock *MovePos);
 
 
-  /// getSinglePredecessor - If this basic block has a single predecessor block,
-  /// return the block, otherwise return a null pointer.
+  /// \brief Return this block if it has a single predecessor block. Otherwise
+  /// return a null pointer.
   BasicBlock *getSinglePredecessor();
   const BasicBlock *getSinglePredecessor() const {
     return const_cast<BasicBlock*>(this)->getSinglePredecessor();
   }
 
-  /// getUniquePredecessor - If this basic block has a unique predecessor block,
-  /// return the block, otherwise return a null pointer.
+  /// \brief Return this block if it has a unique predecessor block. Otherwise return a null pointer.
+  ///
   /// Note that unique predecessor doesn't mean single edge, there can be
-  /// multiple edges from the unique predecessor to this block (for example
-  /// a switch statement with multiple cases having the same destination).
+  /// multiple edges from the unique predecessor to this block (for example a
+  /// switch statement with multiple cases having the same destination).
   BasicBlock *getUniquePredecessor();
   const BasicBlock *getUniquePredecessor() const {
     return const_cast<BasicBlock*>(this)->getUniquePredecessor();
@@ -205,49 +206,52 @@ public:
   inline const Instruction       &back() const { return InstList.back();  }
   inline       Instruction       &back()       { return InstList.back();  }
 
-  /// getInstList() - Return the underlying instruction list container.  You
-  /// need to access it directly if you want to modify it currently.
+  /// \brief Return the underlying instruction list container.
   ///
+  /// Currently you need to access the underlying instruction list container
+  /// directly if you want to modify it.
   const InstListType &getInstList() const { return InstList; }
         InstListType &getInstList()       { return InstList; }
 
-  /// getSublistAccess() - returns pointer to member of instruction list
+  /// \brief Returns a pointer to a member of the instruction list.
   static iplist<Instruction> BasicBlock::*getSublistAccess(Instruction*) {
     return &BasicBlock::InstList;
   }
 
-  /// getValueSymbolTable() - returns pointer to symbol table (if any)
+  /// \brief Returns a pointer to the symbol table if one exists.
   ValueSymbolTable *getValueSymbolTable();
 
-  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::BasicBlockVal;
   }
 
-  /// dropAllReferences() - This function causes all the subinstructions to "let
-  /// go" of all references that they are maintaining.  This allows one to
-  /// 'delete' a whole class at a time, even though there may be circular
-  /// references... first all references are dropped, and all use counts go to
-  /// zero.  Then everything is delete'd for real.  Note that no operations are
-  /// valid on an object that has "dropped all references", except operator
-  /// delete.
+  /// \brief Cause all subinstructions to "let go" of all the references that
+  /// said subinstructions are maintaining.
   ///
+  /// This allows one to 'delete' a whole class at a time, even though there may
+  /// be circular references... first all references are dropped, and all use
+  /// counts go to zero.  Then everything is delete'd for real.  Note that no
+  /// operations are valid on an object that has "dropped all references",
+  /// except operator delete.
   void dropAllReferences();
 
-  /// removePredecessor - This method is used to notify a BasicBlock that the
-  /// specified Predecessor of the block is no longer able to reach it.  This is
-  /// actually not used to update the Predecessor list, but is actually used to
-  /// update the PHI nodes that reside in the block.  Note that this should be
-  /// called while the predecessor still refers to this block.
+  /// \brief Notify the BasicBlock that the predecessor \p Pred is no longer
+  /// able to reach it.
   ///
+  /// This is actually not used to update the Predecessor list, but is actually
+  /// used to update the PHI nodes that reside in the block.  Note that this
+  /// should be called while the predecessor still refers to this block.
   void removePredecessor(BasicBlock *Pred, bool DontDeleteUselessPHIs = false);
 
-  /// splitBasicBlock - This splits a basic block into two at the specified
-  /// instruction.  Note that all instructions BEFORE the specified iterator
-  /// stay as part of the original basic block, an unconditional branch is added
-  /// to the original BB, and the rest of the instructions in the BB are moved
-  /// to the new BB, including the old terminator.  The newly formed BasicBlock
-  /// is returned.  This function invalidates the specified iterator.
+  /// \brief Split the basic block into two basic blocks at the specified
+  /// instruction.
+  ///
+  /// Note that all instructions BEFORE the specified iterator stay as part of
+  /// the original basic block, an unconditional branch is added to the original
+  /// BB, and the rest of the instructions in the BB are moved to the new BB,
+  /// including the old terminator.  The newly formed BasicBlock is returned.
+  /// This function invalidates the specified iterator.
   ///
   /// Note that this only works on well formed basic blocks (must have a
   /// terminator), and 'I' must not be the end of instruction list (which would
@@ -256,37 +260,39 @@ public:
   ///
   /// Also note that this doesn't preserve any passes. To split blocks while
   /// keeping loop information consistent, use the SplitBlock utility function.
-  ///
   BasicBlock *splitBasicBlock(iterator I, const Twine &BBName = "");
 
-  /// hasAddressTaken - returns true if there are any uses of this basic block
-  /// other than direct branches, switches, etc. to it.
+  /// \brief Returns true if there are any uses of this basic block other than
+  /// direct branches, switches, etc. to it.
   bool hasAddressTaken() const { return getSubclassDataFromValue() != 0; }
 
-  /// replaceSuccessorsPhiUsesWith - Update all phi nodes in all our successors
-  /// to refer to basic block New instead of to us.
+  /// \brief Update all phi nodes in this basic block's successors to refer to
+  /// basic block \p New instead of to it.
   void replaceSuccessorsPhiUsesWith(BasicBlock *New);
 
-  /// isLandingPad - Return true if this basic block is a landing pad. I.e.,
-  /// it's the destination of the 'unwind' edge of an invoke instruction.
+  /// \brief Return true if this basic block is a landing pad.
+  ///
+  /// Being a ``landing pad'' means that the basic block is the destination of
+  /// the 'unwind' edge of an invoke instruction.
   bool isLandingPad() const;
 
-  /// getLandingPadInst() - Return the landingpad instruction associated with
-  /// the landing pad.
+  /// \brief Return the landingpad instruction associated with the landing pad.
   LandingPadInst *getLandingPadInst();
   const LandingPadInst *getLandingPadInst() const;
 
 private:
-  /// AdjustBlockAddressRefCount - BasicBlock stores the number of BlockAddress
-  /// objects using it.  This is almost always 0, sometimes one, possibly but
-  /// almost never 2, and inconceivably 3 or more.
+  /// \brief Increment the internal refcount of the number of BlockAddresses
+  /// referencing this BasicBlock by \p Amt.
+  ///
+  /// This is almost always 0, sometimes one possibly, but almost never 2, and
+  /// inconceivably 3 or more.
   void AdjustBlockAddressRefCount(int Amt) {
     setValueSubclassData(getSubclassDataFromValue()+Amt);
     assert((int)(signed char)getSubclassDataFromValue() >= 0 &&
            "Refcount wrap-around");
   }
-  // Shadow Value::setValueSubclassData with a private forwarding method so that
-  // any future subclasses cannot accidentally use it.
+  /// \brief Shadow Value::setValueSubclassData with a private forwarding method
+  /// so that any future subclasses cannot accidentally use it.
   void setValueSubclassData(unsigned short D) {
     Value::setValueSubclassData(D);
   }
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 699cea3..6f3ab20 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CALLINGCONV_H
-#define LLVM_CALLINGCONV_H
+#ifndef LLVM_IR_CALLINGCONV_H
+#define LLVM_IR_CALLINGCONV_H
 
 namespace llvm {
 
diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h
index 36a4538..26bad1d 100644
--- a/include/llvm/IR/Constant.h
+++ b/include/llvm/IR/Constant.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CONSTANT_H
-#define LLVM_CONSTANT_H
+#ifndef LLVM_IR_CONSTANT_H
+#define LLVM_IR_CONSTANT_H
 
 #include "llvm/IR/User.h"
 
@@ -61,6 +61,9 @@ public:
   /// by getZeroValueForNegation.
   bool isNegativeZeroValue() const;
 
+  /// Return true if the value is negative zero or null value.
+  bool isZeroValue() const;
+
   /// canTrap - Return true if evaluation of this constant could trap.  This is
   /// true for things like constant expressions that could divide by zero.
   bool canTrap() const;
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index 0f49fe5..ad258f9 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -18,8 +18,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CONSTANTS_H
-#define LLVM_CONSTANTS_H
+#ifndef LLVM_IR_CONSTANTS_H
+#define LLVM_IR_CONSTANTS_H
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h
index cc02017..bfdb057 100644
--- a/include/llvm/IR/DataLayout.h
+++ b/include/llvm/IR/DataLayout.h
@@ -17,8 +17,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DATALAYOUT_H
-#define LLVM_DATALAYOUT_H
+#ifndef LLVM_IR_DATALAYOUT_H
+#define LLVM_IR_DATALAYOUT_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
@@ -100,7 +100,7 @@ private:
 
   SmallVector<unsigned char, 8> LegalIntWidths; ///< Legal Integers.
 
-  /// Alignments- Where the primitive type alignment data is stored.
+  /// Alignments - Where the primitive type alignment data is stored.
   ///
   /// @sa init().
   /// @note Could support multiple size pointer alignments, e.g., 32-bit
@@ -324,19 +324,16 @@ public:
   /// an integer type of the specified bitwidth.
   unsigned getABIIntegerTypeAlignment(unsigned BitWidth) const;
 
-
   /// getCallFrameTypeAlignment - Return the minimum ABI-required alignment
   /// for the specified type when it is part of a call frame.
   unsigned getCallFrameTypeAlignment(Type *Ty) const;
 
-
   /// getPrefTypeAlignment - Return the preferred stack/global alignment for
   /// the specified type.  This is always at least as good as the ABI alignment.
   unsigned getPrefTypeAlignment(Type *Ty) const;
 
   /// getPreferredTypeAlignmentShift - Return the preferred alignment for the
   /// specified type, returned as log2 of the value (a shift amount).
-  ///
   unsigned getPreferredTypeAlignmentShift(Type *Ty) const;
 
   /// getIntPtrType - Return an integer type with size at least as big as that
@@ -350,7 +347,6 @@ public:
 
   /// getIndexedOffset - return the offset from the beginning of the type for
   /// the specified indices.  This is used to implement getelementptr.
-  ///
   uint64_t getIndexedOffset(Type *Ty, ArrayRef<Value *> Indices) const;
 
   /// getStructLayout - Return a StructLayout object, indicating the alignment
diff --git a/include/llvm/IR/DerivedTypes.h b/include/llvm/IR/DerivedTypes.h
index 5a941520..3bf7885 100644
--- a/include/llvm/IR/DerivedTypes.h
+++ b/include/llvm/IR/DerivedTypes.h
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DERIVED_TYPES_H
-#define LLVM_DERIVED_TYPES_H
+#ifndef LLVM_IR_DERIVEDTYPES_H
+#define LLVM_IR_DERIVEDTYPES_H
 
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Compiler.h"
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 1762b2c..568b55c 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_FUNCTION_H
-#define LLVM_FUNCTION_H
+#ifndef LLVM_IR_FUNCTION_H
+#define LLVM_IR_FUNCTION_H
 
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -85,7 +85,7 @@ private:
   BasicBlockListType  BasicBlocks;        ///< The basic blocks
   mutable ArgumentListType ArgumentList;  ///< The formal arguments
   ValueSymbolTable *SymTab;               ///< Symbol table of args/instructions
-  AttributeSet AttributeList;              ///< Parameter attributes
+  AttributeSet AttributeSets;             ///< Parameter attributes
 
   // HasLazyArguments is stored in Value::SubclassData.
   /*bool HasLazyArguments;*/
@@ -162,24 +162,25 @@ public:
 
   /// getAttributes - Return the attribute list for this Function.
   ///
-  const AttributeSet &getAttributes() const { return AttributeList; }
+  AttributeSet getAttributes() const { return AttributeSets; }
 
   /// setAttributes - Set the attribute list for this Function.
   ///
-  void setAttributes(const AttributeSet &attrs) { AttributeList = attrs; }
+  void setAttributes(AttributeSet attrs) { AttributeSets = attrs; }
 
   /// addFnAttr - Add function attributes to this function.
   ///
   void addFnAttr(Attribute::AttrKind N) {
-    // Function Attribute are stored at ~0 index
-    addAttribute(AttributeSet::FunctionIndex, Attribute::get(getContext(), N));
+    setAttributes(AttributeSets.addAttribute(getContext(),
+                                             AttributeSet::FunctionIndex, N));
   }
 
-  /// removeFnAttr - Remove function attributes from this function.
-  ///
-  void removeFnAttr(Attribute N) {
-    // Function Attribute are stored at ~0 index
-    removeAttribute(~0U, N);
+  /// \brief Return true if the function has the attribute.
+  bool hasFnAttribute(Attribute::AttrKind Kind) const {
+    return AttributeSets.hasAttribute(AttributeSet::FunctionIndex, Kind);
+  }
+  bool hasFnAttribute(StringRef Kind) const {
+    return AttributeSets.hasAttribute(AttributeSet::FunctionIndex, Kind);
   }
 
   /// hasGC/getGC/setGC/clearGC - The name of the garbage collection algorithm
@@ -189,20 +190,23 @@ public:
   void setGC(const char *Str);
   void clearGC();
 
-  /// addAttribute - adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute attr);
+  /// @brief adds the attribute to the list of attributes.
+  void addAttribute(unsigned i, Attribute::AttrKind attr);
+
+  /// @brief adds the attributes to the list of attributes.
+  void addAttributes(unsigned i, AttributeSet attrs);
 
-  /// removeAttribute - removes the attribute from the list of attributes.
-  void removeAttribute(unsigned i, Attribute attr);
+  /// @brief removes the attributes from the list of attributes.
+  void removeAttributes(unsigned i, AttributeSet attr);
 
   /// @brief Extract the alignment for a call or parameter (0=unknown).
   unsigned getParamAlignment(unsigned i) const {
-    return AttributeList.getParamAlignment(i);
+    return AttributeSets.getParamAlignment(i);
   }
 
   /// @brief Determine if the function does not access memory.
   bool doesNotAccessMemory() const {
-    return AttributeList.hasAttribute(AttributeSet::FunctionIndex,
+    return AttributeSets.hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::ReadNone);
   }
   void setDoesNotAccessMemory() {
@@ -212,7 +216,7 @@ public:
   /// @brief Determine if the function does not access or only reads memory.
   bool onlyReadsMemory() const {
     return doesNotAccessMemory() ||
-      AttributeList.hasAttribute(AttributeSet::FunctionIndex,
+      AttributeSets.hasAttribute(AttributeSet::FunctionIndex,
                                  Attribute::ReadOnly);
   }
   void setOnlyReadsMemory() {
@@ -221,7 +225,7 @@ public:
 
   /// @brief Determine if the function cannot return.
   bool doesNotReturn() const {
-    return AttributeList.hasAttribute(AttributeSet::FunctionIndex,
+    return AttributeSets.hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::NoReturn);
   }
   void setDoesNotReturn() {
@@ -230,7 +234,7 @@ public:
 
   /// @brief Determine if the function cannot unwind.
   bool doesNotThrow() const {
-    return AttributeList.hasAttribute(AttributeSet::FunctionIndex,
+    return AttributeSets.hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::NoUnwind);
   }
   void setDoesNotThrow() {
@@ -239,7 +243,7 @@ public:
 
   /// @brief Determine if the call cannot be duplicated.
   bool cannotDuplicate() const {
-    return AttributeList.hasAttribute(AttributeSet::FunctionIndex,
+    return AttributeSets.hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::NoDuplicate);
   }
   void setCannotDuplicate() {
@@ -249,7 +253,7 @@ public:
   /// @brief True if the ABI mandates (or the user requested) that this
   /// function be in a unwind table.
   bool hasUWTable() const {
-    return AttributeList.hasAttribute(AttributeSet::FunctionIndex,
+    return AttributeSets.hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::UWTable);
   }
   void setHasUWTable() {
@@ -264,25 +268,25 @@ public:
   /// @brief Determine if the function returns a structure through first
   /// pointer argument.
   bool hasStructRetAttr() const {
-    return AttributeList.hasAttribute(1, Attribute::StructRet);
+    return AttributeSets.hasAttribute(1, Attribute::StructRet);
   }
 
   /// @brief Determine if the parameter does not alias other parameters.
   /// @param n The parameter to check. 1 is the first parameter, 0 is the return
   bool doesNotAlias(unsigned n) const {
-    return AttributeList.hasAttribute(n, Attribute::NoAlias);
+    return AttributeSets.hasAttribute(n, Attribute::NoAlias);
   }
   void setDoesNotAlias(unsigned n) {
-    addAttribute(n, Attribute::get(getContext(), Attribute::NoAlias));
+    addAttribute(n, Attribute::NoAlias);
   }
 
   /// @brief Determine if the parameter can be captured.
   /// @param n The parameter to check. 1 is the first parameter, 0 is the return
   bool doesNotCapture(unsigned n) const {
-    return AttributeList.hasAttribute(n, Attribute::NoCapture);
+    return AttributeSets.hasAttribute(n, Attribute::NoCapture);
   }
   void setDoesNotCapture(unsigned n) {
-    addAttribute(n, Attribute::get(getContext(), Attribute::NoCapture));
+    addAttribute(n, Attribute::NoCapture);
   }
 
   /// copyAttributesFrom - copy all additional attributes (those not needed to
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index 03c1648..883814a 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_GLOBAL_ALIAS_H
-#define LLVM_GLOBAL_ALIAS_H
+#ifndef LLVM_IR_GLOBALALIAS_H
+#define LLVM_IR_GLOBALALIAS_H
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist_node.h"
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 246e00b..f398bc1 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_GLOBALVALUE_H
-#define LLVM_GLOBALVALUE_H
+#ifndef LLVM_IR_GLOBALVALUE_H
+#define LLVM_IR_GLOBALVALUE_H
 
 #include "llvm/IR/Constant.h"
 
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index 5c42d76..bfed507 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -17,8 +17,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_GLOBAL_VARIABLE_H
-#define LLVM_GLOBAL_VARIABLE_H
+#ifndef LLVM_IR_GLOBALVARIABLE_H
+#define LLVM_IR_GLOBALVARIABLE_H
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist_node.h"
@@ -40,9 +40,14 @@ class GlobalVariable : public GlobalValue, public ilist_node<GlobalVariable> {
 
   void setParent(Module *parent);
 
-  bool isConstantGlobal : 1;           // Is this a global constant?
-  unsigned threadLocalMode : 3;        // Is this symbol "Thread Local",
-                                       // if so, what is the desired model?
+  bool isConstantGlobal : 1;                   // Is this a global constant?
+  unsigned threadLocalMode : 3;                // Is this symbol "Thread Local",
+                                               // if so, what is the desired
+                                               // model?
+  bool isExternallyInitializedConstant : 1;    // Is this a global whose value
+                                               // can change from its initial
+                                               // value before global
+                                               // initializers are run?
 
 public:
   // allocate space for exactly one operand
@@ -62,15 +67,15 @@ public:
   /// automatically inserted into the end of the specified modules global list.
   GlobalVariable(Type *Ty, bool isConstant, LinkageTypes Linkage,
                  Constant *Initializer = 0, const Twine &Name = "",
-                 ThreadLocalMode = NotThreadLocal, unsigned AddressSpace = 0);
+                 ThreadLocalMode = NotThreadLocal, unsigned AddressSpace = 0,
+                 bool isExternallyInitialized = false);
   /// GlobalVariable ctor - This creates a global and inserts it before the
   /// specified other global.
   GlobalVariable(Module &M, Type *Ty, bool isConstant,
                  LinkageTypes Linkage, Constant *Initializer,
-                 const Twine &Name = "",
-                 GlobalVariable *InsertBefore = 0,
-                 ThreadLocalMode = NotThreadLocal,
-                 unsigned AddressSpace = 0);
+                 const Twine &Name = "", GlobalVariable *InsertBefore = 0,
+                 ThreadLocalMode = NotThreadLocal, unsigned AddressSpace = 0,
+                 bool isExternallyInitialized = false);
 
   ~GlobalVariable() {
     NumOperands = 1; // FIXME: needed by operator delete
@@ -105,7 +110,10 @@ public:
     return hasInitializer() &&
       // The initializer of a global variable with weak linkage may change at
       // link time.
-      !mayBeOverridden();
+      !mayBeOverridden() &&
+      // The initializer of a global variable with the externally_initialized
+      // marker may change at runtime before C++ initializers are evaluated.
+      !isExternallyInitialized();
   }
 
   /// hasUniqueInitializer - Whether the global variable has an initializer, and
@@ -118,7 +126,11 @@ public:
       // instead. It is wrong to modify the initializer of a global variable
       // with *_odr linkage because then different instances of the global may
       // have different initializers, breaking the One Definition Rule.
-      !isWeakForLinker();
+      !isWeakForLinker() &&
+      // It is not safe to modify initializers of global variables with the
+      // external_initializer marker since the value may be changed at runtime
+      // before C++ initializers are evaluated.
+      !isExternallyInitialized();
   }
 
   /// getInitializer - Return the initializer for this global variable.  It is
@@ -155,6 +167,13 @@ public:
     return static_cast<ThreadLocalMode>(threadLocalMode);
   }
 
+  bool isExternallyInitialized() const {
+    return isExternallyInitializedConstant;
+  }
+  void setExternallyInitialized(bool Val) {
+    isExternallyInitializedConstant = Val;
+  }
+
   /// copyAttributesFrom - copy all additional attributes (those not needed to
   /// create a GlobalVariable) from the GlobalVariable Src to this one.
   void copyAttributesFrom(const GlobalValue *Src);
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 27f990e..1c71d0a 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_IRBUILDER_H
-#define LLVM_IRBUILDER_H
+#ifndef LLVM_IR_IRBUILDER_H
+#define LLVM_IR_IRBUILDER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -28,10 +28,11 @@
 namespace llvm {
   class MDNode;
 
-/// IRBuilderDefaultInserter - This provides the default implementation of the
-/// IRBuilder 'InsertHelper' method that is called whenever an instruction is
-/// created by IRBuilder and needs to be inserted.  By default, this inserts the
-/// instruction at the insertion point.
+/// \brief This provides the default implementation of the IRBuilder
+/// 'InsertHelper' method that is called whenever an instruction is created by
+/// IRBuilder and needs to be inserted.
+///
+/// By default, this inserts the instruction at the insertion point.
 template <bool preserveNames = true>
 class IRBuilderDefaultInserter {
 protected:
@@ -43,7 +44,7 @@ protected:
   }
 };
 
-/// IRBuilderBase - Common base class shared among various IRBuilders.
+/// \brief Common base class shared among various IRBuilders.
 class IRBuilderBase {
   DebugLoc CurDbgLocation;
 protected:
@@ -61,8 +62,8 @@ public:
   // Builder configuration methods
   //===--------------------------------------------------------------------===//
 
-  /// ClearInsertionPoint - Clear the insertion point: created instructions will
-  /// not be inserted into a block.
+  /// \brief Clear the insertion point: created instructions will not be
+  /// inserted into a block.
   void ClearInsertionPoint() {
     BB = 0;
   }
@@ -71,30 +72,30 @@ public:
   BasicBlock::iterator GetInsertPoint() const { return InsertPt; }
   LLVMContext &getContext() const { return Context; }
 
-  /// SetInsertPoint - This specifies that created instructions should be
-  /// appended to the end of the specified block.
+  /// \brief This specifies that created instructions should be appended to the
+  /// end of the specified block.
   void SetInsertPoint(BasicBlock *TheBB) {
     BB = TheBB;
     InsertPt = BB->end();
   }
 
-  /// SetInsertPoint - This specifies that created instructions should be
-  /// inserted before the specified instruction.
+  /// \brief This specifies that created instructions should be inserted before
+  /// the specified instruction.
   void SetInsertPoint(Instruction *I) {
     BB = I->getParent();
     InsertPt = I;
     SetCurrentDebugLocation(I->getDebugLoc());
   }
 
-  /// SetInsertPoint - This specifies that created instructions should be
-  /// inserted at the specified point.
+  /// \brief This specifies that created instructions should be inserted at the
+  /// specified point.
   void SetInsertPoint(BasicBlock *TheBB, BasicBlock::iterator IP) {
     BB = TheBB;
     InsertPt = IP;
   }
 
-  /// SetInsertPoint(Use) - Find the nearest point that dominates this use, and
-  /// specify that created instructions should be inserted at this point.
+  /// \brief Find the nearest point that dominates this use, and specify that
+  /// created instructions should be inserted at this point.
   void SetInsertPoint(Use &U) {
     Instruction *UseInst = cast<Instruction>(U.getUser());
     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
@@ -106,25 +107,23 @@ public:
     SetInsertPoint(UseInst);
   }
 
-  /// SetCurrentDebugLocation - Set location information used by debugging
-  /// information.
+  /// \brief Set location information used by debugging information.
   void SetCurrentDebugLocation(const DebugLoc &L) {
     CurDbgLocation = L;
   }
 
-  /// getCurrentDebugLocation - Get location information used by debugging
-  /// information.
+  /// \brief Get location information used by debugging information.
   DebugLoc getCurrentDebugLocation() const { return CurDbgLocation; }
 
-  /// SetInstDebugLocation - If this builder has a current debug location, set
-  /// it on the specified instruction.
+  /// \brief If this builder has a current debug location, set it on the
+  /// specified instruction.
   void SetInstDebugLocation(Instruction *I) const {
     if (!CurDbgLocation.isUnknown())
       I->setDebugLoc(CurDbgLocation);
   }
 
-  /// getCurrentFunctionReturnType - Get the return type of the current function
-  /// that we're emitting into.
+  /// \brief Get the return type of the current function that we're emitting
+  /// into.
   Type *getCurrentFunctionReturnType() const;
 
   /// InsertPoint - A saved insertion point.
@@ -133,35 +132,33 @@ public:
     BasicBlock::iterator Point;
 
   public:
-    /// Creates a new insertion point which doesn't point to anything.
+    /// \brief Creates a new insertion point which doesn't point to anything.
     InsertPoint() : Block(0) {}
 
-    /// Creates a new insertion point at the given location.
+    /// \brief Creates a new insertion point at the given location.
     InsertPoint(BasicBlock *InsertBlock, BasicBlock::iterator InsertPoint)
       : Block(InsertBlock), Point(InsertPoint) {}
 
-    /// isSet - Returns true if this insert point is set.
+    /// \brief Returns true if this insert point is set.
     bool isSet() const { return (Block != 0); }
 
     llvm::BasicBlock *getBlock() const { return Block; }
     llvm::BasicBlock::iterator getPoint() const { return Point; }
   };
 
-  /// saveIP - Returns the current insert point.
+  /// \brief Returns the current insert point.
   InsertPoint saveIP() const {
     return InsertPoint(GetInsertBlock(), GetInsertPoint());
   }
 
-  /// saveAndClearIP - Returns the current insert point, clearing it
-  /// in the process.
+  /// \brief Returns the current insert point, clearing it in the process.
   InsertPoint saveAndClearIP() {
     InsertPoint IP(GetInsertBlock(), GetInsertPoint());
     ClearInsertionPoint();
     return IP;
   }
 
-  /// restoreIP - Sets the current insert point to a previously-saved
-  /// location.
+  /// \brief Sets the current insert point to a previously-saved location.
   void restoreIP(InsertPoint IP) {
     if (IP.isSet())
       SetInsertPoint(IP.getBlock(), IP.getPoint());
@@ -173,49 +170,50 @@ public:
   // Miscellaneous creation methods.
   //===--------------------------------------------------------------------===//
 
-  /// CreateGlobalString - Make a new global variable with an initializer that
-  /// has array of i8 type filled in with the nul terminated string value
-  /// specified.  The new global variable will be marked mergable with any
-  /// others of the same contents.  If Name is specified, it is the name of the
-  /// global variable created.
+  /// \brief Make a new global variable with initializer type i8*
+  ///
+  /// Make a new global variable with an initializer that has array of i8 type
+  /// filled in with the null terminated string value specified.  The new global
+  /// variable will be marked mergable with any others of the same contents.  If
+  /// Name is specified, it is the name of the global variable created.
   Value *CreateGlobalString(StringRef Str, const Twine &Name = "");
 
-  /// getInt1 - Get a constant value representing either true or false.
+  /// \brief Get a constant value representing either true or false.
   ConstantInt *getInt1(bool V) {
     return ConstantInt::get(getInt1Ty(), V);
   }
 
-  /// getTrue - Get the constant value for i1 true.
+  /// \brief Get the constant value for i1 true.
   ConstantInt *getTrue() {
     return ConstantInt::getTrue(Context);
   }
 
-  /// getFalse - Get the constant value for i1 false.
+  /// \brief Get the constant value for i1 false.
   ConstantInt *getFalse() {
     return ConstantInt::getFalse(Context);
   }
 
-  /// getInt8 - Get a constant 8-bit value.
+  /// \brief Get a constant 8-bit value.
   ConstantInt *getInt8(uint8_t C) {
     return ConstantInt::get(getInt8Ty(), C);
   }
 
-  /// getInt16 - Get a constant 16-bit value.
+  /// \brief Get a constant 16-bit value.
   ConstantInt *getInt16(uint16_t C) {
     return ConstantInt::get(getInt16Ty(), C);
   }
 
-  /// getInt32 - Get a constant 32-bit value.
+  /// \brief Get a constant 32-bit value.
   ConstantInt *getInt32(uint32_t C) {
     return ConstantInt::get(getInt32Ty(), C);
   }
 
-  /// getInt64 - Get a constant 64-bit value.
+  /// \brief Get a constant 64-bit value.
   ConstantInt *getInt64(uint64_t C) {
     return ConstantInt::get(getInt64Ty(), C);
   }
 
-  /// getInt - Get a constant integer value.
+  /// \brief Get a constant integer value.
   ConstantInt *getInt(const APInt &AI) {
     return ConstantInt::get(Context, AI);
   }
@@ -224,50 +222,52 @@ public:
   // Type creation methods
   //===--------------------------------------------------------------------===//
 
-  /// getInt1Ty - Fetch the type representing a single bit
+  /// \brief Fetch the type representing a single bit
   IntegerType *getInt1Ty() {
     return Type::getInt1Ty(Context);
   }
 
-  /// getInt8Ty - Fetch the type representing an 8-bit integer.
+  /// \brief Fetch the type representing an 8-bit integer.
   IntegerType *getInt8Ty() {
     return Type::getInt8Ty(Context);
   }
 
-  /// getInt16Ty - Fetch the type representing a 16-bit integer.
+  /// \brief Fetch the type representing a 16-bit integer.
   IntegerType *getInt16Ty() {
     return Type::getInt16Ty(Context);
   }
 
-  /// getInt32Ty - Fetch the type representing a 32-bit integer.
+  /// \brief Fetch the type representing a 32-bit integer.
   IntegerType *getInt32Ty() {
     return Type::getInt32Ty(Context);
   }
 
-  /// getInt64Ty - Fetch the type representing a 64-bit integer.
+  /// \brief Fetch the type representing a 64-bit integer.
   IntegerType *getInt64Ty() {
     return Type::getInt64Ty(Context);
   }
 
-  /// getFloatTy - Fetch the type representing a 32-bit floating point value.
+  /// \brief Fetch the type representing a 32-bit floating point value.
   Type *getFloatTy() {
     return Type::getFloatTy(Context);
   }
 
-  /// getDoubleTy - Fetch the type representing a 64-bit floating point value.
+  /// \brief Fetch the type representing a 64-bit floating point value.
   Type *getDoubleTy() {
     return Type::getDoubleTy(Context);
   }
 
-  /// getVoidTy - Fetch the type representing void.
+  /// \brief Fetch the type representing void.
   Type *getVoidTy() {
     return Type::getVoidTy(Context);
   }
 
+  /// \brief Fetch the type representing a pointer to an 8-bit integer value.
   PointerType *getInt8PtrTy(unsigned AddrSpace = 0) {
     return Type::getInt8PtrTy(Context, AddrSpace);
   }
 
+  /// \brief Fetch the type representing a pointer to an integer value.
   IntegerType* getIntPtrTy(DataLayout *DL, unsigned AddrSpace = 0) {
     return DL->getIntPtrType(Context, AddrSpace);
   }
@@ -276,9 +276,11 @@ public:
   // Intrinsic creation methods
   //===--------------------------------------------------------------------===//
 
-  /// CreateMemSet - Create and insert a memset to the specified pointer and the
-  /// specified value.  If the pointer isn't an i8*, it will be converted.  If a
-  /// TBAA tag is specified, it will be added to the instruction.
+  /// \brief Create and insert a memset to the specified pointer and the
+  /// specified value.
+  ///
+  /// If the pointer isn't an i8*, it will be converted.  If a TBAA tag is
+  /// specified, it will be added to the instruction.
   CallInst *CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, unsigned Align,
                          bool isVolatile = false, MDNode *TBAATag = 0) {
     return CreateMemSet(Ptr, Val, getInt64(Size), Align, isVolatile, TBAATag);
@@ -287,7 +289,8 @@ public:
   CallInst *CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align,
                          bool isVolatile = false, MDNode *TBAATag = 0);
 
-  /// CreateMemCpy - Create and insert a memcpy between the specified pointers.
+  /// \brief Create and insert a memcpy between the specified pointers.
+  ///
   /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
   /// specified, it will be added to the instruction.
   CallInst *CreateMemCpy(Value *Dst, Value *Src, uint64_t Size, unsigned Align,
@@ -301,9 +304,11 @@ public:
                          bool isVolatile = false, MDNode *TBAATag = 0,
                          MDNode *TBAAStructTag = 0);
 
-  /// CreateMemMove - Create and insert a memmove between the specified
-  /// pointers.  If the pointers aren't i8*, they will be converted.  If a TBAA
-  /// tag is specified, it will be added to the instruction.
+  /// \brief Create and insert a memmove between the specified
+  /// pointers.
+  ///
+  /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
+  /// specified, it will be added to the instruction.
   CallInst *CreateMemMove(Value *Dst, Value *Src, uint64_t Size, unsigned Align,
                           bool isVolatile = false, MDNode *TBAATag = 0) {
     return CreateMemMove(Dst, Src, getInt64(Size), Align, isVolatile, TBAATag);
@@ -312,21 +317,23 @@ public:
   CallInst *CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
                           bool isVolatile = false, MDNode *TBAATag = 0);
 
-  /// CreateLifetimeStart - Create a lifetime.start intrinsic.  If the pointer
-  /// isn't i8* it will be converted.
+  /// \brief Create a lifetime.start intrinsic.
+  ///
+  /// If the pointer isn't i8* it will be converted.
   CallInst *CreateLifetimeStart(Value *Ptr, ConstantInt *Size = 0);
 
-  /// CreateLifetimeEnd - Create a lifetime.end intrinsic.  If the pointer isn't
-  /// i8* it will be converted.
+  /// \brief Create a lifetime.end intrinsic.
+  ///
+  /// If the pointer isn't i8* it will be converted.
   CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = 0);
 
 private:
   Value *getCastedInt8PtrValue(Value *Ptr);
 };
 
-/// IRBuilder - This provides a uniform API for creating instructions and
-/// inserting them into a basic block: either at the end of a BasicBlock, or
-/// at a specific iterator location in a block.
+/// \brief This provides a uniform API for creating instructions and inserting
+/// them into a basic block: either at the end of a BasicBlock, or at a specific
+/// iterator location in a block.
 ///
 /// Note that the builder does not expose the full generality of LLVM
 /// instructions.  For access to extra instruction properties, use the mutators
@@ -396,38 +403,37 @@ public:
     SetInsertPoint(TheBB, IP);
   }
 
-  /// getFolder - Get the constant folder being used.
+  /// \brief Get the constant folder being used.
   const T &getFolder() { return Folder; }
 
-  /// getDefaultFPMathTag - Get the floating point math metadata being used.
+  /// \brief Get the floating point math metadata being used.
   MDNode *getDefaultFPMathTag() const { return DefaultFPMathTag; }
 
-  /// Get the flags to be applied to created floating point ops
+  /// \brief Get the flags to be applied to created floating point ops
   FastMathFlags getFastMathFlags() const { return FMF; }
 
-  /// Clear the fast-math flags.
+  /// \brief Clear the fast-math flags.
   void clearFastMathFlags() { FMF.clear(); }
 
-  /// SetDefaultFPMathTag - Set the floating point math metadata to be used.
+  /// \brief SetDefaultFPMathTag - Set the floating point math metadata to be used.
   void SetDefaultFPMathTag(MDNode *FPMathTag) { DefaultFPMathTag = FPMathTag; }
 
-  /// Set the fast-math flags to be used with generated fp-math operators
+  /// \brief Set the fast-math flags to be used with generated fp-math operators
   void SetFastMathFlags(FastMathFlags NewFMF) { FMF = NewFMF; }
 
-  /// isNamePreserving - Return true if this builder is configured to actually
-  /// add the requested names to IR created through it.
+  /// \brief Return true if this builder is configured to actually add the
+  /// requested names to IR created through it.
   bool isNamePreserving() const { return preserveNames; }
 
-  /// Insert - Insert and return the specified instruction.
+  /// \brief Insert and return the specified instruction.
   template<typename InstTy>
   InstTy *Insert(InstTy *I, const Twine &Name = "") const {
     this->InsertHelper(I, Name, BB, InsertPt);
-    if (!getCurrentDebugLocation().isUnknown())
-      this->SetInstDebugLocation(I);
+    this->SetInstDebugLocation(I);
     return I;
   }
 
-  /// Insert - No-op overload to handle constants.
+  /// \brief No-op overload to handle constants.
   Constant *Insert(Constant *C, const Twine& = "") const {
     return C;
   }
@@ -447,25 +453,23 @@ private:
   }
 
 public:
-  /// CreateRetVoid - Create a 'ret void' instruction.
+  /// \brief Create a 'ret void' instruction.
   ReturnInst *CreateRetVoid() {
     return Insert(ReturnInst::Create(Context));
   }
 
-  /// @verbatim
-  /// CreateRet - Create a 'ret <val>' instruction.
-  /// @endverbatim
+  /// \brief Create a 'ret <val>' instruction.
   ReturnInst *CreateRet(Value *V) {
     return Insert(ReturnInst::Create(Context, V));
   }
 
-  /// CreateAggregateRet - Create a sequence of N insertvalue instructions,
+  /// \brief Create a sequence of N insertvalue instructions,
   /// with one Value from the retVals array each, that build a aggregate
   /// return value one value at a time, and a ret instruction to return
-  /// the resulting aggregate value. This is a convenience function for
-  /// code that uses aggregate return values as a vehicle for having
-  /// multiple return values.
+  /// the resulting aggregate value.
   ///
+  /// This is a convenience function for code that uses aggregate return values
+  /// as a vehicle for having multiple return values.
   ReturnInst *CreateAggregateRet(Value *const *retVals, unsigned N) {
     Value *V = UndefValue::get(getCurrentFunctionReturnType());
     for (unsigned i = 0; i != N; ++i)
@@ -473,12 +477,12 @@ public:
     return Insert(ReturnInst::Create(Context, V));
   }
 
-  /// CreateBr - Create an unconditional 'br label X' instruction.
+  /// \brief Create an unconditional 'br label X' instruction.
   BranchInst *CreateBr(BasicBlock *Dest) {
     return Insert(BranchInst::Create(Dest));
   }
 
-  /// CreateCondBr - Create a conditional 'br Cond, TrueDest, FalseDest'
+  /// \brief Create a conditional 'br Cond, TrueDest, FalseDest'
   /// instruction.
   BranchInst *CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False,
                            MDNode *BranchWeights = 0) {
@@ -486,18 +490,18 @@ public:
                                    BranchWeights));
   }
 
-  /// CreateSwitch - Create a switch instruction with the specified value,
-  /// default dest, and with a hint for the number of cases that will be added
-  /// (for efficient allocation).
+  /// \brief Create a switch instruction with the specified value, default dest,
+  /// and with a hint for the number of cases that will be added (for efficient
+  /// allocation).
   SwitchInst *CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases = 10,
                            MDNode *BranchWeights = 0) {
     return Insert(addBranchWeights(SwitchInst::Create(V, Dest, NumCases),
                                    BranchWeights));
   }
 
-  /// CreateIndirectBr - Create an indirect branch instruction with the
-  /// specified address operand, with an optional hint for the number of
-  /// destinations that will be added (for efficient allocation).
+  /// \brief Create an indirect branch instruction with the specified address
+  /// operand, with an optional hint for the number of destinations that will be
+  /// added (for efficient allocation).
   IndirectBrInst *CreateIndirectBr(Value *Addr, unsigned NumDests = 10) {
     return Insert(IndirectBrInst::Create(Addr, NumDests));
   }
@@ -522,7 +526,7 @@ public:
     return Insert(InvokeInst::Create(Callee, NormalDest, UnwindDest, Args),
                   Name);
   }
-  /// CreateInvoke - Create an invoke instruction.
+  /// \brief Create an invoke instruction.
   InvokeInst *CreateInvoke(Value *Callee, BasicBlock *NormalDest,
                            BasicBlock *UnwindDest, ArrayRef<Value *> Args,
                            const Twine &Name = "") {
@@ -825,7 +829,7 @@ public:
                            const Twine &Name = "") {
     return Insert(new AllocaInst(Ty, ArraySize), Name);
   }
-  // Provided to resolve 'CreateLoad(Ptr, "...")' correctly, instead of
+  // \brief Provided to resolve 'CreateLoad(Ptr, "...")' correctly, instead of
   // converting the string to 'bool' for the isVolatile parameter.
   LoadInst *CreateLoad(Value *Ptr, const char *Name) {
     return Insert(new LoadInst(Ptr), Name);
@@ -839,8 +843,9 @@ public:
   StoreInst *CreateStore(Value *Val, Value *Ptr, bool isVolatile = false) {
     return Insert(new StoreInst(Val, Ptr, isVolatile));
   }
-  // Provided to resolve 'CreateAlignedLoad(Ptr, Align, "...")' correctly,
-  // instead of converting the string to 'bool' for the isVolatile parameter.
+  // \brief Provided to resolve 'CreateAlignedLoad(Ptr, Align, "...")'
+  // correctly, instead of converting the string to 'bool' for the isVolatile
+  // parameter.
   LoadInst *CreateAlignedLoad(Value *Ptr, unsigned Align, const char *Name) {
     LoadInst *LI = CreateLoad(Ptr, Name);
     LI->setAlignment(Align);
@@ -1002,8 +1007,8 @@ public:
     return CreateConstInBoundsGEP2_32(Ptr, 0, Idx, Name);
   }
 
-  /// CreateGlobalStringPtr - Same as CreateGlobalString, but return a pointer
-  /// with "i8*" type instead of a pointer to array of i8.
+  /// \brief Same as CreateGlobalString, but return a pointer with "i8*" type
+  /// instead of a pointer to array of i8.
   Value *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "") {
     Value *gv = CreateGlobalString(Str, Name);
     Value *zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
@@ -1024,27 +1029,31 @@ public:
   Value *CreateSExt(Value *V, Type *DestTy, const Twine &Name = "") {
     return CreateCast(Instruction::SExt, V, DestTy, Name);
   }
-  /// CreateZExtOrTrunc - Create a ZExt or Trunc from the integer value V to
-  /// DestTy. Return the value untouched if the type of V is already DestTy.
-  Value *CreateZExtOrTrunc(Value *V, IntegerType *DestTy,
+  /// \brief Create a ZExt or Trunc from the integer value V to DestTy. Return
+  /// the value untouched if the type of V is already DestTy.
+  Value *CreateZExtOrTrunc(Value *V, Type *DestTy,
                            const Twine &Name = "") {
-    assert(isa<IntegerType>(V->getType()) && "Can only zero extend integers!");
-    IntegerType *IntTy = cast<IntegerType>(V->getType());
-    if (IntTy->getBitWidth() < DestTy->getBitWidth())
+    assert(V->getType()->isIntOrIntVectorTy() &&
+           DestTy->isIntOrIntVectorTy() &&
+           "Can only zero extend/truncate integers!");
+    Type *VTy = V->getType();
+    if (VTy->getScalarSizeInBits() < DestTy->getScalarSizeInBits())
       return CreateZExt(V, DestTy, Name);
-    if (IntTy->getBitWidth() > DestTy->getBitWidth())
+    if (VTy->getScalarSizeInBits() > DestTy->getScalarSizeInBits())
       return CreateTrunc(V, DestTy, Name);
     return V;
   }
-  /// CreateSExtOrTrunc - Create a SExt or Trunc from the integer value V to
-  /// DestTy. Return the value untouched if the type of V is already DestTy.
-  Value *CreateSExtOrTrunc(Value *V, IntegerType *DestTy,
+  /// \brief Create a SExt or Trunc from the integer value V to DestTy. Return
+  /// the value untouched if the type of V is already DestTy.
+  Value *CreateSExtOrTrunc(Value *V, Type *DestTy,
                            const Twine &Name = "") {
-    assert(isa<IntegerType>(V->getType()) && "Can only sign extend integers!");
-    IntegerType *IntTy = cast<IntegerType>(V->getType());
-    if (IntTy->getBitWidth() < DestTy->getBitWidth())
+    assert(V->getType()->isIntOrIntVectorTy() &&
+           DestTy->isIntOrIntVectorTy() &&
+           "Can only sign extend/truncate integers!");
+    Type *VTy = V->getType();
+    if (VTy->getScalarSizeInBits() < DestTy->getScalarSizeInBits())
       return CreateSExt(V, DestTy, Name);
-    if (IntTy->getBitWidth() > DestTy->getBitWidth())
+    if (VTy->getScalarSizeInBits() > DestTy->getScalarSizeInBits())
       return CreateTrunc(V, DestTy, Name);
     return V;
   }
@@ -1128,8 +1137,9 @@ public:
     return Insert(CastInst::CreateIntegerCast(V, DestTy, isSigned), Name);
   }
 private:
-  // Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a compile time
-  // error, instead of converting the string to bool for the isSigned parameter.
+  // \brief Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a
+  // compile time error, instead of converting the string to bool for the
+  // isSigned parameter.
   Value *CreateIntCast(Value *, Type *, const char *) LLVM_DELETED_FUNCTION;
 public:
   Value *CreateFPCast(Value *V, Type *DestTy, const Twine &Name = "") {
@@ -1339,23 +1349,24 @@ public:
   // Utility creation methods
   //===--------------------------------------------------------------------===//
 
-  /// CreateIsNull - Return an i1 value testing if \p Arg is null.
+  /// \brief Return an i1 value testing if \p Arg is null.
   Value *CreateIsNull(Value *Arg, const Twine &Name = "") {
     return CreateICmpEQ(Arg, Constant::getNullValue(Arg->getType()),
                         Name);
   }
 
-  /// CreateIsNotNull - Return an i1 value testing if \p Arg is not null.
+  /// \brief Return an i1 value testing if \p Arg is not null.
   Value *CreateIsNotNull(Value *Arg, const Twine &Name = "") {
     return CreateICmpNE(Arg, Constant::getNullValue(Arg->getType()),
                         Name);
   }
 
-  /// CreatePtrDiff - Return the i64 difference between two pointer values,
-  /// dividing out the size of the pointed-to objects.  This is intended to
-  /// implement C-style pointer subtraction. As such, the pointers must be
-  /// appropriately aligned for their element types and pointing into the
-  /// same object.
+  /// \brief Return the i64 difference between two pointer values, dividing out
+  /// the size of the pointed-to objects.
+  ///
+  /// This is intended to implement C-style pointer subtraction. As such, the
+  /// pointers must be appropriately aligned for their element types and
+  /// pointing into the same object.
   Value *CreatePtrDiff(Value *LHS, Value *RHS, const Twine &Name = "") {
     assert(LHS->getType() == RHS->getType() &&
            "Pointer subtraction operand types must match!");
@@ -1368,8 +1379,8 @@ public:
                            Name);
   }
 
-  /// CreateVectorSplat - Return a vector value that contains \arg V broadcasted
-  /// to \p NumElts elements.
+  /// \brief Return a vector value that contains \arg V broadcasted to \p
+  /// NumElts elements.
   Value *CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name = "") {
     assert(NumElts > 0 && "Cannot splat to an empty vector!");
 
diff --git a/include/llvm/IR/InlineAsm.h b/include/llvm/IR/InlineAsm.h
index 15d4fb0..33e4ab8 100644
--- a/include/llvm/IR/InlineAsm.h
+++ b/include/llvm/IR/InlineAsm.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_INLINEASM_H
-#define LLVM_INLINEASM_H
+#ifndef LLVM_IR_INLINEASM_H
+#define LLVM_IR_INLINEASM_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Value.h"
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 66bf8dd..3e6903c 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_INSTRUCTION_TYPES_H
-#define LLVM_INSTRUCTION_TYPES_H
+#ifndef LLVM_IR_INSTRTYPES_H
+#define LLVM_IR_INSTRTYPES_H
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -309,7 +309,7 @@ public:
   /// NEG, FNeg, or NOT instruction.
   ///
   static bool isNeg(const Value *V);
-  static bool isFNeg(const Value *V);
+  static bool isFNeg(const Value *V, bool IgnoreZeroSign=false);
   static bool isNot(const Value *V);
 
   /// getNegArgument, getNotArgument - Helper functions to extract the
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index 2731bfb..5721d8f 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_INSTRUCTION_H
-#define LLVM_INSTRUCTION_H
+#ifndef LLVM_IR_INSTRUCTION_H
+#define LLVM_IR_INSTRUCTION_H
 
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/IR/User.h"
@@ -309,6 +309,12 @@ public:
   ///
   bool mayThrow() const;
 
+  /// mayReturn - Return true if this is a function that may return.
+  /// this is true for all normal instructions. The only exception
+  /// is functions that are marked with the 'noreturn' attribute.
+  ///
+  bool mayReturn() const;
+
   /// mayHaveSideEffects - Return true if the instruction may have side effects.
   ///
   /// Note that this does not consider malloc and alloca to have side
@@ -316,7 +322,7 @@ public:
   /// instructions which don't used the returned value.  For cases where this
   /// matters, isSafeToSpeculativelyExecute may be more appropriate.
   bool mayHaveSideEffects() const {
-    return mayWriteToMemory() || mayThrow();
+    return mayWriteToMemory() || mayThrow() || !mayReturn();
   }
 
   /// clone() - Create a copy of 'this' instruction that is identical in all
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index bb4ea55..1ae8850 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_INSTRUCTIONS_H
-#define LLVM_INSTRUCTIONS_H
+#ifndef LLVM_IR_INSTRUCTIONS_H
+#define LLVM_IR_INSTRUCTIONS_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -91,7 +91,7 @@ public:
   /// getType - Overload to return most specific pointer type
   ///
   PointerType *getType() const {
-    return reinterpret_cast<PointerType*>(Instruction::getType());
+    return cast<PointerType>(Instruction::getType());
   }
 
   /// getAllocatedType - Return the type that is being allocated by the
@@ -762,9 +762,9 @@ public:
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
-  // getType - Overload to return most specific pointer type...
-  PointerType *getType() const {
-    return reinterpret_cast<PointerType*>(Instruction::getType());
+  // getType - Overload to return most specific sequential type.
+  SequentialType *getType() const {
+    return cast<SequentialType>(Instruction::getType());
   }
 
   /// \brief Returns the address space of this instruction's pointer type.
@@ -953,7 +953,7 @@ public:
           "Both operands to ICmp instruction are not of the same type!");
     // Check that the operands are the right type
     assert((getOperand(0)->getType()->isIntOrIntVectorTy() ||
-            getOperand(0)->getType()->isPointerTy()) &&
+            getOperand(0)->getType()->getScalarType()->isPointerTy()) &&
            "Invalid operand types for ICmp instruction");
   }
 
@@ -1570,7 +1570,7 @@ public:
   const Value *getIndexOperand() const { return Op<1>(); }
 
   VectorType *getVectorOperandType() const {
-    return reinterpret_cast<VectorType*>(getVectorOperand()->getType());
+    return cast<VectorType>(getVectorOperand()->getType());
   }
 
 
@@ -1629,7 +1629,7 @@ public:
   /// getType - Overload to return most specific vector type.
   ///
   VectorType *getType() const {
-    return reinterpret_cast<VectorType*>(Instruction::getType());
+    return cast<VectorType>(Instruction::getType());
   }
 
   /// Transparently provide more efficient getOperand methods.
@@ -1681,14 +1681,14 @@ public:
   /// getType - Overload to return most specific vector type.
   ///
   VectorType *getType() const {
-    return reinterpret_cast<VectorType*>(Instruction::getType());
+    return cast<VectorType>(Instruction::getType());
   }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
   Constant *getMask() const {
-    return reinterpret_cast<Constant*>(getOperand(2));
+    return cast<Constant>(getOperand(2));
   }
   
   /// getMaskValue - Return the index from the shuffle mask for the specified
diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index a8dfcbd..8344c56 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@@ -21,8 +21,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_INTRINSICINST_H
-#define LLVM_INTRINSICINST_H
+#ifndef LLVM_IR_INTRINSICINST_H
+#define LLVM_IR_INTRINSICINST_H
 
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h
index 1e20fd7..c97cd91 100644
--- a/include/llvm/IR/Intrinsics.h
+++ b/include/llvm/IR/Intrinsics.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_INTRINSICS_H
-#define LLVM_INTRINSICS_H
+#ifndef LLVM_IR_INTRINSICS_H
+#define LLVM_IR_INTRINSICS_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include <string>
@@ -79,7 +79,7 @@ namespace Intrinsic {
   /// getIntrinsicInfoTableEntries.
   struct IITDescriptor {
     enum IITDescriptorKind {
-      Void, MMX, Metadata, Float, Double,
+      Void, MMX, Metadata, Half, Float, Double,
       Integer, Vector, Pointer, Struct,
       Argument, ExtendVecArgument, TruncVecArgument
     } Kind;
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index e9fbba9..e252664 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -106,6 +106,7 @@ def llvm_i8_ty         : LLVMType<i8>;
 def llvm_i16_ty        : LLVMType<i16>;
 def llvm_i32_ty        : LLVMType<i32>;
 def llvm_i64_ty        : LLVMType<i64>;
+def llvm_half_ty       : LLVMType<f16>;
 def llvm_float_ty      : LLVMType<f32>;
 def llvm_double_ty     : LLVMType<f64>;
 def llvm_f80_ty        : LLVMType<f80>;
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td
index 1853c99..ebfd03e 100644
--- a/include/llvm/IR/IntrinsicsNVVM.td
+++ b/include/llvm/IR/IntrinsicsNVVM.td
@@ -805,6 +805,16 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
   [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
   "llvm.nvvm.ldu.global.p">;
 
+// Generated within nvvm. Use for ldg on sm_35 or later
+def int_nvvm_ldg_global_i : Intrinsic<[llvm_anyint_ty],
+  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  "llvm.nvvm.ldg.global.i">;
+def int_nvvm_ldg_global_f : Intrinsic<[llvm_anyfloat_ty],
+  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  "llvm.nvvm.ldg.global.f">;
+def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
+  [LLVMPointerType<LLVMMatchType<0>>], [IntrReadMem, NoCapture<0>],
+  "llvm.nvvm.ldg.global.p">;
 
 // Use for generic pointers
 // - These intrinsics are used to convert address spaces.
@@ -815,36 +825,36 @@ def int_nvvm_ldu_global_p : Intrinsic<[llvm_anyptr_ty],
 //   of pointer to another type of pointer, while the address space remains
 //   the same.
 def int_nvvm_ptr_local_to_gen: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 [llvm_anyptr_ty], [IntrNoMem],
                  "llvm.nvvm.ptr.local.to.gen">;
 def int_nvvm_ptr_shared_to_gen: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 [llvm_anyptr_ty], [IntrNoMem],
                  "llvm.nvvm.ptr.shared.to.gen">;
 def int_nvvm_ptr_global_to_gen: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 [llvm_anyptr_ty], [IntrNoMem],
                  "llvm.nvvm.ptr.global.to.gen">;
 def int_nvvm_ptr_constant_to_gen: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 [llvm_anyptr_ty], [IntrNoMem],
                  "llvm.nvvm.ptr.constant.to.gen">;
 
 def int_nvvm_ptr_gen_to_global: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 [llvm_anyptr_ty], [IntrNoMem],
                  "llvm.nvvm.ptr.gen.to.global">;
 def int_nvvm_ptr_gen_to_shared: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 [llvm_anyptr_ty], [IntrNoMem],
                  "llvm.nvvm.ptr.gen.to.shared">;
 def int_nvvm_ptr_gen_to_local: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 [llvm_anyptr_ty], [IntrNoMem],
                  "llvm.nvvm.ptr.gen.to.local">;
 def int_nvvm_ptr_gen_to_constant: Intrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, NoCapture<0>],
+                 [llvm_anyptr_ty], [IntrNoMem],
                  "llvm.nvvm.ptr.gen.to.constant">;
 
 // Used in nvvm internally to help address space opt and ptx code generation
 // This is for params that are passed to kernel functions by pointer by-val.
 def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
                                      [llvm_anyptr_ty],
-                                   [IntrNoMem, NoCapture<0>],
+                                   [IntrNoMem],
                                    "llvm.nvvm.ptr.gen.to.param">;
 
 // Move intrinsics, used in nvvm internally
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index 58fb39f..b3c558b 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LLVMCONTEXT_H
-#define LLVM_LLVMCONTEXT_H
+#ifndef LLVM_IR_LLVMCONTEXT_H
+#define LLVM_IR_LLVMCONTEXT_H
 
 #include "llvm/Support/Compiler.h"
 
@@ -58,39 +58,27 @@ public:
   void getMDKindNames(SmallVectorImpl<StringRef> &Result) const;
   
   
-  typedef void (*DiagHandlerTy)(const SMDiagnostic&, void *Context,
-                                unsigned LocCookie);
+  typedef void (*InlineAsmDiagHandlerTy)(const SMDiagnostic&, void *Context,
+                                         unsigned LocCookie);
   
-  /// setDiagnosticHandler - This method sets a handler that is invoked
-  /// when problems are detected by the backend.  The first argument is a
-  /// function pointer and the second is a context pointer that gets passed
-  /// into the DiagHandler.
+  /// setInlineAsmDiagnosticHandler - This method sets a handler that is invoked
+  /// when problems with inline asm are detected by the backend.  The first
+  /// argument is a function pointer and the second is a context pointer that
+  /// gets passed into the DiagHandler.
   ///
   /// LLVMContext doesn't take ownership or interpret either of these
   /// pointers.
-  void setDiagnosticHandler(DiagHandlerTy DiagHandler, void *DiagContext = 0);
+  void setInlineAsmDiagnosticHandler(InlineAsmDiagHandlerTy DiagHandler,
+                                     void *DiagContext = 0);
 
-  /// getDiagnosticHandler - Return the diagnostic handler set by
-  /// setDiagnosticHandler.
-  DiagHandlerTy getDiagnosticHandler() const;
+  /// getInlineAsmDiagnosticHandler - Return the diagnostic handler set by
+  /// setInlineAsmDiagnosticHandler.
+  InlineAsmDiagHandlerTy getInlineAsmDiagnosticHandler() const;
 
-  /// getDiagnosticContext - Return the diagnostic context set by
-  /// setDiagnosticHandler.
-  void *getDiagnosticContext() const;
+  /// getInlineAsmDiagnosticContext - Return the diagnostic context set by
+  /// setInlineAsmDiagnosticHandler.
+  void *getInlineAsmDiagnosticContext() const;
   
-  /// FIXME: Temporary copies of the old names; to be removed as soon as
-  /// clang switches to the new ones.
-  typedef DiagHandlerTy InlineAsmDiagHandlerTy;
-  void setInlineAsmDiagnosticHandler(InlineAsmDiagHandlerTy DiagHandler,
-                                     void *DiagContext = 0) {
-    setDiagnosticHandler(DiagHandler, DiagContext);
-  }
-  InlineAsmDiagHandlerTy getInlineAsmDiagnosticHandler() const {
-    return getDiagnosticHandler();
-  }
-  void *getInlineAsmDiagnosticContext() const {
-    return getDiagnosticContext();
-  }
   
   /// emitError - Emit an error message to the currently installed error handler
   /// with optional location information.  This function returns, so code should
@@ -101,12 +89,6 @@ public:
   void emitError(const Instruction *I, const Twine &ErrorStr);
   void emitError(const Twine &ErrorStr);
 
-  /// emitWarning - This is similar to emitError but it emits a warning instead
-  /// of an error.
-  void emitWarning(unsigned LocCookie, const Twine &ErrorStr);
-  void emitWarning(const Instruction *I, const Twine &ErrorStr);
-  void emitWarning(const Twine &ErrorStr);
-
 private:
   LLVMContext(LLVMContext&) LLVM_DELETED_FUNCTION;
   void operator=(LLVMContext&) LLVM_DELETED_FUNCTION;
diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h
index d3ca644..604051b 100644
--- a/include/llvm/IR/MDBuilder.h
+++ b/include/llvm/IR/MDBuilder.h
@@ -12,150 +12,151 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MDBUILDER_H
-#define LLVM_MDBUILDER_H
+#ifndef LLVM_IR_MDBUILDER_H
+#define LLVM_IR_MDBUILDER_H
 
-#include "llvm/ADT/APInt.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 
 namespace llvm {
 
-  class MDBuilder {
-    LLVMContext &Context;
-
-  public:
-    MDBuilder(LLVMContext &context) : Context(context) {}
-
-    /// \brief Return the given string as metadata.
-    MDString *createString(StringRef Str) {
-      return MDString::get(Context, Str);
-    }
-
-    //===------------------------------------------------------------------===//
-    // FPMath metadata.
-    //===------------------------------------------------------------------===//
-
-    /// \brief Return metadata with the given settings.  The special value 0.0
-    /// for the Accuracy parameter indicates the default (maximal precision)
-    /// setting.
-    MDNode *createFPMath(float Accuracy) {
-      if (Accuracy == 0.0)
-        return 0;
-      assert(Accuracy > 0.0 && "Invalid fpmath accuracy!");
-      Value *Op = ConstantFP::get(Type::getFloatTy(Context), Accuracy);
-      return MDNode::get(Context, Op);
-    }
-
-    //===------------------------------------------------------------------===//
-    // Prof metadata.
-    //===------------------------------------------------------------------===//
-
-    /// \brief Return metadata containing two branch weights.
-    MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight) {
-      uint32_t Weights[] = { TrueWeight, FalseWeight };
-      return createBranchWeights(Weights);
-    }
-
-    /// \brief Return metadata containing a number of branch weights.
-    MDNode *createBranchWeights(ArrayRef<uint32_t> Weights) {
-      assert(Weights.size() >= 2 && "Need at least two branch weights!");
-
-      SmallVector<Value *, 4> Vals(Weights.size()+1);
-      Vals[0] = createString("branch_weights");
-
-      Type *Int32Ty = Type::getInt32Ty(Context);
-      for (unsigned i = 0, e = Weights.size(); i != e; ++i)
-        Vals[i+1] = ConstantInt::get(Int32Ty, Weights[i]);
-
-      return MDNode::get(Context, Vals);
-    }
-
-    //===------------------------------------------------------------------===//
-    // Range metadata.
-    //===------------------------------------------------------------------===//
-
-    /// \brief Return metadata describing the range [Lo, Hi).
-    MDNode *createRange(const APInt &Lo, const APInt &Hi) {
-      assert(Lo.getBitWidth() == Hi.getBitWidth() && "Mismatched bitwidths!");
-      // If the range is everything then it is useless.
-      if (Hi == Lo)
-        return 0;
-
-      // Return the range [Lo, Hi).
-      Type *Ty = IntegerType::get(Context, Lo.getBitWidth());
-      Value *Range[2] = { ConstantInt::get(Ty, Lo), ConstantInt::get(Ty, Hi) };
-      return MDNode::get(Context, Range);
-    }
-
-
-    //===------------------------------------------------------------------===//
-    // TBAA metadata.
-    //===------------------------------------------------------------------===//
-
-    /// \brief Return metadata appropriate for a TBAA root node.  Each returned
-    /// node is distinct from all other metadata and will never be identified
-    /// (uniqued) with anything else.
-    MDNode *createAnonymousTBAARoot() {
-      // To ensure uniqueness the root node is self-referential.
-      MDNode *Dummy = MDNode::getTemporary(Context, ArrayRef<Value*>());
-      MDNode *Root = MDNode::get(Context, Dummy);
-      // At this point we have
-      //   !0 = metadata !{}            <- dummy
-      //   !1 = metadata !{metadata !0} <- root
-      // Replace the dummy operand with the root node itself and delete the dummy.
-      Root->replaceOperandWith(0, Root);
-      MDNode::deleteTemporary(Dummy);
-      // We now have
-      //   !1 = metadata !{metadata !1} <- self-referential root
-      return Root;
-    }
-
-    /// \brief Return metadata appropriate for a TBAA root node with the given
-    /// name.  This may be identified (uniqued) with other roots with the same
-    /// name.
-    MDNode *createTBAARoot(StringRef Name) {
-      return MDNode::get(Context, createString(Name));
-    }
-
-    /// \brief Return metadata for a non-root TBAA node with the given name,
-    /// parent in the TBAA tree, and value for 'pointsToConstantMemory'.
-    MDNode *createTBAANode(StringRef Name, MDNode *Parent,
-                           bool isConstant = false) {
-      if (isConstant) {
-        Constant *Flags = ConstantInt::get(Type::getInt64Ty(Context), 1);
-        Value *Ops[3] = { createString(Name), Parent, Flags };
-        return MDNode::get(Context, Ops);
-      } else {
-        Value *Ops[2] = { createString(Name), Parent };
-        return MDNode::get(Context, Ops);
-      }
+class APInt;
+class LLVMContext;
+
+class MDBuilder {
+  LLVMContext &Context;
+
+public:
+  MDBuilder(LLVMContext &context) : Context(context) {}
+
+  /// \brief Return the given string as metadata.
+  MDString *createString(StringRef Str) {
+    return MDString::get(Context, Str);
+  }
+
+  //===------------------------------------------------------------------===//
+  // FPMath metadata.
+  //===------------------------------------------------------------------===//
+
+  /// \brief Return metadata with the given settings.  The special value 0.0
+  /// for the Accuracy parameter indicates the default (maximal precision)
+  /// setting.
+  MDNode *createFPMath(float Accuracy) {
+    if (Accuracy == 0.0)
+      return 0;
+    assert(Accuracy > 0.0 && "Invalid fpmath accuracy!");
+    Value *Op = ConstantFP::get(Type::getFloatTy(Context), Accuracy);
+    return MDNode::get(Context, Op);
+  }
+
+  //===------------------------------------------------------------------===//
+  // Prof metadata.
+  //===------------------------------------------------------------------===//
+
+  /// \brief Return metadata containing two branch weights.
+  MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight) {
+    uint32_t Weights[] = { TrueWeight, FalseWeight };
+    return createBranchWeights(Weights);
+  }
+
+  /// \brief Return metadata containing a number of branch weights.
+  MDNode *createBranchWeights(ArrayRef<uint32_t> Weights) {
+    assert(Weights.size() >= 2 && "Need at least two branch weights!");
+
+    SmallVector<Value *, 4> Vals(Weights.size()+1);
+    Vals[0] = createString("branch_weights");
+
+    Type *Int32Ty = Type::getInt32Ty(Context);
+    for (unsigned i = 0, e = Weights.size(); i != e; ++i)
+      Vals[i+1] = ConstantInt::get(Int32Ty, Weights[i]);
+
+    return MDNode::get(Context, Vals);
+  }
+
+  //===------------------------------------------------------------------===//
+  // Range metadata.
+  //===------------------------------------------------------------------===//
+
+  /// \brief Return metadata describing the range [Lo, Hi).
+  MDNode *createRange(const APInt &Lo, const APInt &Hi) {
+    assert(Lo.getBitWidth() == Hi.getBitWidth() && "Mismatched bitwidths!");
+    // If the range is everything then it is useless.
+    if (Hi == Lo)
+      return 0;
+
+    // Return the range [Lo, Hi).
+    Type *Ty = IntegerType::get(Context, Lo.getBitWidth());
+    Value *Range[2] = { ConstantInt::get(Ty, Lo), ConstantInt::get(Ty, Hi) };
+    return MDNode::get(Context, Range);
+  }
+
+
+  //===------------------------------------------------------------------===//
+  // TBAA metadata.
+  //===------------------------------------------------------------------===//
+
+  /// \brief Return metadata appropriate for a TBAA root node.  Each returned
+  /// node is distinct from all other metadata and will never be identified
+  /// (uniqued) with anything else.
+  MDNode *createAnonymousTBAARoot() {
+    // To ensure uniqueness the root node is self-referential.
+    MDNode *Dummy = MDNode::getTemporary(Context, ArrayRef<Value*>());
+    MDNode *Root = MDNode::get(Context, Dummy);
+    // At this point we have
+    //   !0 = metadata !{}            <- dummy
+    //   !1 = metadata !{metadata !0} <- root
+    // Replace the dummy operand with the root node itself and delete the dummy.
+    Root->replaceOperandWith(0, Root);
+    MDNode::deleteTemporary(Dummy);
+    // We now have
+    //   !1 = metadata !{metadata !1} <- self-referential root
+    return Root;
+  }
+
+  /// \brief Return metadata appropriate for a TBAA root node with the given
+  /// name.  This may be identified (uniqued) with other roots with the same
+  /// name.
+  MDNode *createTBAARoot(StringRef Name) {
+    return MDNode::get(Context, createString(Name));
+  }
+
+  /// \brief Return metadata for a non-root TBAA node with the given name,
+  /// parent in the TBAA tree, and value for 'pointsToConstantMemory'.
+  MDNode *createTBAANode(StringRef Name, MDNode *Parent,
+                         bool isConstant = false) {
+    if (isConstant) {
+      Constant *Flags = ConstantInt::get(Type::getInt64Ty(Context), 1);
+      Value *Ops[3] = { createString(Name), Parent, Flags };
+      return MDNode::get(Context, Ops);
+    } else {
+      Value *Ops[2] = { createString(Name), Parent };
+      return MDNode::get(Context, Ops);
     }
+  }
+
+  struct TBAAStructField {
+    uint64_t Offset;
+    uint64_t Size;
+    MDNode *TBAA;
+    TBAAStructField(uint64_t Offset, uint64_t Size, MDNode *TBAA) :
+      Offset(Offset), Size(Size), TBAA(TBAA) {}
+  };
 
-    struct TBAAStructField {
-      uint64_t Offset;
-      uint64_t Size;
-      MDNode *TBAA;
-      TBAAStructField(uint64_t Offset, uint64_t Size, MDNode *TBAA) :
-        Offset(Offset), Size(Size), TBAA(TBAA) {}
-    };
-
-    /// \brief Return metadata for a tbaa.struct node with the given
-    /// struct field descriptions.
-    MDNode *createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
-      SmallVector<Value *, 4> Vals(Fields.size() * 3);
-      Type *Int64 = IntegerType::get(Context, 64);
-      for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
-        Vals[i * 3 + 0] = ConstantInt::get(Int64, Fields[i].Offset);
-        Vals[i * 3 + 1] = ConstantInt::get(Int64, Fields[i].Size);
-        Vals[i * 3 + 2] = Fields[i].TBAA;
-      }
-      return MDNode::get(Context, Vals);
+  /// \brief Return metadata for a tbaa.struct node with the given
+  /// struct field descriptions.
+  MDNode *createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
+    SmallVector<Value *, 4> Vals(Fields.size() * 3);
+    Type *Int64 = IntegerType::get(Context, 64);
+    for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
+      Vals[i * 3 + 0] = ConstantInt::get(Int64, Fields[i].Offset);
+      Vals[i * 3 + 1] = ConstantInt::get(Int64, Fields[i].Size);
+      Vals[i * 3 + 2] = Fields[i].TBAA;
     }
+    return MDNode::get(Context, Vals);
+  }
 
-  };
+};
 
 } // end namespace llvm
 
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 402abb3..8c2cfac 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_METADATA_H
-#define LLVM_METADATA_H
+#ifndef LLVM_IR_METADATA_H
+#define LLVM_IR_METADATA_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -29,8 +29,8 @@ class Module;
 template <typename T> class SmallVectorImpl;
 template<typename ValueSubClass, typename ItemParentClass>
   class SymbolTableListTraits;
-  
-  
+
+
 //===----------------------------------------------------------------------===//
 /// MDString - a single uniqued string.
 /// These are used to efficiently contain a byte sequence for metadata.
@@ -51,7 +51,7 @@ public:
   unsigned getLength() const { return (unsigned)getName().size(); }
 
   typedef StringRef::iterator iterator;
-  
+
   /// begin() - Pointer to the first byte of the string.
   iterator begin() const { return getName().begin(); }
 
@@ -64,9 +64,9 @@ public:
   }
 };
 
-  
+
 class MDNodeOperand;
-  
+
 //===----------------------------------------------------------------------===//
 /// MDNode - a tuple of other values.
 class MDNode : public Value, public FoldingSetNode {
@@ -82,37 +82,37 @@ class MDNode : public Value, public FoldingSetNode {
   /// NumOperands - This many 'MDNodeOperand' items are co-allocated onto the
   /// end of this MDNode.
   unsigned NumOperands;
-  
+
   // Subclass data enums.
   enum {
     /// FunctionLocalBit - This bit is set if this MDNode is function local.
     /// This is true when it (potentially transitively) contains a reference to
     /// something in a function, like an argument, basicblock, or instruction.
     FunctionLocalBit = 1 << 0,
-    
+
     /// NotUniquedBit - This is set on MDNodes that are not uniqued because they
     /// have a null operand.
     NotUniquedBit    = 1 << 1,
-    
+
     /// DestroyFlag - This bit is set by destroy() so the destructor can assert
     /// that the node isn't being destroyed with a plain 'delete'.
     DestroyFlag      = 1 << 2
   };
-  
+
   // FunctionLocal enums.
   enum FunctionLocalness {
     FL_Unknown = -1,
     FL_No = 0,
     FL_Yes = 1
   };
-  
-  /// replaceOperand - Replace each instance of F from the operand list of this 
+
+  /// replaceOperand - Replace each instance of F from the operand list of this
   /// node with T.
   void replaceOperand(MDNodeOperand *Op, Value *NewVal);
   ~MDNode();
 
   MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal);
-  
+
   static MDNode *getMDNode(LLVMContext &C, ArrayRef<Value*> Vals,
                            FunctionLocalness FL, bool Insert = true);
 public:
@@ -123,7 +123,7 @@ public:
   static MDNode *getWhenValsUnresolved(LLVMContext &Context,
                                        ArrayRef<Value*> Vals,
                                        bool isFunctionLocal);
-                                       
+
   static MDNode *getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals);
 
   /// getTemporary - Return a temporary MDNode, for use in constructing
@@ -137,22 +137,22 @@ public:
 
   /// replaceOperandWith - Replace a specific operand.
   void replaceOperandWith(unsigned i, Value *NewVal);
-  
+
   /// getOperand - Return specified operand.
   Value *getOperand(unsigned i) const;
-  
+
   /// getNumOperands - Return number of MDNode operands.
   unsigned getNumOperands() const { return NumOperands; }
-  
+
   /// isFunctionLocal - Return whether MDNode is local to a function.
   bool isFunctionLocal() const {
     return (getSubclassDataFromValue() & FunctionLocalBit) != 0;
   }
-  
+
   // getFunction - If this metadata is function-local and recursively has a
   // function-local operand, return the first such operand's parent function.
   // Otherwise, return null. getFunction() should not be used for performance-
-  // critical code because it recursively visits all the MDNode's operands.  
+  // critical code because it recursively visits all the MDNode's operands.
   const Function *getFunction() const;
 
   /// Profile - calculate a unique identifier for this MDNode to collapse
@@ -172,11 +172,11 @@ private:
   // destroy - Delete this node.  Only when there are no uses.
   void destroy();
 
-  bool isNotUniqued() const { 
+  bool isNotUniqued() const {
     return (getSubclassDataFromValue() & NotUniquedBit) != 0;
   }
   void setIsNotUniqued();
-  
+
   // Shadow Value::setValueSubclassData with a private forwarding method so that
   // any future subclasses cannot accidentally use it.
   void setValueSubclassData(unsigned short D) {
@@ -220,7 +220,7 @@ public:
 
   /// getOperand - Return specified operand.
   MDNode *getOperand(unsigned i) const;
-  
+
   /// getNumOperands - Return the number of NamedMDNode operands.
   unsigned getNumOperands() const;
 
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 4e1ca43..4460aa4 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MODULE_H
-#define LLVM_MODULE_H
+#ifndef LLVM_IR_MODULE_H
+#define LLVM_IR_MODULE_H
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/IR/Function.h"
@@ -147,30 +147,38 @@ public:
   /// An enumeration for describing the size of a pointer on the target machine.
   enum PointerSize { AnyPointerSize, Pointer32, Pointer64 };
 
-  /// An enumeration for the supported behaviors of module flags. The following
-  /// module flags behavior values are supported:
-  ///
-  ///    Value        Behavior
-  ///    -----        --------
-  ///      1          Error
-  ///                   Emits an error if two values disagree.
-  ///
-  ///      2          Warning
-  ///                   Emits a warning if two values disagree.
-  ///
-  ///      3          Require
-  ///                   Emits an error when the specified value is not present
-  ///                   or doesn't have the specified value. It is an error for
-  ///                   two (or more) llvm.module.flags with the same ID to have
-  ///                   the Require behavior but different values. There may be
-  ///                   multiple Require flags per ID.
-  ///
-  ///      4          Override
-  ///                   Uses the specified value if the two values disagree. It
-  ///                   is an error for two (or more) llvm.module.flags with the
-  ///                   same ID to have the Override behavior but different
-  ///                   values.
-  enum ModFlagBehavior { Error = 1, Warning  = 2, Require = 3, Override = 4 };
+  /// This enumeration defines the supported behaviors of module flags.
+  enum ModFlagBehavior {
+    /// Emits an error if two values disagree, otherwise the resulting value is
+    /// that of the operands.
+    Error = 1,
+
+    /// Emits a warning if two values disagree. The result value will be the
+    /// operand for the flag from the first module being linked.
+    Warning  = 2,
+
+    /// Adds a requirement that another module flag be present and have a
+    /// specified value after linking is performed. The value must be a metadata
+    /// pair, where the first element of the pair is the ID of the module flag
+    /// to be restricted, and the second element of the pair is the value the
+    /// module flag should be restricted to. This behavior can be used to
+    /// restrict the allowable results (via triggering of an error) of linking
+    /// IDs with the **Override** behavior.
+    Require = 3,
+
+    /// Uses the specified value, regardless of the behavior or value of the
+    /// other module. If both modules specify **Override**, but the values
+    /// differ, an error will be emitted.
+    Override = 4,
+
+    /// Appends the two values, which are required to be metadata nodes.
+    Append = 5,
+
+    /// Appends the two values, which are required to be metadata
+    /// nodes. However, duplicate entries in the second list are dropped
+    /// during the append operation.
+    AppendUnique = 6
+  };
 
   struct ModuleFlagEntry {
     ModFlagBehavior Behavior;
diff --git a/include/llvm/IR/OperandTraits.h b/include/llvm/IR/OperandTraits.h
index 60c8b09..0e4b195 100644
--- a/include/llvm/IR/OperandTraits.h
+++ b/include/llvm/IR/OperandTraits.h
@@ -12,8 +12,8 @@
 // the operands in the most efficient manner.
 //
 
-#ifndef LLVM_OPERAND_TRAITS_H
-#define LLVM_OPERAND_TRAITS_H
+#ifndef LLVM_IR_OPERANDTRAITS_H
+#define LLVM_IR_OPERANDTRAITS_H
 
 #include "llvm/IR/User.h"
 
diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index 577b41a..13ab72c 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OPERATOR_H
-#define LLVM_OPERATOR_H
+#ifndef LLVM_IR_OPERATOR_H
+#define LLVM_IR_OPERATOR_H
 
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -33,8 +33,8 @@ class ConstantExpr;
 ///
 class Operator : public User {
 private:
-  // Do not implement any of these. The Operator class is intended to be used
-  // as a utility, and is never itself instantiated.
+  // The Operator class is intended to be used as a utility, and is never itself
+  // instantiated.
   void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void *operator new(size_t s) LLVM_DELETED_FUNCTION;
   Operator() LLVM_DELETED_FUNCTION;
diff --git a/include/llvm/IR/SymbolTableListTraits.h b/include/llvm/IR/SymbolTableListTraits.h
index ec5c88f..561ce01 100644
--- a/include/llvm/IR/SymbolTableListTraits.h
+++ b/include/llvm/IR/SymbolTableListTraits.h
@@ -22,8 +22,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYMBOLTABLELISTTRAITS_H
-#define LLVM_SYMBOLTABLELISTTRAITS_H
+#ifndef LLVM_IR_SYMBOLTABLELISTTRAITS_H
+#define LLVM_IR_SYMBOLTABLELISTTRAITS_H
 
 #include "llvm/ADT/ilist.h"
 
diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h
index def4575..d89ae24 100644
--- a/include/llvm/IR/Type.h
+++ b/include/llvm/IR/Type.h
@@ -12,11 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TYPE_H
-#define LLVM_TYPE_H
+#ifndef LLVM_IR_TYPE_H
+#define LLVM_IR_TYPE_H
 
+#include "llvm/ADT/APFloat.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
@@ -162,6 +164,18 @@ public:
            getTypeID() == PPC_FP128TyID;
   }
 
+  const fltSemantics &getFltSemantics() const {
+    switch (getTypeID()) {
+    case HalfTyID: return APFloat::IEEEhalf;
+    case FloatTyID: return APFloat::IEEEsingle;
+    case DoubleTyID: return APFloat::IEEEdouble;
+    case X86_FP80TyID: return APFloat::x87DoubleExtended;
+    case FP128TyID: return APFloat::IEEEquad;
+    case PPC_FP128TyID: return APFloat::PPCDoubleDouble;
+    default: llvm_unreachable("Invalid floating type");
+    }
+  }
+
   /// isX86_MMXTy - Return true if this is X86 MMX.
   bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; }
 
diff --git a/include/llvm/IR/TypeBuilder.h b/include/llvm/IR/TypeBuilder.h
index 83ca2a3..80c60a0 100644
--- a/include/llvm/IR/TypeBuilder.h
+++ b/include/llvm/IR/TypeBuilder.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TYPEBUILDER_H
-#define LLVM_TYPEBUILDER_H
+#ifndef LLVM_IR_TYPEBUILDER_H
+#define LLVM_IR_TYPEBUILDER_H
 
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
diff --git a/include/llvm/IR/Use.h b/include/llvm/IR/Use.h
index 8080445..33a69c4 100644
--- a/include/llvm/IR/Use.h
+++ b/include/llvm/IR/Use.h
@@ -22,8 +22,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_USE_H
-#define LLVM_USE_H
+#ifndef LLVM_IR_USE_H
+#define LLVM_IR_USE_H
 
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/Support/Compiler.h"
@@ -66,7 +66,6 @@ public:
   typedef PointerIntPair<User*, 1, unsigned> UserRef;
 
 private:
-  /// Copy ctor - do not implement
   Use(const Use &U) LLVM_DELETED_FUNCTION;
 
   /// Destructor - Only for zap()
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index 843d20b..a4d0a5d 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -16,8 +16,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_USER_H
-#define LLVM_USER_H
+#ifndef LLVM_IR_USER_H
+#define LLVM_IR_USER_H
 
 #include "llvm/IR/Value.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 85fdfe3..a4f7862 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_VALUE_H
-#define LLVM_VALUE_H
+#ifndef LLVM_IR_VALUE_H
+#define LLVM_IR_VALUE_H
 
 #include "llvm/IR/Use.h"
 #include "llvm/Support/Casting.h"
diff --git a/include/llvm/IR/ValueSymbolTable.h b/include/llvm/IR/ValueSymbolTable.h
index 3ea095b..bf1fade 100644
--- a/include/llvm/IR/ValueSymbolTable.h
+++ b/include/llvm/IR/ValueSymbolTable.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_VALUE_SYMBOL_TABLE_H
-#define LLVM_VALUE_SYMBOL_TABLE_H
+#ifndef LLVM_IR_VALUESYMBOLTABLE_H
+#define LLVM_IR_VALUESYMBOLTABLE_H
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Value.h"
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 79037f7..e5e21f3 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -31,6 +31,10 @@ void initializeTransformUtils(PassRegistry&);
 /// ScalarOpts library.
 void initializeScalarOpts(PassRegistry&);
 
+/// initializeObjCARCOpts - Initialize all passes linked into the ObjCARCOpts
+/// library.
+void initializeObjCARCOpts(PassRegistry&);
+
 /// initializeVectorization - Initialize all passes linked into the
 /// Vectorize library.
 void initializeVectorization(PassRegistry&);
@@ -77,6 +81,8 @@ void initializeBoundsCheckingPass(PassRegistry&);
 void initializeBranchFolderPassPass(PassRegistry&);
 void initializeBranchProbabilityInfoPass(PassRegistry&);
 void initializeBreakCriticalEdgesPass(PassRegistry&);
+void initializeCallGraphPrinterPass(PassRegistry&);
+void initializeCallGraphViewerPass(PassRegistry&);
 void initializeCFGOnlyPrinterPass(PassRegistry&);
 void initializeCFGOnlyViewerPass(PassRegistry&);
 void initializeCFGPrinterPass(PassRegistry&);
@@ -130,6 +136,7 @@ void initializeIPSCCPPass(PassRegistry&);
 void initializeIVUsersPass(PassRegistry&);
 void initializeIfConverterPass(PassRegistry&);
 void initializeIndVarSimplifyPass(PassRegistry&);
+void initializeInlineCostAnalysisPass(PassRegistry&);
 void initializeInstCombinerPass(PassRegistry&);
 void initializeInstCountPass(PassRegistry&);
 void initializeInstNamerPass(PassRegistry&);
@@ -210,6 +217,7 @@ void initializePreVerifierPass(PassRegistry&);
 void initializePrintDbgInfoPass(PassRegistry&);
 void initializePrintFunctionPassPass(PassRegistry&);
 void initializePrintModulePassPass(PassRegistry&);
+void initializePrintBasicBlockPassPass(PassRegistry&);
 void initializeProcessImplicitDefsPass(PassRegistry&);
 void initializeProfileEstimatorPassPass(PassRegistry&);
 void initializeProfileInfoAnalysisGroup(PassRegistry&);
diff --git a/include/llvm/LinkAllVMCore.h b/include/llvm/LinkAllIR.h
index 6039a63..4c1aaca 100644
--- a/include/llvm/LinkAllVMCore.h
+++ b/include/llvm/LinkAllIR.h
@@ -1,4 +1,4 @@
-//===- LinkAllVMCore.h - Reference All VMCore Code --------------*- C++ -*-===//
+//===----- LinkAllIR.h - Reference All VMCore Code --------------*- C++ -*-===//
 //
 //                      The LLVM Compiler Infrastructure
 //
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LINKALLVMCORE_H
-#define LLVM_LINKALLVMCORE_H
+#ifndef LLVM_LINKALLIR_H
+#define LLVM_LINKALLIR_H
 
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/IR/InlineAsm.h"
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index d29775a..cb727f6 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -16,6 +16,7 @@
 #define LLVM_LINKALLPASSES_H
 
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/CallPrinter.h"
 #include "llvm/Analysis/DomPrinter.h"
 #include "llvm/Analysis/FindUsedTypes.h"
 #include "llvm/Analysis/IntervalPartition.h"
@@ -30,6 +31,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -57,6 +59,8 @@ namespace {
       (void) llvm::createBlockPlacementPass();
       (void) llvm::createBoundsCheckingPass();
       (void) llvm::createBreakCriticalEdgesPass();
+      (void) llvm::createCallGraphPrinterPass();
+      (void) llvm::createCallGraphViewerPass();
       (void) llvm::createCFGSimplificationPass();
       (void) llvm::createConstantMergePass();
       (void) llvm::createConstantPropagationPass();
@@ -147,6 +151,7 @@ namespace {
       (void) llvm::createMergeFunctionsPass();
       (void) llvm::createPrintModulePass(0);
       (void) llvm::createPrintFunctionPass("", 0);
+      (void) llvm::createPrintBasicBlockPass(0);
       (void) llvm::createDbgInfoPrinterPass();
       (void) llvm::createModuleDebugInfoPrinterPass();
       (void) llvm::createPartialInliningPass();
diff --git a/include/llvm/Linker.h b/include/llvm/Linker.h
index 1ebcd6b..a349806 100644
--- a/include/llvm/Linker.h
+++ b/include/llvm/Linker.h
@@ -43,14 +43,6 @@ class Linker {
   /// @name Types
   /// @{
   public:
-    /// This type is used to pass the linkage items (libraries and files) to
-    /// the LinkItems function. It is composed of string/bool pairs. The string
-    /// provides the name of the file or library (as with the -l option). The
-    /// bool should be true for libraries and false for files, signifying
-    /// "isLibrary".
-    /// @brief A list of linkage items
-    typedef std::vector<std::pair<std::string,bool> > ItemList;
-
     /// This enumeration is used to control various optional features of the
     /// linker.
     enum ControlFlags {
@@ -153,93 +145,6 @@ class Linker {
     /// @brief Set control flags.
     void setFlags(unsigned flags) { Flags = flags; }
 
-    /// This method is the main interface to the linker. It can be used to
-    /// link a set of linkage items into a module. A linkage item is either a
-    /// file name with fully qualified path, or a library for which the Linker's
-    /// LibraryPath will be utilized to locate the library. The bool value in
-    /// the LinkItemKind should be set to true for libraries.  This function
-    /// allows linking to preserve the order of specification associated with
-    /// the command line, or for other purposes. Each item will be linked in
-    /// turn as it occurs in \p Items.
-    /// @returns true if an error occurred, false otherwise
-    /// @see LinkItemKind
-    /// @see getLastError
-    bool LinkInItems (
-      const ItemList& Items, ///< Set of libraries/files to link in
-      ItemList& NativeItems  ///< Output list of native files/libs
-    );
-
-    /// This function links the bitcode \p Files into the composite module.
-    /// Note that this does not do any linking of unresolved symbols. The \p
-    /// Files are all completely linked into \p HeadModule regardless of
-    /// unresolved symbols. This function just loads each bitcode file and
-    /// calls LinkInModule on them.
-    /// @returns true if an error occurs, false otherwise
-    /// @see getLastError
-    /// @brief Link in multiple files.
-    bool LinkInFiles (
-      const std::vector<sys::Path> & Files ///< Files to link in
-    );
-
-    /// This function links a single bitcode file, \p File, into the composite
-    /// module. Note that this does not attempt to resolve symbols. This method
-    /// just loads the bitcode file and calls LinkInModule on it. If an error
-    /// occurs, the Linker's error string is set.
-    /// @returns true if an error occurs, false otherwise
-    /// @see getLastError
-    /// @brief Link in a single file.
-    bool LinkInFile(
-      const sys::Path& File, ///< File to link in.
-      bool &is_native        ///< Indicates if the file is native object file
-    );
-
-    /// This function provides a way to selectively link in a set of modules,
-    /// found in libraries, based on the unresolved symbols in the composite
-    /// module. Each item in \p Libraries should be the base name of a library,
-    /// as if given with the -l option of a linker tool.  The Linker's LibPaths
-    /// are searched for the \p Libraries and any found will be linked in with
-    /// LinkInArchive.  If an error occurs, the Linker's error string is set.
-    /// @see LinkInArchive
-    /// @see getLastError
-    /// @returns true if an error occurs, false otherwise
-    /// @brief Link libraries into the module
-    bool LinkInLibraries (
-      const std::vector<std::string> & Libraries ///< Libraries to link in
-    );
-
-    /// This function provides a way to selectively link in a set of modules,
-    /// found in one library, based on the unresolved symbols in the composite
-    /// module.The \p Library should be the base name of a library, as if given
-    /// with the -l option of a linker tool. The Linker's LibPaths are searched
-    /// for the \p Library and if found, it will be linked in with via the
-    /// LinkInArchive method. If an error occurs, the Linker's error string is
-    /// set.
-    /// @see LinkInArchive
-    /// @see getLastError
-    /// @returns true if an error occurs, false otherwise
-    /// @brief Link one library into the module
-    bool LinkInLibrary (
-      StringRef Library, ///< The library to link in
-      bool& is_native    ///< Indicates if lib a native library
-    );
-
-    /// This function links one bitcode archive, \p Filename, into the module.
-    /// The archive is searched to resolve outstanding symbols. Any modules in
-    /// the archive that resolve outstanding symbols will be linked in. The
-    /// library is searched repeatedly until no more modules that resolve
-    /// symbols can be found. If an error occurs, the error string is  set.
-    /// To speed up this function, ensure the archive has been processed
-    /// llvm-ranlib or the S option was given to llvm-ar when the archive was
-    /// created. These tools add a symbol table to the archive which makes the
-    /// search for undefined symbols much faster.
-    /// @see getLastError
-    /// @returns true if an error occurs, otherwise false.
-    /// @brief Link in one archive.
-    bool LinkInArchive(
-      const sys::Path& Filename, ///< Filename of the archive to link
-      bool& is_native            ///<  Indicates if archive is a native archive
-    );
-
     /// This method links the \p Src module into the Linker's Composite module
     /// by calling LinkModules.  All the other LinkIn* methods eventually
     /// result in calling this method to link a Module into the Linker's
@@ -268,21 +173,10 @@ class Linker {
     static bool LinkModules(Module* Dest, Module* Src, unsigned Mode,
                             std::string* ErrorMsg);
 
-    /// This function looks through the Linker's LibPaths to find a library with
-    /// the name \p Filename. If the library cannot be found, the returned path
-    /// will be empty (i.e. sys::Path::isEmpty() will return true).
-    /// @returns A sys::Path to the found library
-    /// @brief Find a library from its short name.
-    sys::Path FindLib(StringRef Filename);
-
   /// @}
   /// @name Implementation
   /// @{
   private:
-    /// Read in and parse the bitcode file named by FN and return the
-    /// Module it contains (wrapped in an auto_ptr), or 0 if an error occurs.
-    std::auto_ptr<Module> LoadObject(const sys::Path& FN);
-
     bool warning(StringRef message);
     bool error(StringRef message);
     void verbose(StringRef message);
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 38cf060..28256b3 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_ASM_INFO_H
-#define LLVM_TARGET_ASM_INFO_H
+#ifndef LLVM_MC_MCASMINFO_H
+#define LLVM_MC_MCASMINFO_H
 
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MachineLocation.h"
@@ -48,6 +48,11 @@ namespace llvm {
     ///               Default is 4.
     unsigned PointerSize;
 
+    /// CalleeSaveStackSlotSize - Size of the stack slot reserved for
+    ///                           callee-saved registers, in bytes.
+    ///                           Default is same as pointer size.
+    unsigned CalleeSaveStackSlotSize;
+
     /// IsLittleEndian - True if target is little endian.
     ///                  Default is true.
     bool IsLittleEndian;
@@ -343,7 +348,13 @@ namespace llvm {
       return PointerSize;
     }
 
-    /// islittleendian - True if the target is little endian.
+    /// getCalleeSaveStackSlotSize - Get the callee-saved register stack slot
+    /// size in bytes.
+    unsigned getCalleeSaveStackSlotSize() const {
+      return CalleeSaveStackSlotSize;
+    }
+
+    /// isLittleEndian - True if the target is little endian.
     bool isLittleEndian() const {
       return IsLittleEndian;
     }
diff --git a/include/llvm/MC/MCAsmInfoCOFF.h b/include/llvm/MC/MCAsmInfoCOFF.h
index 0ff3e12..7286151 100644
--- a/include/llvm/MC/MCAsmInfoCOFF.h
+++ b/include/llvm/MC/MCAsmInfoCOFF.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_COFF_TARGET_ASM_INFO_H
-#define LLVM_COFF_TARGET_ASM_INFO_H
+#ifndef LLVM_MC_MCASMINFOCOFF_H
+#define LLVM_MC_MCASMINFOCOFF_H
 
 #include "llvm/MC/MCAsmInfo.h"
 
@@ -33,4 +33,4 @@ namespace llvm {
 }
 
 
-#endif // LLVM_COFF_TARGET_ASM_INFO_H
+#endif // LLVM_MC_MCASMINFOCOFF_H
diff --git a/include/llvm/MC/MCAsmInfoDarwin.h b/include/llvm/MC/MCAsmInfoDarwin.h
index af552de..3d249f9 100644
--- a/include/llvm/MC/MCAsmInfoDarwin.h
+++ b/include/llvm/MC/MCAsmInfoDarwin.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DARWIN_TARGET_ASM_INFO_H
-#define LLVM_DARWIN_TARGET_ASM_INFO_H
+#ifndef LLVM_MC_MCASMINFODARWIN_H
+#define LLVM_MC_MCASMINFODARWIN_H
 
 #include "llvm/MC/MCAsmInfo.h"
 
@@ -26,4 +26,4 @@ namespace llvm {
 }
 
 
-#endif // LLVM_DARWIN_TARGET_ASM_INFO_H
+#endif // LLVM_MC_MCASMINFODARWIN_H
diff --git a/include/llvm/MC/MCAsmLayout.h b/include/llvm/MC/MCAsmLayout.h
index 8cb4601..3058b7b 100644
--- a/include/llvm/MC/MCAsmLayout.h
+++ b/include/llvm/MC/MCAsmLayout.h
@@ -60,9 +60,10 @@ public:
   /// Get the assembler object this is a layout for.
   MCAssembler &getAssembler() const { return Assembler; }
 
-  /// \brief Invalidate the fragments after F because it has been resized.
-  /// The fragment's size should have already been updated.
-  void invalidateFragmentsAfter(MCFragment *F);
+  /// \brief Invalidate the fragments starting with F because it has been
+  /// resized. The fragment's size should have already been updated, but
+  /// its bundle padding will be recomputed.
+  void invalidateFragmentsFrom(MCFragment *F);
 
   /// \brief Perform layout for a single fragment, assuming that the previous
   /// fragment has already been laid out correctly, and the parent section has
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 40766b4..6aeb8b8 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -52,6 +52,7 @@ public:
   enum FragmentType {
     FT_Align,
     FT_Data,
+    FT_CompactEncodedInst,
     FT_Fill,
     FT_Relaxable,
     FT_Org,
@@ -113,6 +114,7 @@ public:
 
   /// \brief Should this fragment be placed at the end of an aligned bundle?
   virtual bool alignToBundleEnd() const { return false; }
+  virtual void setAlignToBundleEnd(bool V) { }
 
   /// \brief Get the padding size that must be inserted before this fragment.
   /// Used for bundling. By default, no padding is inserted.
@@ -131,6 +133,9 @@ public:
   void dump();
 };
 
+/// Interface implemented by fragments that contain encoded instructions and/or
+/// data.
+///
 class MCEncodedFragment : public MCFragment {
   virtual void anchor();
 
@@ -142,20 +147,9 @@ public:
   }
   virtual ~MCEncodedFragment();
 
-  typedef SmallVectorImpl<MCFixup>::const_iterator const_fixup_iterator;
-  typedef SmallVectorImpl<MCFixup>::iterator fixup_iterator;
-
   virtual SmallVectorImpl<char> &getContents() = 0;
   virtual const SmallVectorImpl<char> &getContents() const = 0;
 
-  virtual SmallVectorImpl<MCFixup> &getFixups() = 0;
-  virtual const SmallVectorImpl<MCFixup> &getFixups() const = 0;
-
-  virtual fixup_iterator fixup_begin() = 0;
-  virtual const_fixup_iterator fixup_begin() const  = 0;
-  virtual fixup_iterator fixup_end() = 0;
-  virtual const_fixup_iterator fixup_end() const = 0;
-
   virtual uint8_t getBundlePadding() const {
     return BundlePadding;
   }
@@ -166,13 +160,52 @@ public:
 
   static bool classof(const MCFragment *F) {
     MCFragment::FragmentType Kind = F->getKind();
+    switch (Kind) {
+      default:
+        return false;
+      case MCFragment::FT_Relaxable:
+      case MCFragment::FT_CompactEncodedInst:
+      case MCFragment::FT_Data:
+        return true;
+    }
+  }
+};
+
+/// Interface implemented by fragments that contain encoded instructions and/or
+/// data and also have fixups registered.
+///
+class MCEncodedFragmentWithFixups : public MCEncodedFragment {
+  virtual void anchor();
+
+public:
+  MCEncodedFragmentWithFixups(MCFragment::FragmentType FType,
+                              MCSectionData *SD = 0)
+    : MCEncodedFragment(FType, SD)
+  {
+  }
+
+  virtual ~MCEncodedFragmentWithFixups();
+
+  typedef SmallVectorImpl<MCFixup>::const_iterator const_fixup_iterator;
+  typedef SmallVectorImpl<MCFixup>::iterator fixup_iterator;
+
+  virtual SmallVectorImpl<MCFixup> &getFixups() = 0;
+  virtual const SmallVectorImpl<MCFixup> &getFixups() const = 0;
+
+  virtual fixup_iterator fixup_begin() = 0;
+  virtual const_fixup_iterator fixup_begin() const  = 0;
+  virtual fixup_iterator fixup_end() = 0;
+  virtual const_fixup_iterator fixup_end() const = 0;
+
+  static bool classof(const MCFragment *F) {
+    MCFragment::FragmentType Kind = F->getKind();
     return Kind == MCFragment::FT_Relaxable || Kind == MCFragment::FT_Data;
   }
 };
 
 /// Fragment for data and encoded instructions.
 ///
-class MCDataFragment : public MCEncodedFragment {
+class MCDataFragment : public MCEncodedFragmentWithFixups {
   virtual void anchor();
 
   /// \brief Does this fragment contain encoded instructions anywhere in it?
@@ -187,7 +220,7 @@ class MCDataFragment : public MCEncodedFragment {
   SmallVector<MCFixup, 4> Fixups;
 public:
   MCDataFragment(MCSectionData *SD = 0)
-    : MCEncodedFragment(FT_Data, SD),
+    : MCEncodedFragmentWithFixups(FT_Data, SD),
       HasInstructions(false), AlignToBundleEnd(false)
   {
   }
@@ -220,10 +253,43 @@ public:
   }
 };
 
+/// This is a compact (memory-size-wise) fragment for holding an encoded
+/// instruction (non-relaxable) that has no fixups registered. When applicable,
+/// it can be used instead of MCDataFragment and lead to lower memory
+/// consumption.
+///
+class MCCompactEncodedInstFragment : public MCEncodedFragment {
+  virtual void anchor();
+
+  /// \brief Should this fragment be aligned to the end of a bundle?
+  bool AlignToBundleEnd;
+
+  SmallVector<char, 4> Contents;
+public:
+  MCCompactEncodedInstFragment(MCSectionData *SD = 0)
+    : MCEncodedFragment(FT_CompactEncodedInst, SD), AlignToBundleEnd(false)
+  {
+  }
+
+  virtual bool hasInstructions() const {
+    return true;
+  }
+
+  virtual SmallVectorImpl<char> &getContents() { return Contents; }
+  virtual const SmallVectorImpl<char> &getContents() const { return Contents; }
+
+  virtual bool alignToBundleEnd() const { return AlignToBundleEnd; }
+  virtual void setAlignToBundleEnd(bool V) { AlignToBundleEnd = V; }
+
+  static bool classof(const MCFragment *F) {
+    return F->getKind() == MCFragment::FT_CompactEncodedInst;
+  }
+};
+
 /// A relaxable fragment holds on to its MCInst, since it may need to be
 /// relaxed during the assembler layout and relaxation stage.
 ///
-class MCRelaxableFragment : public MCEncodedFragment {
+class MCRelaxableFragment : public MCEncodedFragmentWithFixups {
   virtual void anchor();
 
   /// Inst - The instruction this is a fragment for.
@@ -237,7 +303,7 @@ class MCRelaxableFragment : public MCEncodedFragment {
 
 public:
   MCRelaxableFragment(const MCInst &_Inst, MCSectionData *SD = 0)
-    : MCEncodedFragment(FT_Relaxable, SD), Inst(_Inst) {
+    : MCEncodedFragmentWithFixups(FT_Relaxable, SD), Inst(_Inst) {
   }
 
   virtual SmallVectorImpl<char> &getContents() { return Contents; }
@@ -791,6 +857,10 @@ private:
   std::vector<IndirectSymbolData> IndirectSymbols;
 
   std::vector<DataRegionData> DataRegions;
+
+  /// The list of linker options to propagate into the object file.
+  std::vector<std::vector<std::string> > LinkerOptions;
+
   /// The set of function symbols for which a .thumb_func directive has
   /// been seen.
   //
@@ -809,6 +879,12 @@ private:
   unsigned NoExecStack : 1;
   unsigned SubsectionsViaSymbols : 1;
 
+  /// ELF specific e_header flags
+  // It would be good if there were an MCELFAssembler class to hold this.
+  // ELF header flags are used both by the integrated and standalone assemblers.
+  // Access to the flags is necessary in cases where assembler directives affect
+  // which flags to be set.
+  unsigned ELFHeaderEFlags;
 private:
   /// Evaluate a fixup to a relocatable expression and the value which should be
   /// placed into the fixup.
@@ -886,6 +962,10 @@ public:
   /// Flag a function symbol as the target of a .thumb_func directive.
   void setIsThumbFunc(const MCSymbol *Func) { ThumbFuncs.insert(Func); }
 
+  /// ELF e_header flags
+  unsigned getELFHeaderEFlags() const {return ELFHeaderEFlags;}
+  void setELFHeaderEFlags(unsigned Flags) { ELFHeaderEFlags = Flags;}
+
 public:
   /// Construct a new assembler instance.
   ///
@@ -1004,6 +1084,14 @@ public:
   size_t indirect_symbol_size() const { return IndirectSymbols.size(); }
 
   /// @}
+  /// @name Linker Option List Access
+  /// @{
+
+  std::vector<std::vector<std::string> > &getLinkerOptions() {
+    return LinkerOptions;
+  }
+
+  /// @}
   /// @name Data Region List Access
   /// @{
 
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index e92d3b9..20d52eb 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -129,6 +129,10 @@ namespace llvm {
     /// non-empty.
     StringRef DwarfDebugFlags;
 
+    /// The string to embed in as the dwarf AT_producer for the compile unit, if
+    /// non-empty.
+    StringRef DwarfDebugProducer;
+
     /// Honor temporary labels, this is useful for debugging semantic
     /// differences between temporary and non-temporary labels (primarily on
     /// Darwin).
@@ -140,6 +144,10 @@ namespace llvm {
     /// We need a deterministic iteration order, so we remember the order
     /// the elements were added.
     std::vector<const MCSection *> MCLineSectionOrder;
+    /// The Compile Unit ID that we are currently processing.
+    unsigned DwarfCompileUnitID;
+    /// The line table start symbol for each Compile Unit.
+    DenseMap<unsigned, MCSymbol *> MCLineTableSymbols;
 
     void *MachOUniquingMap, *ELFUniquingMap, *COFFUniquingMap;
 
@@ -300,6 +308,25 @@ namespace llvm {
       MCLineSections[Sec] = Line;
       MCLineSectionOrder.push_back(Sec);
     }
+    unsigned getDwarfCompileUnitID() {
+      return DwarfCompileUnitID;
+    }
+    void setDwarfCompileUnitID(unsigned CUIndex) {
+      DwarfCompileUnitID = CUIndex;
+    }
+    const DenseMap<unsigned, MCSymbol *> &getMCLineTableSymbols() const {
+      return MCLineTableSymbols;
+    }
+    MCSymbol *getMCLineTableSymbol(unsigned ID) const {
+      DenseMap<unsigned, MCSymbol *>::const_iterator CIter =
+        MCLineTableSymbols.find(ID);
+      if (CIter == MCLineTableSymbols.end())
+        return NULL;
+      return CIter->second;
+    }
+    void setMCLineTableSymbol(MCSymbol *Sym, unsigned ID) {
+      MCLineTableSymbols[ID] = Sym;
+    }
 
     /// setCurrentDwarfLoc - saves the information from the currently parsed
     /// dwarf .loc directive and sets DwarfLocSeen.  When the next instruction
@@ -346,6 +373,9 @@ namespace llvm {
     void setDwarfDebugFlags(StringRef S) { DwarfDebugFlags = S; }
     StringRef getDwarfDebugFlags() { return DwarfDebugFlags; }
 
+    void setDwarfDebugProducer(StringRef S) { DwarfDebugProducer = S; }
+    StringRef getDwarfDebugProducer() { return DwarfDebugProducer; }
+
     /// @}
 
     char *getSecureLogFile() { return SecureLogFile; }
diff --git a/include/llvm/MC/MCDisassembler.h b/include/llvm/MC/MCDisassembler.h
index 1b6beb2..36fbcb0 100644
--- a/include/llvm/MC/MCDisassembler.h
+++ b/include/llvm/MC/MCDisassembler.h
@@ -6,8 +6,8 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-#ifndef MCDISASSEMBLER_H
-#define MCDISASSEMBLER_H
+#ifndef LLVM_MC_MCDISASSEMBLER_H
+#define LLVM_MC_MCDISASSEMBLER_H
 
 #include "llvm-c/Disassembler.h"
 #include "llvm/Support/DataTypes.h"
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 3160b61..1a392e8 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/raw_ostream.h"
+#include <map>
 #include <vector>
 
 namespace llvm {
@@ -186,29 +187,43 @@ namespace llvm {
     MCLineSection() {}
 
     // addLineEntry - adds an entry to this MCLineSection's line entries
-    void addLineEntry(const MCLineEntry &LineEntry) {
-      MCLineEntries.push_back(LineEntry);
+    void addLineEntry(const MCLineEntry &LineEntry, unsigned CUID) {
+      MCLineDivisions[CUID].push_back(LineEntry);
     }
 
     typedef std::vector<MCLineEntry> MCLineEntryCollection;
     typedef MCLineEntryCollection::iterator iterator;
     typedef MCLineEntryCollection::const_iterator const_iterator;
+    typedef std::map<unsigned, MCLineEntryCollection> MCLineDivisionMap;
 
   private:
-    MCLineEntryCollection MCLineEntries;
+    // A collection of MCLineEntry for each Compile Unit ID.
+    MCLineDivisionMap MCLineDivisions;
 
   public:
-    const MCLineEntryCollection *getMCLineEntries() const {
-      return &MCLineEntries;
+    // Returns whether MCLineSection contains entries for a given Compile
+    // Unit ID.
+    bool containEntriesForID(unsigned CUID) const {
+      return MCLineDivisions.count(CUID);
+    }
+    // Returns the collection of MCLineEntry for a given Compile Unit ID.
+    const MCLineEntryCollection &getMCLineEntries(unsigned CUID) const {
+      MCLineDivisionMap::const_iterator CIter = MCLineDivisions.find(CUID);
+      assert(CIter != MCLineDivisions.end());
+      return CIter->second;
     }
   };
 
   class MCDwarfFileTable {
   public:
     //
-    // This emits the Dwarf file and the line tables.
+    // This emits the Dwarf file and the line tables for all Compile Units.
     //
     static const MCSymbol *Emit(MCStreamer *MCOS);
+    //
+    // This emits the Dwarf file and the line tables for a given Compile Unit.
+    //
+    static const MCSymbol *EmitCU(MCStreamer *MCOS, unsigned ID);
   };
 
   class MCDwarfLineAddr {
diff --git a/include/llvm/MC/MCELF.h b/include/llvm/MC/MCELF.h
index e08f1e6..7e59911 100644
--- a/include/llvm/MC/MCELF.h
+++ b/include/llvm/MC/MCELF.h
@@ -28,6 +28,8 @@ class MCELF {
   static unsigned GetType(const MCSymbolData &SD);
   static void SetVisibility(MCSymbolData &SD, unsigned Visibility);
   static unsigned GetVisibility(MCSymbolData &SD);
+  static void setOther(MCSymbolData &SD, unsigned Other);
+  static unsigned getOther(MCSymbolData &SD);
 };
 
 }
diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index 38cdc72..a59776d 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h
@@ -79,7 +79,6 @@ public:
   virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                 bool IsPCRel, bool IsRelocWithSymbol,
                                 int64_t Addend) const = 0;
-  virtual unsigned getEFlags() const;
   virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
                                          const MCValue &Target,
                                          const MCFragment &F,
diff --git a/include/llvm/MC/MCELFStreamer.h b/include/llvm/MC/MCELFStreamer.h
index ab20ee8..6fb2d22 100644
--- a/include/llvm/MC/MCELFStreamer.h
+++ b/include/llvm/MC/MCELFStreamer.h
@@ -28,15 +28,20 @@ class MCSymbolData;
 class raw_ostream;
 
 class MCELFStreamer : public MCObjectStreamer {
-public:
-  MCELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+protected:
+  MCELFStreamer(StreamerKind Kind, MCContext &Context, MCAsmBackend &TAB,
                 raw_ostream &OS, MCCodeEmitter *Emitter)
-    : MCObjectStreamer(Context, TAB, OS, Emitter) {}
+      : MCObjectStreamer(Kind, Context, TAB, OS, Emitter) {}
+
+public:
+  MCELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                MCCodeEmitter *Emitter)
+      : MCObjectStreamer(SK_ELFStreamer, Context, TAB, OS, Emitter) {}
 
-  MCELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                raw_ostream &OS, MCCodeEmitter *Emitter,
-                MCAssembler *Assembler)
-    : MCObjectStreamer(Context, TAB, OS, Emitter, Assembler) {}
+  MCELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                MCCodeEmitter *Emitter, MCAssembler *Assembler)
+      : MCObjectStreamer(SK_ELFStreamer, Context, TAB, OS, Emitter,
+                         Assembler) {}
 
   virtual ~MCELFStreamer();
 
@@ -44,6 +49,7 @@ public:
   /// @{
 
   virtual void InitSections();
+  virtual void InitToTextSection();
   virtual void ChangeSection(const MCSection *Section);
   virtual void EmitLabel(MCSymbol *Symbol);
   virtual void EmitDebugLabel(MCSymbol *Symbol);
@@ -59,6 +65,8 @@ public:
   virtual void EmitCOFFSymbolType(int Type);
   virtual void EndCOFFSymbolDef();
 
+  virtual MCSymbolData &getOrCreateSymbolData(MCSymbol *Symbol);
+
   virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
 
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -80,6 +88,10 @@ public:
   virtual void FinishImpl();
   /// @}
 
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_ELFStreamer || S->getKind() == SK_ARMELFStreamer;
+  }
+
 private:
   virtual void EmitInstToFragment(const MCInst &Inst);
   virtual void EmitInstToData(const MCInst &Inst);
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index 475981f..b5bfed1 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -472,6 +472,8 @@ public:
   virtual void AddValueSymbols(MCAssembler *) const = 0;
   virtual const MCSection *FindAssociatedSection() const = 0;
 
+  virtual void fixELFSymbolsInTLSFixups(MCAssembler &) const = 0;
+
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
   }
diff --git a/include/llvm/MC/MCFixedLenDisassembler.h b/include/llvm/MC/MCFixedLenDisassembler.h
index 22b3c32..ad99943 100644
--- a/include/llvm/MC/MCFixedLenDisassembler.h
+++ b/include/llvm/MC/MCFixedLenDisassembler.h
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 // Fixed length disassembler decoder state machine driver.
 //===----------------------------------------------------------------------===//
-#ifndef MCFIXEDLENDISASSEMBLER_H
-#define MCFIXEDLENDISASSEMBLER_H
+#ifndef LLVM_MC_MCFIXEDLENDISASSEMBLER_H
+#define LLVM_MC_MCFIXEDLENDISASSEMBLER_H
 
 namespace llvm {
 
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index 3cd278e..3c9a588 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -196,6 +196,8 @@ public:
   void WriteLinkeditLoadCommand(uint32_t Type, uint32_t DataOffset,
                                 uint32_t DataSize);
 
+  void WriteLinkerOptionsLoadCommand(const std::vector<std::string> &Options);
+
   // FIXME: We really need to improve the relocation validation. Basically, we
   // want to implement a separate computation which evaluates the relocation
   // entry as the linker would, and verifies that the resultant fixup value is
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index c8444fd..c8d7484 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -97,6 +97,9 @@ protected:
   const MCSection *DwarfARangesSection;
   const MCSection *DwarfRangesSection;
   const MCSection *DwarfMacroInfoSection;
+  // The pubnames section is no longer generated by default.  The generation
+  // can be enabled by a compiler flag.
+  const MCSection *DwarfPubNamesSection;
 
   // DWARF5 Experimental Debug Info Sections
   /// DwarfAccelNamesSection, DwarfAccelObjCSection,
@@ -115,6 +118,7 @@ protected:
   const MCSection *DwarfLineDWOSection;
   const MCSection *DwarfLocDWOSection;
   const MCSection *DwarfStrOffDWOSection;
+  const MCSection *DwarfAddrSection;
 
   // Extra TLS Variable Data section.  If the target needs to put additional
   // information for a TLS variable, it'll go here.
@@ -208,6 +212,7 @@ public:
   const MCSection *getDwarfInfoSection() const { return DwarfInfoSection; }
   const MCSection *getDwarfLineSection() const { return DwarfLineSection; }
   const MCSection *getDwarfFrameSection() const { return DwarfFrameSection; }
+  const MCSection *getDwarfPubNamesSection() const{return DwarfPubNamesSection;}
   const MCSection *getDwarfPubTypesSection() const{return DwarfPubTypesSection;}
   const MCSection *getDwarfDebugInlineSection() const {
     return DwarfDebugInlineSection;
@@ -251,6 +256,9 @@ public:
   const MCSection *getDwarfStrOffDWOSection() const {
     return DwarfStrOffDWOSection;
   }
+  const MCSection *getDwarfAddrSection() const {
+    return DwarfAddrSection;
+  }
 
   const MCSection *getTLSExtraDataSection() const {
     return TLSExtraDataSection;
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index 0ece092..f06c49f 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -38,9 +38,9 @@ class MCObjectStreamer : public MCStreamer {
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame);
 
 protected:
-  MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
+  MCObjectStreamer(StreamerKind Kind, MCContext &Context, MCAsmBackend &TAB,
                    raw_ostream &_OS, MCCodeEmitter *_Emitter);
-  MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
+  MCObjectStreamer(StreamerKind Kind, MCContext &Context, MCAsmBackend &TAB,
                    raw_ostream &_OS, MCCodeEmitter *_Emitter,
                    MCAssembler *_Assembler);
   ~MCObjectStreamer();
@@ -86,7 +86,7 @@ public:
   virtual void EmitBundleAlignMode(unsigned AlignPow2);
   virtual void EmitBundleLock(bool AlignToEnd);
   virtual void EmitBundleUnlock();
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace = 0);
   virtual void EmitValueToAlignment(unsigned ByteAlignment,
                                     int64_t Value = 0,
                                     unsigned ValueSize = 1,
@@ -103,10 +103,14 @@ public:
   virtual void EmitGPRel32Value(const MCExpr *Value);
   virtual void EmitGPRel64Value(const MCExpr *Value);
   virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                        unsigned AddrSpace);
+                        unsigned AddrSpace = 0);
   virtual void FinishImpl();
 
   /// @}
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() >= SK_ELFStreamer && S->getKind() <= SK_WinCOFFStreamer;
+  }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCObjectWriter.h b/include/llvm/MC/MCObjectWriter.h
index 9d5c1a7..4939a3f 100644
--- a/include/llvm/MC/MCObjectWriter.h
+++ b/include/llvm/MC/MCObjectWriter.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_MC_MCOBJECTWRITER_H
 #define LLVM_MC_MCOBJECTWRITER_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/raw_ostream.h"
@@ -61,15 +62,15 @@ public:
   /// @name High-Level API
   /// @{
 
-  /// Perform any late binding of symbols (for example, to assign symbol indices
-  /// for use when generating relocations).
+  /// \brief Perform any late binding of symbols (for example, to assign symbol
+  /// indices for use when generating relocations).
   ///
   /// This routine is called by the assembler after layout and relaxation is
   /// complete.
   virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
                                         const MCAsmLayout &Layout) = 0;
 
-  /// Record a relocation entry.
+  /// \brief Record a relocation entry.
   ///
   /// This routine is called by the assembler after layout and relaxation, and
   /// post layout binding. The implementation is responsible for storing
@@ -99,8 +100,7 @@ public:
                                          bool InSet,
                                          bool IsPCRel) const;
 
-
-  /// Write the object file.
+  /// \brief Write the object file.
   ///
   /// This routine is called by the assembler after layout and relaxation is
   /// complete, fixups have been evaluated and applied, and relocations
@@ -176,7 +176,7 @@ public:
     OS << StringRef(Zeros, N % 16);
   }
 
-  void WriteBytes(SmallVectorImpl<char> &ByteVec, unsigned ZeroFillSize = 0) {
+  void WriteBytes(const SmallVectorImpl<char> &ByteVec, unsigned ZeroFillSize = 0) {
     WriteBytes(StringRef(ByteVec.data(), ByteVec.size()), ZeroFillSize);
   }
 
diff --git a/include/llvm/MC/MCParser/AsmCond.h b/include/llvm/MC/MCParser/AsmCond.h
index 92a115e..a918b56 100644
--- a/include/llvm/MC/MCParser/AsmCond.h
+++ b/include/llvm/MC/MCParser/AsmCond.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ASMCOND_H
-#define ASMCOND_H
+#ifndef LLVM_MC_MCPARSER_ASMCOND_H
+#define LLVM_MC_MCPARSER_ASMCOND_H
 
 namespace llvm {
 
diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index e102dfb..0dab314 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ASMLEXER_H
-#define ASMLEXER_H
+#ifndef LLVM_MC_MCPARSER_ASMLEXER_H
+#define LLVM_MC_MCPARSER_ASMLEXER_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index 37a69e2..53b380f 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCASMLEXER_H
-#define LLVM_MC_MCASMLEXER_H
+#ifndef LLVM_MC_MCPARSER_MCASMLEXER_H
+#define LLVM_MC_MCPARSER_MCASMLEXER_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index b9490fa..d7e3902 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -7,14 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCASMPARSER_H
-#define LLVM_MC_MCASMPARSER_H
+#ifndef LLVM_MC_MCPARSER_MCASMPARSER_H
+#define LLVM_MC_MCPARSER_MCASMPARSER_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
-class AsmToken;
 class MCAsmInfo;
 class MCAsmLexer;
 class MCAsmParserExtension;
@@ -22,13 +23,11 @@ class MCContext;
 class MCExpr;
 class MCInstPrinter;
 class MCInstrInfo;
-class MCParsedAsmOperand;
 class MCStreamer;
 class MCTargetAsmParser;
 class SMLoc;
 class SMRange;
 class SourceMgr;
-class StringRef;
 class Twine;
 
 /// MCAsmParserSemaCallback - Generic Sema callback for assembly parser.
@@ -36,16 +35,21 @@ class MCAsmParserSemaCallback {
 public:
   virtual ~MCAsmParserSemaCallback(); 
   virtual void *LookupInlineAsmIdentifier(StringRef Name, void *Loc,
-                                          unsigned &Size) = 0;
+                                          unsigned &Length, unsigned &Size, 
+                                          unsigned &Type, bool &IsVarDecl) = 0;
+
   virtual bool LookupInlineAsmField(StringRef Base, StringRef Member,
                                     unsigned &Offset) = 0;
 };
 
+
 /// MCAsmParser - Generic assembler parser interface, for use by target specific
 /// assembly parsers.
 class MCAsmParser {
 public:
   typedef bool (*DirectiveHandler)(MCAsmParserExtension*, StringRef, SMLoc);
+  typedef std::pair<MCAsmParserExtension*, DirectiveHandler>
+    ExtensionDirectiveHandler;
 
 private:
   MCAsmParser(const MCAsmParser &) LLVM_DELETED_FUNCTION;
@@ -61,9 +65,8 @@ protected: // Can only create subclasses.
 public:
   virtual ~MCAsmParser();
 
-  virtual void AddDirectiveHandler(MCAsmParserExtension *Object,
-                                   StringRef Directive,
-                                   DirectiveHandler Handler) = 0;
+  virtual void addDirectiveHandler(StringRef Directive,
+                                   ExtensionDirectiveHandler Handler) = 0;
 
   virtual SourceMgr &getSourceManager() = 0;
 
@@ -89,8 +92,8 @@ public:
   virtual void setParsingInlineAsm(bool V) = 0;
   virtual bool isParsingInlineAsm() = 0;
 
-  /// ParseMSInlineAsm - Parse ms-style inline assembly.
-  virtual bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+  /// parseMSInlineAsm - Parse ms-style inline assembly.
+  virtual bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                                 unsigned &NumOutputs, unsigned &NumInputs,
                                 SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
                                 SmallVectorImpl<std::string> &Constraints,
@@ -123,42 +126,50 @@ public:
   bool TokError(const Twine &Msg,
                 ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
 
-  /// ParseIdentifier - Parse an identifier or string (as a quoted identifier)
+  /// parseIdentifier - Parse an identifier or string (as a quoted identifier)
   /// and set \p Res to the identifier contents.
-  virtual bool ParseIdentifier(StringRef &Res) = 0;
+  virtual bool parseIdentifier(StringRef &Res) = 0;
 
   /// \brief Parse up to the end of statement and return the contents from the
   /// current token until the end of the statement; the current token on exit
   /// will be either the EndOfStatement or EOF.
-  virtual StringRef ParseStringToEndOfStatement() = 0;
+  virtual StringRef parseStringToEndOfStatement() = 0;
 
-  /// EatToEndOfStatement - Skip to the end of the current statement, for error
+  /// parseEscapedString - Parse the current token as a string which may include
+  /// escaped characters and return the string contents.
+  virtual bool parseEscapedString(std::string &Data) = 0;
+
+  /// eatToEndOfStatement - Skip to the end of the current statement, for error
   /// recovery.
-  virtual void EatToEndOfStatement() = 0;
+  virtual void eatToEndOfStatement() = 0;
 
-  /// ParseExpression - Parse an arbitrary expression.
+  /// parseExpression - Parse an arbitrary expression.
   ///
   /// @param Res - The value of the expression. The result is undefined
   /// on error.
   /// @result - False on success.
-  virtual bool ParseExpression(const MCExpr *&Res, SMLoc &EndLoc) = 0;
-  bool ParseExpression(const MCExpr *&Res);
+  virtual bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) = 0;
+  bool parseExpression(const MCExpr *&Res);
 
-  /// ParseParenExpression - Parse an arbitrary expression, assuming that an
+  /// parseParenExpression - Parse an arbitrary expression, assuming that an
   /// initial '(' has already been consumed.
   ///
   /// @param Res - The value of the expression. The result is undefined
   /// on error.
   /// @result - False on success.
-  virtual bool ParseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) = 0;
+  virtual bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) = 0;
 
-  /// ParseAbsoluteExpression - Parse an expression which must evaluate to an
+  /// parseAbsoluteExpression - Parse an expression which must evaluate to an
   /// absolute value.
   ///
   /// @param Res - The value of the absolute expression. The result is undefined
   /// on error.
   /// @result - False on success.
-  virtual bool ParseAbsoluteExpression(int64_t &Res) = 0;
+  virtual bool parseAbsoluteExpression(int64_t &Res) = 0;
+
+  /// checkForValidSection - Ensure that we have a valid section set in the
+  /// streamer. Otherwise, report an error and switch to .text.
+  virtual void checkForValidSection() = 0;
 };
 
 /// \brief Create an MCAsmParser instance.
diff --git a/include/llvm/MC/MCParser/MCAsmParserExtension.h b/include/llvm/MC/MCParser/MCAsmParserExtension.h
index 84b33b5..2eda3a9 100644
--- a/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/include/llvm/MC/MCParser/MCAsmParserExtension.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCASMPARSEREXTENSION_H
-#define LLVM_MC_MCASMPARSEREXTENSION_H
+#ifndef LLVM_MC_MCPARSER_MCASMPARSEREXTENSION_H
+#define LLVM_MC_MCPARSER_MCASMPARSEREXTENSION_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
diff --git a/include/llvm/MC/MCParser/MCParsedAsmOperand.h b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
index 60e7887..c78cd97 100644
--- a/include/llvm/MC/MCParser/MCParsedAsmOperand.h
+++ b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCASMOPERAND_H
-#define LLVM_MC_MCASMOPERAND_H
+#ifndef LLVM_MC_MCPARSER_MCPARSEDASMOPERAND_H
+#define LLVM_MC_MCPARSER_MCPARSEDASMOPERAND_H
 
 namespace llvm {
 class SMLoc;
@@ -70,6 +70,10 @@ public:
   /// care of the rewrites.  Only valid when parsing MS-style inline assembly.
   virtual bool needAsmRewrite() const { return true; }
 
+  /// needAddressOf - Do we need to emit code to get the address of the
+  /// variable/label?   Only valid when parsing MS-style inline assembly.
+  virtual bool needAddressOf() const { return false; }
+
   /// isOffsetOf - Do we need to emit code to get the offset of the variable,
   /// rather then the value of the variable?   Only valid when parsing MS-style
   /// inline assembly.
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 0c71ee5..defa299 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_MC_MCSCHEDMODEL_H
-#define LLVM_MC_MCSCHEDMODEL_H
+#ifndef LLVM_MC_MCSCHEDULE_H
+#define LLVM_MC_MCSCHEDULE_H
 
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
@@ -155,7 +155,7 @@ public:
   //      Optional InstrItinerary OperandCycles provides expected latency.
   //      TODO: can't yet specify both min and expected latency per operand.
   int MinLatency;
-  static const unsigned DefaultMinLatency = -1;
+  static const int DefaultMinLatency = -1;
 
   // LoadLatency is the expected latency of load instructions.
   //
@@ -172,6 +172,16 @@ public:
   unsigned HighLatency;
   static const unsigned DefaultHighLatency = 10;
 
+  // ILPWindow is the number of cycles that the scheduler effectively ignores
+  // before attempting to hide latency. This should be zero for in-order cpus to
+  // always hide expected latency. For out-of-order cpus, it may be tweaked as
+  // desired to roughly approximate instruction buffers. The actual threshold is
+  // not very important for an OOO processor, as long as it isn't too high. A
+  // nonzero value helps avoid rescheduling to hide latency when its is fairly
+  // obviously useless and makes register pressure heuristics more effective.
+  unsigned ILPWindow;
+  static const unsigned DefaultILPWindow = 0;
+
   // MispredictPenalty is the typical number of extra cycles the processor
   // takes to recover from a branch misprediction.
   unsigned MispredictPenalty;
@@ -196,6 +206,7 @@ public:
                   MinLatency(DefaultMinLatency),
                   LoadLatency(DefaultLoadLatency),
                   HighLatency(DefaultHighLatency),
+                  ILPWindow(DefaultILPWindow),
                   MispredictPenalty(DefaultMispredictPenalty),
                   ProcID(0), ProcResourceTable(0), SchedClassTable(0),
                   NumProcResourceKinds(0), NumSchedClasses(0),
@@ -205,12 +216,12 @@ public:
   }
 
   // Table-gen driven ctor.
-  MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned mp,
-               unsigned pi, const MCProcResourceDesc *pr,
+  MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned ilp,
+               unsigned mp, unsigned pi, const MCProcResourceDesc *pr,
                const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
                const InstrItinerary *ii):
     IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl),
-    MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
+    ILPWindow(ilp), MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
     SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc),
     InstrItineraries(ii) {}
 
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 05a33c5..d247066 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -16,10 +16,12 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCWin64EH.h"
 #include "llvm/Support/DataTypes.h"
+#include <string>
 
 namespace llvm {
   class MCAsmBackend;
@@ -45,6 +47,23 @@ namespace llvm {
   /// a .s file, and implementations that write out .o files of various formats.
   ///
   class MCStreamer {
+  public:
+    enum StreamerKind {
+      SK_AsmStreamer,
+      SK_NullStreamer,
+      SK_RecordStreamer,
+
+      // MCObjectStreamer subclasses.
+      SK_ELFStreamer,
+      SK_ARMELFStreamer,
+      SK_MachOStreamer,
+      SK_PureStreamer,
+      SK_MipsELFStreamer,
+      SK_WinCOFFStreamer
+    };
+
+  private:
+    const StreamerKind Kind;
     MCContext &Context;
 
     MCStreamer(const MCStreamer&) LLVM_DELETED_FUNCTION;
@@ -73,7 +92,7 @@ namespace llvm {
     bool AutoInitSections;
 
   protected:
-    MCStreamer(MCContext &Ctx);
+    MCStreamer(StreamerKind Kind, MCContext &Ctx);
 
     const MCExpr *BuildSymbolDiff(MCContext &Context, const MCSymbol *A,
                                   const MCSymbol *B);
@@ -92,6 +111,8 @@ namespace llvm {
   public:
     virtual ~MCStreamer();
 
+    StreamerKind getKind() const { return Kind; }
+
     /// State management
     ///
     virtual void reset();
@@ -234,6 +255,9 @@ namespace llvm {
     /// InitSections - Create the default sections and set the initial one.
     virtual void InitSections() = 0;
 
+    /// InitToTextSection - Create a text section and switch the streamer to it.
+    virtual void InitToTextSection() = 0;
+
     /// EmitLabel - Emit a label for @p Symbol into the current section.
     ///
     /// This corresponds to an assembler statement such as:
@@ -252,6 +276,10 @@ namespace llvm {
     /// EmitAssemblerFlag - Note in the output the specified @p Flag.
     virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) = 0;
 
+    /// EmitLinkerOptions - Emit the given list @p Options of strings as linker
+    /// options into the output.
+    virtual void EmitLinkerOptions(ArrayRef<std::string> Kind) {}
+
     /// EmitDataRegion - Note in the output the specified region @p Kind.
     virtual void EmitDataRegion(MCDataRegionType Kind) {}
 
@@ -259,6 +287,9 @@ namespace llvm {
     /// a Thumb mode function (ARM target only).
     virtual void EmitThumbFunc(MCSymbol *Func) = 0;
 
+    /// getOrCreateSymbolData - Get symbol data for given symbol.
+    virtual MCSymbolData &getOrCreateSymbolData(MCSymbol *Symbol);
+
     /// EmitAssignment - Emit an assignment of @p Value to @p Symbol.
     ///
     /// This corresponds to an assembler statement such as:
@@ -366,7 +397,7 @@ namespace llvm {
     ///
     /// This is used to implement assembler directives such as .byte, .ascii,
     /// etc.
-    virtual void EmitBytes(StringRef Data, unsigned AddrSpace) = 0;
+    virtual void EmitBytes(StringRef Data, unsigned AddrSpace = 0) = 0;
 
     /// EmitValue - Emit the expression @p Value into the output as a native
     /// integer of the given @p Size bytes.
@@ -400,8 +431,8 @@ namespace llvm {
 
     /// EmitULEB128Value - Special case of EmitULEB128Value that avoids the
     /// client having to pass in a MCExpr for constant integers.
-    void EmitULEB128IntValue(uint64_t Value, unsigned AddrSpace = 0,
-                             unsigned Padding = 0);
+    void EmitULEB128IntValue(uint64_t Value, unsigned Padding = 0,
+                             unsigned AddrSpace = 0);
 
     /// EmitSLEB128Value - Special case of EmitSLEB128Value that avoids the
     /// client having to pass in a MCExpr for constant integers.
@@ -429,11 +460,11 @@ namespace llvm {
     /// EmitFill - Emit NumBytes bytes worth of the value specified by
     /// FillValue.  This implements directives such as '.space'.
     virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                          unsigned AddrSpace);
+                          unsigned AddrSpace = 0);
 
     /// EmitZeros - Emit NumBytes worth of zeros.  This is a convenience
     /// function that just wraps EmitFill.
-    void EmitZeros(uint64_t NumBytes, unsigned AddrSpace) {
+    void EmitZeros(uint64_t NumBytes, unsigned AddrSpace = 0) {
       EmitFill(NumBytes, 0, AddrSpace);
     }
 
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index 483a80b..4c5b176 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -22,6 +22,7 @@ class MCInst;
 template <typename T> class SmallVectorImpl;
 
 enum AsmRewriteKind {
+  AOK_Align,          // Rewrite align as .align.
   AOK_DotOperator,    // Rewrite a dot operator expression as an immediate.
                       // E.g., [eax].foo.bar -> [eax].8
   AOK_Emit,           // Rewrite _emit as .byte.
@@ -142,6 +143,15 @@ public:
                           MCStreamer &Out, unsigned &ErrorInfo,
                           bool MatchingInlineAsm) = 0;
 
+  /// Allow a target to add special case operand matching for things that
+  /// tblgen doesn't/can't handle effectively. For example, literal
+  /// immediates on ARM. TableGen expects a token operand, but the parser
+  /// will recognize them as immediates.
+  virtual unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                              unsigned Kind) {
+    return Match_InvalidOperand;
+  }
+
   /// checkTargetMatchPredicate - Validate the instruction match against
   /// any complex target predicates not expressible via match classes.
   virtual unsigned checkTargetMatchPredicate(MCInst &Inst) {
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index 8046efd..e2478f6 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -14,22 +14,78 @@
 #ifndef LLVM_OBJECT_ARCHIVE_H
 #define LLVM_OBJECT_ARCHIVE_H
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 namespace llvm {
 namespace object {
+struct ArchiveMemberHeader {
+  char Name[16];
+  char LastModified[12];
+  char UID[6];
+  char GID[6];
+  char AccessMode[8];
+  char Size[10]; ///< Size of data, not including header or padding.
+  char Terminator[2];
+
+  ///! Get the name without looking up long names.
+  llvm::StringRef getName() const {
+    char EndCond;
+    if (Name[0] == '/' || Name[0] == '#')
+      EndCond = ' ';
+    else
+      EndCond = '/';
+    llvm::StringRef::size_type end =
+        llvm::StringRef(Name, sizeof(Name)).find(EndCond);
+    if (end == llvm::StringRef::npos)
+      end = sizeof(Name);
+    assert(end <= sizeof(Name) && end > 0);
+    // Don't include the EndCond if there is one.
+    return llvm::StringRef(Name, end);
+  }
+
+  uint64_t getSize() const {
+    uint64_t ret;
+    if (llvm::StringRef(Size, sizeof(Size)).rtrim(" ").getAsInteger(10, ret))
+      llvm_unreachable("Size is not an integer.");
+    return ret;
+  }
+};
+
+static const ArchiveMemberHeader *ToHeader(const char *base) {
+  return reinterpret_cast<const ArchiveMemberHeader *>(base);
+}
 
 class Archive : public Binary {
   virtual void anchor();
 public:
   class Child {
     const Archive *Parent;
+    /// \brief Includes header but not padding byte.
     StringRef Data;
+    /// \brief Offset from Data to the start of the file.
+    uint16_t StartOfFile;
 
   public:
-    Child(const Archive *p, StringRef d) : Parent(p), Data(d) {}
+    Child(const Archive *p, StringRef d) : Parent(p), Data(d) {
+      if (!p || d.empty())
+        return;
+      // Setup StartOfFile and PaddingBytes.
+      StartOfFile = sizeof(ArchiveMemberHeader);
+      // Don't include attached name.
+      StringRef Name = ToHeader(Data.data())->getName();
+      if (Name.startswith("#1/")) {
+        uint64_t NameSize;
+        if (Name.substr(3).rtrim(" ").getAsInteger(10, NameSize))
+          llvm_unreachable("Long name length is not an integer");
+        StartOfFile += NameSize;
+      }
+    }
 
     bool operator ==(const Child &other) const {
       return (Parent == other.Parent) && (Data.begin() == other.Data.begin());
@@ -39,16 +95,48 @@ public:
       return Data.begin() < other.Data.begin();
     }
 
-    Child getNext() const;
+    Child getNext() const {
+      size_t SpaceToSkip = Data.size();
+      // If it's odd, add 1 to make it even.
+      if (SpaceToSkip & 1)
+        ++SpaceToSkip;
+
+      const char *NextLoc = Data.data() + SpaceToSkip;
+
+      // Check to see if this is past the end of the archive.
+      if (NextLoc >= Parent->Data->getBufferEnd())
+        return Child(Parent, StringRef(0, 0));
+
+      size_t NextSize =
+          sizeof(ArchiveMemberHeader) + ToHeader(NextLoc)->getSize();
+
+      return Child(Parent, StringRef(NextLoc, NextSize));
+    }
+
     error_code getName(StringRef &Result) const;
     int getLastModified() const;
     int getUID() const;
     int getGID() const;
     int getAccessMode() const;
-    ///! Return the size of the archive member without the header or padding.
-    uint64_t getSize() const;
+    /// \return the size of the archive member without the header or padding.
+    uint64_t getSize() const { return Data.size() - StartOfFile; }
+
+    StringRef getBuffer() const {
+      return StringRef(Data.data() + StartOfFile, getSize());
+    }
+
+    error_code getMemoryBuffer(OwningPtr<MemoryBuffer> &Result,
+                               bool FullPath = false) const {
+      StringRef Name;
+      if (error_code ec = getName(Name))
+        return ec;
+      SmallString<128> Path;
+      Result.reset(MemoryBuffer::getMemBuffer(
+          getBuffer(), FullPath ? (Twine(Parent->getFileName()) + "(" + Name +
+                                   ")").toStringRef(Path) : Name, false));
+      return error_code::success();
+    }
 
-    MemoryBuffer *getBuffer() const;
     error_code getAsBinary(OwningPtr<Binary> &Result) const;
   };
 
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index d555de3..8bbcd8b 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -49,8 +49,8 @@ protected:
     ID_EndObjects
   };
 
-  static inline unsigned int getELFType(bool isLittleEndian, bool is64Bits) {
-    if (isLittleEndian)
+  static inline unsigned int getELFType(bool isLE, bool is64Bits) {
+    if (isLE)
       return is64Bits ? ID_ELF64L : ID_ELF32L;
     else
       return is64Bits ? ID_ELF64B : ID_ELF32B;
@@ -85,6 +85,10 @@ public:
   bool isCOFF() const {
     return TypeID == ID_COFF;
   }
+
+  bool isLittleEndian() const {
+    return !(TypeID == ID_ELF32B || TypeID == ID_ELF64B);
+  }
 };
 
 /// @brief Create a Binary from Source, autodetecting the file type.
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index aa26d17..d5ff6f9 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -37,6 +37,13 @@ namespace object {
 
 using support::endianness;
 
+template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+struct ELFType {
+  static const endianness TargetEndianness = target_endianness;
+  static const std::size_t MaxAlignment = max_alignment;
+  static const bool Is64Bits = is64Bits;
+};
+
 template<typename T, int max_align>
 struct MaximumAlignment {
   enum {value = AlignOf<T>::Alignment > max_align ? max_align
@@ -72,59 +79,59 @@ struct ELFDataTypeTypedefHelperCommon {
      MaximumAlignment<int64_t, max_alignment>::value> Elf_Sxword;
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct ELFDataTypeTypedefHelper;
 
 /// ELF 32bit types.
-template<endianness target_endianness, std::size_t max_alignment>
-struct ELFDataTypeTypedefHelper<target_endianness, max_alignment, false>
-  : ELFDataTypeTypedefHelperCommon<target_endianness, max_alignment> {
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct ELFDataTypeTypedefHelper<ELFT<TargetEndianness, MaxAlign, false> >
+  : ELFDataTypeTypedefHelperCommon<TargetEndianness, MaxAlign> {
   typedef uint32_t value_type;
   typedef support::detail::packed_endian_specific_integral
-    <value_type, target_endianness,
-     MaximumAlignment<value_type, max_alignment>::value> Elf_Addr;
+    <value_type, TargetEndianness,
+     MaximumAlignment<value_type, MaxAlign>::value> Elf_Addr;
   typedef support::detail::packed_endian_specific_integral
-    <value_type, target_endianness,
-     MaximumAlignment<value_type, max_alignment>::value> Elf_Off;
+    <value_type, TargetEndianness,
+     MaximumAlignment<value_type, MaxAlign>::value> Elf_Off;
 };
 
 /// ELF 64bit types.
-template<endianness target_endianness, std::size_t max_alignment>
-struct ELFDataTypeTypedefHelper<target_endianness, max_alignment, true>
-  : ELFDataTypeTypedefHelperCommon<target_endianness, max_alignment>{
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct ELFDataTypeTypedefHelper<ELFT<TargetEndianness, MaxAlign, true> >
+  : ELFDataTypeTypedefHelperCommon<TargetEndianness, MaxAlign> {
   typedef uint64_t value_type;
   typedef support::detail::packed_endian_specific_integral
-    <value_type, target_endianness,
-     MaximumAlignment<value_type, max_alignment>::value> Elf_Addr;
+    <value_type, TargetEndianness,
+     MaximumAlignment<value_type, MaxAlign>::value> Elf_Addr;
   typedef support::detail::packed_endian_specific_integral
-    <value_type, target_endianness,
-     MaximumAlignment<value_type, max_alignment>::value> Elf_Off;
+    <value_type, TargetEndianness,
+     MaximumAlignment<value_type, MaxAlign>::value> Elf_Off;
 };
 
 // I really don't like doing this, but the alternative is copypasta.
-#define LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, is64Bits) \
-typedef typename ELFDataTypeTypedefHelper \
-  <target_endianness, max_alignment, is64Bits>::Elf_Addr Elf_Addr; \
-typedef typename ELFDataTypeTypedefHelper \
-  <target_endianness, max_alignment, is64Bits>::Elf_Off Elf_Off; \
-typedef typename ELFDataTypeTypedefHelper \
-  <target_endianness, max_alignment, is64Bits>::Elf_Half Elf_Half; \
-typedef typename ELFDataTypeTypedefHelper \
-  <target_endianness, max_alignment, is64Bits>::Elf_Word Elf_Word; \
-typedef typename ELFDataTypeTypedefHelper \
-  <target_endianness, max_alignment, is64Bits>::Elf_Sword Elf_Sword; \
-typedef typename ELFDataTypeTypedefHelper \
-  <target_endianness, max_alignment, is64Bits>::Elf_Xword Elf_Xword; \
-typedef typename ELFDataTypeTypedefHelper \
-  <target_endianness, max_alignment, is64Bits>::Elf_Sxword Elf_Sxword;
+#define LLVM_ELF_IMPORT_TYPES(ELFT) \
+typedef typename ELFDataTypeTypedefHelper <ELFT>::Elf_Addr Elf_Addr; \
+typedef typename ELFDataTypeTypedefHelper <ELFT>::Elf_Off Elf_Off; \
+typedef typename ELFDataTypeTypedefHelper <ELFT>::Elf_Half Elf_Half; \
+typedef typename ELFDataTypeTypedefHelper <ELFT>::Elf_Word Elf_Word; \
+typedef typename ELFDataTypeTypedefHelper <ELFT>::Elf_Sword Elf_Sword; \
+typedef typename ELFDataTypeTypedefHelper <ELFT>::Elf_Xword Elf_Xword; \
+typedef typename ELFDataTypeTypedefHelper <ELFT>::Elf_Sxword Elf_Sxword;
+
+// This is required to get template types into a macro :(
+#define LLVM_ELF_COMMA ,
 
   // Section header.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Shdr_Base;
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Shdr_Base<target_endianness, max_alignment, false> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, false)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Shdr_Base<ELFT<TargetEndianness, MaxAlign, false> > {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA false>)
   Elf_Word sh_name;     // Section name (index into string table)
   Elf_Word sh_type;     // Section type (SHT_*)
   Elf_Word sh_flags;    // Section flags (SHF_*)
@@ -137,9 +144,11 @@ struct Elf_Shdr_Base<target_endianness, max_alignment, false> {
   Elf_Word sh_entsize;  // Size of records contained within the section
 };
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Shdr_Base<target_endianness, max_alignment, true> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, true)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Shdr_Base<ELFT<TargetEndianness, MaxAlign, true> > {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA true>)
   Elf_Word  sh_name;     // Section name (index into string table)
   Elf_Word  sh_type;     // Section type (SHT_*)
   Elf_Xword sh_flags;    // Section flags (SHF_*)
@@ -152,11 +161,10 @@ struct Elf_Shdr_Base<target_endianness, max_alignment, true> {
   Elf_Xword sh_entsize;  // Size of records contained within the section
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-struct Elf_Shdr_Impl
-  : Elf_Shdr_Base<target_endianness, max_alignment, is64Bits> {
-  using Elf_Shdr_Base<target_endianness, max_alignment, is64Bits>::sh_entsize;
-  using Elf_Shdr_Base<target_endianness, max_alignment, is64Bits>::sh_size;
+template<class ELFT>
+struct Elf_Shdr_Impl : Elf_Shdr_Base<ELFT> {
+  using Elf_Shdr_Base<ELFT>::sh_entsize;
+  using Elf_Shdr_Base<ELFT>::sh_size;
 
   /// @brief Get the number of entities this section contains if it has any.
   unsigned getEntityCount() const {
@@ -166,14 +174,14 @@ struct Elf_Shdr_Impl
   }
 };
 
-template< endianness target_endianness
-        , std::size_t max_alignment
-        , bool is64Bits>
+template<class ELFT>
 struct Elf_Sym_Base;
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Sym_Base<target_endianness, max_alignment, false> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, false)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Sym_Base<ELFT<TargetEndianness, MaxAlign, false> > {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA false>)
   Elf_Word      st_name;  // Symbol name (index into string table)
   Elf_Addr      st_value; // Value or address associated with the symbol
   Elf_Word      st_size;  // Size of the symbol
@@ -182,9 +190,11 @@ struct Elf_Sym_Base<target_endianness, max_alignment, false> {
   Elf_Half      st_shndx; // Which section (header table index) it's defined in
 };
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Sym_Base<target_endianness, max_alignment, true> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, true)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Sym_Base<ELFT<TargetEndianness, MaxAlign, true> > {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA true>)
   Elf_Word      st_name;  // Symbol name (index into string table)
   unsigned char st_info;  // Symbol's type and binding attributes
   unsigned char st_other; // Must be zero; reserved
@@ -193,10 +203,9 @@ struct Elf_Sym_Base<target_endianness, max_alignment, true> {
   Elf_Xword     st_size;  // Size of the symbol
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-struct Elf_Sym_Impl
-  : Elf_Sym_Base<target_endianness, max_alignment, is64Bits> {
-  using Elf_Sym_Base<target_endianness, max_alignment, is64Bits>::st_info;
+template<class ELFT>
+struct Elf_Sym_Impl : Elf_Sym_Base<ELFT> {
+  using Elf_Sym_Base<ELFT>::st_info;
 
   // These accessors and mutators correspond to the ELF32_ST_BIND,
   // ELF32_ST_TYPE, and ELF32_ST_INFO macros defined in the ELF specification:
@@ -211,22 +220,21 @@ struct Elf_Sym_Impl
 
 /// Elf_Versym: This is the structure of entries in the SHT_GNU_versym section
 /// (.gnu.version). This structure is identical for ELF32 and ELF64.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Versym_Impl {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, is64Bits)
+  LLVM_ELF_IMPORT_TYPES(ELFT)
   Elf_Half vs_index;   // Version index with flags (e.g. VERSYM_HIDDEN)
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Verdaux_Impl;
 
 /// Elf_Verdef: This is the structure of entries in the SHT_GNU_verdef section
 /// (.gnu.version_d). This structure is identical for ELF32 and ELF64.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Verdef_Impl {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, is64Bits)
-  typedef
-    Elf_Verdaux_Impl<target_endianness, max_alignment, is64Bits> Elf_Verdaux;
+  LLVM_ELF_IMPORT_TYPES(ELFT)
+  typedef Elf_Verdaux_Impl<ELFT> Elf_Verdaux;
   Elf_Half vd_version; // Version of this structure (e.g. VER_DEF_CURRENT)
   Elf_Half vd_flags;   // Bitwise flags (VER_DEF_*)
   Elf_Half vd_ndx;     // Version index, used in .gnu.version entries
@@ -243,18 +251,18 @@ struct Elf_Verdef_Impl {
 
 /// Elf_Verdaux: This is the structure of auxiliary data in the SHT_GNU_verdef
 /// section (.gnu.version_d). This structure is identical for ELF32 and ELF64.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Verdaux_Impl {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, is64Bits)
+  LLVM_ELF_IMPORT_TYPES(ELFT)
   Elf_Word vda_name; // Version name (offset in string table)
   Elf_Word vda_next; // Offset to next Verdaux entry (in bytes)
 };
 
 /// Elf_Verneed: This is the structure of entries in the SHT_GNU_verneed
 /// section (.gnu.version_r). This structure is identical for ELF32 and ELF64.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Verneed_Impl {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, is64Bits)
+  LLVM_ELF_IMPORT_TYPES(ELFT)
   Elf_Half vn_version; // Version of this structure (e.g. VER_NEED_CURRENT)
   Elf_Half vn_cnt;     // Number of associated Vernaux entries
   Elf_Word vn_file;    // Library name (string table offset)
@@ -264,9 +272,9 @@ struct Elf_Verneed_Impl {
 
 /// Elf_Vernaux: This is the structure of auxiliary data in SHT_GNU_verneed
 /// section (.gnu.version_r). This structure is identical for ELF32 and ELF64.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Vernaux_Impl {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, is64Bits)
+  LLVM_ELF_IMPORT_TYPES(ELFT)
   Elf_Word vna_hash;  // Hash of dependency name
   Elf_Half vna_flags; // Bitwise Flags (VER_FLAG_*)
   Elf_Half vna_other; // Version index, used in .gnu.version entries
@@ -276,12 +284,14 @@ struct Elf_Vernaux_Impl {
 
 /// Elf_Dyn_Base: This structure matches the form of entries in the dynamic
 ///               table section (.dynamic) look like.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Dyn_Base;
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Dyn_Base<target_endianness, max_alignment, false> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, false)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Dyn_Base<ELFT<TargetEndianness, MaxAlign, false> > {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA false>)
   Elf_Sword d_tag;
   union {
     Elf_Word d_val;
@@ -289,9 +299,11 @@ struct Elf_Dyn_Base<target_endianness, max_alignment, false> {
   } d_un;
 };
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Dyn_Base<target_endianness, max_alignment, true> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, true)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Dyn_Base<ELFT<TargetEndianness, MaxAlign, true> > {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA true>)
   Elf_Sxword d_tag;
   union {
     Elf_Xword d_val;
@@ -300,92 +312,67 @@ struct Elf_Dyn_Base<target_endianness, max_alignment, true> {
 };
 
 /// Elf_Dyn_Impl: This inherits from Elf_Dyn_Base, adding getters and setters.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-struct Elf_Dyn_Impl : Elf_Dyn_Base<target_endianness, max_alignment, is64Bits> {
-  using Elf_Dyn_Base<target_endianness, max_alignment, is64Bits>::d_tag;
-  using Elf_Dyn_Base<target_endianness, max_alignment, is64Bits>::d_un;
+template<class ELFT>
+struct Elf_Dyn_Impl : Elf_Dyn_Base<ELFT> {
+  using Elf_Dyn_Base<ELFT>::d_tag;
+  using Elf_Dyn_Base<ELFT>::d_un;
   int64_t getTag() const { return d_tag; }
   uint64_t getVal() const { return d_un.d_val; }
   uint64_t getPtr() const { return d_un.ptr; }
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-class ELFObjectFile;
-
-// DynRefImpl: Reference to an entry in the dynamic table
-// This is an ELF-specific interface.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-class DynRefImpl {
-  typedef Elf_Dyn_Impl<target_endianness, max_alignment, is64Bits> Elf_Dyn;
-  typedef ELFObjectFile<target_endianness, max_alignment, is64Bits> OwningType;
-
-  DataRefImpl DynPimpl;
-  const OwningType *OwningObject;
-
-public:
-  DynRefImpl() : OwningObject(NULL) { }
-
-  DynRefImpl(DataRefImpl DynP, const OwningType *Owner);
-
-  bool operator==(const DynRefImpl &Other) const;
-  bool operator <(const DynRefImpl &Other) const;
-
-  error_code getNext(DynRefImpl &Result) const;
-  int64_t getTag() const;
-  uint64_t getVal() const;
-  uint64_t getPtr() const;
-
-  DataRefImpl getRawDataRefImpl() const;
-};
-
 // Elf_Rel: Elf Relocation
-template< endianness target_endianness
-        , std::size_t max_alignment
-        , bool is64Bits
-        , bool isRela>
+template<class ELFT, bool isRela>
 struct Elf_Rel_Base;
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Rel_Base<target_endianness, max_alignment, false, false> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, false)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Rel_Base<ELFT<TargetEndianness, MaxAlign, false>, false> {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA false>)
   Elf_Addr      r_offset; // Location (file byte offset, or program virtual addr)
   Elf_Word      r_info;  // Symbol table index and type of relocation to apply
 };
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Rel_Base<target_endianness, max_alignment, true, false> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, true)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Rel_Base<ELFT<TargetEndianness, MaxAlign, true>, false> {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA true>)
   Elf_Addr      r_offset; // Location (file byte offset, or program virtual addr)
   Elf_Xword     r_info;   // Symbol table index and type of relocation to apply
 };
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Rel_Base<target_endianness, max_alignment, false, true> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, false)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Rel_Base<ELFT<TargetEndianness, MaxAlign, false>, true> {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA false>)
   Elf_Addr      r_offset; // Location (file byte offset, or program virtual addr)
   Elf_Word      r_info;   // Symbol table index and type of relocation to apply
   Elf_Sword     r_addend; // Compute value for relocatable field by adding this
 };
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Rel_Base<target_endianness, max_alignment, true, true> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, true)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Rel_Base<ELFT<TargetEndianness, MaxAlign, true>, true> {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA true>)
   Elf_Addr      r_offset; // Location (file byte offset, or program virtual addr)
   Elf_Xword     r_info;   // Symbol table index and type of relocation to apply
   Elf_Sxword    r_addend; // Compute value for relocatable field by adding this.
 };
 
-template< endianness target_endianness
-        , std::size_t max_alignment
-        , bool is64Bits
-        , bool isRela>
+template<class ELFT, bool isRela>
 struct Elf_Rel_Impl;
 
-template<endianness target_endianness, std::size_t max_alignment, bool isRela>
-struct Elf_Rel_Impl<target_endianness, max_alignment, true, isRela>
-       : Elf_Rel_Base<target_endianness, max_alignment, true, isRela> {
-  using Elf_Rel_Base<target_endianness, max_alignment, true, isRela>::r_info;
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, true)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign, bool isRela>
+struct Elf_Rel_Impl<ELFT<TargetEndianness, MaxAlign, true>, isRela>
+       : Elf_Rel_Base<ELFT<TargetEndianness, MaxAlign, true>, isRela> {
+  using Elf_Rel_Base<ELFT<TargetEndianness, MaxAlign, true>, isRela>::r_info;
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA true>)
 
   // These accessors and mutators correspond to the ELF64_R_SYM, ELF64_R_TYPE,
   // and ELF64_R_INFO macros defined in the ELF specification:
@@ -400,11 +387,13 @@ struct Elf_Rel_Impl<target_endianness, max_alignment, true, isRela>
   }
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool isRela>
-struct Elf_Rel_Impl<target_endianness, max_alignment, false, isRela>
-       : Elf_Rel_Base<target_endianness, max_alignment, false, isRela> {
-  using Elf_Rel_Base<target_endianness, max_alignment, false, isRela>::r_info;
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, false)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign, bool isRela>
+struct Elf_Rel_Impl<ELFT<TargetEndianness, MaxAlign, false>, isRela>
+       : Elf_Rel_Base<ELFT<TargetEndianness, MaxAlign, false>, isRela> {
+  using Elf_Rel_Base<ELFT<TargetEndianness, MaxAlign, false>, isRela>::r_info;
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA false>)
 
   // These accessors and mutators correspond to the ELF32_R_SYM, ELF32_R_TYPE,
   // and ELF32_R_INFO macros defined in the ELF specification:
@@ -417,9 +406,9 @@ struct Elf_Rel_Impl<target_endianness, max_alignment, false, isRela>
   }
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Ehdr_Impl {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, is64Bits)
+  LLVM_ELF_IMPORT_TYPES(ELFT)
   unsigned char e_ident[ELF::EI_NIDENT]; // ELF Identification bytes
   Elf_Half e_type;     // Type of file (see ET_*)
   Elf_Half e_machine;  // Required architecture for this file (see EM_*)
@@ -442,12 +431,14 @@ struct Elf_Ehdr_Impl {
    unsigned char getDataEncoding() const { return e_ident[ELF::EI_DATA]; }
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 struct Elf_Phdr_Impl;
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Phdr_Impl<target_endianness, max_alignment, false> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, false)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Phdr_Impl<ELFT<TargetEndianness, MaxAlign, false> > {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA false>)
   Elf_Word p_type;   // Type of segment
   Elf_Off  p_offset; // FileOffset where segment is located, in bytes
   Elf_Addr p_vaddr;  // Virtual Address of beginning of segment
@@ -458,9 +449,11 @@ struct Elf_Phdr_Impl<target_endianness, max_alignment, false> {
   Elf_Word p_align;  // Segment alignment constraint
 };
 
-template<endianness target_endianness, std::size_t max_alignment>
-struct Elf_Phdr_Impl<target_endianness, max_alignment, true> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, true)
+template<template<endianness, std::size_t, bool> class ELFT,
+         endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Phdr_Impl<ELFT<TargetEndianness, MaxAlign, true> > {
+  LLVM_ELF_IMPORT_TYPES(ELFT<TargetEndianness LLVM_ELF_COMMA
+                             MaxAlign LLVM_ELF_COMMA true>)
   Elf_Word p_type;   // Type of segment
   Elf_Word p_flags;  // Segment flags
   Elf_Off  p_offset; // FileOffset where segment is located, in bytes
@@ -471,72 +464,18 @@ struct Elf_Phdr_Impl<target_endianness, max_alignment, true> {
   Elf_Xword p_align;  // Segment alignment constraint
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 class ELFObjectFile : public ObjectFile {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, is64Bits)
-
-  typedef Elf_Ehdr_Impl<target_endianness, max_alignment, is64Bits> Elf_Ehdr;
-  typedef Elf_Shdr_Impl<target_endianness, max_alignment, is64Bits> Elf_Shdr;
-  typedef Elf_Sym_Impl<target_endianness, max_alignment, is64Bits> Elf_Sym;
-  typedef Elf_Dyn_Impl<target_endianness, max_alignment, is64Bits> Elf_Dyn;
-  typedef Elf_Phdr_Impl<target_endianness, max_alignment, is64Bits> Elf_Phdr;
-  typedef
-    Elf_Rel_Impl<target_endianness, max_alignment, is64Bits, false> Elf_Rel;
-  typedef
-    Elf_Rel_Impl<target_endianness, max_alignment, is64Bits, true> Elf_Rela;
-  typedef
-    Elf_Verdef_Impl<target_endianness, max_alignment, is64Bits> Elf_Verdef;
-  typedef
-    Elf_Verdaux_Impl<target_endianness, max_alignment, is64Bits> Elf_Verdaux;
-  typedef
-    Elf_Verneed_Impl<target_endianness, max_alignment, is64Bits> Elf_Verneed;
-  typedef
-    Elf_Vernaux_Impl<target_endianness, max_alignment, is64Bits> Elf_Vernaux;
-  typedef
-    Elf_Versym_Impl<target_endianness, max_alignment, is64Bits> Elf_Versym;
-  typedef DynRefImpl<target_endianness, max_alignment, is64Bits> DynRef;
-  typedef content_iterator<DynRef> dyn_iterator;
-
-protected:
-  // This flag is used for classof, to distinguish ELFObjectFile from
-  // its subclass. If more subclasses will be created, this flag will
-  // have to become an enum.
-  bool isDyldELFObject;
-
-private:
-  typedef SmallVector<const Elf_Shdr*, 1> Sections_t;
-  typedef DenseMap<unsigned, unsigned> IndexMap_t;
-  typedef DenseMap<const Elf_Shdr*, SmallVector<uint32_t, 1> > RelocMap_t;
-
-  const Elf_Ehdr *Header;
-  const Elf_Shdr *SectionHeaderTable;
-  const Elf_Shdr *dot_shstrtab_sec; // Section header string table.
-  const Elf_Shdr *dot_strtab_sec;   // Symbol header string table.
-  const Elf_Shdr *dot_dynstr_sec;   // Dynamic symbol string table.
-
-  // SymbolTableSections[0] always points to the dynamic string table section
-  // header, or NULL if there is no dynamic string table.
-  Sections_t SymbolTableSections;
-  IndexMap_t SymbolTableSectionsIndexMap;
-  DenseMap<const Elf_Sym*, ELF::Elf64_Word> ExtendedSymbolTable;
-
-  const Elf_Shdr *dot_dynamic_sec;       // .dynamic
-  const Elf_Shdr *dot_gnu_version_sec;   // .gnu.version
-  const Elf_Shdr *dot_gnu_version_r_sec; // .gnu.version_r
-  const Elf_Shdr *dot_gnu_version_d_sec; // .gnu.version_d
-
-  // Pointer to SONAME entry in dynamic string table
-  // This is set the first time getLoadName is called.
-  mutable const char *dt_soname;
+  LLVM_ELF_IMPORT_TYPES(ELFT)
 
 public:
   /// \brief Iterate over constant sized entities.
   template<class EntT>
   class ELFEntityIterator {
   public:
-    typedef void difference_type;
+    typedef ptrdiff_t difference_type;
     typedef EntT value_type;
-    typedef std::forward_iterator_tag iterator_category;
+    typedef std::random_access_iterator_tag iterator_category;
     typedef value_type &reference;
     typedef value_type *pointer;
 
@@ -576,11 +515,74 @@ public:
       return Tmp;
     }
 
+    ELFEntityIterator &operator =(const ELFEntityIterator &Other) {
+      EntitySize = Other.EntitySize;
+      Current = Other.Current;
+      return *this;
+    }
+
+    difference_type operator -(const ELFEntityIterator &Other) const {
+      assert(EntitySize == Other.EntitySize &&
+             "Subtracting iterators of different EntitiySize!");
+      return (Current - Other.Current) / EntitySize;
+    }
+
+    const char *get() const { return Current; }
+
   private:
-    const uint64_t EntitySize;
+    uint64_t EntitySize;
     const char *Current;
   };
 
+  typedef Elf_Ehdr_Impl<ELFT> Elf_Ehdr;
+  typedef Elf_Shdr_Impl<ELFT> Elf_Shdr;
+  typedef Elf_Sym_Impl<ELFT> Elf_Sym;
+  typedef Elf_Dyn_Impl<ELFT> Elf_Dyn;
+  typedef Elf_Phdr_Impl<ELFT> Elf_Phdr;
+  typedef Elf_Rel_Impl<ELFT, false> Elf_Rel;
+  typedef Elf_Rel_Impl<ELFT, true> Elf_Rela;
+  typedef Elf_Verdef_Impl<ELFT> Elf_Verdef;
+  typedef Elf_Verdaux_Impl<ELFT> Elf_Verdaux;
+  typedef Elf_Verneed_Impl<ELFT> Elf_Verneed;
+  typedef Elf_Vernaux_Impl<ELFT> Elf_Vernaux;
+  typedef Elf_Versym_Impl<ELFT> Elf_Versym;
+  typedef ELFEntityIterator<const Elf_Dyn> Elf_Dyn_iterator;
+  typedef ELFEntityIterator<const Elf_Sym> Elf_Sym_iterator;
+  typedef ELFEntityIterator<const Elf_Rela> Elf_Rela_Iter;
+  typedef ELFEntityIterator<const Elf_Rel> Elf_Rel_Iter;
+
+protected:
+  // This flag is used for classof, to distinguish ELFObjectFile from
+  // its subclass. If more subclasses will be created, this flag will
+  // have to become an enum.
+  bool isDyldELFObject;
+
+private:
+  typedef SmallVector<const Elf_Shdr *, 2> Sections_t;
+  typedef DenseMap<unsigned, unsigned> IndexMap_t;
+  typedef DenseMap<const Elf_Shdr*, SmallVector<uint32_t, 1> > RelocMap_t;
+
+  const Elf_Ehdr *Header;
+  const Elf_Shdr *SectionHeaderTable;
+  const Elf_Shdr *dot_shstrtab_sec; // Section header string table.
+  const Elf_Shdr *dot_strtab_sec;   // Symbol header string table.
+  const Elf_Shdr *dot_dynstr_sec;   // Dynamic symbol string table.
+
+  // SymbolTableSections[0] always points to the dynamic string table section
+  // header, or NULL if there is no dynamic string table.
+  Sections_t SymbolTableSections;
+  IndexMap_t SymbolTableSectionsIndexMap;
+  DenseMap<const Elf_Sym*, ELF::Elf64_Word> ExtendedSymbolTable;
+
+  const Elf_Shdr *dot_dynamic_sec;       // .dynamic
+  const Elf_Shdr *dot_gnu_version_sec;   // .gnu.version
+  const Elf_Shdr *dot_gnu_version_r_sec; // .gnu.version_r
+  const Elf_Shdr *dot_gnu_version_d_sec; // .gnu.version_d
+
+  // Pointer to SONAME entry in dynamic string table
+  // This is set the first time getLoadName is called.
+  mutable const char *dt_soname;
+
 private:
   // Records for each version index the corresponding Verdef or Vernaux entry.
   // This is filled the first time LoadVersionMap() is called.
@@ -617,6 +619,7 @@ private:
     return getSection(Rel.w.b);
   }
 
+public:
   bool            isRelocationHasAddend(DataRefImpl Rel) const;
   template<typename T>
   const T        *getEntry(uint16_t Section, uint32_t Entry) const;
@@ -661,9 +664,6 @@ protected:
                                       section_iterator &Res) const;
   virtual error_code getSymbolValue(DataRefImpl Symb, uint64_t &Val) const;
 
-  friend class DynRefImpl<target_endianness, max_alignment, is64Bits>;
-  virtual error_code getDynNext(DataRefImpl DynData, DynRef &Result) const;
-
   virtual error_code getLibraryNext(DataRefImpl Data, LibraryRef &Result) const;
   virtual error_code getLibraryPath(DataRefImpl Data, StringRef &Res) const;
 
@@ -717,11 +717,34 @@ public:
   virtual library_iterator begin_libraries_needed() const;
   virtual library_iterator end_libraries_needed() const;
 
-  virtual dyn_iterator begin_dynamic_table() const;
-  virtual dyn_iterator end_dynamic_table() const;
+  const Elf_Shdr *getDynamicSymbolTableSectionHeader() const {
+    return SymbolTableSections[0];
+  }
 
-  typedef ELFEntityIterator<const Elf_Rela> Elf_Rela_Iter;
-  typedef ELFEntityIterator<const Elf_Rel> Elf_Rel_Iter;
+  const Elf_Shdr *getDynamicStringTableSectionHeader() const {
+    return dot_dynstr_sec;
+  }
+
+  Elf_Dyn_iterator begin_dynamic_table() const;
+  /// \param NULLEnd use one past the first DT_NULL entry as the end instead of
+  /// the section size.
+  Elf_Dyn_iterator end_dynamic_table(bool NULLEnd = false) const;
+
+  Elf_Sym_iterator begin_elf_dynamic_symbols() const {
+    const Elf_Shdr *DynSymtab = SymbolTableSections[0];
+    if (DynSymtab)
+      return Elf_Sym_iterator(DynSymtab->sh_entsize,
+                              (const char *)base() + DynSymtab->sh_offset);
+    return Elf_Sym_iterator(0, 0);
+  }
+
+  Elf_Sym_iterator end_elf_dynamic_symbols() const {
+    const Elf_Shdr *DynSymtab = SymbolTableSections[0];
+    if (DynSymtab)
+      return Elf_Sym_iterator(DynSymtab->sh_entsize, (const char *)base() +
+                              DynSymtab->sh_offset + DynSymtab->sh_size);
+    return Elf_Sym_iterator(0, 0);
+  }
 
   Elf_Rela_Iter beginELFRela(const Elf_Shdr *sec) const {
     return Elf_Rela_Iter(sec->sh_entsize,
@@ -777,16 +800,15 @@ public:
   // Methods for type inquiry through isa, cast, and dyn_cast
   bool isDyldType() const { return isDyldELFObject; }
   static inline bool classof(const Binary *v) {
-    return v->getType() == getELFType(target_endianness == support::little,
-                                      is64Bits);
+    return v->getType() == getELFType(ELFT::TargetEndianness == support::little,
+                                      ELFT::Is64Bits);
   }
 };
 
 // Iterate through the version definitions, and place each Elf_Verdef
 // in the VersionMap according to its index.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-void ELFObjectFile<target_endianness, max_alignment, is64Bits>::
-                  LoadVersionDefs(const Elf_Shdr *sec) const {
+template<class ELFT>
+void ELFObjectFile<ELFT>::LoadVersionDefs(const Elf_Shdr *sec) const {
   unsigned vd_size = sec->sh_size; // Size of section in bytes
   unsigned vd_count = sec->sh_info; // Number of Verdef entries
   const char *sec_start = (const char*)base() + sec->sh_offset;
@@ -810,9 +832,8 @@ void ELFObjectFile<target_endianness, max_alignment, is64Bits>::
 
 // Iterate through the versions needed section, and place each Elf_Vernaux
 // in the VersionMap according to its index.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-void ELFObjectFile<target_endianness, max_alignment, is64Bits>::
-                  LoadVersionNeeds(const Elf_Shdr *sec) const {
+template<class ELFT>
+void ELFObjectFile<ELFT>::LoadVersionNeeds(const Elf_Shdr *sec) const {
   unsigned vn_size = sec->sh_size; // Size of section in bytes
   unsigned vn_count = sec->sh_info; // Number of Verneed entries
   const char *sec_start = (const char*)base() + sec->sh_offset;
@@ -843,9 +864,8 @@ void ELFObjectFile<target_endianness, max_alignment, is64Bits>::
   }
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-void ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                  ::LoadVersionMap() const {
+template<class ELFT>
+void ELFObjectFile<ELFT>::LoadVersionMap() const {
   // If there is no dynamic symtab or version table, there is nothing to do.
   if (SymbolTableSections[0] == NULL || dot_gnu_version_sec == NULL)
     return;
@@ -866,9 +886,9 @@ void ELFObjectFile<target_endianness, max_alignment, is64Bits>
     LoadVersionNeeds(dot_gnu_version_r_sec);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-void ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                  ::validateSymbol(DataRefImpl Symb) const {
+template<class ELFT>
+void ELFObjectFile<ELFT>::validateSymbol(DataRefImpl Symb) const {
+#ifndef NDEBUG
   const Elf_Sym  *symb = getSymbol(Symb);
   const Elf_Shdr *SymbolTableSection = SymbolTableSections[Symb.d.b];
   // FIXME: We really need to do proper error handling in the case of an invalid
@@ -883,12 +903,12 @@ void ELFObjectFile<target_endianness, max_alignment, is64Bits>
                    + SymbolTableSection->sh_size)))
     // FIXME: Proper error handling.
     report_fatal_error("Symb must point to a valid symbol!");
+#endif
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolNext(DataRefImpl Symb,
-                                        SymbolRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolNext(DataRefImpl Symb,
+                                              SymbolRef &Result) const {
   validateSymbol(Symb);
   const Elf_Shdr *SymbolTableSection = SymbolTableSections[Symb.d.b];
 
@@ -913,20 +933,18 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolName(DataRefImpl Symb,
-                                        StringRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Symb,
+                                              StringRef &Result) const {
   validateSymbol(Symb);
   const Elf_Sym *symb = getSymbol(Symb);
   return getSymbolName(SymbolTableSections[Symb.d.b], symb, Result);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolVersion(SymbolRef SymRef,
-                                           StringRef &Version,
-                                           bool &IsDefault) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
+                                                 StringRef &Version,
+                                                 bool &IsDefault) const {
   DataRefImpl Symb = SymRef.getRawDataRefImpl();
   validateSymbol(Symb);
   const Elf_Sym *symb = getSymbol(Symb);
@@ -934,19 +952,17 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
                           Version, IsDefault);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-ELF::Elf64_Word ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                      ::getSymbolTableIndex(const Elf_Sym *symb) const {
+template<class ELFT>
+ELF::Elf64_Word ELFObjectFile<ELFT>
+                             ::getSymbolTableIndex(const Elf_Sym *symb) const {
   if (symb->st_shndx == ELF::SHN_XINDEX)
     return ExtendedSymbolTable.lookup(symb);
   return symb->st_shndx;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Shdr *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::getSection(const Elf_Sym *symb) const {
+template<class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Shdr *
+ELFObjectFile<ELFT>::getSection(const Elf_Sym *symb) const {
   if (symb->st_shndx == ELF::SHN_XINDEX)
     return getSection(ExtendedSymbolTable.lookup(symb));
   if (symb->st_shndx >= ELF::SHN_LORESERVE)
@@ -954,38 +970,31 @@ ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return getSection(symb->st_shndx);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Shdr *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::getElfSection(section_iterator &It) const {
+template<class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Shdr *
+ELFObjectFile<ELFT>::getElfSection(section_iterator &It) const {
   llvm::object::DataRefImpl ShdrRef = It->getRawDataRefImpl();
   return reinterpret_cast<const Elf_Shdr *>(ShdrRef.p);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Sym *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::getElfSymbol(symbol_iterator &It) const {
+template<class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Sym *
+ELFObjectFile<ELFT>::getElfSymbol(symbol_iterator &It) const {
   return getSymbol(It->getRawDataRefImpl());
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Sym *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::getElfSymbol(uint32_t index) const {
+template<class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Sym *
+ELFObjectFile<ELFT>::getElfSymbol(uint32_t index) const {
   DataRefImpl SymbolData;
   SymbolData.d.a = index;
   SymbolData.d.b = 1;
   return getSymbol(SymbolData);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolFileOffset(DataRefImpl Symb,
-                                          uint64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolFileOffset(DataRefImpl Symb,
+                                                    uint64_t &Result) const {
   validateSymbol(Symb);
   const Elf_Sym  *symb = getSymbol(Symb);
   const Elf_Shdr *Section;
@@ -1003,7 +1012,7 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
 
   switch (symb->getType()) {
   case ELF::STT_SECTION:
-    Result = Section ? Section->sh_addr : UnknownAddressOrSize;
+    Result = Section ? Section->sh_offset : UnknownAddressOrSize;
     return object_error::success;
   case ELF::STT_FUNC:
   case ELF::STT_OBJECT:
@@ -1017,10 +1026,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   }
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolAddress(DataRefImpl Symb,
-                                           uint64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
+                                                 uint64_t &Result) const {
   validateSymbol(Symb);
   const Elf_Sym  *symb = getSymbol(Symb);
   const Elf_Shdr *Section;
@@ -1061,10 +1069,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   }
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolSize(DataRefImpl Symb,
-                                        uint64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolSize(DataRefImpl Symb,
+                                              uint64_t &Result) const {
   validateSymbol(Symb);
   const Elf_Sym  *symb = getSymbol(Symb);
   if (symb->st_size == 0)
@@ -1073,10 +1080,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolNMTypeChar(DataRefImpl Symb,
-                                              char &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolNMTypeChar(DataRefImpl Symb,
+                                                    char &Result) const {
   validateSymbol(Symb);
   const Elf_Sym  *symb = getSymbol(Symb);
   const Elf_Shdr *Section = getSection(symb);
@@ -1138,10 +1144,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolType(DataRefImpl Symb,
-                                        SymbolRef::Type &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb,
+                                              SymbolRef::Type &Result) const {
   validateSymbol(Symb);
   const Elf_Sym  *symb = getSymbol(Symb);
 
@@ -1170,10 +1175,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolFlags(DataRefImpl Symb,
-                                         uint32_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Symb,
+                                               uint32_t &Result) const {
   validateSymbol(Symb);
   const Elf_Sym  *symb = getSymbol(Symb);
 
@@ -1205,10 +1209,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolSection(DataRefImpl Symb,
-                                           section_iterator &Res) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolSection(DataRefImpl Symb,
+                                                 section_iterator &Res) const {
   validateSymbol(Symb);
   const Elf_Sym  *symb = getSymbol(Symb);
   const Elf_Shdr *sec = getSection(symb);
@@ -1222,19 +1225,18 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolValue(DataRefImpl Symb,
-                                         uint64_t &Val) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolValue(DataRefImpl Symb,
+                                               uint64_t &Val) const {
   validateSymbol(Symb);
   const Elf_Sym *symb = getSymbol(Symb);
   Val = symb->st_value;
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSectionNext(DataRefImpl Sec, SectionRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionNext(DataRefImpl Sec,
+                                               SectionRef &Result) const {
   const uint8_t *sec = reinterpret_cast<const uint8_t *>(Sec.p);
   sec += Header->e_shentsize;
   Sec.p = reinterpret_cast<intptr_t>(sec);
@@ -1242,65 +1244,58 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSectionName(DataRefImpl Sec,
-                                         StringRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec,
+                                               StringRef &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   Result = StringRef(getString(dot_shstrtab_sec, sec->sh_name));
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSectionAddress(DataRefImpl Sec,
-                                            uint64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec,
+                                                  uint64_t &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   Result = sec->sh_addr;
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSectionSize(DataRefImpl Sec,
-                                         uint64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec,
+                                               uint64_t &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   Result = sec->sh_size;
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSectionContents(DataRefImpl Sec,
-                                             StringRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionContents(DataRefImpl Sec,
+                                                   StringRef &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   const char *start = (const char*)base() + sec->sh_offset;
   Result = StringRef(start, sec->sh_size);
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSectionContents(const Elf_Shdr *Sec,
-                                             StringRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionContents(const Elf_Shdr *Sec,
+                                                   StringRef &Result) const {
   const char *start = (const char*)base() + Sec->sh_offset;
   Result = StringRef(start, Sec->sh_size);
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSectionAlignment(DataRefImpl Sec,
-                                              uint64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionAlignment(DataRefImpl Sec,
+                                                    uint64_t &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   Result = sec->sh_addralign;
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::isSectionText(DataRefImpl Sec,
-                                        bool &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec,
+                                              bool &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   if (sec->sh_flags & ELF::SHF_EXECINSTR)
     Result = true;
@@ -1309,10 +1304,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::isSectionData(DataRefImpl Sec,
-                                        bool &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
+                                              bool &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   if (sec->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE)
       && sec->sh_type == ELF::SHT_PROGBITS)
@@ -1322,10 +1316,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::isSectionBSS(DataRefImpl Sec,
-                                       bool &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
+                                             bool &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   if (sec->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE)
       && sec->sh_type == ELF::SHT_NOBITS)
@@ -1335,10 +1328,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::isSectionRequiredForExecution(DataRefImpl Sec,
-                                                        bool &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionRequiredForExecution(
+    DataRefImpl Sec, bool &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   if (sec->sh_flags & ELF::SHF_ALLOC)
     Result = true;
@@ -1347,10 +1339,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::isSectionVirtual(DataRefImpl Sec,
-                                           bool &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec,
+                                                 bool &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   if (sec->sh_type == ELF::SHT_NOBITS)
     Result = true;
@@ -1359,10 +1350,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::isSectionZeroInit(DataRefImpl Sec,
-                                            bool &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec,
+                                                  bool &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   // For ELF, all zero-init sections are virtual (that is, they occupy no space
   //   in the object image) and vice versa.
@@ -1370,10 +1360,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                       ::isSectionReadOnlyData(DataRefImpl Sec,
-                                               bool &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec,
+                                                      bool &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   if (sec->sh_flags & ELF::SHF_WRITE || sec->sh_flags & ELF::SHF_EXECINSTR)
     Result = false;
@@ -1382,19 +1371,18 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                          ::sectionContainsSymbol(DataRefImpl Sec,
-                                                  DataRefImpl Symb,
-                                                  bool &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
+                                                      DataRefImpl Symb,
+                                                      bool &Result) const {
   // FIXME: Unimplemented.
   Result = false;
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-relocation_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                                 ::getSectionRelBegin(DataRefImpl Sec) const {
+template<class ELFT>
+relocation_iterator
+ELFObjectFile<ELFT>::getSectionRelBegin(DataRefImpl Sec) const {
   DataRefImpl RelData;
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   typename RelocMap_t::const_iterator ittr = SectionRelocMap.find(sec);
@@ -1406,9 +1394,9 @@ relocation_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return relocation_iterator(RelocationRef(RelData, this));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-relocation_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                                 ::getSectionRelEnd(DataRefImpl Sec) const {
+template<class ELFT>
+relocation_iterator
+ELFObjectFile<ELFT>::getSectionRelEnd(DataRefImpl Sec) const {
   DataRefImpl RelData;
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   typename RelocMap_t::const_iterator ittr = SectionRelocMap.find(sec);
@@ -1424,10 +1412,9 @@ relocation_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
 }
 
 // Relocations
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getRelocationNext(DataRefImpl Rel,
-                                            RelocationRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationNext(DataRefImpl Rel,
+                                                  RelocationRef &Result) const {
   ++Rel.w.c;
   const Elf_Shdr *relocsec = getSection(Rel.w.b);
   if (Rel.w.c >= (relocsec->sh_size / relocsec->sh_entsize)) {
@@ -1453,10 +1440,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getRelocationSymbol(DataRefImpl Rel,
-                                              SymbolRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationSymbol(DataRefImpl Rel,
+                                                    SymbolRef &Result) const {
   uint32_t symbolIdx;
   const Elf_Shdr *sec = getSection(Rel.w.b);
   switch (sec->sh_type) {
@@ -1481,10 +1467,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getRelocationAddress(DataRefImpl Rel,
-                                               uint64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
+                                                     uint64_t &Result) const {
   uint64_t offset;
   const Elf_Shdr *sec = getSection(Rel.w.b);
   switch (sec->sh_type) {
@@ -1504,10 +1489,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getRelocationOffset(DataRefImpl Rel,
-                                              uint64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationOffset(DataRefImpl Rel,
+                                                    uint64_t &Result) const {
   uint64_t offset;
   const Elf_Shdr *sec = getSection(Rel.w.b);
   switch (sec->sh_type) {
@@ -1527,10 +1511,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getRelocationType(DataRefImpl Rel,
-                                            uint64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationType(DataRefImpl Rel,
+                                                  uint64_t &Result) const {
   const Elf_Shdr *sec = getSection(Rel.w.b);
   switch (sec->sh_type) {
     default :
@@ -1550,10 +1533,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
 #define LLVM_ELF_SWITCH_RELOC_TYPE_NAME(enum) \
   case ELF::enum: res = #enum; break;
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getRelocationTypeName(DataRefImpl Rel,
-                                          SmallVectorImpl<char> &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationTypeName(
+    DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
   const Elf_Shdr *sec = getSection(Rel.w.b);
   uint32_t type;
   StringRef res;
@@ -1654,6 +1636,86 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
       res = "Unknown";
     }
     break;
+  case ELF::EM_AARCH64:
+    switch (type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD_PREL_LO19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_LO21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_PG_HI21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADD_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST8_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TSTBR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CONDBR19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_JUMP26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CALL26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST16_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST32_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST64_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST128_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD64_GOT_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_HI12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_HI12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADR_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_LD64_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADD_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_CALL);
+
+    default:
+      res = "Unknown";
+    }
+    break;
   case ELF::EM_ARM:
     switch (type) {
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_NONE);
@@ -1892,10 +1954,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
 
 #undef LLVM_ELF_SWITCH_RELOC_TYPE_NAME
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getRelocationAdditionalInfo(DataRefImpl Rel,
-                                                      int64_t &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationAdditionalInfo(
+    DataRefImpl Rel, int64_t &Result) const {
   const Elf_Shdr *sec = getSection(Rel.w.b);
   switch (sec->sh_type) {
     default :
@@ -1911,10 +1972,9 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   }
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getRelocationValueString(DataRefImpl Rel,
-                                          SmallVectorImpl<char> &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationValueString(
+    DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
   const Elf_Shdr *sec = getSection(Rel.w.b);
   uint8_t type;
   StringRef res;
@@ -1969,6 +2029,7 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
       res = "Unknown";
     }
     break;
+  case ELF::EM_AARCH64:
   case ELF::EM_ARM:
   case ELF::EM_HEXAGON:
     res = symname;
@@ -1982,20 +2043,21 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
 }
 
 // Verify that the last byte in the string table in a null.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-void ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                  ::VerifyStrTab(const Elf_Shdr *sh) const {
+template<class ELFT>
+void ELFObjectFile<ELFT>::VerifyStrTab(const Elf_Shdr *sh) const {
   const char *strtab = (const char*)base() + sh->sh_offset;
   if (strtab[sh->sh_size - 1] != 0)
     // FIXME: Proper error handling.
     report_fatal_error("String table must end with a null terminator!");
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::ELFObjectFile(MemoryBuffer *Object, error_code &ec)
-  : ObjectFile(getELFType(target_endianness == support::little, is64Bits),
-               Object, ec)
+template<class ELFT>
+ELFObjectFile<ELFT>::ELFObjectFile(MemoryBuffer *Object, error_code &ec)
+  : ObjectFile(getELFType(
+      static_cast<endianness>(ELFT::TargetEndianness) == support::little,
+      ELFT::Is64Bits),
+      Object,
+      ec)
   , isDyldELFObject(false)
   , SectionHeaderTable(0)
   , dot_shstrtab_sec(0)
@@ -2153,9 +2215,8 @@ ELFObjectFile<target_endianness, max_alignment, is64Bits>
 }
 
 // Get the symbol table index in the symtab section given a symbol
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-uint64_t ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                      ::getSymbolIndex(const Elf_Sym *Sym) const {
+template<class ELFT>
+uint64_t ELFObjectFile<ELFT>::getSymbolIndex(const Elf_Sym *Sym) const {
   assert(SymbolTableSections.size() == 1 && "Only one symbol table supported!");
   const Elf_Shdr *SymTab = *SymbolTableSections.begin();
   uintptr_t SymLoc = uintptr_t(Sym);
@@ -2167,9 +2228,8 @@ uint64_t ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return SymOffset / SymTab->sh_entsize;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-symbol_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::begin_symbols() const {
+template<class ELFT>
+symbol_iterator ELFObjectFile<ELFT>::begin_symbols() const {
   DataRefImpl SymbolData;
   if (SymbolTableSections.size() <= 1) {
     SymbolData.d.a = std::numeric_limits<uint32_t>::max();
@@ -2181,18 +2241,16 @@ symbol_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return symbol_iterator(SymbolRef(SymbolData, this));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-symbol_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::end_symbols() const {
+template<class ELFT>
+symbol_iterator ELFObjectFile<ELFT>::end_symbols() const {
   DataRefImpl SymbolData;
   SymbolData.d.a = std::numeric_limits<uint32_t>::max();
   SymbolData.d.b = std::numeric_limits<uint32_t>::max();
   return symbol_iterator(SymbolRef(SymbolData, this));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-symbol_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::begin_dynamic_symbols() const {
+template<class ELFT>
+symbol_iterator ELFObjectFile<ELFT>::begin_dynamic_symbols() const {
   DataRefImpl SymbolData;
   if (SymbolTableSections[0] == NULL) {
     SymbolData.d.a = std::numeric_limits<uint32_t>::max();
@@ -2204,26 +2262,23 @@ symbol_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return symbol_iterator(SymbolRef(SymbolData, this));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-symbol_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::end_dynamic_symbols() const {
+template<class ELFT>
+symbol_iterator ELFObjectFile<ELFT>::end_dynamic_symbols() const {
   DataRefImpl SymbolData;
   SymbolData.d.a = std::numeric_limits<uint32_t>::max();
   SymbolData.d.b = std::numeric_limits<uint32_t>::max();
   return symbol_iterator(SymbolRef(SymbolData, this));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-section_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                              ::begin_sections() const {
+template<class ELFT>
+section_iterator ELFObjectFile<ELFT>::begin_sections() const {
   DataRefImpl ret;
   ret.p = reinterpret_cast<intptr_t>(base() + Header->e_shoff);
   return section_iterator(SectionRef(ret, this));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-section_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                              ::end_sections() const {
+template<class ELFT>
+section_iterator ELFObjectFile<ELFT>::end_sections() const {
   DataRefImpl ret;
   ret.p = reinterpret_cast<intptr_t>(base()
                                      + Header->e_shoff
@@ -2231,58 +2286,46 @@ section_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return section_iterator(SectionRef(ret, this));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-typename ELFObjectFile<target_endianness, max_alignment, is64Bits>::dyn_iterator
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::begin_dynamic_table() const {
-  DataRefImpl DynData;
-  if (dot_dynamic_sec == NULL || dot_dynamic_sec->sh_size == 0) {
-    DynData.d.a = std::numeric_limits<uint32_t>::max();
-  } else {
-    DynData.d.a = 0;
-  }
-  return dyn_iterator(DynRef(DynData, this));
-}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-typename ELFObjectFile<target_endianness, max_alignment, is64Bits>::dyn_iterator
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                          ::end_dynamic_table() const {
-  DataRefImpl DynData;
-  DynData.d.a = std::numeric_limits<uint32_t>::max();
-  return dyn_iterator(DynRef(DynData, this));
-}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getDynNext(DataRefImpl DynData,
-                                     DynRef &Result) const {
-  ++DynData.d.a;
-
-  // Check to see if we are at the end of .dynamic
-  if (DynData.d.a >= dot_dynamic_sec->getEntityCount()) {
-    // We are at the end. Return the terminator.
-    DynData.d.a = std::numeric_limits<uint32_t>::max();
+template<class ELFT>
+typename ELFObjectFile<ELFT>::Elf_Dyn_iterator
+ELFObjectFile<ELFT>::begin_dynamic_table() const {
+  if (dot_dynamic_sec)
+    return Elf_Dyn_iterator(dot_dynamic_sec->sh_entsize,
+                            (const char *)base() + dot_dynamic_sec->sh_offset);
+  return Elf_Dyn_iterator(0, 0);
+}
+
+template<class ELFT>
+typename ELFObjectFile<ELFT>::Elf_Dyn_iterator
+ELFObjectFile<ELFT>::end_dynamic_table(bool NULLEnd) const {
+  if (dot_dynamic_sec) {
+    Elf_Dyn_iterator Ret(dot_dynamic_sec->sh_entsize,
+                         (const char *)base() + dot_dynamic_sec->sh_offset +
+                         dot_dynamic_sec->sh_size);
+
+    if (NULLEnd) {
+      Elf_Dyn_iterator Start = begin_dynamic_table();
+      for (; Start != Ret && Start->getTag() != ELF::DT_NULL; ++Start)
+        ;
+      // Include the DT_NULL.
+      if (Start != Ret)
+        ++Start;
+      Ret = Start;
+    }
+    return Ret;
   }
-
-  Result = DynRef(DynData, this);
-  return object_error::success;
+  return Elf_Dyn_iterator(0, 0);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-StringRef
-ELFObjectFile<target_endianness, max_alignment, is64Bits>::getLoadName() const {
+template<class ELFT>
+StringRef ELFObjectFile<ELFT>::getLoadName() const {
   if (!dt_soname) {
     // Find the DT_SONAME entry
-    dyn_iterator it = begin_dynamic_table();
-    dyn_iterator ie = end_dynamic_table();
-    error_code ec;
-    while (it != ie) {
+    Elf_Dyn_iterator it = begin_dynamic_table();
+    Elf_Dyn_iterator ie = end_dynamic_table();
+    for (; it != ie; ++it) {
       if (it->getTag() == ELF::DT_SONAME)
         break;
-      it.increment(ec);
-      if (ec)
-        report_fatal_error("dynamic table iteration failed");
     }
     if (it != ie) {
       if (dot_dynstr_sec == NULL)
@@ -2295,57 +2338,46 @@ ELFObjectFile<target_endianness, max_alignment, is64Bits>::getLoadName() const {
   return dt_soname;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-library_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::begin_libraries_needed() const {
+template<class ELFT>
+library_iterator ELFObjectFile<ELFT>::begin_libraries_needed() const {
   // Find the first DT_NEEDED entry
-  dyn_iterator i = begin_dynamic_table();
-  dyn_iterator e = end_dynamic_table();
-  error_code ec;
-  while (i != e) {
+  Elf_Dyn_iterator i = begin_dynamic_table();
+  Elf_Dyn_iterator e = end_dynamic_table();
+  for (; i != e; ++i) {
     if (i->getTag() == ELF::DT_NEEDED)
       break;
-    i.increment(ec);
-    if (ec)
-      report_fatal_error("dynamic table iteration failed");
   }
-  // Use the same DataRefImpl format as DynRef.
-  return library_iterator(LibraryRef(i->getRawDataRefImpl(), this));
+
+  DataRefImpl DRI;
+  DRI.p = reinterpret_cast<uintptr_t>(i.get());
+  return library_iterator(LibraryRef(DRI, this));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getLibraryNext(DataRefImpl Data,
-                                         LibraryRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
+                                               LibraryRef &Result) const {
   // Use the same DataRefImpl format as DynRef.
-  dyn_iterator i = dyn_iterator(DynRef(Data, this));
-  dyn_iterator e = end_dynamic_table();
+  Elf_Dyn_iterator i = Elf_Dyn_iterator(dot_dynamic_sec->sh_entsize,
+                                        reinterpret_cast<const char *>(Data.p));
+  Elf_Dyn_iterator e = end_dynamic_table();
 
   // Skip the current dynamic table entry.
-  error_code ec;
-  if (i != e) {
-    i.increment(ec);
-    // TODO: proper error handling
-    if (ec)
-      report_fatal_error("dynamic table iteration failed");
-  }
+  ++i;
 
   // Find the next DT_NEEDED entry.
-  while (i != e) {
-    if (i->getTag() == ELF::DT_NEEDED)
-      break;
-    i.increment(ec);
-    if (ec)
-      report_fatal_error("dynamic table iteration failed");
-  }
-  Result = LibraryRef(i->getRawDataRefImpl(), this);
+  for (; i != e && i->getTag() != ELF::DT_NEEDED; ++i);
+
+  DataRefImpl DRI;
+  DRI.p = reinterpret_cast<uintptr_t>(i.get());
+  Result = LibraryRef(DRI, this);
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-         ::getLibraryPath(DataRefImpl Data, StringRef &Res) const {
-  dyn_iterator i = dyn_iterator(DynRef(Data, this));
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getLibraryPath(DataRefImpl Data,
+                                               StringRef &Res) const {
+  Elf_Dyn_iterator i = Elf_Dyn_iterator(dot_dynamic_sec->sh_entsize,
+                                        reinterpret_cast<const char *>(Data.p));
   if (i == end_dynamic_table())
     report_fatal_error("getLibraryPath() called on iterator end");
 
@@ -2363,23 +2395,21 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-library_iterator ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                             ::end_libraries_needed() const {
-  dyn_iterator e = end_dynamic_table();
-  // Use the same DataRefImpl format as DynRef.
-  return library_iterator(LibraryRef(e->getRawDataRefImpl(), this));
+template<class ELFT>
+library_iterator ELFObjectFile<ELFT>::end_libraries_needed() const {
+  Elf_Dyn_iterator e = end_dynamic_table();
+  DataRefImpl DRI;
+  DRI.p = reinterpret_cast<uintptr_t>(e.get());
+  return library_iterator(LibraryRef(DRI, this));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-uint8_t ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                     ::getBytesInAddress() const {
-  return is64Bits ? 8 : 4;
+template<class ELFT>
+uint8_t ELFObjectFile<ELFT>::getBytesInAddress() const {
+  return ELFT::Is64Bits ? 8 : 4;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-StringRef ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                       ::getFileFormatName() const {
+template<class ELFT>
+StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
   switch(Header->e_ident[ELF::EI_CLASS]) {
   case ELF::ELFCLASS32:
     switch(Header->e_machine) {
@@ -2391,6 +2421,8 @@ StringRef ELFObjectFile<target_endianness, max_alignment, is64Bits>
       return "ELF32-arm";
     case ELF::EM_HEXAGON:
       return "ELF32-hexagon";
+    case ELF::EM_MIPS:
+      return "ELF32-mips";
     default:
       return "ELF32-unknown";
     }
@@ -2400,6 +2432,8 @@ StringRef ELFObjectFile<target_endianness, max_alignment, is64Bits>
       return "ELF64-i386";
     case ELF::EM_X86_64:
       return "ELF64-x86-64";
+    case ELF::EM_AARCH64:
+      return "ELF64-aarch64";
     case ELF::EM_PPC64:
       return "ELF64-ppc64";
     default:
@@ -2411,20 +2445,21 @@ StringRef ELFObjectFile<target_endianness, max_alignment, is64Bits>
   }
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-unsigned ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                      ::getArch() const {
+template<class ELFT>
+unsigned ELFObjectFile<ELFT>::getArch() const {
   switch(Header->e_machine) {
   case ELF::EM_386:
     return Triple::x86;
   case ELF::EM_X86_64:
     return Triple::x86_64;
+  case ELF::EM_AARCH64:
+    return Triple::aarch64;
   case ELF::EM_ARM:
     return Triple::arm;
   case ELF::EM_HEXAGON:
     return Triple::hexagon;
   case ELF::EM_MIPS:
-    return (target_endianness == support::little) ?
+    return (ELFT::TargetEndianness == support::little) ?
            Triple::mipsel : Triple::mips;
   case ELF::EM_PPC64:
     return Triple::ppc64;
@@ -2433,9 +2468,8 @@ unsigned ELFObjectFile<target_endianness, max_alignment, is64Bits>
   }
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-uint64_t ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                      ::getNumSections() const {
+template<class ELFT>
+uint64_t ELFObjectFile<ELFT>::getNumSections() const {
   assert(Header && "Header not initialized!");
   if (Header->e_shnum == ELF::SHN_UNDEF) {
     assert(SectionHeaderTable && "SectionHeaderTable not initialized!");
@@ -2444,10 +2478,9 @@ uint64_t ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return Header->e_shnum;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 uint64_t
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::getStringTableIndex() const {
+ELFObjectFile<ELFT>::getStringTableIndex() const {
   if (Header->e_shnum == ELF::SHN_UNDEF) {
     if (Header->e_shstrndx == ELF::SHN_HIRESERVE)
       return SectionHeaderTable->sh_link;
@@ -2457,62 +2490,44 @@ ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return Header->e_shstrndx;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 template<typename T>
 inline const T *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::getEntry(uint16_t Section, uint32_t Entry) const {
+ELFObjectFile<ELFT>::getEntry(uint16_t Section, uint32_t Entry) const {
   return getEntry<T>(getSection(Section), Entry);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 template<typename T>
 inline const T *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::getEntry(const Elf_Shdr * Section, uint32_t Entry) const {
+ELFObjectFile<ELFT>::getEntry(const Elf_Shdr * Section, uint32_t Entry) const {
   return reinterpret_cast<const T *>(
            base()
            + Section->sh_offset
            + (Entry * Section->sh_entsize));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Sym *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::getSymbol(DataRefImpl Symb) const {
+template<class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Sym *
+ELFObjectFile<ELFT>::getSymbol(DataRefImpl Symb) const {
   return getEntry<Elf_Sym>(SymbolTableSections[Symb.d.b], Symb.d.a);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Dyn *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::getDyn(DataRefImpl DynData) const {
-  return getEntry<Elf_Dyn>(dot_dynamic_sec, DynData.d.a);
-}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Rel *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::getRel(DataRefImpl Rel) const {
+template<class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Rel *
+ELFObjectFile<ELFT>::getRel(DataRefImpl Rel) const {
   return getEntry<Elf_Rel>(Rel.w.b, Rel.w.c);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Rela *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::getRela(DataRefImpl Rela) const {
+template<class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Rela *
+ELFObjectFile<ELFT>::getRela(DataRefImpl Rela) const {
   return getEntry<Elf_Rela>(Rela.w.b, Rela.w.c);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Shdr *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::getSection(DataRefImpl Symb) const {
+template<class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Shdr *
+ELFObjectFile<ELFT>::getSection(DataRefImpl Symb) const {
   const Elf_Shdr *sec = getSection(Symb.d.b);
   if (sec->sh_type != ELF::SHT_SYMTAB || sec->sh_type != ELF::SHT_DYNSYM)
     // FIXME: Proper error handling.
@@ -2520,11 +2535,9 @@ ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return sec;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const typename ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                            ::Elf_Shdr *
-ELFObjectFile<target_endianness, max_alignment, is64Bits>
-             ::getSection(uint32_t index) const {
+template<class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Shdr *
+ELFObjectFile<ELFT>::getSection(uint32_t index) const {
   if (index == 0)
     return 0;
   if (!SectionHeaderTable || index >= getNumSections())
@@ -2536,17 +2549,15 @@ ELFObjectFile<target_endianness, max_alignment, is64Bits>
          + (index * Header->e_shentsize));
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const char *ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                         ::getString(uint32_t section,
-                                     ELF::Elf32_Word offset) const {
+template<class ELFT>
+const char *ELFObjectFile<ELFT>::getString(uint32_t section,
+                                           ELF::Elf32_Word offset) const {
   return getString(getSection(section), offset);
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-const char *ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                         ::getString(const Elf_Shdr *section,
-                                     ELF::Elf32_Word offset) const {
+template<class ELFT>
+const char *ELFObjectFile<ELFT>::getString(const Elf_Shdr *section,
+                                           ELF::Elf32_Word offset) const {
   assert(section && section->sh_type == ELF::SHT_STRTAB && "Invalid section!");
   if (offset >= section->sh_size)
     // FIXME: Proper error handling.
@@ -2554,11 +2565,10 @@ const char *ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return (const char *)base() + section->sh_offset + offset;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolName(const Elf_Shdr *section,
-                                        const Elf_Sym *symb,
-                                        StringRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolName(const Elf_Shdr *section,
+                                              const Elf_Sym *symb,
+                                              StringRef &Result) const {
   if (symb->st_name == 0) {
     const Elf_Shdr *section = getSection(symb);
     if (!section)
@@ -2578,20 +2588,18 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSectionName(const Elf_Shdr *section,
-                                        StringRef &Result) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionName(const Elf_Shdr *section,
+                                               StringRef &Result) const {
   Result = StringRef(getString(dot_shstrtab_sec, section->sh_name));
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
-                        ::getSymbolVersion(const Elf_Shdr *section,
-                                           const Elf_Sym *symb,
-                                           StringRef &Version,
-                                           bool &IsDefault) const {
+template<class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
+                                                 const Elf_Sym *symb,
+                                                 StringRef &Version,
+                                                 bool &IsDefault) const {
   // Handle non-dynamic symbols.
   if (section != SymbolTableSections[0]) {
     // Non-dynamic symbols can have versions in their names
@@ -2669,54 +2677,6 @@ error_code ELFObjectFile<target_endianness, max_alignment, is64Bits>
   return object_error::success;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-inline DynRefImpl<target_endianness, max_alignment, is64Bits>
-                 ::DynRefImpl(DataRefImpl DynP, const OwningType *Owner)
-  : DynPimpl(DynP)
-  , OwningObject(Owner) {}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-inline bool DynRefImpl<target_endianness, max_alignment, is64Bits>
-                      ::operator==(const DynRefImpl &Other) const {
-  return DynPimpl == Other.DynPimpl;
-}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-inline bool DynRefImpl<target_endianness, max_alignment, is64Bits>
-                      ::operator <(const DynRefImpl &Other) const {
-  return DynPimpl < Other.DynPimpl;
-}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-inline error_code DynRefImpl<target_endianness, max_alignment, is64Bits>
-                            ::getNext(DynRefImpl &Result) const {
-  return OwningObject->getDynNext(DynPimpl, Result);
-}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-inline int64_t DynRefImpl<target_endianness, max_alignment, is64Bits>
-                            ::getTag() const {
-  return OwningObject->getDyn(DynPimpl)->d_tag;
-}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-inline uint64_t DynRefImpl<target_endianness, max_alignment, is64Bits>
-                            ::getVal() const {
-  return OwningObject->getDyn(DynPimpl)->d_un.d_val;
-}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-inline uint64_t DynRefImpl<target_endianness, max_alignment, is64Bits>
-                            ::getPtr() const {
-  return OwningObject->getDyn(DynPimpl)->d_un.d_ptr;
-}
-
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-inline DataRefImpl DynRefImpl<target_endianness, max_alignment, is64Bits>
-                             ::getRawDataRefImpl() const {
-  return DynPimpl;
-}
-
 /// This is a generic interface for retrieving GNU symbol version
 /// information from an ELFObjectFile.
 static inline error_code GetELFSymbolVersion(const ObjectFile *Obj,
@@ -2724,23 +2684,23 @@ static inline error_code GetELFSymbolVersion(const ObjectFile *Obj,
                                              StringRef &Version,
                                              bool &IsDefault) {
   // Little-endian 32-bit
-  if (const ELFObjectFile<support::little, 4, false> *ELFObj =
-          dyn_cast<ELFObjectFile<support::little, 4, false> >(Obj))
+  if (const ELFObjectFile<ELFType<support::little, 4, false> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::little, 4, false> > >(Obj))
     return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
 
   // Big-endian 32-bit
-  if (const ELFObjectFile<support::big, 4, false> *ELFObj =
-          dyn_cast<ELFObjectFile<support::big, 4, false> >(Obj))
+  if (const ELFObjectFile<ELFType<support::big, 4, false> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::big, 4, false> > >(Obj))
     return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
 
   // Little-endian 64-bit
-  if (const ELFObjectFile<support::little, 8, true> *ELFObj =
-          dyn_cast<ELFObjectFile<support::little, 8, true> >(Obj))
+  if (const ELFObjectFile<ELFType<support::little, 8, true> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::little, 8, true> > >(Obj))
     return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
 
   // Big-endian 64-bit
-  if (const ELFObjectFile<support::big, 8, true> *ELFObj =
-          dyn_cast<ELFObjectFile<support::big, 8, true> >(Obj))
+  if (const ELFObjectFile<ELFType<support::big, 8, true> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::big, 8, true> > >(Obj))
     return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
 
   llvm_unreachable("Object passed to GetELFSymbolVersion() is not ELF");
diff --git a/include/llvm/Object/MachOFormat.h b/include/llvm/Object/MachOFormat.h
index a17d58d..ffca391 100644
--- a/include/llvm/Object/MachOFormat.h
+++ b/include/llvm/Object/MachOFormat.h
@@ -64,7 +64,10 @@ namespace mach {
     CSARM_V7     = 9,
     CSARM_V7F    = 10,
     CSARM_V7S    = 11,
-    CSARM_V7K    = 12
+    CSARM_V7K    = 12,
+    CSARM_V6M    = 14,
+    CSARM_V7M    = 15,
+    CSARM_V7EM   = 16
   };
 
   /// \brief PowerPC Machine Subtypes.
@@ -145,7 +148,8 @@ namespace macho {
     LCT_CodeSignature = 0x1d,
     LCT_SegmentSplitInfo = 0x1e,
     LCT_FunctionStarts = 0x26,
-    LCT_DataInCode = 0x29
+    LCT_DataInCode = 0x29,
+    LCT_LinkerOptions = 0x2D
   };
 
   /// \brief Load command structure.
@@ -233,6 +237,14 @@ namespace macho {
     uint32_t DataSize;
   };
 
+  struct LinkerOptionsLoadCommand {
+    uint32_t Type;
+    uint32_t Size;
+    uint32_t Count;
+    // Load command is followed by Count number of zero-terminated UTF8 strings,
+    // and then zero-filled to be 4-byte aligned.
+  };
+
   /// @}
   /// @name Section Data
   /// @{
diff --git a/include/llvm/Object/MachOObject.h b/include/llvm/Object/MachOObject.h
index e32a85d..9e4ab19 100644
--- a/include/llvm/Object/MachOObject.h
+++ b/include/llvm/Object/MachOObject.h
@@ -153,6 +153,9 @@ public:
   void ReadLinkeditDataLoadCommand(
     const LoadCommandInfo &LCI,
     InMemoryStruct<macho::LinkeditDataLoadCommand> &Res) const;
+  void ReadLinkerOptionsLoadCommand(
+    const LoadCommandInfo &LCI,
+    InMemoryStruct<macho::LinkerOptionsLoadCommand> &Res) const;
   void ReadIndirectSymbolTableEntry(
     const macho::DysymtabLoadCommand &DLC,
     unsigned Index,
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index e63f554..6a66653 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_OBJECT_OBJECT_FILE_H
-#define LLVM_OBJECT_OBJECT_FILE_H
+#ifndef LLVM_OBJECT_OBJECTFILE_H
+#define LLVM_OBJECT_OBJECTFILE_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/Binary.h"
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index 2c2dd03..2dcbdf9 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h
@@ -13,13 +13,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LLVM_OBJECT_RELOCVISITOR
-#define _LLVM_OBJECT_RELOCVISITOR
+#ifndef LLVM_OBJECT_RELOCVISITOR_H
+#define LLVM_OBJECT_RELOCVISITOR_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Object/ELF.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -40,7 +40,7 @@ struct RelocToApply {
 /// @brief Base class for object file relocation visitors.
 class RelocVisitor {
 public:
-  explicit RelocVisitor(llvm::StringRef FileFormat)
+  explicit RelocVisitor(StringRef FileFormat)
     : FileFormat(FileFormat), HasError(false) {}
 
   // TODO: Should handle multiple applied relocations via either passing in the
@@ -76,14 +76,41 @@ public:
         HasError = true;
         return RelocToApply();
       }
+    } else if (FileFormat == "ELF64-ppc64") {
+      switch (RelocType) {
+      case llvm::ELF::R_PPC64_ADDR32:
+        return visitELF_PPC64_ADDR32(R, Value);
+      default:
+        HasError = true;
+        return RelocToApply();
+      }
+    } else if (FileFormat == "ELF32-mips") {
+      switch (RelocType) {
+      case llvm::ELF::R_MIPS_32:
+        return visitELF_MIPS_32(R, Value);
+      default:
+        HasError = true;
+        return RelocToApply();
+      }
+    } else if (FileFormat == "ELF64-aarch64") {
+      switch (RelocType) {
+      case llvm::ELF::R_AARCH64_ABS32:
+        return visitELF_AARCH64_ABS32(R, Value);
+      case llvm::ELF::R_AARCH64_ABS64:
+        return visitELF_AARCH64_ABS64(R, Value);
+      default:
+        HasError = true;
+        return RelocToApply();
+      }
     }
+    HasError = true;
     return RelocToApply();
   }
 
   bool error() { return HasError; }
 
 private:
-  llvm::StringRef FileFormat;
+  StringRef FileFormat;
   bool HasError;
 
   /// Operations
@@ -139,6 +166,42 @@ private:
     int32_t Res = (Value + Addend) & 0xFFFFFFFF;
     return RelocToApply(Res, 4);
   }
+
+  /// PPC64 ELF
+  RelocToApply visitELF_PPC64_ADDR32(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
+    return RelocToApply(Res, 4);
+  }
+
+  /// MIPS ELF
+  RelocToApply visitELF_MIPS_32(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
+    return RelocToApply(Res, 4);
+  }
+
+  // AArch64 ELF
+  RelocToApply visitELF_AARCH64_ABS32(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    int64_t Res =  Value + Addend;
+
+    // Overflow check allows for both signed and unsigned interpretation.
+    if (Res < INT32_MIN || Res > UINT32_MAX)
+      HasError = true;
+
+    return RelocToApply(static_cast<uint32_t>(Res), 4);
+  }
+
+  RelocToApply visitELF_AARCH64_ABS64(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    return RelocToApply(Value + Addend, 8);
+  }
+
 };
 
 }
diff --git a/include/llvm/Option/Arg.h b/include/llvm/Option/Arg.h
index baa4b6a..6b8ed3f 100644
--- a/include/llvm/Option/Arg.h
+++ b/include/llvm/Option/Arg.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_ARG_H_
-#define LLVM_SUPPORT_ARG_H_
+#ifndef LLVM_OPTION_ARG_H
+#define LLVM_OPTION_ARG_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/include/llvm/Option/ArgList.h b/include/llvm/Option/ArgList.h
index 378b58a..d3accfe 100644
--- a/include/llvm/Option/ArgList.h
+++ b/include/llvm/Option/ArgList.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_ARGLIST_H_
-#define LLVM_SUPPORT_ARGLIST_H_
+#ifndef LLVM_OPTION_ARGLIST_H
+#define LLVM_OPTION_ARGLIST_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/include/llvm/Option/OptSpecifier.h b/include/llvm/Option/OptSpecifier.h
index 3bc9bb2..02bc6b1 100644
--- a/include/llvm/Option/OptSpecifier.h
+++ b/include/llvm/Option/OptSpecifier.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_OPTSPECIFIER_H
-#define LLVM_SUPPORT_OPTSPECIFIER_H
+#ifndef LLVM_OPTION_OPTSPECIFIER_H
+#define LLVM_OPTION_OPTSPECIFIER_H
 
 namespace llvm {
 namespace opt {
@@ -19,7 +19,7 @@ namespace opt {
     unsigned ID;
 
   private:
-    explicit OptSpecifier(bool); // DO NOT IMPLEMENT
+    explicit OptSpecifier(bool) LLVM_DELETED_FUNCTION;
 
   public:
     OptSpecifier() : ID(0) {}
diff --git a/include/llvm/Option/OptTable.h b/include/llvm/Option/OptTable.h
index 930549f..a93acbf 100644
--- a/include/llvm/Option/OptTable.h
+++ b/include/llvm/Option/OptTable.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_OPTTABLE_H
-#define LLVM_SUPPORT_OPTTABLE_H
+#ifndef LLVM_OPTION_OPTTABLE_H
+#define LLVM_OPTION_OPTTABLE_H
 
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Option/OptSpecifier.h"
diff --git a/include/llvm/Option/Option.h b/include/llvm/Option/Option.h
index ee5eec4..541aa8d 100644
--- a/include/llvm/Option/Option.h
+++ b/include/llvm/Option/Option.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_OPTION_H_
-#define LLVM_SUPPORT_OPTION_H_
+#ifndef LLVM_OPTION_OPTION_H
+#define LLVM_OPTION_OPTION_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/include/llvm/PassAnalysisSupport.h b/include/llvm/PassAnalysisSupport.h
index 1cc5741..a581802 100644
--- a/include/llvm/PassAnalysisSupport.h
+++ b/include/llvm/PassAnalysisSupport.h
@@ -16,8 +16,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PASS_ANALYSIS_SUPPORT_H
-#define LLVM_PASS_ANALYSIS_SUPPORT_H
+#ifndef LLVM_PASSANALYSISSUPPORT_H
+#define LLVM_PASSANALYSISSUPPORT_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/include/llvm/PassManagers.h b/include/llvm/PassManagers.h
index fac7928..7afb0a0 100644
--- a/include/llvm/PassManagers.h
+++ b/include/llvm/PassManagers.h
@@ -352,7 +352,7 @@ public:
     return PMT_Unknown;
   }
 
-  std::map<AnalysisID, Pass*> *getAvailableAnalysis() {
+  DenseMap<AnalysisID, Pass*> *getAvailableAnalysis() {
     return &AvailableAnalysis;
   }
 
@@ -375,8 +375,7 @@ protected:
   // Collection of Analysis provided by Parent pass manager and
   // used by current pass manager. At at time there can not be more
   // then PMT_Last active pass mangers.
-  std::map<AnalysisID, Pass *> *InheritedAnalysis[PMT_Last];
-
+  DenseMap<AnalysisID, Pass *> *InheritedAnalysis[PMT_Last];
 
   /// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
   /// or higher is specified.
@@ -390,7 +389,7 @@ private:
   // pass. If a pass requires an analysis which is not available then
   // the required analysis pass is scheduled to run before the pass itself is
   // scheduled to run.
-  std::map<AnalysisID, Pass*> AvailableAnalysis;
+  DenseMap<AnalysisID, Pass*> AvailableAnalysis;
 
   // Collection of higher level analysis used by the pass managed by
   // this manager.
diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index 3633f47..ccc7934 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h
@@ -18,8 +18,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PASS_SUPPORT_H
-#define LLVM_PASS_SUPPORT_H
+#ifndef LLVM_PASSSUPPORT_H
+#define LLVM_PASSSUPPORT_H
 
 #include "Pass.h"
 #include "llvm/InitializePasses.h"
diff --git a/include/llvm/Support/AlignOf.h b/include/llvm/Support/AlignOf.h
index 0894316..bba3424 100644
--- a/include/llvm/Support/AlignOf.h
+++ b/include/llvm/Support/AlignOf.h
@@ -169,17 +169,20 @@ LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128)
 
 namespace detail {
 template <typename T1,
-          typename T2 = char, typename T3 = char, typename T4 = char>
+          typename T2 = char, typename T3 = char, typename T4 = char,
+          typename T5 = char, typename T6 = char, typename T7 = char>
 class AlignerImpl {
-  T1 t1; T2 t2; T3 t3; T4 t4;
+  T1 t1; T2 t2; T3 t3; T4 t4; T5 t5; T6 t6; T7 t7;
 
   AlignerImpl(); // Never defined or instantiated.
 };
 
 template <typename T1,
-          typename T2 = char, typename T3 = char, typename T4 = char>
+          typename T2 = char, typename T3 = char, typename T4 = char,
+          typename T5 = char, typename T6 = char, typename T7 = char>
 union SizerImpl {
-  char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)];
+  char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)],
+       arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)];
 };
 } // end namespace detail
 
@@ -188,14 +191,14 @@ union SizerImpl {
 ///
 /// These types may be arrays, structs, or any other types. The goal is to
 /// expose a char array buffer member which can be used as suitable storage for
-/// a placement new of any of these types. Support for more than four types can
+/// a placement new of any of these types. Support for more than seven types can
 /// be added at the cost of more boiler plate.
 template <typename T1,
-          typename T2 = char, typename T3 = char, typename T4 = char>
-struct AlignedCharArrayUnion :
-  llvm::AlignedCharArray<AlignOf<detail::AlignerImpl<T1, T2, T3, T4> >
-                                            ::Alignment,
-                                 sizeof(detail::SizerImpl<T1, T2, T3, T4>)> {
+          typename T2 = char, typename T3 = char, typename T4 = char,
+          typename T5 = char, typename T6 = char, typename T7 = char>
+struct AlignedCharArrayUnion : llvm::AlignedCharArray<
+    AlignOf<detail::AlignerImpl<T1, T2, T3, T4, T5, T6, T7> >::Alignment,
+    sizeof(detail::SizerImpl<T1, T2, T3, T4, T5, T6, T7>)> {
 };
 } // end namespace llvm
 #endif
diff --git a/include/llvm/Support/ArrayRecycler.h b/include/llvm/Support/ArrayRecycler.h
index 0fbe79c..c7e0cba 100644
--- a/include/llvm/Support/ArrayRecycler.h
+++ b/include/llvm/Support/ArrayRecycler.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_ARRAY_RECYCLER_H
-#define LLVM_SUPPORT_ARRAY_RECYCLER_H
+#ifndef LLVM_SUPPORT_ARRAYRECYCLER_H
+#define LLVM_SUPPORT_ARRAYRECYCLER_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/MathExtras.h"
diff --git a/include/llvm/Support/Atomic.h b/include/llvm/Support/Atomic.h
index 1a6c606..9ec23e8 100644
--- a/include/llvm/Support/Atomic.h
+++ b/include/llvm/Support/Atomic.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_ATOMIC_H
-#define LLVM_SYSTEM_ATOMIC_H
+#ifndef LLVM_SUPPORT_ATOMIC_H
+#define LLVM_SUPPORT_ATOMIC_H
 
 #include "llvm/Support/DataTypes.h"
 
diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h
index ba8adb0..02c44cd 100644
--- a/include/llvm/Support/COFF.h
+++ b/include/llvm/Support/COFF.h
@@ -20,8 +20,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_WIN_COFF_H
-#define LLVM_SUPPORT_WIN_COFF_H
+#ifndef LLVM_SUPPORT_COFF_H
+#define LLVM_SUPPORT_COFF_H
 
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
diff --git a/include/llvm/Support/Casting.h b/include/llvm/Support/Casting.h
index 0c71882..80f09db 100644
--- a/include/llvm/Support/Casting.h
+++ b/include/llvm/Support/Casting.h
@@ -55,8 +55,8 @@ struct isa_impl {
 /// \brief Always allow upcasts, and perform no dynamic check for them.
 template <typename To, typename From>
 struct isa_impl<To, From,
-                typename llvm::enable_if_c<
-                  llvm::is_base_of<To, From>::value
+                typename enable_if<
+                  llvm::is_base_of<To, From>
                 >::type
                > {
   static inline bool doit(const From &) { return true; }
@@ -204,12 +204,35 @@ template<class To, class FromTy> struct cast_convert_val<To,FromTy,FromTy> {
 //  cast<Instruction>(myVal)->getParent()
 //
 template <class X, class Y>
-inline typename cast_retty<X, Y>::ret_type cast(const Y &Val) {
+inline typename enable_if_c<
+  !is_same<Y, typename simplify_type<Y>::SimpleType>::value,
+  typename cast_retty<X, Y>::ret_type
+>::type cast(const Y &Val) {
   assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
   return cast_convert_val<X, Y,
                           typename simplify_type<Y>::SimpleType>::doit(Val);
 }
 
+template <class X, class Y>
+inline typename enable_if<
+  is_same<Y, typename simplify_type<Y>::SimpleType>,
+  typename cast_retty<X, Y>::ret_type
+>::type cast(Y &Val) {
+  assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
+  return cast_convert_val<X, Y,
+                          typename simplify_type<Y>::SimpleType>::doit(Val);
+}
+
+template <class X, class Y>
+inline typename enable_if<
+  is_same<Y, typename simplify_type<Y>::SimpleType>,
+  typename cast_retty<X, Y*>::ret_type
+>::type cast(Y *Val) {
+  assert(isa<X>(Val) && "cast<Ty>() argument of incompatible type!");
+  return cast_convert_val<X, Y*,
+                          typename simplify_type<Y*>::SimpleType>::doit(Val);
+}
+
 // cast_or_null<X> - Functionally identical to cast, except that a null value is
 // accepted.
 //
@@ -230,8 +253,27 @@ inline typename cast_retty<X, Y*>::ret_type cast_or_null(Y *Val) {
 //
 
 template <class X, class Y>
-inline typename cast_retty<X, Y>::ret_type dyn_cast(const Y &Val) {
-  return isa<X>(Val) ? cast<X, Y>(Val) : 0;
+inline typename enable_if_c<
+  !is_same<Y, typename simplify_type<Y>::SimpleType>::value,
+  typename cast_retty<X, Y>::ret_type
+>::type dyn_cast(const Y &Val) {
+  return isa<X>(Val) ? cast<X>(Val) : 0;
+}
+
+template <class X, class Y>
+inline typename enable_if<
+  is_same<Y, typename simplify_type<Y>::SimpleType>,
+  typename cast_retty<X, Y>::ret_type
+>::type dyn_cast(Y &Val) {
+  return isa<X>(Val) ? cast<X>(Val) : 0;
+}
+
+template <class X, class Y>
+inline typename enable_if<
+  is_same<Y, typename simplify_type<Y>::SimpleType>,
+  typename cast_retty<X, Y*>::ret_type
+>::type dyn_cast(Y *Val) {
+  return isa<X>(Val) ? cast<X>(Val) : 0;
 }
 
 // dyn_cast_or_null<X> - Functionally identical to dyn_cast, except that a null
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index 0ab39fc..bf7823e 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -469,8 +469,7 @@ public:
 
   template<class Opt>
   void apply(Opt &O) const {
-    for (unsigned i = 0, e = static_cast<unsigned>(Values.size());
-         i != e; ++i)
+    for (size_t i = 0, e = Values.size(); i != e; ++i)
       O.getParser().addLiteralOption(Values[i].first, Values[i].second.first,
                                      Values[i].second.second);
   }
@@ -629,8 +628,7 @@ public:
     else
       ArgVal = ArgName;
 
-    for (unsigned i = 0, e = static_cast<unsigned>(Values.size());
-         i != e; ++i)
+    for (size_t i = 0, e = Values.size(); i != e; ++i)
       if (Values[i].Name == ArgVal) {
         V = Values[i].V.getValue();
         return false;
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index b32939e..25f42a9 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_SUPPORT_COMPILER_H
 #define LLVM_SUPPORT_COMPILER_H
 
+#include "llvm/Config/llvm-config.h"
+
 #ifndef __has_feature
 # define __has_feature(x) 0
 #endif
@@ -42,6 +44,43 @@
 #define LLVM_HAS_RVALUE_REFERENCE_THIS 0
 #endif
 
+/// \macro LLVM_HAS_CXX11_TYPETRAITS
+/// \brief Does the compiler have the C++11 type traits.
+///
+/// #include <type_traits>
+///
+/// * enable_if
+/// * {true,false}_type
+/// * is_constructible
+/// * etc...
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) \
+    || (defined(_MSC_VER) && _MSC_VER >= 1700)
+#define LLVM_HAS_CXX11_TYPETRAITS 1
+#else
+#define LLVM_HAS_CXX11_TYPETRAITS 0
+#endif
+
+/// \macro LLVM_HAS_CXX11_STDLIB
+/// \brief Does the compiler have the C++11 standard library.
+///
+/// Implies LLVM_HAS_RVALUE_REFERENCES, LLVM_HAS_CXX11_TYPETRAITS
+#if defined(__GXX_EXPERIMENTAL_CXX0X__) \
+    || (defined(_MSC_VER) && _MSC_VER >= 1700)
+#define LLVM_HAS_CXX11_STDLIB 1
+#else
+#define LLVM_HAS_CXX11_STDLIB 0
+#endif
+
+/// \macro LLVM_HAS_VARIADIC_TEMPLATES
+/// \brief Does this compiler support variadic templates.
+///
+/// Implies LLVM_HAS_RVALUE_REFERENCES and the existence of std::forward.
+#if __has_feature(cxx_variadic_templates)
+# define LLVM_HAS_VARIADIC_TEMPLATES 1
+#else
+# define LLVM_HAS_VARIADIC_TEMPLATES 0
+#endif
+
 /// llvm_move - Expands to ::std::move if the compiler supports
 /// r-value references; otherwise, expands to the argument.
 #if LLVM_HAS_RVALUE_REFERENCES
@@ -81,7 +120,8 @@
 
 /// LLVM_FINAL - Expands to 'final' if the compiler supports it.
 /// Use to mark classes or virtual methods as final.
-#if (__has_feature(cxx_override_control))
+#if __has_feature(cxx_override_control) \
+    || (defined(_MSC_VER) && _MSC_VER >= 1700)
 #define LLVM_FINAL final
 #else
 #define LLVM_FINAL
@@ -89,12 +129,19 @@
 
 /// LLVM_OVERRIDE - Expands to 'override' if the compiler supports it.
 /// Use to mark virtual methods as overriding a base class method.
-#if (__has_feature(cxx_override_control))
+#if __has_feature(cxx_override_control) \
+    || (defined(_MSC_VER) && _MSC_VER >= 1700)
 #define LLVM_OVERRIDE override
 #else
 #define LLVM_OVERRIDE
 #endif
 
+#if __has_feature(cxx_constexpr) || defined(__GXX_EXPERIMENTAL_CXX0X__)
+# define LLVM_CONSTEXPR constexpr
+#else
+# define LLVM_CONSTEXPR
+#endif
+
 /// LLVM_LIBRARY_VISIBILITY - If a class marked with this attribute is linked
 /// into a shared library, then the class should be private to the library and
 /// not accessible from outside it.  Can also be used to mark variables and
@@ -249,4 +296,59 @@
 # define LLVM_ASSUME_ALIGNED(p, a) (p)
 #endif
 
+/// \macro LLVM_FUNCTION_NAME
+/// \brief Expands to __func__ on compilers which support it.  Otherwise,
+/// expands to a compiler-dependent replacement.
+#if defined(_MSC_VER)
+# define LLVM_FUNCTION_NAME __FUNCTION__
+#else
+# define LLVM_FUNCTION_NAME __func__
+#endif
+
+#if defined(HAVE_SANITIZER_MSAN_INTERFACE_H)
+# include <sanitizer/msan_interface.h>
+#else
+# define __msan_allocated_memory(p, size)
+# define __msan_unpoison(p, size)
+#endif
+
+/// \macro LLVM_MEMORY_SANITIZER_BUILD
+/// \brief Whether LLVM itself is built with MemorySanitizer instrumentation.
+#if __has_feature(memory_sanitizer)
+# define LLVM_MEMORY_SANITIZER_BUILD 1
+#else
+# define LLVM_MEMORY_SANITIZER_BUILD 0
+#endif
+
+/// \macro LLVM_ADDRESS_SANITIZER_BUILD
+/// \brief Whether LLVM itself is built with AddressSanitizer instrumentation.
+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+# define LLVM_ADDRESS_SANITIZER_BUILD 1
+#else
+# define LLVM_ADDRESS_SANITIZER_BUILD 0
+#endif
+
+/// \macro LLVM_IS_UNALIGNED_ACCESS_FAST
+/// \brief Is unaligned memory access fast on the host machine.
+///
+/// Don't specialize on alignment for platforms where unaligned memory accesses
+/// generates the same code as aligned memory accesses for common types.
+#if defined(_M_AMD64) || defined(_M_IX86) || defined(__amd64) || \
+    defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || \
+    defined(_X86_) || defined(__i386) || defined(__i386__)
+# define LLVM_IS_UNALIGNED_ACCESS_FAST 1
+#else
+# define LLVM_IS_UNALIGNED_ACCESS_FAST 0
+#endif
+
+/// \macro LLVM_EXPLICIT
+/// \brief Expands to explicit on compilers which support explicit conversion
+/// operators. Otherwise expands to nothing.
+#if (__has_feature(cxx_explicit_conversions) \
+     || defined(__GXX_EXPERIMENTAL_CXX0X__))
+#define LLVM_EXPLICIT explicit
+#else
+#define LLVM_EXPLICIT
+#endif
+
 #endif
diff --git a/include/llvm/Support/ConstantRange.h b/include/llvm/Support/ConstantRange.h
index 90dd69f..0f29256 100644
--- a/include/llvm/Support/ConstantRange.h
+++ b/include/llvm/Support/ConstantRange.h
@@ -29,8 +29,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_CONSTANT_RANGE_H
-#define LLVM_SUPPORT_CONSTANT_RANGE_H
+#ifndef LLVM_SUPPORT_CONSTANTRANGE_H
+#define LLVM_SUPPORT_CONSTANTRANGE_H
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/Support/DataTypes.h"
diff --git a/include/llvm/Support/ConvertUTF.h b/include/llvm/Support/ConvertUTF.h
new file mode 100644
index 0000000..1eae6d6
--- /dev/null
+++ b/include/llvm/Support/ConvertUTF.h
@@ -0,0 +1,228 @@
+/*===--- ConvertUTF.h - Universal Character Names conversions ---------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *==------------------------------------------------------------------------==*/
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ *
+ * Disclaimer
+ *
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ *
+ * Limitations on Rights to Redistribute This Code
+ *
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+/* ---------------------------------------------------------------------
+
+    Conversions between UTF32, UTF-16, and UTF-8.  Header file.
+
+    Several funtions are included here, forming a complete set of
+    conversions between the three formats.  UTF-7 is not included
+    here, but is handled in a separate source file.
+
+    Each of these routines takes pointers to input buffers and output
+    buffers.  The input buffers are const.
+
+    Each routine converts the text between *sourceStart and sourceEnd,
+    putting the result into the buffer between *targetStart and
+    targetEnd. Note: the end pointers are *after* the last item: e.g.
+    *(sourceEnd - 1) is the last item.
+
+    The return result indicates whether the conversion was successful,
+    and if not, whether the problem was in the source or target buffers.
+    (Only the first encountered problem is indicated.)
+
+    After the conversion, *sourceStart and *targetStart are both
+    updated to point to the end of last text successfully converted in
+    the respective buffers.
+
+    Input parameters:
+        sourceStart - pointer to a pointer to the source buffer.
+                The contents of this are modified on return so that
+                it points at the next thing to be converted.
+        targetStart - similarly, pointer to pointer to the target buffer.
+        sourceEnd, targetEnd - respectively pointers to the ends of the
+                two buffers, for overflow checking only.
+
+    These conversion functions take a ConversionFlags argument. When this
+    flag is set to strict, both irregular sequences and isolated surrogates
+    will cause an error.  When the flag is set to lenient, both irregular
+    sequences and isolated surrogates are converted.
+
+    Whether the flag is strict or lenient, all illegal sequences will cause
+    an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
+    or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
+    must check for illegal sequences.
+
+    When the flag is set to lenient, characters over 0x10FFFF are converted
+    to the replacement character; otherwise (when the flag is set to strict)
+    they constitute an error.
+
+    Output parameters:
+        The value "sourceIllegal" is returned from some routines if the input
+        sequence is malformed.  When "sourceIllegal" is returned, the source
+        value will point to the illegal value that caused the problem. E.g.,
+        in UTF-8 when a sequence is malformed, it points to the start of the
+        malformed sequence.
+
+    Author: Mark E. Davis, 1994.
+    Rev History: Rick McGowan, fixes & updates May 2001.
+         Fixes & updates, Sept 2001.
+
+------------------------------------------------------------------------ */
+
+#ifndef CLANG_BASIC_CONVERTUTF_H
+#define CLANG_BASIC_CONVERTUTF_H
+
+/* ---------------------------------------------------------------------
+    The following 4 definitions are compiler-specific.
+    The C standard does not guarantee that wchar_t has at least
+    16 bits, so wchar_t is no less portable than unsigned short!
+    All should be unsigned values to avoid sign extension during
+    bit mask & shift operations.
+------------------------------------------------------------------------ */
+
+typedef unsigned int    UTF32;  /* at least 32 bits */
+typedef unsigned short  UTF16;  /* at least 16 bits */
+typedef unsigned char   UTF8;   /* typically 8 bits */
+typedef unsigned char   Boolean; /* 0 or 1 */
+
+/* Some fundamental constants */
+#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
+#define UNI_MAX_BMP (UTF32)0x0000FFFF
+#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
+#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
+#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
+
+#define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
+
+typedef enum {
+  conversionOK,           /* conversion successful */
+  sourceExhausted,        /* partial character in source, but hit end */
+  targetExhausted,        /* insuff. room in target for conversion */
+  sourceIllegal           /* source sequence is illegal/malformed */
+} ConversionResult;
+
+typedef enum {
+  strictConversion = 0,
+  lenientConversion
+} ConversionFlags;
+
+/* This is for C++ and does no harm in C */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ConversionResult ConvertUTF8toUTF16 (
+  const UTF8** sourceStart, const UTF8* sourceEnd,
+  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
+
+ConversionResult ConvertUTF8toUTF32 (
+  const UTF8** sourceStart, const UTF8* sourceEnd,
+  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
+
+ConversionResult ConvertUTF16toUTF8 (
+  const UTF16** sourceStart, const UTF16* sourceEnd,
+  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
+
+ConversionResult ConvertUTF32toUTF8 (
+  const UTF32** sourceStart, const UTF32* sourceEnd,
+  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
+
+ConversionResult ConvertUTF16toUTF32 (
+  const UTF16** sourceStart, const UTF16* sourceEnd,
+  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
+
+ConversionResult ConvertUTF32toUTF16 (
+  const UTF32** sourceStart, const UTF32* sourceEnd,
+  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
+
+Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
+
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
+
+unsigned getNumBytesForUTF8(UTF8 firstByte);
+
+#ifdef __cplusplus
+}
+
+/*************************************************************************/
+/* Below are LLVM-specific wrappers of the functions above. */
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+/**
+ * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
+ * WideCharWidth. The converted data is written to ResultPtr, which needs to
+ * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
+ * ResultPtr will point one after the end of the copied string. On failure,
+ * ResultPtr will not be changed, and ErrorPtr will be set to the location of
+ * the first character which could not be converted.
+ * \return true on success.
+ */
+bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
+                       char *&ResultPtr, const UTF8 *&ErrorPtr);
+
+/**
+ * Convert an Unicode code point to UTF8 sequence.
+ *
+ * \param Source a Unicode code point.
+ * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
+ * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes.  On success \c ResultPtr is
+ * updated one past end of the converted sequence.
+ *
+ * \returns true on success.
+ */
+bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
+
+/**
+ * Convert the first UTF8 sequence in the given source buffer to a UTF32
+ * code point.
+ *
+ * \param [in,out] source A pointer to the source buffer. If the conversion
+ * succeeds, this pointer will be updated to point to the byte just past the
+ * end of the converted sequence.
+ * \param sourceEnd A pointer just past the end of the source buffer.
+ * \param [out] target The converted code
+ * \param flags Whether the conversion is strict or lenient.
+ *
+ * \returns conversionOK on success
+ *
+ * \sa ConvertUTF8toUTF32
+ */
+static inline ConversionResult convertUTF8Sequence(const UTF8 **source,
+                                                   const UTF8 *sourceEnd,
+                                                   UTF32 *target,
+                                                   ConversionFlags flags) {
+  if (*source == sourceEnd)
+    return sourceExhausted;
+  unsigned size = getNumBytesForUTF8(**source);
+  if ((ptrdiff_t)size > sourceEnd - *source)
+    return sourceExhausted;
+  return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
+}
+} /* end namespace llvm */
+
+#endif
+
+/* --------------------------------------------------------------------- */
+
+#endif
diff --git a/include/llvm/Support/DOTGraphTraits.h b/include/llvm/Support/DOTGraphTraits.h
index 483f267..95e37c0 100644
--- a/include/llvm/Support/DOTGraphTraits.h
+++ b/include/llvm/Support/DOTGraphTraits.h
@@ -79,6 +79,11 @@ public:
     return false;
   }
 
+  template<typename GraphType>
+  static std::string getNodeDescription(const void *, const GraphType &) {
+    return "";
+  }
+
   /// If you want to specify custom node attributes, this is the place to do so
   ///
   template<typename GraphType>
diff --git a/include/llvm/Support/DataExtractor.h b/include/llvm/Support/DataExtractor.h
index a3ae782..e8a19cd 100644
--- a/include/llvm/Support/DataExtractor.h
+++ b/include/llvm/Support/DataExtractor.h
@@ -18,22 +18,24 @@ namespace llvm {
 class DataExtractor {
   StringRef Data;
   uint8_t IsLittleEndian;
-  uint8_t PointerSize;
+  uint8_t AddressSize;
 public:
   /// Construct with a buffer that is owned by the caller.
   ///
   /// This constructor allows us to use data that is owned by the
   /// caller. The data must stay around as long as this object is
   /// valid.
-  DataExtractor(StringRef Data, bool IsLittleEndian, uint8_t PointerSize)
-    : Data(Data), IsLittleEndian(IsLittleEndian), PointerSize(PointerSize) {}
+  DataExtractor(StringRef Data, bool IsLittleEndian, uint8_t AddressSize)
+    : Data(Data), IsLittleEndian(IsLittleEndian), AddressSize(AddressSize) {}
 
-  /// getData - Get the data pointed to by this extractor.
+  /// \brief Get the data pointed to by this extractor.
   StringRef getData() const { return Data; }
-  /// isLittleEndian - Get the endianess for this extractor.
+  /// \brief Get the endianess for this extractor.
   bool isLittleEndian() const { return IsLittleEndian; }
-  /// getAddressSize - Get the address size for this extractor.
-  uint8_t getAddressSize() const { return PointerSize; }
+  /// \brief Get the address size for this extractor.
+  uint8_t getAddressSize() const { return AddressSize; }
+  /// \brief Set the address size for this extractor.
+  void setAddressSize(uint8_t Size) { AddressSize = Size; }
 
   /// Extract a C string from \a *offset_ptr.
   ///
@@ -113,7 +115,7 @@ public:
   ///
   /// Extract a single pointer from the data and update the offset
   /// pointed to by \a offset_ptr. The size of the extracted pointer
-  /// comes from the \a m_addr_size member variable and should be
+  /// is \a getAddressSize(), so the address size has to be
   /// set correctly prior to extracting any pointer values.
   ///
   /// @param[in,out] offset_ptr
@@ -126,7 +128,7 @@ public:
   /// @return
   ///     The extracted pointer value as a 64 integer.
   uint64_t getAddress(uint32_t *offset_ptr) const {
-    return getUnsigned(offset_ptr, PointerSize);
+    return getUnsigned(offset_ptr, AddressSize);
   }
 
   /// Extract a uint8_t value from \a *offset_ptr.
diff --git a/include/llvm/Support/DataStream.h b/include/llvm/Support/DataStream.h
index fedb0c9..8bc4133 100644
--- a/include/llvm/Support/DataStream.h
+++ b/include/llvm/Support/DataStream.h
@@ -14,8 +14,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef LLVM_SUPPORT_DATASTREAM_H_
-#define LLVM_SUPPORT_DATASTREAM_H_
+#ifndef LLVM_SUPPORT_DATASTREAM_H
+#define LLVM_SUPPORT_DATASTREAM_H
 
 #include <string>
 
diff --git a/include/llvm/Support/DebugLoc.h b/include/llvm/Support/DebugLoc.h
index 0498075..3596be8 100644
--- a/include/llvm/Support/DebugLoc.h
+++ b/include/llvm/Support/DebugLoc.h
@@ -109,4 +109,4 @@ namespace llvm {
   };
 } // end namespace llvm
 
-#endif /* LLVM_DEBUGLOC_H */
+#endif /* LLVM_SUPPORT_DEBUGLOC_H */
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index c703da5..b52914f 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -16,6 +16,9 @@
 #ifndef LLVM_SUPPORT_DWARF_H
 #define LLVM_SUPPORT_DWARF_H
 
+#include "llvm/Support/DataTypes.h"
+
+
 namespace llvm {
 
 //===----------------------------------------------------------------------===//
@@ -53,10 +56,16 @@ enum llvm_dwarf_constants {
 
   DW_TAG_user_base = 0x1000,            // Recommended base for user tags.
 
-  DW_CIE_VERSION = 1,                   // Common frame information version.
-  DW_CIE_ID       = 0xffffffff          // Common frame information mark.
+  DW_CIE_VERSION = 1                    // Common frame information version.
 };
 
+
+// Special ID values that distinguish a CIE from a FDE in DWARF CFI.
+// Not inside an enum because a 64-bit value is needed.
+const uint32_t DW_CIE_ID = UINT32_MAX;
+const uint64_t DW64_CIE_ID = UINT64_MAX;
+
+
 enum dwarf_constants {
   DWARF_VERSION = 2,
 
diff --git a/include/llvm/Support/DynamicLibrary.h b/include/llvm/Support/DynamicLibrary.h
index 0f59cbf..1e2d16c 100644
--- a/include/llvm/Support/DynamicLibrary.h
+++ b/include/llvm/Support/DynamicLibrary.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_DYNAMIC_LIBRARY_H
-#define LLVM_SYSTEM_DYNAMIC_LIBRARY_H
+#ifndef LLVM_SYSTEM_DYNAMICLIBRARY_H
+#define LLVM_SYSTEM_DYNAMICLIBRARY_H
 
 #include <string>
 
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index d6e54be..8e6b91e 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -271,6 +271,7 @@ enum {
   EM_SLE9X         = 179, // Infineon Technologies SLE9X core
   EM_L10M          = 180, // Intel L10M
   EM_K10M          = 181, // Intel K10M
+  EM_AARCH64       = 183, // ARM AArch64
   EM_AVR32         = 185, // Atmel Corporation 32-bit microprocessor family
   EM_STM8          = 186, // STMicroeletronics STM8 8-bit microcontroller
   EM_TILE64        = 187, // Tilera TILE64 multicore architecture family
@@ -366,7 +367,8 @@ enum {
   R_X86_64_SIZE64     = 33,
   R_X86_64_GOTPC32_TLSDESC = 34,
   R_X86_64_TLSDESC_CALL    = 35,
-  R_X86_64_TLSDESC    = 36
+  R_X86_64_TLSDESC    = 36,
+  R_X86_64_IRELATIVE  = 37
 };
 
 // i386 relocations.
@@ -469,6 +471,7 @@ enum {
   R_PPC64_ADDR16_HI           = 5,
   R_PPC64_ADDR14              = 7,
   R_PPC64_REL24               = 10,
+  R_PPC64_REL32               = 26,
   R_PPC64_ADDR64              = 38,
   R_PPC64_ADDR16_HIGHER       = 39,
   R_PPC64_ADDR16_HIGHEST      = 41,
@@ -480,6 +483,7 @@ enum {
   R_PPC64_TOC16_DS            = 63,
   R_PPC64_TOC16_LO_DS         = 64,
   R_PPC64_TLS                 = 67,
+  R_PPC64_TPREL16_LO          = 70,
   R_PPC64_DTPREL16_LO         = 75,
   R_PPC64_DTPREL16_HA         = 77,
   R_PPC64_GOT_TLSGD16_LO      = 80,
@@ -492,8 +496,106 @@ enum {
   R_PPC64_TLSLD               = 108
 };
 
+// ELF Relocation types for AArch64
+
+enum {
+  R_AARCH64_NONE                        = 0x100,
+
+  R_AARCH64_ABS64                       = 0x101,
+  R_AARCH64_ABS32                       = 0x102,
+  R_AARCH64_ABS16                       = 0x103,
+  R_AARCH64_PREL64                      = 0x104,
+  R_AARCH64_PREL32                      = 0x105,
+  R_AARCH64_PREL16                      = 0x106,
+
+  R_AARCH64_MOVW_UABS_G0                = 0x107,
+  R_AARCH64_MOVW_UABS_G0_NC             = 0x108,
+  R_AARCH64_MOVW_UABS_G1                = 0x109,
+  R_AARCH64_MOVW_UABS_G1_NC             = 0x10a,
+  R_AARCH64_MOVW_UABS_G2                = 0x10b,
+  R_AARCH64_MOVW_UABS_G2_NC             = 0x10c,
+  R_AARCH64_MOVW_UABS_G3                = 0x10d,
+  R_AARCH64_MOVW_SABS_G0                = 0x10e,
+  R_AARCH64_MOVW_SABS_G1                = 0x10f,
+  R_AARCH64_MOVW_SABS_G2                = 0x110,
+
+  R_AARCH64_LD_PREL_LO19                = 0x111,
+  R_AARCH64_ADR_PREL_LO21               = 0x112,
+  R_AARCH64_ADR_PREL_PG_HI21            = 0x113,
+  R_AARCH64_ADD_ABS_LO12_NC             = 0x115,
+  R_AARCH64_LDST8_ABS_LO12_NC           = 0x116,
+
+  R_AARCH64_TSTBR14                     = 0x117,
+  R_AARCH64_CONDBR19                    = 0x118,
+  R_AARCH64_JUMP26                      = 0x11a,
+  R_AARCH64_CALL26                      = 0x11b,
+
+  R_AARCH64_LDST16_ABS_LO12_NC          = 0x11c,
+  R_AARCH64_LDST32_ABS_LO12_NC          = 0x11d,
+  R_AARCH64_LDST64_ABS_LO12_NC          = 0x11e,
+
+  R_AARCH64_LDST128_ABS_LO12_NC         = 0x12b,
+
+  R_AARCH64_ADR_GOT_PAGE                = 0x137,
+  R_AARCH64_LD64_GOT_LO12_NC            = 0x138,
+
+  R_AARCH64_TLSLD_MOVW_DTPREL_G2        = 0x20b,
+  R_AARCH64_TLSLD_MOVW_DTPREL_G1        = 0x20c,
+  R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC     = 0x20d,
+  R_AARCH64_TLSLD_MOVW_DTPREL_G0        = 0x20e,
+  R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC     = 0x20f,
+  R_AARCH64_TLSLD_ADD_DTPREL_HI12       = 0x210,
+  R_AARCH64_TLSLD_ADD_DTPREL_LO12       = 0x211,
+  R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC    = 0x212,
+  R_AARCH64_TLSLD_LDST8_DTPREL_LO12     = 0x213,
+  R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC  = 0x214,
+  R_AARCH64_TLSLD_LDST16_DTPREL_LO12    = 0x215,
+  R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC = 0x216,
+  R_AARCH64_TLSLD_LDST32_DTPREL_LO12    = 0x217,
+  R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC = 0x218,
+  R_AARCH64_TLSLD_LDST64_DTPREL_LO12    = 0x219,
+  R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC = 0x21a,
+
+  R_AARCH64_TLSIE_MOVW_GOTTPREL_G1      = 0x21b,
+  R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC   = 0x21c,
+  R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21   = 0x21d,
+  R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC = 0x21e,
+  R_AARCH64_TLSIE_LD_GOTTPREL_PREL19    = 0x21f,
+
+  R_AARCH64_TLSLE_MOVW_TPREL_G2         = 0x220,
+  R_AARCH64_TLSLE_MOVW_TPREL_G1         = 0x221,
+  R_AARCH64_TLSLE_MOVW_TPREL_G1_NC      = 0x222,
+  R_AARCH64_TLSLE_MOVW_TPREL_G0         = 0x223,
+  R_AARCH64_TLSLE_MOVW_TPREL_G0_NC      = 0x224,
+  R_AARCH64_TLSLE_ADD_TPREL_HI12        = 0x225,
+  R_AARCH64_TLSLE_ADD_TPREL_LO12        = 0x226,
+  R_AARCH64_TLSLE_ADD_TPREL_LO12_NC     = 0x227,
+  R_AARCH64_TLSLE_LDST8_TPREL_LO12      = 0x228,
+  R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC   = 0x229,
+  R_AARCH64_TLSLE_LDST16_TPREL_LO12     = 0x22a,
+  R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC  = 0x22b,
+  R_AARCH64_TLSLE_LDST32_TPREL_LO12     = 0x22c,
+  R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC  = 0x22d,
+  R_AARCH64_TLSLE_LDST64_TPREL_LO12     = 0x22e,
+  R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC  = 0x22f,
+
+  R_AARCH64_TLSDESC_ADR_PAGE            = 0x232,
+  R_AARCH64_TLSDESC_LD64_LO12_NC        = 0x233,
+  R_AARCH64_TLSDESC_ADD_LO12_NC         = 0x234,
+
+  R_AARCH64_TLSDESC_CALL                = 0x239
+};
+
 // ARM Specific e_flags
-enum { EF_ARM_EABIMASK = 0xFF000000U };
+enum {
+  EF_ARM_EABI_UNKNOWN =   0x00000000U,
+  EF_ARM_EABI_VER1 =      0x01000000U,
+  EF_ARM_EABI_VER2 =      0x02000000U,
+  EF_ARM_EABI_VER3 =      0x03000000U,
+  EF_ARM_EABI_VER4 =      0x04000000U,
+  EF_ARM_EABI_VER5 =      0x05000000U,
+  EF_ARM_EABIMASK =       0xFF000000U
+};
 
 // ELF Relocation types for ARM
 // Meets 2.08 ABI Specs.
@@ -637,6 +739,13 @@ enum {
   EF_MIPS_NOREORDER = 0x00000001, // Don't reorder instructions
   EF_MIPS_PIC       = 0x00000002, // Position independent code
   EF_MIPS_CPIC      = 0x00000004, // Call object with Position independent code
+  EF_MIPS_ABI_O32   = 0x00001000, // This file follows the first MIPS 32 bit ABI
+
+  //ARCH_ASE
+  EF_MIPS_MICROMIPS = 0x02000000, // microMIPS
+  EF_MIPS_ARCH_ASE_M16 =
+                      0x04000000, // Has Mips-16 ISA extensions
+  //ARCH
   EF_MIPS_ARCH_1    = 0x00000000, // MIPS1 instruction set
   EF_MIPS_ARCH_2    = 0x10000000, // MIPS2 instruction set
   EF_MIPS_ARCH_3    = 0x20000000, // MIPS3 instruction set
@@ -707,6 +816,11 @@ enum {
   R_MIPS_NUM               = 218
 };
 
+// Special values for the st_other field in the symbol table entry for MIPS.
+enum {
+  STO_MIPS_MICROMIPS       = 0x80 // MIPS Specific ISA for MicroMips
+};
+
 // Hexagon Specific e_flags
 // Release 5 ABI
 enum {
@@ -726,14 +840,14 @@ enum {
 };
 
 // Hexagon specific Section indexes for common small data
-// Release 5 ABI 
+// Release 5 ABI
 enum {
   SHN_HEXAGON_SCOMMON     = 0xff00,       // Other access sizes
   SHN_HEXAGON_SCOMMON_1   = 0xff01,       // Byte-sized access
   SHN_HEXAGON_SCOMMON_2   = 0xff02,       // Half-word-sized access
   SHN_HEXAGON_SCOMMON_4   = 0xff03,       // Word-sized access
   SHN_HEXAGON_SCOMMON_8   = 0xff04        // Double-word-size access
-};   
+};
 
 // ELF Relocation types for Hexagon
 // Release 5 ABI
@@ -894,8 +1008,7 @@ enum {
   SHT_GNU_verneed   = 0x6ffffffe, // GNU version references.
   SHT_GNU_versym    = 0x6fffffff, // GNU symbol versions table.
   SHT_HIOS          = 0x6fffffff, // Highest operating system-specific type.
-  SHT_LOPROC        = 0x70000000, // Lowest processor architecture-specific type.
-
+  SHT_LOPROC        = 0x70000000, // Lowest processor arch-specific type.
   // Fixme: All this is duplicated in MCSectionELF. Why??
   // Exception Index table
   SHT_ARM_EXIDX           = 0x70000001U,
@@ -905,10 +1018,14 @@ enum {
   SHT_ARM_ATTRIBUTES      = 0x70000003U,
   SHT_ARM_DEBUGOVERLAY    = 0x70000004U,
   SHT_ARM_OVERLAYSECTION  = 0x70000005U,
-
+  SHT_HEX_ORDERED         = 0x70000000, // Link editor is to sort the entries in
+                                        // this section based on their sizes
   SHT_X86_64_UNWIND       = 0x70000001, // Unwind information
 
-  SHT_HIPROC        = 0x7fffffff, // Highest processor architecture-specific type.
+  SHT_MIPS_REGINFO        = 0x70000006, // Register usage information
+  SHT_MIPS_OPTIONS        = 0x7000000d, // General options
+
+  SHT_HIPROC        = 0x7fffffff, // Highest processor arch-specific type.
   SHT_LOUSER        = 0x80000000, // Lowest type reserved for applications.
   SHT_HIUSER        = 0xffffffff  // Highest type reserved for applications.
 };
@@ -970,7 +1087,14 @@ enum {
   // sets this flag besides being able to refer to data in a section that does
   // not set it; likewise, a small code model object can refer only to code in a
   // section that does not set this flag.
-  SHF_X86_64_LARGE = 0x10000000
+  SHF_X86_64_LARGE = 0x10000000,
+
+  // All sections with the GPREL flag are grouped into a global data area
+  // for faster accesses
+  SHF_HEX_GPREL = 0x10000000,
+
+  // Do not strip this section. FIXME: We need target specific SHF_ enums.
+  SHF_MIPS_NOSTRIP = 0x8000000
 };
 
 // Section Group Flags
@@ -1005,7 +1129,7 @@ struct Elf64_Sym {
   Elf64_Word      st_name;  // Symbol name (index into string table)
   unsigned char   st_info;  // Symbol's type and binding attributes
   unsigned char   st_other; // Must be zero; reserved
-  Elf64_Half      st_shndx; // Which section (header table index) it's defined in
+  Elf64_Half      st_shndx; // Which section (header tbl index) it's defined in
   Elf64_Addr      st_value; // Value or address associated with the symbol
   Elf64_Xword     st_size;  // Size of the symbol
 
@@ -1060,6 +1184,11 @@ enum {
   STV_PROTECTED = 3   // Visible in other components but not preemptable
 };
 
+// Symbol number.
+enum {
+  STN_UNDEF = 0
+};
+
 // Relocation entry, without explicit addend.
 struct Elf32_Rel {
   Elf32_Addr r_offset; // Location (file byte offset, or program virtual addr)
@@ -1100,14 +1229,14 @@ struct Elf64_Rel {
 
   // These accessors and mutators correspond to the ELF64_R_SYM, ELF64_R_TYPE,
   // and ELF64_R_INFO macros defined in the ELF specification:
-  Elf64_Xword getSymbol() const { return (r_info >> 32); }
-  unsigned char getType() const {
-    return (unsigned char) (r_info & 0xffffffffL);
+  Elf64_Word getSymbol() const { return (r_info >> 32); }
+  Elf64_Word getType() const {
+    return (Elf64_Word) (r_info & 0xffffffffL);
   }
-  void setSymbol(Elf32_Word s) { setSymbolAndType(s, getType()); }
-  void setType(unsigned char t) { setSymbolAndType(getSymbol(), t); }
-  void setSymbolAndType(Elf64_Xword s, unsigned char t) {
-    r_info = (s << 32) + (t&0xffffffffL);
+  void setSymbol(Elf64_Word s) { setSymbolAndType(s, getType()); }
+  void setType(Elf64_Word t) { setSymbolAndType(getSymbol(), t); }
+  void setSymbolAndType(Elf64_Word s, Elf64_Word t) {
+    r_info = ((Elf64_Xword)s << 32) + (t&0xffffffffL);
   }
 };
 
@@ -1119,14 +1248,14 @@ struct Elf64_Rela {
 
   // These accessors and mutators correspond to the ELF64_R_SYM, ELF64_R_TYPE,
   // and ELF64_R_INFO macros defined in the ELF specification:
-  Elf64_Xword getSymbol() const { return (r_info >> 32); }
-  unsigned char getType() const {
-    return (unsigned char) (r_info & 0xffffffffL);
+  Elf64_Word getSymbol() const { return (r_info >> 32); }
+  Elf64_Word getType() const {
+    return (Elf64_Word) (r_info & 0xffffffffL);
   }
-  void setSymbol(Elf64_Xword s) { setSymbolAndType(s, getType()); }
-  void setType(unsigned char t) { setSymbolAndType(getSymbol(), t); }
-  void setSymbolAndType(Elf64_Xword s, unsigned char t) {
-    r_info = (s << 32) + (t&0xffffffffL);
+  void setSymbol(Elf64_Word s) { setSymbolAndType(s, getType()); }
+  void setType(Elf64_Word t) { setSymbolAndType(getSymbol(), t); }
+  void setSymbolAndType(Elf64_Word s, Elf64_Word t) {
+    r_info = ((Elf64_Xword)s << 32) + (t&0xffffffffL);
   }
 };
 
@@ -1148,7 +1277,7 @@ struct Elf64_Phdr {
   Elf64_Word   p_flags;  // Segment flags
   Elf64_Off    p_offset; // File offset where segment is located, in bytes
   Elf64_Addr   p_vaddr;  // Virtual address of beginning of segment
-  Elf64_Addr   p_paddr;  // Physical address of beginning of segment (OS-specific)
+  Elf64_Addr   p_paddr;  // Physical addr of beginning of segment (OS-specific)
   Elf64_Xword  p_filesz; // Num. of bytes in file image of segment (may be zero)
   Elf64_Xword  p_memsz;  // Num. of bytes in mem image of segment (may be zero)
   Elf64_Xword  p_align;  // Segment alignment constraint
@@ -1179,7 +1308,7 @@ enum {
   PT_GNU_RELRO  = 0x6474e552, // Read-only after relocation.
 
   // ARM program header types.
-  PT_ARM_ARCHEXT = 0x70000000, // Platform architecture compatibility information
+  PT_ARM_ARCHEXT = 0x70000000, // Platform architecture compatibility info
   // These all contain stack unwind tables.
   PT_ARM_EXIDX   = 0x70000001,
   PT_ARM_UNWIND  = 0x70000001
diff --git a/include/llvm/Support/Errno.h b/include/llvm/Support/Errno.h
index 150bdb7..8e145c7 100644
--- a/include/llvm/Support/Errno.h
+++ b/include/llvm/Support/Errno.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_ERRNO_H
-#define LLVM_SYSTEM_ERRNO_H
+#ifndef LLVM_SUPPORT_ERRNO_H
+#define LLVM_SUPPORT_ERRNO_H
 
 #include <string>
 
diff --git a/include/llvm/Support/ErrorOr.h b/include/llvm/Support/ErrorOr.h
new file mode 100644
index 0000000..ceec33d
--- /dev/null
+++ b/include/llvm/Support/ErrorOr.h
@@ -0,0 +1,488 @@
+//===- llvm/Support/ErrorOr.h - Error Smart Pointer -----------------------===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// Provides ErrorOr<T> smart pointer.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ERROR_OR_H
+#define LLVM_SUPPORT_ERROR_OR_H
+
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Support/type_traits.h"
+
+#include <cassert>
+#if LLVM_HAS_CXX11_TYPETRAITS
+#include <type_traits>
+#endif
+
+namespace llvm {
+struct ErrorHolderBase {
+  error_code Error;
+  uint16_t RefCount;
+  bool HasUserData;
+
+  ErrorHolderBase() : RefCount(1) {}
+
+  void aquire() {
+    ++RefCount;
+  }
+
+  void release() {
+    if (--RefCount == 0)
+      delete this;
+  }
+
+protected:
+  virtual ~ErrorHolderBase() {}
+};
+
+template<class T>
+struct ErrorHolder : ErrorHolderBase {
+#if LLVM_HAS_RVALUE_REFERENCES
+  ErrorHolder(T &&UD) : UserData(llvm_move(UD)) {}
+#else
+  ErrorHolder(T &UD) : UserData(UD) {}
+#endif
+  T UserData;
+};
+
+template<class Tp> struct ErrorOrUserDataTraits : llvm::false_type {};
+
+#if LLVM_HAS_CXX11_TYPETRAITS && LLVM_HAS_RVALUE_REFERENCES
+template<class T, class V>
+typename std::enable_if< std::is_constructible<T, V>::value
+                       , typename std::remove_reference<V>::type>::type &&
+ moveIfMoveConstructible(V &Val) {
+  return std::move(Val);
+}
+
+template<class T, class V>
+typename std::enable_if< !std::is_constructible<T, V>::value
+                       , typename std::remove_reference<V>::type>::type &
+moveIfMoveConstructible(V &Val) {
+  return Val;
+}
+#else
+template<class T, class V>
+V &moveIfMoveConstructible(V &Val) {
+  return Val;
+}
+#endif
+
+/// \brief Stores a reference that can be changed.
+template <typename T>
+class ReferenceStorage {
+  T *Storage;
+
+public:
+  ReferenceStorage(T &Ref) : Storage(&Ref) {}
+
+  operator T &() const { return *Storage; }
+  T &get() const { return *Storage; }
+};
+
+/// \brief Represents either an error or a value T.
+///
+/// ErrorOr<T> is a pointer-like class that represents the result of an
+/// operation. The result is either an error, or a value of type T. This is
+/// designed to emulate the usage of returning a pointer where nullptr indicates
+/// failure. However instead of just knowing that the operation failed, we also
+/// have an error_code and optional user data that describes why it failed.
+///
+/// It is used like the following.
+/// \code
+///   ErrorOr<Buffer> getBuffer();
+///   void handleError(error_code ec);
+///
+///   auto buffer = getBuffer();
+///   if (!buffer)
+///     handleError(buffer);
+///   buffer->write("adena");
+/// \endcode
+///
+/// ErrorOr<T> also supports user defined data for specific error_codes. To use
+/// this feature you must first add a template specialization of
+/// ErrorOrUserDataTraits derived from std::true_type for your type in the lld
+/// namespace. This specialization must have a static error_code error()
+/// function that returns the error_code this data is used with.
+///
+/// getError<UserData>() may be called to get either the stored user data, or
+/// a default constructed UserData if none was stored.
+///
+/// Example:
+/// \code
+///   struct InvalidArgError {
+///     InvalidArgError() {}
+///     InvalidArgError(std::string S) : ArgName(S) {}
+///     std::string ArgName;
+///   };
+///
+///   namespace llvm {
+///   template<>
+///   struct ErrorOrUserDataTraits<InvalidArgError> : std::true_type {
+///     static error_code error() {
+///       return make_error_code(errc::invalid_argument);
+///     }
+///   };
+///   } // end namespace llvm
+///
+///   using namespace llvm;
+///
+///   ErrorOr<int> foo() {
+///     return InvalidArgError("adena");
+///   }
+///
+///   int main() {
+///     auto a = foo();
+///     if (!a && error_code(a) == errc::invalid_argument)
+///       llvm::errs() << a.getError<InvalidArgError>().ArgName << "\n";
+///   }
+/// \endcode
+///
+/// An implicit conversion to bool provides a way to check if there was an
+/// error. The unary * and -> operators provide pointer like access to the
+/// value. Accessing the value when there is an error has undefined behavior.
+///
+/// When T is a reference type the behaivor is slightly different. The reference
+/// is held in a std::reference_wrapper<std::remove_reference<T>::type>, and
+/// there is special handling to make operator -> work as if T was not a
+/// reference.
+///
+/// T cannot be a rvalue reference.
+template<class T>
+class ErrorOr {
+  template <class OtherT> friend class ErrorOr;
+  static const bool isRef = is_reference<T>::value;
+  typedef ReferenceStorage<typename remove_reference<T>::type> wrap;
+
+public:
+  typedef typename
+    conditional< isRef
+               , wrap
+               , T
+               >::type storage_type;
+
+private:
+  typedef typename remove_reference<T>::type &reference;
+  typedef typename remove_reference<T>::type *pointer;
+
+public:
+  ErrorOr() : IsValid(false) {}
+
+  ErrorOr(llvm::error_code EC) : HasError(true), IsValid(true) {
+    Error = new ErrorHolderBase;
+    Error->Error = EC;
+    Error->HasUserData = false;
+  }
+
+  template<class UserDataT>
+  ErrorOr(UserDataT UD, typename
+          enable_if_c<ErrorOrUserDataTraits<UserDataT>::value>::type* = 0)
+    : HasError(true), IsValid(true) {
+    Error = new ErrorHolder<UserDataT>(llvm_move(UD));
+    Error->Error = ErrorOrUserDataTraits<UserDataT>::error();
+    Error->HasUserData = true;
+  }
+
+  ErrorOr(T Val) : HasError(false), IsValid(true) {
+    new (get()) storage_type(moveIfMoveConstructible<storage_type>(Val));
+  }
+
+  ErrorOr(const ErrorOr &Other) : IsValid(false) {
+    copyConstruct(Other);
+  }
+
+  template <class OtherT>
+  ErrorOr(const ErrorOr<OtherT> &Other) : IsValid(false) {
+    copyConstruct(Other);
+  }
+
+  ErrorOr &operator =(const ErrorOr &Other) {
+    copyAssign(Other);
+    return *this;
+  }
+
+  template <class OtherT>
+  ErrorOr &operator =(const ErrorOr<OtherT> &Other) {
+    copyAssign(Other);
+    return *this;
+  }
+
+#if LLVM_HAS_RVALUE_REFERENCES
+  ErrorOr(ErrorOr &&Other) : IsValid(false) {
+    moveConstruct(std::move(Other));
+  }
+
+  template <class OtherT>
+  ErrorOr(ErrorOr<OtherT> &&Other) : IsValid(false) {
+    moveConstruct(std::move(Other));
+  }
+
+  ErrorOr &operator =(ErrorOr &&Other) {
+    moveAssign(std::move(Other));
+    return *this;
+  }
+
+  template <class OtherT>
+  ErrorOr &operator =(ErrorOr<OtherT> &&Other) {
+    moveAssign(std::move(Other));
+    return *this;
+  }
+#endif
+
+  ~ErrorOr() {
+    if (!IsValid)
+      return;
+    if (HasError)
+      Error->release();
+    else
+      get()->~storage_type();
+  }
+
+  template<class ET>
+  ET getError() const {
+    assert(IsValid && "Cannot get the error of a default constructed ErrorOr!");
+    assert(HasError && "Cannot get an error if none exists!");
+    assert(ErrorOrUserDataTraits<ET>::error() == Error->Error &&
+           "Incorrect user error data type for error!");
+    if (!Error->HasUserData)
+      return ET();
+    return reinterpret_cast<const ErrorHolder<ET>*>(Error)->UserData;
+  }
+
+  typedef void (*unspecified_bool_type)();
+  static void unspecified_bool_true() {}
+
+  /// \brief Return false if there is an error.
+  operator unspecified_bool_type() const {
+    assert(IsValid && "Can't do anything on a default constructed ErrorOr!");
+    return HasError ? 0 : unspecified_bool_true;
+  }
+
+  operator llvm::error_code() const {
+    assert(IsValid && "Can't do anything on a default constructed ErrorOr!");
+    return HasError ? Error->Error : llvm::error_code::success();
+  }
+
+  pointer operator ->() {
+    return toPointer(get());
+  }
+
+  reference operator *() {
+    return *get();
+  }
+
+private:
+  template <class OtherT>
+  void copyConstruct(const ErrorOr<OtherT> &Other) {
+    // Construct an invalid ErrorOr if other is invalid.
+    if (!Other.IsValid)
+      return;
+    IsValid = true;
+    if (!Other.HasError) {
+      // Get the other value.
+      HasError = false;
+      new (get()) storage_type(*Other.get());
+    } else {
+      // Get other's error.
+      Error = Other.Error;
+      HasError = true;
+      Error->aquire();
+    }
+  }
+
+  template <class T1>
+  static bool compareThisIfSameType(const T1 &a, const T1 &b) {
+    return &a == &b;
+  }
+
+  template <class T1, class T2>
+  static bool compareThisIfSameType(const T1 &a, const T2 &b) {
+    return false;
+  }
+
+  template <class OtherT>
+  void copyAssign(const ErrorOr<OtherT> &Other) {
+    if (compareThisIfSameType(*this, Other))
+      return;
+
+    this->~ErrorOr();
+    new (this) ErrorOr(Other);
+  }
+
+#if LLVM_HAS_RVALUE_REFERENCES
+  template <class OtherT>
+  void moveConstruct(ErrorOr<OtherT> &&Other) {
+    // Construct an invalid ErrorOr if other is invalid.
+    if (!Other.IsValid)
+      return;
+    IsValid = true;
+    if (!Other.HasError) {
+      // Get the other value.
+      HasError = false;
+      new (get()) storage_type(std::move(*Other.get()));
+      // Tell other not to do any destruction.
+      Other.IsValid = false;
+    } else {
+      // Get other's error.
+      Error = Other.Error;
+      HasError = true;
+      // Tell other not to do any destruction.
+      Other.IsValid = false;
+    }
+  }
+
+  template <class OtherT>
+  void moveAssign(ErrorOr<OtherT> &&Other) {
+    if (compareThisIfSameType(*this, Other))
+      return;
+
+    this->~ErrorOr();
+    new (this) ErrorOr(std::move(Other));
+  }
+#endif
+
+  pointer toPointer(pointer Val) {
+    return Val;
+  }
+
+  pointer toPointer(wrap *Val) {
+    return &Val->get();
+  }
+
+  storage_type *get() {
+    assert(IsValid && "Can't do anything on a default constructed ErrorOr!");
+    assert(!HasError && "Cannot get value when an error exists!");
+    return reinterpret_cast<storage_type*>(TStorage.buffer);
+  }
+
+  const storage_type *get() const {
+    assert(IsValid && "Can't do anything on a default constructed ErrorOr!");
+    assert(!HasError && "Cannot get value when an error exists!");
+    return reinterpret_cast<const storage_type*>(TStorage.buffer);
+  }
+
+  union {
+    AlignedCharArrayUnion<storage_type> TStorage;
+    ErrorHolderBase *Error;
+  };
+  bool HasError : 1;
+  bool IsValid : 1;
+};
+
+// ErrorOr specialization for void.
+template <>
+class ErrorOr<void> {
+public:
+  ErrorOr() : Error(0, 0) {}
+
+  ErrorOr(llvm::error_code EC) : Error(0, 0) {
+    if (EC == errc::success) {
+      Error.setInt(1);
+      return;
+    }
+    ErrorHolderBase *E = new ErrorHolderBase;
+    E->Error = EC;
+    E->HasUserData = false;
+    Error.setPointer(E);
+  }
+
+  template<class UserDataT>
+  ErrorOr(UserDataT UD, typename
+          enable_if_c<ErrorOrUserDataTraits<UserDataT>::value>::type* = 0)
+      : Error(0, 0) {
+    ErrorHolderBase *E = new ErrorHolder<UserDataT>(llvm_move(UD));
+    E->Error = ErrorOrUserDataTraits<UserDataT>::error();
+    E->HasUserData = true;
+    Error.setPointer(E);
+  }
+
+  ErrorOr(const ErrorOr &Other) : Error(0, 0) {
+    Error = Other.Error;
+    if (Other.Error.getPointer()->Error) {
+      Error.getPointer()->aquire();
+    }
+  }
+
+  ErrorOr &operator =(const ErrorOr &Other) {
+    if (this == &Other)
+      return *this;
+
+    this->~ErrorOr();
+    new (this) ErrorOr(Other);
+
+    return *this;
+  }
+
+#if LLVM_HAS_RVALUE_REFERENCES
+  ErrorOr(ErrorOr &&Other) : Error(0) {
+    // Get other's error.
+    Error = Other.Error;
+    // Tell other not to do any destruction.
+    Other.Error.setPointer(0);
+  }
+
+  ErrorOr &operator =(ErrorOr &&Other) {
+    if (this == &Other)
+      return *this;
+
+    this->~ErrorOr();
+    new (this) ErrorOr(std::move(Other));
+
+    return *this;
+  }
+#endif
+
+  ~ErrorOr() {
+    if (Error.getPointer())
+      Error.getPointer()->release();
+  }
+
+  template<class ET>
+  ET getError() const {
+    assert(ErrorOrUserDataTraits<ET>::error() == *this &&
+           "Incorrect user error data type for error!");
+    if (!Error.getPointer()->HasUserData)
+      return ET();
+    return reinterpret_cast<const ErrorHolder<ET> *>(
+        Error.getPointer())->UserData;
+  }
+
+  typedef void (*unspecified_bool_type)();
+  static void unspecified_bool_true() {}
+
+  /// \brief Return false if there is an error.
+  operator unspecified_bool_type() const {
+    return Error.getInt() ? unspecified_bool_true : 0;
+  }
+
+  operator llvm::error_code() const {
+    return Error.getInt() ? make_error_code(errc::success)
+                          : Error.getPointer()->Error;
+  }
+
+private:
+  // If the bit is 1, the error is success.
+  llvm::PointerIntPair<ErrorHolderBase *, 1> Error;
+};
+
+template<class T, class E>
+typename enable_if_c<is_error_code_enum<E>::value ||
+                     is_error_condition_enum<E>::value, bool>::type
+operator ==(ErrorOr<T> &Err, E Code) {
+  return error_code(Err) == Code;
+}
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Support/FEnv.h b/include/llvm/Support/FEnv.h
index f6f4333..d9cf724 100644
--- a/include/llvm/Support/FEnv.h
+++ b/include/llvm/Support/FEnv.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_FENV_H
-#define LLVM_SYSTEM_FENV_H
+#ifndef LLVM_SUPPORT_FENV_H
+#define LLVM_SUPPORT_FENV_H
 
 #include "llvm/Config/config.h"
 #include <cerrno>
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index f1cbe97..bae3a5a 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -24,8 +24,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_FILE_SYSTEM_H
-#define LLVM_SUPPORT_FILE_SYSTEM_H
+#ifndef LLVM_SUPPORT_FILESYSTEM_H
+#define LLVM_SUPPORT_FILESYSTEM_H
 
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/OwningPtr.h"
diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h
index e552315..f1040f5 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/Support/GCOV.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_GCOV_H
-#define LLVM_GCOV_H
+#ifndef LLVM_SUPPORT_GCOV_H
+#define LLVM_SUPPORT_GCOV_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/include/llvm/Support/GetElementPtrTypeIterator.h b/include/llvm/Support/GetElementPtrTypeIterator.h
index 52b06ad..5a90553 100644
--- a/include/llvm/Support/GetElementPtrTypeIterator.h
+++ b/include/llvm/Support/GetElementPtrTypeIterator.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_GETELEMENTPTRTYPE_H
-#define LLVM_SUPPORT_GETELEMENTPTRTYPE_H
+#ifndef LLVM_SUPPORT_GETELEMENTPTRTYPEITERATOR_H
+#define LLVM_SUPPORT_GETELEMENTPTRTYPEITERATOR_H
 
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/User.h"
diff --git a/include/llvm/Support/GraphWriter.h b/include/llvm/Support/GraphWriter.h
index 30cfe61..22181d4 100644
--- a/include/llvm/Support/GraphWriter.h
+++ b/include/llvm/Support/GraphWriter.h
@@ -34,6 +34,10 @@ namespace llvm {
 
 namespace DOT {  // Private functions...
   std::string EscapeString(const std::string &Label);
+
+  /// \brief Get a color string for this node number. Simply round-robin selects
+  /// from a reasonable number of colors.
+  StringRef getColorString(unsigned NodeNumber);
 }
 
 namespace GraphProgram {
@@ -173,6 +177,10 @@ public:
       // If we should include the address of the node in the label, do so now.
       if (DTraits.hasNodeAddressLabel(Node, G))
         O << "|" << static_cast<const void*>(Node);
+
+      std::string NodeDesc = DTraits.getNodeDescription(Node, G);
+      if (!NodeDesc.empty())
+        O << "|" << DOT::EscapeString(NodeDesc);
     }
 
     std::string edgeSourceLabels;
@@ -193,6 +201,10 @@ public:
       // If we should include the address of the node in the label, do so now.
       if (DTraits.hasNodeAddressLabel(Node, G))
         O << "|" << static_cast<const void*>(Node);
+
+      std::string NodeDesc = DTraits.getNodeDescription(Node, G);
+      if (!NodeDesc.empty())
+        O << "|" << DOT::EscapeString(NodeDesc);
     }
 
     if (DTraits.hasEdgeDestLabels()) {
diff --git a/include/llvm/Support/Host.h b/include/llvm/Support/Host.h
index b331016..3a44405 100644
--- a/include/llvm/Support/Host.h
+++ b/include/llvm/Support/Host.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_HOST_H
-#define LLVM_SYSTEM_HOST_H
+#ifndef LLVM_SUPPORT_HOST_H
+#define LLVM_SUPPORT_HOST_H
 
 #include "llvm/ADT/StringMap.h"
 #include <string>
@@ -42,6 +42,10 @@ namespace sys {
   ///   CPU_TYPE-VENDOR-KERNEL-OPERATING_SYSTEM
   std::string getDefaultTargetTriple();
 
+  /// getProcessTriple() - Return an appropriate target triple for generating
+  /// code to be loaded into the current process, e.g. when using the JIT.
+  std::string getProcessTriple();
+
   /// getHostCPUName - Get the LLVM name for the host CPU. The particular format
   /// of the name is target dependent, and suitable for passing as -mcpu to the
   /// target which matches the host.
diff --git a/include/llvm/Support/IncludeFile.h b/include/llvm/Support/IncludeFile.h
index a931972..2067e34 100644
--- a/include/llvm/Support/IncludeFile.h
+++ b/include/llvm/Support/IncludeFile.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_INCLUDEFILE_H
-#define LLVM_SYSTEM_INCLUDEFILE_H
+#ifndef LLVM_SUPPORT_INCLUDEFILE_H
+#define LLVM_SUPPORT_INCLUDEFILE_H
 
 /// This macro is the public interface that IncludeFile.h exports. This gives
 /// us the option to implement the "link the definition" capability in any
diff --git a/include/llvm/Support/IntegersSubset.h b/include/llvm/Support/IntegersSubset.h
index 9e5361c..ce34d78 100644
--- a/include/llvm/Support/IntegersSubset.h
+++ b/include/llvm/Support/IntegersSubset.h
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CONSTANTRANGESSET_H_
-#define CONSTANTRANGESSET_H_
+#ifndef LLVM_SUPPORT_INTEGERSSUBSET_H
+#define LLVM_SUPPORT_INTEGERSSUBSET_H
 
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -537,4 +537,4 @@ public:
 
 }
 
-#endif /* CONSTANTRANGESSET_H_ */
+#endif /* CLLVM_SUPPORT_INTEGERSSUBSET_H */
diff --git a/include/llvm/Support/IntegersSubsetMapping.h b/include/llvm/Support/IntegersSubsetMapping.h
index 7635d5e..641ce78 100644
--- a/include/llvm/Support/IntegersSubsetMapping.h
+++ b/include/llvm/Support/IntegersSubsetMapping.h
@@ -17,8 +17,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef CRSBUILDER_H_
-#define CRSBUILDER_H_
+#ifndef LLVM_SUPPORT_INTEGERSSUBSETMAPPING_H
+#define LLVM_SUPPORT_INTEGERSSUBSETMAPPING_H
 
 #include "llvm/Support/IntegersSubset.h"
 #include <list>
@@ -585,4 +585,4 @@ typedef IntegersSubsetMapping<BasicBlock> IntegersSubsetToBB;
 
 }
 
-#endif /* CRSBUILDER_H_ */
+#endif /* LLVM_SUPPORT_INTEGERSSUBSETMAPPING_CRSBUILDER_H */
diff --git a/include/llvm/Support/LEB128.h b/include/llvm/Support/LEB128.h
index b52e5bc..802b4f3 100644
--- a/include/llvm/Support/LEB128.h
+++ b/include/llvm/Support/LEB128.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_LEB128_H
-#define LLVM_SYSTEM_LEB128_H
+#ifndef LLVM_SUPPORT_LEB128_H
+#define LLVM_SUPPORT_LEB128_H
 
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/include/llvm/Support/Locale.h b/include/llvm/Support/Locale.h
index b0f1295..b384d58 100644
--- a/include/llvm/Support/Locale.h
+++ b/include/llvm/Support/Locale.h
@@ -1,5 +1,5 @@
-#ifndef LLVM_SUPPORT_LOCALE
-#define LLVM_SUPPORT_LOCALE
+#ifndef LLVM_SUPPORT_LOCALE_H
+#define LLVM_SUPPORT_LOCALE_H
 
 #include "llvm/ADT/StringRef.h"
 
@@ -14,4 +14,4 @@ bool isPrint(int c);
 }
 }
 
-#endif // LLVM_SUPPORT_LOCALE
+#endif // LLVM_SUPPORT_LOCALE_H
diff --git a/include/llvm/Support/LockFileManager.h b/include/llvm/Support/LockFileManager.h
index 8c4a760..9df8675 100644
--- a/include/llvm/Support/LockFileManager.h
+++ b/include/llvm/Support/LockFileManager.h
@@ -41,6 +41,7 @@ public:
   };
 
 private:
+  SmallString<128> FileName;
   SmallString<128> LockFileName;
   SmallString<128> UniqueLockFileName;
 
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 11f9e63..214bb6c 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -16,6 +16,10 @@
 
 #include "llvm/Support/SwapByteOrder.h"
 
+#ifdef _MSC_VER
+# include <intrin.h>
+#endif
+
 namespace llvm {
 
 // NOTE: The following support functions use the _32/_64 extensions instead of
@@ -61,7 +65,7 @@ inline bool isShiftedInt(int64_t x) {
 /// isUInt - Checks if an unsigned integer fits into the given bit width.
 template<unsigned N>
 inline bool isUInt(uint64_t x) {
-  return N >= 64 || x < (UINT64_C(1)<<N);
+  return N >= 64 || x < (UINT64_C(1)<<(N));
 }
 // Template specializations to get better code for common cases.
 template<>
diff --git a/include/llvm/Support/Memory.h b/include/llvm/Support/Memory.h
index 025eee7..a08c796 100644
--- a/include/llvm/Support/Memory.h
+++ b/include/llvm/Support/Memory.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_MEMORY_H
-#define LLVM_SYSTEM_MEMORY_H
+#ifndef LLVM_SUPPORT_MEMORY_H
+#define LLVM_SUPPORT_MEMORY_H
 
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/system_error.h"
diff --git a/include/llvm/Support/MemoryObject.h b/include/llvm/Support/MemoryObject.h
index b778b08..732b0f0 100644
--- a/include/llvm/Support/MemoryObject.h
+++ b/include/llvm/Support/MemoryObject.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MEMORYOBJECT_H
-#define MEMORYOBJECT_H
+#ifndef LLVM_SUPPORT_MEMORYOBJECT_H
+#define LLVM_SUPPORT_MEMORYOBJECT_H
 
 #include "llvm/Support/DataTypes.h"
 
diff --git a/include/llvm/Support/Mutex.h b/include/llvm/Support/Mutex.h
index 6abc533..496a438 100644
--- a/include/llvm/Support/Mutex.h
+++ b/include/llvm/Support/Mutex.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_MUTEX_H
-#define LLVM_SYSTEM_MUTEX_H
+#ifndef LLVM_SUPPORT_MUTEX_H
+#define LLVM_SUPPORT_MUTEX_H
 
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Threading.h"
diff --git a/include/llvm/Support/PassNameParser.h b/include/llvm/Support/PassNameParser.h
index bdfab39..317416c 100644
--- a/include/llvm/Support/PassNameParser.h
+++ b/include/llvm/Support/PassNameParser.h
@@ -20,8 +20,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_PASS_NAME_PARSER_H
-#define LLVM_SUPPORT_PASS_NAME_PARSER_H
+#ifndef LLVM_SUPPORT_PASSNAMEPARSER_H
+#define LLVM_SUPPORT_PASSNAMEPARSER_H
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Pass.h"
diff --git a/include/llvm/Support/PathV1.h b/include/llvm/Support/PathV1.h
index 643ee8c..86328f0 100644
--- a/include/llvm/Support/PathV1.h
+++ b/include/llvm/Support/PathV1.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_PATH_H
-#define LLVM_SYSTEM_PATH_H
+#ifndef LLVM_SUPPORT_PATHV1_H
+#define LLVM_SUPPORT_PATHV1_H
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
diff --git a/include/llvm/Support/Process.h b/include/llvm/Support/Process.h
index ea8da6d..4256d4a 100644
--- a/include/llvm/Support/Process.h
+++ b/include/llvm/Support/Process.h
@@ -22,8 +22,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_PROCESS_H
-#define LLVM_SYSTEM_PROCESS_H
+#ifndef LLVM_SUPPORT_PROCESS_H
+#define LLVM_SUPPORT_PROCESS_H
 
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/DataTypes.h"
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index 028565d..a0cc27c 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_PROGRAM_H
-#define LLVM_SYSTEM_PROGRAM_H
+#ifndef LLVM_SUPPORT_PROGRAM_H
+#define LLVM_SUPPORT_PROGRAM_H
 
 #include "llvm/Support/Path.h"
 
diff --git a/include/llvm/Support/RegistryParser.h b/include/llvm/Support/RegistryParser.h
index 2cc5783..a6997b6 100644
--- a/include/llvm/Support/RegistryParser.h
+++ b/include/llvm/Support/RegistryParser.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_REGISTRY_PARSER_H
-#define LLVM_SUPPORT_REGISTRY_PARSER_H
+#ifndef LLVM_SUPPORT_REGISTRYPARSER_H
+#define LLVM_SUPPORT_REGISTRYPARSER_H
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Registry.h"
@@ -52,4 +52,4 @@ namespace llvm {
 
 }
 
-#endif // LLVM_SUPPORT_REGISTRY_PARSER_H
+#endif // LLVM_SUPPORT_REGISTRYPARSER_H
diff --git a/include/llvm/Support/SMLoc.h b/include/llvm/Support/SMLoc.h
index 2b0e9df..0906471 100644
--- a/include/llvm/Support/SMLoc.h
+++ b/include/llvm/Support/SMLoc.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SUPPORT_SMLOC_H
-#define SUPPORT_SMLOC_H
+#ifndef LLVM_SUPPORT_SMLOC_H
+#define LLVM_SUPPORT_SMLOC_H
 
 #include <cassert>
 
diff --git a/include/llvm/Support/SaveAndRestore.h b/include/llvm/Support/SaveAndRestore.h
index ffa99b9..6330bec 100644
--- a/include/llvm/Support/SaveAndRestore.h
+++ b/include/llvm/Support/SaveAndRestore.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ADT_SAVERESTORE
-#define LLVM_ADT_SAVERESTORE
+#ifndef LLVM_SUPPORT_SAVEANDRESTORE_H
+#define LLVM_SUPPORT_SAVEANDRESTORE_H
 
 namespace llvm {
 
diff --git a/include/llvm/Support/Signals.h b/include/llvm/Support/Signals.h
index 634f4cf..465656b 100644
--- a/include/llvm/Support/Signals.h
+++ b/include/llvm/Support/Signals.h
@@ -12,10 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_SIGNALS_H
-#define LLVM_SYSTEM_SIGNALS_H
+#ifndef LLVM_SUPPORT_SIGNALS_H
+#define LLVM_SUPPORT_SIGNALS_H
 
 #include "llvm/Support/Path.h"
+#include <cstdio>
 
 namespace llvm {
 namespace sys {
@@ -38,6 +39,9 @@ namespace sys {
   /// @brief Print a stack trace if a fatal signal occurs.
   void PrintStackTraceOnErrorSignal();
 
+  /// \brief Print the stack trace using the given \c FILE object.
+  void PrintStackTrace(FILE *);
+
   /// AddSignalHandler - Add a function to be called when an abort/kill signal
   /// is delivered to the process.  The handler can have a cookie passed to it
   /// to identify what instance of the handler it is.
diff --git a/include/llvm/Support/Solaris.h b/include/llvm/Support/Solaris.h
index 57eee2c..6228c4b 100644
--- a/include/llvm/Support/Solaris.h
+++ b/include/llvm/Support/Solaris.h
@@ -11,8 +11,8 @@
  *
  *===----------------------------------------------------------------------===*/
 
-#ifndef LLVM_SYSTEM_SOLARIS_H
-#define LLVM_SYSTEM_SOLARIS_H
+#ifndef LLVM_SUPPORT_SOLARIS_H
+#define LLVM_SUPPORT_SOLARIS_H
 
 #include <sys/types.h>
 #include <sys/regset.h>
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index aaee344..02abf92 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -13,10 +13,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef SUPPORT_SOURCEMGR_H
-#define SUPPORT_SOURCEMGR_H
+#ifndef LLVM_SUPPORT_SOURCEMGR_H
+#define LLVM_SUPPORT_SOURCEMGR_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/SMLoc.h"
 #include <string>
 
@@ -24,6 +26,7 @@ namespace llvm {
   class MemoryBuffer;
   class SourceMgr;
   class SMDiagnostic;
+  class SMFixIt;
   class Twine;
   class raw_ostream;
 
@@ -143,6 +146,7 @@ public:
   /// the default error handler is used.
   void PrintMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg,
                     ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
+                    ArrayRef<SMFixIt> FixIts = ArrayRef<SMFixIt>(),
                     bool ShowColors = true) const;
 
 
@@ -152,7 +156,8 @@ public:
   /// @param Msg If non-null, the kind of message (e.g., "error") which is
   /// prefixed to the message.
   SMDiagnostic GetMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg, 
-                          ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) const;
+                          ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
+                          ArrayRef<SMFixIt> FixIts = ArrayRef<SMFixIt>()) const;
 
   /// PrintIncludeStack - Prints the names of included files and the line of the
   /// file they were included from.  A diagnostic handler can use this before
@@ -164,6 +169,38 @@ public:
 };
 
 
+/// Represents a single fixit, a replacement of one range of text with another.
+class SMFixIt {
+  SMRange Range;
+
+  std::string Text;
+
+public:
+  // FIXME: Twine.str() is not very efficient.
+  SMFixIt(SMLoc Loc, const Twine &Insertion)
+    : Range(Loc, Loc), Text(Insertion.str()) {
+    assert(Loc.isValid());
+  }
+
+  // FIXME: Twine.str() is not very efficient.
+  SMFixIt(SMRange R, const Twine &Replacement)
+    : Range(R), Text(Replacement.str()) {
+    assert(R.isValid());
+  }
+
+  StringRef getText() const { return Text; }
+  SMRange getRange() const { return Range; }
+
+  bool operator<(const SMFixIt &Other) const {
+    if (Range.Start.getPointer() != Other.Range.Start.getPointer())
+      return Range.Start.getPointer() < Other.Range.Start.getPointer();
+    if (Range.End.getPointer() != Other.Range.End.getPointer())
+      return Range.End.getPointer() < Other.Range.End.getPointer();
+    return Text < Other.Text;
+  }
+};
+
+
 /// SMDiagnostic - Instances of this class encapsulate one diagnostic report,
 /// allowing printing to a raw_ostream as a caret diagnostic.
 class SMDiagnostic {
@@ -174,35 +211,46 @@ class SMDiagnostic {
   SourceMgr::DiagKind Kind;
   std::string Message, LineContents;
   std::vector<std::pair<unsigned, unsigned> > Ranges;
+  SmallVector<SMFixIt, 4> FixIts;
 
 public:
   // Null diagnostic.
   SMDiagnostic()
     : SM(0), LineNo(0), ColumnNo(0), Kind(SourceMgr::DK_Error) {}
   // Diagnostic with no location (e.g. file not found, command line arg error).
-  SMDiagnostic(const std::string &filename, SourceMgr::DiagKind Knd,
-               const std::string &Msg)
+  SMDiagnostic(StringRef filename, SourceMgr::DiagKind Knd, StringRef Msg)
     : SM(0), Filename(filename), LineNo(-1), ColumnNo(-1), Kind(Knd),
       Message(Msg) {}
   
   // Diagnostic with a location.
-  SMDiagnostic(const SourceMgr &sm, SMLoc L, const std::string &FN,
+  SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
                int Line, int Col, SourceMgr::DiagKind Kind,
-               const std::string &Msg, const std::string &LineStr,
-               ArrayRef<std::pair<unsigned,unsigned> > Ranges);
+               StringRef Msg, StringRef LineStr,
+               ArrayRef<std::pair<unsigned,unsigned> > Ranges,
+               ArrayRef<SMFixIt> FixIts = ArrayRef<SMFixIt>());
 
   const SourceMgr *getSourceMgr() const { return SM; }
   SMLoc getLoc() const { return Loc; }
-  const std::string &getFilename() const { return Filename; }
+  StringRef getFilename() const { return Filename; }
   int getLineNo() const { return LineNo; }
   int getColumnNo() const { return ColumnNo; }
   SourceMgr::DiagKind getKind() const { return Kind; }
-  const std::string &getMessage() const { return Message; }
-  const std::string &getLineContents() const { return LineContents; }
-  const std::vector<std::pair<unsigned, unsigned> > &getRanges() const {
+  StringRef getMessage() const { return Message; }
+  StringRef getLineContents() const { return LineContents; }
+  ArrayRef<std::pair<unsigned, unsigned> > getRanges() const {
     return Ranges;
   }
-  void print(const char *ProgName, raw_ostream &S, bool ShowColors = true) const;
+
+  void addFixIt(const SMFixIt &Hint) {
+    FixIts.push_back(Hint);
+  }
+
+  ArrayRef<SMFixIt> getFixIts() const {
+    return FixIts;
+  }
+
+  void print(const char *ProgName, raw_ostream &S,
+             bool ShowColors = true) const;
 };
 
 }  // end llvm namespace
diff --git a/include/llvm/Support/StreamableMemoryObject.h b/include/llvm/Support/StreamableMemoryObject.h
index 2e1163f..3855485 100644
--- a/include/llvm/Support/StreamableMemoryObject.h
+++ b/include/llvm/Support/StreamableMemoryObject.h
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 
-#ifndef STREAMABLEMEMORYOBJECT_H_
-#define STREAMABLEMEMORYOBJECT_H_
+#ifndef LLVM_SUPPORT_STREAMABLEMEMORYOBJECT_H
+#define LLVM_SUPPORT_STREAMABLEMEMORYOBJECT_H
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Support/Compiler.h"
diff --git a/include/llvm/Support/SwapByteOrder.h b/include/llvm/Support/SwapByteOrder.h
index 6c0592c..e65f9cc 100644
--- a/include/llvm/Support/SwapByteOrder.h
+++ b/include/llvm/Support/SwapByteOrder.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_SWAP_BYTE_ORDER_H
-#define LLVM_SYSTEM_SWAP_BYTE_ORDER_H
+#ifndef LLVM_SUPPORT_SWAPBYTEORDER_H
+#define LLVM_SUPPORT_SWAPBYTEORDER_H
 
 #include "llvm/Support/DataTypes.h"
 #include <cstddef>
diff --git a/include/llvm/Support/ThreadLocal.h b/include/llvm/Support/ThreadLocal.h
index 01f735c..7518626 100644
--- a/include/llvm/Support/ThreadLocal.h
+++ b/include/llvm/Support/ThreadLocal.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_THREAD_LOCAL_H
-#define LLVM_SYSTEM_THREAD_LOCAL_H
+#ifndef LLVM_SUPPORT_THREADLOCAL_H
+#define LLVM_SUPPORT_THREADLOCAL_H
 
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Threading.h"
diff --git a/include/llvm/Support/Threading.h b/include/llvm/Support/Threading.h
index 9017afb..a7e8774 100644
--- a/include/llvm/Support/Threading.h
+++ b/include/llvm/Support/Threading.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_THREADING_H
-#define LLVM_SYSTEM_THREADING_H
+#ifndef LLVM_SUPPORT_THREADING_H
+#define LLVM_SUPPORT_THREADING_H
 
 namespace llvm {
   /// llvm_start_multithreaded - Allocate and initialize structures needed to
diff --git a/include/llvm/Support/TimeValue.h b/include/llvm/Support/TimeValue.h
index ab7401a..4b48b84 100644
--- a/include/llvm/Support/TimeValue.h
+++ b/include/llvm/Support/TimeValue.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_TIMEVALUE_H
-#define LLVM_SYSTEM_TIMEVALUE_H
+#ifndef LLVM_SUPPORT_TIMEVALUE_H
+#define LLVM_SUPPORT_TIMEVALUE_H
 
 #include "llvm/Support/DataTypes.h"
 #include <string>
@@ -240,7 +240,7 @@ namespace sys {
     /// Posix, correcting for the difference in Posix zero time.
     /// @brief Convert to unix time (100 nanoseconds since 12:00:00a Jan 1,1970)
     uint64_t toPosixTime() const {
-      uint64_t result = seconds_ - PosixZeroTime.seconds_;
+      uint64_t result = seconds_ - PosixZeroTimeSeconds;
       result += nanos_ / NANOSECONDS_PER_POSIX_TICK;
       return result;
     }
@@ -248,14 +248,14 @@ namespace sys {
     /// Converts the TimeValue into the corresponding number of seconds
     /// since the epoch (00:00:00 Jan 1,1970).
     uint64_t toEpochTime() const {
-      return seconds_ - PosixZeroTime.seconds_;
+      return seconds_ - PosixZeroTimeSeconds;
     }
 
     /// Converts the TimeValue into the corresponding number of "ticks" for
     /// Win32 platforms, correcting for the difference in Win32 zero time.
     /// @brief Convert to windows time (seconds since 12:00:00a Jan 1, 1601)
     uint64_t toWin32Time() const {
-      uint64_t result = seconds_ - Win32ZeroTime.seconds_;
+      uint64_t result = seconds_ - Win32ZeroTimeSeconds;
       result += nanos_ / NANOSECONDS_PER_WIN32_TICK;
       return result;
     }
@@ -264,7 +264,7 @@ namespace sys {
     /// correction for the Posix zero time.
     /// @brief Convert to timespec time (ala POSIX.1b)
     void getTimespecTime( uint64_t& seconds, uint32_t& nanos ) const {
-      seconds = seconds_ - PosixZeroTime.seconds_;
+      seconds = seconds_ - PosixZeroTimeSeconds;
       nanos = nanos_;
     }
 
@@ -331,7 +331,7 @@ namespace sys {
     /// TimeValue and assigns that value to \p this.
     /// @brief Convert seconds form PosixTime to TimeValue
     void fromEpochTime( SecondsType seconds ) {
-      seconds_ = seconds + PosixZeroTime.seconds_;
+      seconds_ = seconds + PosixZeroTimeSeconds;
       nanos_ = 0;
       this->normalize();
     }
@@ -340,7 +340,7 @@ namespace sys {
     /// corresponding TimeValue and assigns that value to \p this.
     /// @brief Convert seconds form Windows FILETIME to TimeValue
     void fromWin32Time( uint64_t win32Time ) {
-      this->seconds_ = win32Time / 10000000 + Win32ZeroTime.seconds_;
+      this->seconds_ = win32Time / 10000000 + Win32ZeroTimeSeconds;
       this->nanos_ = NanoSecondsType(win32Time  % 10000000) * 100;
     }
 
@@ -360,6 +360,9 @@ namespace sys {
     /// Store the values as a <timeval>.
     SecondsType      seconds_;///< Stores the seconds part of the TimeVal
     NanoSecondsType  nanos_;  ///< Stores the nanoseconds part of the TimeVal
+
+    static const SecondsType PosixZeroTimeSeconds;
+    static const SecondsType Win32ZeroTimeSeconds;
   /// @}
 
   };
diff --git a/include/llvm/Support/Timer.h b/include/llvm/Support/Timer.h
index 789c05f..d009d7f 100644
--- a/include/llvm/Support/Timer.h
+++ b/include/llvm/Support/Timer.h
@@ -6,11 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file defines three classes: Timer, TimeRegion, and TimerGroup,
-// documented below.
-//
-//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_SUPPORT_TIMER_H
 #define LLVM_SUPPORT_TIMER_H
@@ -78,7 +73,7 @@ public:
 /// invocations of its startTimer()/stopTimer() methods.  Given appropriate OS
 /// support it can also keep track of the RSS of the program at various points.
 /// By default, the Timer will print the amount of time it has captured to
-/// standard error when the laster timer is destroyed, otherwise it is printed
+/// standard error when the last timer is destroyed, otherwise it is printed
 /// when its TimerGroup is destroyed.  Timers do not print their information
 /// if they are never started.
 ///
@@ -126,7 +121,7 @@ private:
 
 /// The TimeRegion class is used as a helper class to call the startTimer() and
 /// stopTimer() methods of the Timer class.  When the object is constructed, it
-/// starts the timer specified as it's argument.  When it is destroyed, it stops
+/// starts the timer specified as its argument.  When it is destroyed, it stops
 /// the relevant timer.  This makes it easy to time a region of code.
 ///
 class TimeRegion {
diff --git a/include/llvm/Support/ToolOutputFile.h b/include/llvm/Support/ToolOutputFile.h
index 65b182a..b3b7c57 100644
--- a/include/llvm/Support/ToolOutputFile.h
+++ b/include/llvm/Support/ToolOutputFile.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_TOOL_OUTPUT_FILE_H
-#define LLVM_SUPPORT_TOOL_OUTPUT_FILE_H
+#ifndef LLVM_SUPPORT_TOOLOUTPUTFILE_H
+#define LLVM_SUPPORT_TOOLOUTPUTFILE_H
 
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/include/llvm/Support/YAMLParser.h b/include/llvm/Support/YAMLParser.h
index 5b728bc..6e4f57f 100644
--- a/include/llvm/Support/YAMLParser.h
+++ b/include/llvm/Support/YAMLParser.h
@@ -35,8 +35,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_YAML_PARSER_H
-#define LLVM_SUPPORT_YAML_PARSER_H
+#ifndef LLVM_SUPPORT_YAMLPARSER_H
+#define LLVM_SUPPORT_YAMLPARSER_H
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 1862286..801868f 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_YAML_TRAITS_H_
-#define LLVM_YAML_TRAITS_H_
+#ifndef LLVM_SUPPORT_YAMLTRAITS_H
+#define LLVM_SUPPORT_YAMLTRAITS_H
 
 
 #include "llvm/ADT/DenseMap.h"
@@ -195,7 +195,7 @@ public:
 template <class T>
 struct has_ScalarTraits
 {
-  typedef llvm::StringRef (*Signature_input)(llvm::StringRef, void*, T&);
+  typedef StringRef (*Signature_input)(StringRef, void*, T&);
   typedef void (*Signature_output)(const T&, void*, llvm::raw_ostream&);
 
   template <typename U>
@@ -528,73 +528,73 @@ yamlize(IO &io, T &Seq, bool) {
 template<>
 struct ScalarTraits<bool> {
   static void output(const bool &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, bool &);
+  static StringRef input(StringRef, void*, bool &);
 };
 
 template<>
 struct ScalarTraits<StringRef> {
   static void output(const StringRef &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, StringRef &);
+  static StringRef input(StringRef, void*, StringRef &);
 };
 
 template<>
 struct ScalarTraits<uint8_t> {
   static void output(const uint8_t &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, uint8_t &);
+  static StringRef input(StringRef, void*, uint8_t &);
 };
 
 template<>
 struct ScalarTraits<uint16_t> {
   static void output(const uint16_t &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, uint16_t &);
+  static StringRef input(StringRef, void*, uint16_t &);
 };
 
 template<>
 struct ScalarTraits<uint32_t> {
   static void output(const uint32_t &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, uint32_t &);
+  static StringRef input(StringRef, void*, uint32_t &);
 };
 
 template<>
 struct ScalarTraits<uint64_t> {
   static void output(const uint64_t &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, uint64_t &);
+  static StringRef input(StringRef, void*, uint64_t &);
 };
 
 template<>
 struct ScalarTraits<int8_t> {
   static void output(const int8_t &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, int8_t &);
+  static StringRef input(StringRef, void*, int8_t &);
 };
 
 template<>
 struct ScalarTraits<int16_t> {
   static void output(const int16_t &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, int16_t &);
+  static StringRef input(StringRef, void*, int16_t &);
 };
 
 template<>
 struct ScalarTraits<int32_t> {
   static void output(const int32_t &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, int32_t &);
+  static StringRef input(StringRef, void*, int32_t &);
 };
 
 template<>
 struct ScalarTraits<int64_t> {
   static void output(const int64_t &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, int64_t &);
+  static StringRef input(StringRef, void*, int64_t &);
 };
 
 template<>
 struct ScalarTraits<float> {
   static void output(const float &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, float &);
+  static StringRef input(StringRef, void*, float &);
 };
 
 template<>
 struct ScalarTraits<double> {
   static void output(const double &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, double &);
+  static StringRef input(StringRef, void*, double &);
 };
 
 
@@ -913,25 +913,25 @@ LLVM_YAML_STRONG_TYPEDEF(uint64_t, Hex64)
 template<>
 struct ScalarTraits<Hex8> {
   static void output(const Hex8 &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, Hex8 &);
+  static StringRef input(StringRef, void*, Hex8 &);
 };
 
 template<>
 struct ScalarTraits<Hex16> {
   static void output(const Hex16 &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, Hex16 &);
+  static StringRef input(StringRef, void*, Hex16 &);
 };
 
 template<>
 struct ScalarTraits<Hex32> {
   static void output(const Hex32 &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, Hex32 &);
+  static StringRef input(StringRef, void*, Hex32 &);
 };
 
 template<>
 struct ScalarTraits<Hex64> {
   static void output(const Hex64 &, void*, llvm::raw_ostream &);
-  static llvm::StringRef input(llvm::StringRef , void*, Hex64 &);
+  static StringRef input(StringRef, void*, Hex64 &);
 };
 
 
@@ -1101,4 +1101,4 @@ operator<<(Output &yout, T &seq) {
 
 
 
-#endif // LLVM_YAML_TRAITS_H_
+#endif // LLVM_SUPPORT_YAMLTRAITS_H
diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index eab0f2d..d2b4a2a 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -29,7 +29,6 @@ namespace llvm {
 /// a chunk at a time.
 class raw_ostream {
 private:
-  // Do not implement. raw_ostream is noncopyable.
   void operator=(const raw_ostream &) LLVM_DELETED_FUNCTION;
   raw_ostream(const raw_ostream &) LLVM_DELETED_FUNCTION;
 
diff --git a/include/llvm/Support/system_error.h b/include/llvm/Support/system_error.h
index 0d164f6..43dace6 100644
--- a/include/llvm/Support/system_error.h
+++ b/include/llvm/Support/system_error.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_SYSTEM_ERROR_H
-#define LLVM_SYSTEM_SYSTEM_ERROR_H
+#ifndef LLVM_SUPPORT_SYSTEM_ERROR_H
+#define LLVM_SUPPORT_SYSTEM_ERROR_H
 
 #include "llvm/Support/Compiler.h"
 
diff --git a/include/llvm/Support/type_traits.h b/include/llvm/Support/type_traits.h
index f930639..db43ccf 100644
--- a/include/llvm/Support/type_traits.h
+++ b/include/llvm/Support/type_traits.h
@@ -145,6 +145,10 @@ template <typename T> struct is_pointer<T* const> : true_type {};
 template <typename T> struct is_pointer<T* volatile> : true_type {};
 template <typename T> struct is_pointer<T* const volatile> : true_type {};
 
+/// \brief Metafunction that determines wheather the given type is a reference.
+template <typename T> struct is_reference : false_type {};
+template <typename T> struct is_reference<T&> : true_type {};
+
 /// \brief Metafunction that determines whether the given type is either an
 /// integral type or an enumeration type.
 ///
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index 8968534..3cf4f1f 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -175,8 +175,7 @@ public:
 };
 
 
-// BitsRecTy - 'bits<n>' - Represent a fixed number of bits
-/// BitsRecTy - 'bits&lt;n&gt;' - Represent a fixed number of bits
+/// BitsRecTy - 'bits<n>' - Represent a fixed number of bits
 ///
 class BitsRecTy : public RecTy {
   unsigned Size;
@@ -288,10 +287,8 @@ public:
   }
 };
 
-// ListRecTy - 'list<Ty>' - Represent a list of values, all of which must be of
-// the specified type.
-/// ListRecTy - 'list&lt;Ty&gt;' - Represent a list of values, all of which must
-/// be of the specified type.
+/// ListRecTy - 'list<Ty>' - Represent a list of values, all of which must be of
+/// the specified type.
 ///
 class ListRecTy : public RecTy {
   RecTy *Ty;
@@ -930,7 +927,7 @@ public:
 ///
 class BinOpInit : public OpInit {
 public:
-  enum BinaryOp { SHL, SRA, SRL, STRCONCAT, CONCAT, EQ };
+  enum BinaryOp { ADD, SHL, SRA, SRL, STRCONCAT, CONCAT, EQ };
 private:
   BinaryOp Opc;
   Init *LHS, *RHS;
@@ -1387,12 +1384,14 @@ class Record {
   SmallVector<SMLoc, 4> Locs;
   std::vector<Init *> TemplateArgs;
   std::vector<RecordVal> Values;
-  std::vector<Record*> SuperClasses;
+  std::vector<Record *> SuperClasses;
+  std::vector<SMRange> SuperClassRanges;
 
   // Tracks Record instances. Not owned by Record.
   RecordKeeper &TrackedRecords;
 
   DefInit *TheInit;
+  bool IsAnonymous;
 
   void init();
   void checkName();
@@ -1401,14 +1400,15 @@ public:
 
   // Constructs a record.
   explicit Record(const std::string &N, ArrayRef<SMLoc> locs,
-                  RecordKeeper &records) :
+                  RecordKeeper &records, bool Anonymous = false) :
     ID(LastID++), Name(StringInit::get(N)), Locs(locs.begin(), locs.end()),
-    TrackedRecords(records), TheInit(0) {
+    TrackedRecords(records), TheInit(0), IsAnonymous(Anonymous) {
     init();
   }
-  explicit Record(Init *N, ArrayRef<SMLoc> locs, RecordKeeper &records) :
+  explicit Record(Init *N, ArrayRef<SMLoc> locs, RecordKeeper &records,
+                  bool Anonymous = false) :
     ID(LastID++), Name(N), Locs(locs.begin(), locs.end()),
-    TrackedRecords(records), TheInit(0) {
+    TrackedRecords(records), TheInit(0), IsAnonymous(Anonymous) {
     init();
   }
 
@@ -1417,7 +1417,8 @@ public:
   Record(const Record &O) :
     ID(LastID++), Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
     Values(O.Values), SuperClasses(O.SuperClasses),
-    TrackedRecords(O.TrackedRecords), TheInit(O.TheInit) { }
+    SuperClassRanges(O.SuperClassRanges), TrackedRecords(O.TrackedRecords),
+    TheInit(O.TheInit), IsAnonymous(O.IsAnonymous) { }
 
   ~Record() {}
 
@@ -1448,6 +1449,7 @@ public:
   }
   const std::vector<RecordVal> &getValues() const { return Values; }
   const std::vector<Record*>   &getSuperClasses() const { return SuperClasses; }
+  ArrayRef<SMRange> getSuperClassRanges() const { return SuperClassRanges; }
 
   bool isTemplateArg(Init *Name) const {
     for (unsigned i = 0, e = TemplateArgs.size(); i != e; ++i)
@@ -1522,9 +1524,10 @@ public:
     return false;
   }
 
-  void addSuperClass(Record *R) {
+  void addSuperClass(Record *R, SMRange Range) {
     assert(!isSubClassOf(R) && "Already subclassing record!");
     SuperClasses.push_back(R);
+    SuperClassRanges.push_back(Range);
   }
 
   /// resolveReferences - If there are any field references that refer to fields
@@ -1541,6 +1544,10 @@ public:
     return TrackedRecords;
   }
 
+  bool isAnonymous() const {
+    return IsAnonymous;
+  }
+
   void dump() const;
 
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/TableGen/StringMatcher.h b/include/llvm/TableGen/StringMatcher.h
index 1b0572f..99cbcad 100644
--- a/include/llvm/TableGen/StringMatcher.h
+++ b/include/llvm/TableGen/StringMatcher.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef STRINGMATCHER_H
-#define STRINGMATCHER_H
+#ifndef LLVM_TABLEGEN_STRINGMATCHER_H
+#define LLVM_TABLEGEN_STRINGMATCHER_H
 
 #include "llvm/ADT/StringRef.h"
 #include <string>
diff --git a/include/llvm/Target/CostTable.h b/include/llvm/Target/CostTable.h
new file mode 100644
index 0000000..a974b56
--- /dev/null
+++ b/include/llvm/Target/CostTable.h
@@ -0,0 +1,64 @@
+//===-- CostTable.h - Instruction Cost Table handling -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Cost tables and simple lookup functions
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_COSTTABLE_H_
+#define LLVM_TARGET_COSTTABLE_H_
+
+namespace llvm {
+
+/// Cost Table Entry
+template <class TypeTy>
+struct CostTblEntry {
+  int ISD;
+  TypeTy Type;
+  unsigned Cost;
+};
+
+/// Find in cost table, TypeTy must be comparable by ==
+template <class TypeTy>
+int CostTableLookup(const CostTblEntry<TypeTy> *Tbl,
+                    unsigned len, int ISD, TypeTy Ty) {
+  for (unsigned int i = 0; i < len; ++i)
+    if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
+      return i;
+
+  // Could not find an entry.
+  return -1;
+}
+
+/// Type Conversion Cost Table
+template <class TypeTy>
+struct TypeConversionCostTblEntry {
+  int ISD;
+  TypeTy Dst;
+  TypeTy Src;
+  unsigned Cost;
+};
+
+/// Find in type conversion cost table, TypeTy must be comparable by ==
+template <class TypeTy>
+int ConvertCostTableLookup(const TypeConversionCostTblEntry<TypeTy> *Tbl,
+                           unsigned len, int ISD, TypeTy Dst, TypeTy Src) {
+  for (unsigned int i = 0; i < len; ++i)
+    if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
+      return i;
+
+  // Could not find an entry.
+  return -1;
+}
+
+} // namespace llvm
+
+
+#endif /* LLVM_TARGET_COSTTABLE_H_ */
diff --git a/include/llvm/Target/Mangler.h b/include/llvm/Target/Mangler.h
index a50f54a..9500f1c 100644
--- a/include/llvm/Target/Mangler.h
+++ b/include/llvm/Target/Mangler.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_MANGLER_H
-#define LLVM_SUPPORT_MANGLER_H
+#ifndef LLVM_TARGET_MANGLER_H
+#define LLVM_TARGET_MANGLER_H
 
 #include "llvm/ADT/DenseMap.h"
 
@@ -69,4 +69,4 @@ public:
 
 } // End llvm namespace
 
-#endif // LLVM_SUPPORT_MANGLER_H
+#endif // LLVM_TARGET_MANGLER_H
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 1781e47..876479b 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -367,8 +367,9 @@ class Instruction {
   //  hasSideEffects - The instruction has side effects that are not
   //    captured by any operands of the instruction or other flags.
   //
-  //  neverHasSideEffects - Set on an instruction with no pattern if it has no
-  //    side effects.
+  //  neverHasSideEffects (deprecated) - Set on an instruction with no pattern
+  //    if it has no side effects. This is now equivalent to setting
+  //    "hasSideEffects = 0".
   bit hasSideEffects = ?;
   bit neverHasSideEffects = 0;
 
diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/Target/TargetFrameLowering.h
index 1958f90..58bfcec 100644
--- a/include/llvm/Target/TargetFrameLowering.h
+++ b/include/llvm/Target/TargetFrameLowering.h
@@ -120,6 +120,10 @@ public:
   /// by adding a check even before the "normal" function prologue.
   virtual void adjustForSegmentedStacks(MachineFunction &MF) const { }
 
+  /// Adjust the prologue to add Erlang Run-Time System (ERTS) specific code in
+  /// the assembly prologue to explicitly handle the stack.
+  virtual void adjustForHiPEPrologue(MachineFunction &MF) const { }
+
   /// spillCalleeSavedRegisters - Issues instruction(s) to spill all callee
   /// saved registers and returns true if it isn't possible / profitable to do
   /// so by issuing a series of store instructions via
@@ -192,6 +196,21 @@ public:
   ///
   virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
   }
+
+  /// eliminateCallFramePseudoInstr - This method is called during prolog/epilog
+  /// code insertion to eliminate call frame setup and destroy pseudo
+  /// instructions (but only if the Target is using them).  It is responsible
+  /// for eliminating these instructions, replacing them with concrete
+  /// instructions.  This method need only be implemented if using call frame
+  /// setup/destroy pseudo instructions.
+  ///
+  virtual void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const {
+    llvm_unreachable("Call Frame Pseudo Instructions do not exist on this "
+                     "target!");
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index eee25c9..a28d63a 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -68,17 +68,12 @@ namespace llvm {
     };
   }
 
-//===----------------------------------------------------------------------===//
-/// TargetLowering - This class defines information used to lower LLVM code to
-/// legal SelectionDAG operators that the target instruction selector can accept
-/// natively.
-///
-/// This class also defines callbacks that targets must implement to lower
-/// target-specific constructs to SelectionDAG operators.
-///
-class TargetLowering {
-  TargetLowering(const TargetLowering&) LLVM_DELETED_FUNCTION;
-  void operator=(const TargetLowering&) LLVM_DELETED_FUNCTION;
+/// TargetLoweringBase - This base class for TargetLowering contains the
+/// SelectionDAG-independent parts that can be used from the rest of CodeGen.
+class TargetLoweringBase {
+  TargetLoweringBase(const TargetLoweringBase&) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetLoweringBase&) LLVM_DELETED_FUNCTION;
+
 public:
   /// LegalizeAction - This enum indicates whether operations are valid for a
   /// target, and if not, what action should be used to make them valid.
@@ -136,9 +131,9 @@ public:
   }
 
   /// NOTE: The constructor takes ownership of TLOF.
-  explicit TargetLowering(const TargetMachine &TM,
-                          const TargetLoweringObjectFile *TLOF);
-  virtual ~TargetLowering();
+  explicit TargetLoweringBase(const TargetMachine &TM,
+                              const TargetLoweringObjectFile *TLOF);
+  virtual ~TargetLoweringBase();
 
   const TargetMachine &getTargetMachine() const { return TM; }
   const DataLayout *getDataLayout() const { return TD; }
@@ -188,7 +183,7 @@ public:
   /// isPredictableSelectExpensive - Return true if selects are only cheaper
   /// than branches if the branch is unlikely to be predicted right.
   bool isPredictableSelectExpensive() const {
-    return predictableSelectIsExpensive;
+    return PredictableSelectIsExpensive;
   }
 
   /// getSetCCResultType - Return the ValueType of the result of SETCC
@@ -664,7 +659,7 @@ public:
   /// return the limit for functions that have OptSize attribute.
   /// @brief Get maximum # of store operations permitted for llvm.memset
   unsigned getMaxStoresPerMemset(bool OptSize) const {
-    return OptSize ? maxStoresPerMemsetOptSize : maxStoresPerMemset;
+    return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
   }
 
   /// This function returns the maximum number of store operations permitted
@@ -673,7 +668,7 @@ public:
   /// return the limit for functions that have OptSize attribute.
   /// @brief Get maximum # of store operations permitted for llvm.memcpy
   unsigned getMaxStoresPerMemcpy(bool OptSize) const {
-    return OptSize ? maxStoresPerMemcpyOptSize : maxStoresPerMemcpy;
+    return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
 
   /// This function returns the maximum number of store operations permitted
@@ -682,7 +677,7 @@ public:
   /// return the limit for functions that have OptSize attribute.
   /// @brief Get maximum # of store operations permitted for llvm.memmove
   unsigned getMaxStoresPerMemmove(bool OptSize) const {
-    return OptSize ? maxStoresPerMemmoveOptSize : maxStoresPerMemmove;
+    return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
   }
 
   /// This function returns true if the target allows unaligned memory accesses.
@@ -701,7 +696,7 @@ public:
   /// optimization.
   /// @brief Determine if the target should perform code placement optimization.
   bool shouldOptimizeCodePlacement() const {
-    return benefitFromCodePlacementOpt;
+    return BenefitFromCodePlacementOpt;
   }
 
   /// getOptimalMemOpType - Returns the target specific optimal type for load
@@ -829,55 +824,6 @@ public:
     return InsertFencesForAtomic;
   }
 
-  /// getPreIndexedAddressParts - returns true by value, base pointer and
-  /// offset pointer and addressing mode by reference if the node's address
-  /// can be legally represented as pre-indexed load / store address.
-  virtual bool getPreIndexedAddressParts(SDNode * /*N*/, SDValue &/*Base*/,
-                                         SDValue &/*Offset*/,
-                                         ISD::MemIndexedMode &/*AM*/,
-                                         SelectionDAG &/*DAG*/) const {
-    return false;
-  }
-
-  /// getPostIndexedAddressParts - returns true by value, base pointer and
-  /// offset pointer and addressing mode by reference if this node can be
-  /// combined with a load / store to form a post-indexed load / store.
-  virtual bool getPostIndexedAddressParts(SDNode * /*N*/, SDNode * /*Op*/,
-                                          SDValue &/*Base*/, SDValue &/*Offset*/,
-                                          ISD::MemIndexedMode &/*AM*/,
-                                          SelectionDAG &/*DAG*/) const {
-    return false;
-  }
-
-  /// getJumpTableEncoding - Return the entry encoding for a jump table in the
-  /// current function.  The returned value is a member of the
-  /// MachineJumpTableInfo::JTEntryKind enum.
-  virtual unsigned getJumpTableEncoding() const;
-
-  virtual const MCExpr *
-  LowerCustomJumpTableEntry(const MachineJumpTableInfo * /*MJTI*/,
-                            const MachineBasicBlock * /*MBB*/, unsigned /*uid*/,
-                            MCContext &/*Ctx*/) const {
-    llvm_unreachable("Need to implement this hook if target has custom JTIs");
-  }
-
-  /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
-  /// jumptable.
-  virtual SDValue getPICJumpTableRelocBase(SDValue Table,
-                                           SelectionDAG &DAG) const;
-
-  /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
-  /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
-  /// MCExpr.
-  virtual const MCExpr *
-  getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
-                               unsigned JTI, MCContext &Ctx) const;
-
-  /// isOffsetFoldingLegal - Return true if folding a constant offset
-  /// with the given GlobalAddress is legal.  It is frequently not legal in
-  /// PIC relocation models.
-  virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
-
   /// getStackCookieLocation - Return true if the target stores stack
   /// protector cookies at a fixed offset in some non-standard address
   /// space, and populates the address space and offset as
@@ -906,152 +852,6 @@ public:
   /// @}
 
   //===--------------------------------------------------------------------===//
-  // TargetLowering Optimization Methods
-  //
-
-  /// TargetLoweringOpt - A convenience struct that encapsulates a DAG, and two
-  /// SDValues for returning information from TargetLowering to its clients
-  /// that want to combine
-  struct TargetLoweringOpt {
-    SelectionDAG &DAG;
-    bool LegalTys;
-    bool LegalOps;
-    SDValue Old;
-    SDValue New;
-
-    explicit TargetLoweringOpt(SelectionDAG &InDAG,
-                               bool LT, bool LO) :
-      DAG(InDAG), LegalTys(LT), LegalOps(LO) {}
-
-    bool LegalTypes() const { return LegalTys; }
-    bool LegalOperations() const { return LegalOps; }
-
-    bool CombineTo(SDValue O, SDValue N) {
-      Old = O;
-      New = N;
-      return true;
-    }
-
-    /// ShrinkDemandedConstant - Check to see if the specified operand of the
-    /// specified instruction is a constant integer.  If so, check to see if
-    /// there are any bits set in the constant that are not demanded.  If so,
-    /// shrink the constant and return true.
-    bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded);
-
-    /// ShrinkDemandedOp - Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the
-    /// casts are free.  This uses isZExtFree and ZERO_EXTEND for the widening
-    /// cast, but it could be generalized for targets with other types of
-    /// implicit widening casts.
-    bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
-                          DebugLoc dl);
-  };
-
-  /// SimplifyDemandedBits - Look at Op.  At this point, we know that only the
-  /// DemandedMask bits of the result of Op are ever used downstream.  If we can
-  /// use this information to simplify Op, create a new simplified DAG node and
-  /// return true, returning the original and new nodes in Old and New.
-  /// Otherwise, analyze the expression and return a mask of KnownOne and
-  /// KnownZero bits for the expression (used to simplify the caller).
-  /// The KnownZero/One bits may only be accurate for those bits in the
-  /// DemandedMask.
-  bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
-                            APInt &KnownZero, APInt &KnownOne,
-                            TargetLoweringOpt &TLO, unsigned Depth = 0) const;
-
-  /// computeMaskedBitsForTargetNode - Determine which of the bits specified in
-  /// Mask are known to be either zero or one and return them in the
-  /// KnownZero/KnownOne bitsets.
-  virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                              APInt &KnownZero,
-                                              APInt &KnownOne,
-                                              const SelectionDAG &DAG,
-                                              unsigned Depth = 0) const;
-
-  /// ComputeNumSignBitsForTargetNode - This method can be implemented by
-  /// targets that want to expose additional information about sign bits to the
-  /// DAG Combiner.
-  virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
-                                                   unsigned Depth = 0) const;
-
-  struct DAGCombinerInfo {
-    void *DC;  // The DAG Combiner object.
-    CombineLevel Level;
-    bool CalledByLegalizer;
-  public:
-    SelectionDAG &DAG;
-
-    DAGCombinerInfo(SelectionDAG &dag, CombineLevel level,  bool cl, void *dc)
-      : DC(dc), Level(level), CalledByLegalizer(cl), DAG(dag) {}
-
-    bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; }
-    bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; }
-    bool isAfterLegalizeVectorOps() const {
-      return Level == AfterLegalizeDAG;
-    }
-    CombineLevel getDAGCombineLevel() { return Level; }
-    bool isCalledByLegalizer() const { return CalledByLegalizer; }
-
-    void AddToWorklist(SDNode *N);
-    void RemoveFromWorklist(SDNode *N);
-    SDValue CombineTo(SDNode *N, const std::vector<SDValue> &To,
-                      bool AddTo = true);
-    SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true);
-    SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true);
-
-    void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
-  };
-
-  /// SimplifySetCC - Try to simplify a setcc built with the specified operands
-  /// and cc. If it is unable to simplify it, return a null SDValue.
-  SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
-                          ISD::CondCode Cond, bool foldBooleans,
-                          DAGCombinerInfo &DCI, DebugLoc dl) const;
-
-  /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
-  /// node is a GlobalAddress + offset.
-  virtual bool
-  isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const;
-
-  /// PerformDAGCombine - This method will be invoked for all target nodes and
-  /// for any target-independent nodes that the target has registered with
-  /// invoke it for.
-  ///
-  /// The semantics are as follows:
-  /// Return Value:
-  ///   SDValue.Val == 0   - No change was made
-  ///   SDValue.Val == N   - N was replaced, is dead, and is already handled.
-  ///   otherwise          - N should be replaced by the returned Operand.
-  ///
-  /// In addition, methods provided by DAGCombinerInfo may be used to perform
-  /// more complex transformations.
-  ///
-  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-
-  /// isTypeDesirableForOp - Return true if the target has native support for
-  /// the specified value type and it is 'desirable' to use the type for the
-  /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
-  /// instruction encodings are longer and some i16 instructions are slow.
-  virtual bool isTypeDesirableForOp(unsigned /*Opc*/, EVT VT) const {
-    // By default, assume all legal types are desirable.
-    return isTypeLegal(VT);
-  }
-
-  /// isDesirableToPromoteOp - Return true if it is profitable for dag combiner
-  /// to transform a floating point op of specified opcode to a equivalent op of
-  /// an integer type. e.g. f32 load -> i32 load can be profitable on ARM.
-  virtual bool isDesirableToTransformToIntegerOp(unsigned /*Opc*/,
-                                                 EVT /*VT*/) const {
-    return false;
-  }
-
-  /// IsDesirableToPromoteOp - This method query the target whether it is
-  /// beneficial for dag combiner to promote the specified node. If true, it
-  /// should return the desired promotion type by reference.
-  virtual bool IsDesirableToPromoteOp(SDValue /*Op*/, EVT &/*PVT*/) const {
-    return false;
-  }
-
-  //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
   //
@@ -1302,387 +1102,6 @@ protected:
 
 public:
   //===--------------------------------------------------------------------===//
-  // Lowering methods - These methods must be implemented by targets so that
-  // the SelectionDAGBuilder code knows how to lower these.
-  //
-
-  /// LowerFormalArguments - This hook must be implemented to lower the
-  /// incoming (formal) arguments, described by the Ins array, into the
-  /// specified DAG. The implementation should fill in the InVals array
-  /// with legal-type argument values, and return the resulting token
-  /// chain value.
-  ///
-  virtual SDValue
-    LowerFormalArguments(SDValue /*Chain*/, CallingConv::ID /*CallConv*/,
-                         bool /*isVarArg*/,
-                         const SmallVectorImpl<ISD::InputArg> &/*Ins*/,
-                         DebugLoc /*dl*/, SelectionDAG &/*DAG*/,
-                         SmallVectorImpl<SDValue> &/*InVals*/) const {
-    llvm_unreachable("Not Implemented");
-  }
-
-  struct ArgListEntry {
-    SDValue Node;
-    Type* Ty;
-    bool isSExt  : 1;
-    bool isZExt  : 1;
-    bool isInReg : 1;
-    bool isSRet  : 1;
-    bool isNest  : 1;
-    bool isByVal : 1;
-    uint16_t Alignment;
-
-    ArgListEntry() : isSExt(false), isZExt(false), isInReg(false),
-      isSRet(false), isNest(false), isByVal(false), Alignment(0) { }
-  };
-  typedef std::vector<ArgListEntry> ArgListTy;
-
-  /// CallLoweringInfo - This structure contains all information that is
-  /// necessary for lowering calls. It is passed to TLI::LowerCallTo when the
-  /// SelectionDAG builder needs to lower a call, and targets will see this
-  /// struct in their LowerCall implementation.
-  struct CallLoweringInfo {
-    SDValue Chain;
-    Type *RetTy;
-    bool RetSExt           : 1;
-    bool RetZExt           : 1;
-    bool IsVarArg          : 1;
-    bool IsInReg           : 1;
-    bool DoesNotReturn     : 1;
-    bool IsReturnValueUsed : 1;
-
-    // IsTailCall should be modified by implementations of
-    // TargetLowering::LowerCall that perform tail call conversions.
-    bool IsTailCall;
-
-    unsigned NumFixedArgs;
-    CallingConv::ID CallConv;
-    SDValue Callee;
-    ArgListTy &Args;
-    SelectionDAG &DAG;
-    DebugLoc DL;
-    ImmutableCallSite *CS;
-    SmallVector<ISD::OutputArg, 32> Outs;
-    SmallVector<SDValue, 32> OutVals;
-    SmallVector<ISD::InputArg, 32> Ins;
-
-
-    /// CallLoweringInfo - Constructs a call lowering context based on the
-    /// ImmutableCallSite \p cs.
-    CallLoweringInfo(SDValue chain, Type *retTy,
-                     FunctionType *FTy, bool isTailCall, SDValue callee,
-                     ArgListTy &args, SelectionDAG &dag, DebugLoc dl,
-                     ImmutableCallSite &cs)
-    : Chain(chain), RetTy(retTy), RetSExt(cs.paramHasAttr(0, Attribute::SExt)),
-      RetZExt(cs.paramHasAttr(0, Attribute::ZExt)), IsVarArg(FTy->isVarArg()),
-      IsInReg(cs.paramHasAttr(0, Attribute::InReg)),
-      DoesNotReturn(cs.doesNotReturn()),
-      IsReturnValueUsed(!cs.getInstruction()->use_empty()),
-      IsTailCall(isTailCall), NumFixedArgs(FTy->getNumParams()),
-      CallConv(cs.getCallingConv()), Callee(callee), Args(args), DAG(dag),
-      DL(dl), CS(&cs) {}
-
-    /// CallLoweringInfo - Constructs a call lowering context based on the
-    /// provided call information.
-    CallLoweringInfo(SDValue chain, Type *retTy, bool retSExt, bool retZExt,
-                     bool isVarArg, bool isInReg, unsigned numFixedArgs,
-                     CallingConv::ID callConv, bool isTailCall,
-                     bool doesNotReturn, bool isReturnValueUsed, SDValue callee,
-                     ArgListTy &args, SelectionDAG &dag, DebugLoc dl)
-    : Chain(chain), RetTy(retTy), RetSExt(retSExt), RetZExt(retZExt),
-      IsVarArg(isVarArg), IsInReg(isInReg), DoesNotReturn(doesNotReturn),
-      IsReturnValueUsed(isReturnValueUsed), IsTailCall(isTailCall),
-      NumFixedArgs(numFixedArgs), CallConv(callConv), Callee(callee),
-      Args(args), DAG(dag), DL(dl), CS(NULL) {}
-  };
-
-  /// LowerCallTo - This function lowers an abstract call to a function into an
-  /// actual call.  This returns a pair of operands.  The first element is the
-  /// return value for the function (if RetTy is not VoidTy).  The second
-  /// element is the outgoing token chain. It calls LowerCall to do the actual
-  /// lowering.
-  std::pair<SDValue, SDValue> LowerCallTo(CallLoweringInfo &CLI) const;
-
-  /// LowerCall - This hook must be implemented to lower calls into the
-  /// the specified DAG. The outgoing arguments to the call are described
-  /// by the Outs array, and the values to be returned by the call are
-  /// described by the Ins array. The implementation should fill in the
-  /// InVals array with legal-type return values from the call, and return
-  /// the resulting token chain value.
-  virtual SDValue
-    LowerCall(CallLoweringInfo &/*CLI*/,
-              SmallVectorImpl<SDValue> &/*InVals*/) const {
-    llvm_unreachable("Not Implemented");
-  }
-
-  /// HandleByVal - Target-specific cleanup for formal ByVal parameters.
-  virtual void HandleByVal(CCState *, unsigned &, unsigned) const {}
-
-  /// CanLowerReturn - This hook should be implemented to check whether the
-  /// return values described by the Outs array can fit into the return
-  /// registers.  If false is returned, an sret-demotion is performed.
-  ///
-  virtual bool CanLowerReturn(CallingConv::ID /*CallConv*/,
-                              MachineFunction &/*MF*/, bool /*isVarArg*/,
-               const SmallVectorImpl<ISD::OutputArg> &/*Outs*/,
-               LLVMContext &/*Context*/) const
-  {
-    // Return true by default to get preexisting behavior.
-    return true;
-  }
-
-  /// LowerReturn - This hook must be implemented to lower outgoing
-  /// return values, described by the Outs array, into the specified
-  /// DAG. The implementation should return the resulting token chain
-  /// value.
-  ///
-  virtual SDValue
-    LowerReturn(SDValue /*Chain*/, CallingConv::ID /*CallConv*/,
-                bool /*isVarArg*/,
-                const SmallVectorImpl<ISD::OutputArg> &/*Outs*/,
-                const SmallVectorImpl<SDValue> &/*OutVals*/,
-                DebugLoc /*dl*/, SelectionDAG &/*DAG*/) const {
-    llvm_unreachable("Not Implemented");
-  }
-
-  /// isUsedByReturnOnly - Return true if result of the specified node is used
-  /// by a return node only. It also compute and return the input chain for the
-  /// tail call.
-  /// This is used to determine whether it is possible
-  /// to codegen a libcall as tail call at legalization time.
-  virtual bool isUsedByReturnOnly(SDNode *, SDValue &Chain) const {
-    return false;
-  }
-
-  /// mayBeEmittedAsTailCall - Return true if the target may be able emit the
-  /// call instruction as a tail call. This is used by optimization passes to
-  /// determine if it's profitable to duplicate return instructions to enable
-  /// tailcall optimization.
-  virtual bool mayBeEmittedAsTailCall(CallInst *) const {
-    return false;
-  }
-
-  /// getTypeForExtArgOrReturn - Return the type that should be used to zero or
-  /// sign extend a zeroext/signext integer argument or return value.
-  /// FIXME: Most C calling convention requires the return type to be promoted,
-  /// but this is not true all the time, e.g. i1 on x86-64. It is also not
-  /// necessary for non-C calling conventions. The frontend should handle this
-  /// and include all of the necessary information.
-  virtual MVT getTypeForExtArgOrReturn(MVT VT,
-                                       ISD::NodeType /*ExtendKind*/) const {
-    MVT MinVT = getRegisterType(MVT::i32);
-    return VT.bitsLT(MinVT) ? MinVT : VT;
-  }
-
-  /// LowerOperationWrapper - This callback is invoked by the type legalizer
-  /// to legalize nodes with an illegal operand type but legal result types.
-  /// It replaces the LowerOperation callback in the type Legalizer.
-  /// The reason we can not do away with LowerOperation entirely is that
-  /// LegalizeDAG isn't yet ready to use this callback.
-  /// TODO: Consider merging with ReplaceNodeResults.
-
-  /// The target places new result values for the node in Results (their number
-  /// and types must exactly match those of the original return values of
-  /// the node), or leaves Results empty, which indicates that the node is not
-  /// to be custom lowered after all.
-  /// The default implementation calls LowerOperation.
-  virtual void LowerOperationWrapper(SDNode *N,
-                                     SmallVectorImpl<SDValue> &Results,
-                                     SelectionDAG &DAG) const;
-
-  /// LowerOperation - This callback is invoked for operations that are
-  /// unsupported by the target, which are registered to use 'custom' lowering,
-  /// and whose defined values are all legal.
-  /// If the target has no operations that require custom lowering, it need not
-  /// implement this.  The default implementation of this aborts.
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-
-  /// ReplaceNodeResults - This callback is invoked when a node result type is
-  /// illegal for the target, and the operation was registered to use 'custom'
-  /// lowering for that result type.  The target places new result values for
-  /// the node in Results (their number and types must exactly match those of
-  /// the original return values of the node), or leaves Results empty, which
-  /// indicates that the node is not to be custom lowered after all.
-  ///
-  /// If the target has no operations that require custom lowering, it need not
-  /// implement this.  The default implementation aborts.
-  virtual void ReplaceNodeResults(SDNode * /*N*/,
-                                  SmallVectorImpl<SDValue> &/*Results*/,
-                                  SelectionDAG &/*DAG*/) const {
-    llvm_unreachable("ReplaceNodeResults not implemented for this target!");
-  }
-
-  /// getTargetNodeName() - This method returns the name of a target specific
-  /// DAG node.
-  virtual const char *getTargetNodeName(unsigned Opcode) const;
-
-  /// createFastISel - This method returns a target specific FastISel object,
-  /// or null if the target does not support "fast" ISel.
-  virtual FastISel *createFastISel(FunctionLoweringInfo &,
-                                   const TargetLibraryInfo *) const {
-    return 0;
-  }
-
-  //===--------------------------------------------------------------------===//
-  // Inline Asm Support hooks
-  //
-
-  /// ExpandInlineAsm - This hook allows the target to expand an inline asm
-  /// call to be explicit llvm code if it wants to.  This is useful for
-  /// turning simple inline asms into LLVM intrinsics, which gives the
-  /// compiler more information about the behavior of the code.
-  virtual bool ExpandInlineAsm(CallInst *) const {
-    return false;
-  }
-
-  enum ConstraintType {
-    C_Register,            // Constraint represents specific register(s).
-    C_RegisterClass,       // Constraint represents any of register(s) in class.
-    C_Memory,              // Memory constraint.
-    C_Other,               // Something else.
-    C_Unknown              // Unsupported constraint.
-  };
-
-  enum ConstraintWeight {
-    // Generic weights.
-    CW_Invalid  = -1,     // No match.
-    CW_Okay     = 0,      // Acceptable.
-    CW_Good     = 1,      // Good weight.
-    CW_Better   = 2,      // Better weight.
-    CW_Best     = 3,      // Best weight.
-
-    // Well-known weights.
-    CW_SpecificReg  = CW_Okay,    // Specific register operands.
-    CW_Register     = CW_Good,    // Register operands.
-    CW_Memory       = CW_Better,  // Memory operands.
-    CW_Constant     = CW_Best,    // Constant operand.
-    CW_Default      = CW_Okay     // Default or don't know type.
-  };
-
-  /// AsmOperandInfo - This contains information for each constraint that we are
-  /// lowering.
-  struct AsmOperandInfo : public InlineAsm::ConstraintInfo {
-    /// ConstraintCode - This contains the actual string for the code, like "m".
-    /// TargetLowering picks the 'best' code from ConstraintInfo::Codes that
-    /// most closely matches the operand.
-    std::string ConstraintCode;
-
-    /// ConstraintType - Information about the constraint code, e.g. Register,
-    /// RegisterClass, Memory, Other, Unknown.
-    TargetLowering::ConstraintType ConstraintType;
-
-    /// CallOperandval - If this is the result output operand or a
-    /// clobber, this is null, otherwise it is the incoming operand to the
-    /// CallInst.  This gets modified as the asm is processed.
-    Value *CallOperandVal;
-
-    /// ConstraintVT - The ValueType for the operand value.
-    MVT ConstraintVT;
-
-    /// isMatchingInputConstraint - Return true of this is an input operand that
-    /// is a matching constraint like "4".
-    bool isMatchingInputConstraint() const;
-
-    /// getMatchedOperand - If this is an input matching constraint, this method
-    /// returns the output operand it matches.
-    unsigned getMatchedOperand() const;
-
-    /// Copy constructor for copying from an AsmOperandInfo.
-    AsmOperandInfo(const AsmOperandInfo &info)
-      : InlineAsm::ConstraintInfo(info),
-        ConstraintCode(info.ConstraintCode),
-        ConstraintType(info.ConstraintType),
-        CallOperandVal(info.CallOperandVal),
-        ConstraintVT(info.ConstraintVT) {
-    }
-
-    /// Copy constructor for copying from a ConstraintInfo.
-    AsmOperandInfo(const InlineAsm::ConstraintInfo &info)
-      : InlineAsm::ConstraintInfo(info),
-        ConstraintType(TargetLowering::C_Unknown),
-        CallOperandVal(0), ConstraintVT(MVT::Other) {
-    }
-  };
-
-  typedef std::vector<AsmOperandInfo> AsmOperandInfoVector;
-
-  /// ParseConstraints - Split up the constraint string from the inline
-  /// assembly value into the specific constraints and their prefixes,
-  /// and also tie in the associated operand values.
-  /// If this returns an empty vector, and if the constraint string itself
-  /// isn't empty, there was an error parsing.
-  virtual AsmOperandInfoVector ParseConstraints(ImmutableCallSite CS) const;
-
-  /// Examine constraint type and operand type and determine a weight value.
-  /// The operand object must already have been set up with the operand type.
-  virtual ConstraintWeight getMultipleConstraintMatchWeight(
-      AsmOperandInfo &info, int maIndex) const;
-
-  /// Examine constraint string and operand type and determine a weight value.
-  /// The operand object must already have been set up with the operand type.
-  virtual ConstraintWeight getSingleConstraintMatchWeight(
-      AsmOperandInfo &info, const char *constraint) const;
-
-  /// ComputeConstraintToUse - Determines the constraint code and constraint
-  /// type to use for the specific AsmOperandInfo, setting
-  /// OpInfo.ConstraintCode and OpInfo.ConstraintType.  If the actual operand
-  /// being passed in is available, it can be passed in as Op, otherwise an
-  /// empty SDValue can be passed.
-  virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo,
-                                      SDValue Op,
-                                      SelectionDAG *DAG = 0) const;
-
-  /// getConstraintType - Given a constraint, return the type of constraint it
-  /// is for this target.
-  virtual ConstraintType getConstraintType(const std::string &Constraint) const;
-
-  /// getRegForInlineAsmConstraint - Given a physical register constraint (e.g.
-  /// {edx}), return the register number and the register class for the
-  /// register.
-  ///
-  /// Given a register class constraint, like 'r', if this corresponds directly
-  /// to an LLVM register class, return a register of 0 and the register class
-  /// pointer.
-  ///
-  /// This should only be used for C_Register constraints.  On error,
-  /// this returns a register number of 0 and a null register class pointer..
-  virtual std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 EVT VT) const;
-
-  /// LowerXConstraint - try to replace an X constraint, which matches anything,
-  /// with another that has more specific requirements based on the type of the
-  /// corresponding operand.  This returns null if there is no replacement to
-  /// make.
-  virtual const char *LowerXConstraint(EVT ConstraintVT) const;
-
-  /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-  /// vector.  If it is invalid, don't add anything to Ops.
-  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
-                                            std::vector<SDValue> &Ops,
-                                            SelectionDAG &DAG) const;
-
-  //===--------------------------------------------------------------------===//
-  // Instruction Emitting Hooks
-  //
-
-  // EmitInstrWithCustomInserter - This method should be implemented by targets
-  // that mark instructions with the 'usesCustomInserter' flag.  These
-  // instructions are special in various ways, which require special support to
-  // insert.  The specified MachineInstr is created but not inserted into any
-  // basic blocks, and this method is called to expand it into a sequence of
-  // instructions, potentially also creating new basic blocks and control flow.
-  virtual MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
-
-  /// AdjustInstrPostInstrSelection - This method should be implemented by
-  /// targets that mark instructions with the 'hasPostISelHook' flag. These
-  /// instructions must be adjusted after instruction selection by target hooks.
-  /// e.g. To fill in optional defs for ARM 's' setting instructions.
-  virtual void
-  AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const;
-
-  //===--------------------------------------------------------------------===//
   // Addressing mode description hooks (used by LSR etc).
   //
 
@@ -1798,17 +1217,6 @@ public:
   }
 
   //===--------------------------------------------------------------------===//
-  // Div utility functions
-  //
-  SDValue BuildExactSDIV(SDValue Op1, SDValue Op2, DebugLoc dl,
-                         SelectionDAG &DAG) const;
-  SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
-                      std::vector<SDNode*> *Created) const;
-  SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
-                      std::vector<SDNode*> *Created) const;
-
-
-  //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
 
@@ -2190,11 +1598,11 @@ protected:
   /// with 16-bit alignment would result in four 2-byte stores and one 1-byte
   /// store.  This only applies to setting a constant array of a constant size.
   /// @brief Specify maximum number of store instructions per memset call.
-  unsigned maxStoresPerMemset;
+  unsigned MaxStoresPerMemset;
 
   /// Maximum number of stores operations that may be substituted for the call
   /// to memset, used for functions with OptSize attribute.
-  unsigned maxStoresPerMemsetOptSize;
+  unsigned MaxStoresPerMemsetOptSize;
 
   /// When lowering \@llvm.memcpy this field specifies the maximum number of
   /// store operations that may be substituted for a call to memcpy. Targets
@@ -2206,11 +1614,11 @@ protected:
   /// and one 1-byte store. This only applies to copying a constant array of
   /// constant size.
   /// @brief Specify maximum bytes of store instructions per memcpy call.
-  unsigned maxStoresPerMemcpy;
+  unsigned MaxStoresPerMemcpy;
 
   /// Maximum number of store operations that may be substituted for a call
   /// to memcpy, used for functions with OptSize attribute.
-  unsigned maxStoresPerMemcpyOptSize;
+  unsigned MaxStoresPerMemcpyOptSize;
 
   /// When lowering \@llvm.memmove this field specifies the maximum number of
   /// store instructions that may be substituted for a call to memmove. Targets
@@ -2221,26 +1629,641 @@ protected:
   /// with 8-bit alignment would result in nine 1-byte stores.  This only
   /// applies to copying a constant array of constant size.
   /// @brief Specify maximum bytes of store instructions per memmove call.
-  unsigned maxStoresPerMemmove;
+  unsigned MaxStoresPerMemmove;
 
   /// Maximum number of store instructions that may be substituted for a call
   /// to memmove, used for functions with OpSize attribute.
-  unsigned maxStoresPerMemmoveOptSize;
+  unsigned MaxStoresPerMemmoveOptSize;
 
   /// This field specifies whether the target can benefit from code placement
   /// optimization.
-  bool benefitFromCodePlacementOpt;
+  bool BenefitFromCodePlacementOpt;
 
-  /// predictableSelectIsExpensive - Tells the code generator that select is
+  /// PredictableSelectIsExpensive - Tells the code generator that select is
   /// more expensive than a branch if the branch is usually predicted right.
-  bool predictableSelectIsExpensive;
+  bool PredictableSelectIsExpensive;
 
-private:
+protected:
   /// isLegalRC - Return true if the value types that can be represented by the
   /// specified register class are all legal.
   bool isLegalRC(const TargetRegisterClass *RC) const;
 };
 
+//===----------------------------------------------------------------------===//
+/// TargetLowering - This class defines information used to lower LLVM code to
+/// legal SelectionDAG operators that the target instruction selector can accept
+/// natively.
+///
+/// This class also defines callbacks that targets must implement to lower
+/// target-specific constructs to SelectionDAG operators.
+///
+class TargetLowering : public TargetLoweringBase {
+  TargetLowering(const TargetLowering&) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetLowering&) LLVM_DELETED_FUNCTION;
+
+public:
+  /// NOTE: The constructor takes ownership of TLOF.
+  explicit TargetLowering(const TargetMachine &TM,
+                          const TargetLoweringObjectFile *TLOF);
+
+  /// getPreIndexedAddressParts - returns true by value, base pointer and
+  /// offset pointer and addressing mode by reference if the node's address
+  /// can be legally represented as pre-indexed load / store address.
+  virtual bool getPreIndexedAddressParts(SDNode * /*N*/, SDValue &/*Base*/,
+                                         SDValue &/*Offset*/,
+                                         ISD::MemIndexedMode &/*AM*/,
+                                         SelectionDAG &/*DAG*/) const {
+    return false;
+  }
+
+  /// getPostIndexedAddressParts - returns true by value, base pointer and
+  /// offset pointer and addressing mode by reference if this node can be
+  /// combined with a load / store to form a post-indexed load / store.
+  virtual bool getPostIndexedAddressParts(SDNode * /*N*/, SDNode * /*Op*/,
+                                          SDValue &/*Base*/, SDValue &/*Offset*/,
+                                          ISD::MemIndexedMode &/*AM*/,
+                                          SelectionDAG &/*DAG*/) const {
+    return false;
+  }
+
+  /// getJumpTableEncoding - Return the entry encoding for a jump table in the
+  /// current function.  The returned value is a member of the
+  /// MachineJumpTableInfo::JTEntryKind enum.
+  virtual unsigned getJumpTableEncoding() const;
+
+  virtual const MCExpr *
+  LowerCustomJumpTableEntry(const MachineJumpTableInfo * /*MJTI*/,
+                            const MachineBasicBlock * /*MBB*/, unsigned /*uid*/,
+                            MCContext &/*Ctx*/) const {
+    llvm_unreachable("Need to implement this hook if target has custom JTIs");
+  }
+
+  /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+  /// jumptable.
+  virtual SDValue getPICJumpTableRelocBase(SDValue Table,
+                                           SelectionDAG &DAG) const;
+
+  /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
+  /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
+  /// MCExpr.
+  virtual const MCExpr *
+  getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                               unsigned JTI, MCContext &Ctx) const;
+
+  /// isOffsetFoldingLegal - Return true if folding a constant offset
+  /// with the given GlobalAddress is legal.  It is frequently not legal in
+  /// PIC relocation models.
+  virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+  bool isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
+                            SDValue &Chain) const;
+
+  void softenSetCCOperands(SelectionDAG &DAG, EVT VT,
+                           SDValue &NewLHS, SDValue &NewRHS,
+                           ISD::CondCode &CCCode, DebugLoc DL) const;
+
+  SDValue makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
+                      const SDValue *Ops, unsigned NumOps,
+                      bool isSigned, DebugLoc dl) const;
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Optimization Methods
+  //
+
+  /// TargetLoweringOpt - A convenience struct that encapsulates a DAG, and two
+  /// SDValues for returning information from TargetLowering to its clients
+  /// that want to combine
+  struct TargetLoweringOpt {
+    SelectionDAG &DAG;
+    bool LegalTys;
+    bool LegalOps;
+    SDValue Old;
+    SDValue New;
+
+    explicit TargetLoweringOpt(SelectionDAG &InDAG,
+                               bool LT, bool LO) :
+      DAG(InDAG), LegalTys(LT), LegalOps(LO) {}
+
+    bool LegalTypes() const { return LegalTys; }
+    bool LegalOperations() const { return LegalOps; }
+
+    bool CombineTo(SDValue O, SDValue N) {
+      Old = O;
+      New = N;
+      return true;
+    }
+
+    /// ShrinkDemandedConstant - Check to see if the specified operand of the
+    /// specified instruction is a constant integer.  If so, check to see if
+    /// there are any bits set in the constant that are not demanded.  If so,
+    /// shrink the constant and return true.
+    bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded);
+
+    /// ShrinkDemandedOp - Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the
+    /// casts are free.  This uses isZExtFree and ZERO_EXTEND for the widening
+    /// cast, but it could be generalized for targets with other types of
+    /// implicit widening casts.
+    bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
+                          DebugLoc dl);
+  };
+
+  /// SimplifyDemandedBits - Look at Op.  At this point, we know that only the
+  /// DemandedMask bits of the result of Op are ever used downstream.  If we can
+  /// use this information to simplify Op, create a new simplified DAG node and
+  /// return true, returning the original and new nodes in Old and New.
+  /// Otherwise, analyze the expression and return a mask of KnownOne and
+  /// KnownZero bits for the expression (used to simplify the caller).
+  /// The KnownZero/One bits may only be accurate for those bits in the
+  /// DemandedMask.
+  bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
+                            APInt &KnownZero, APInt &KnownOne,
+                            TargetLoweringOpt &TLO, unsigned Depth = 0) const;
+
+  /// computeMaskedBitsForTargetNode - Determine which of the bits specified in
+  /// Mask are known to be either zero or one and return them in the
+  /// KnownZero/KnownOne bitsets.
+  virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                              APInt &KnownZero,
+                                              APInt &KnownOne,
+                                              const SelectionDAG &DAG,
+                                              unsigned Depth = 0) const;
+
+  /// ComputeNumSignBitsForTargetNode - This method can be implemented by
+  /// targets that want to expose additional information about sign bits to the
+  /// DAG Combiner.
+  virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth = 0) const;
+
+  struct DAGCombinerInfo {
+    void *DC;  // The DAG Combiner object.
+    CombineLevel Level;
+    bool CalledByLegalizer;
+  public:
+    SelectionDAG &DAG;
+
+    DAGCombinerInfo(SelectionDAG &dag, CombineLevel level,  bool cl, void *dc)
+      : DC(dc), Level(level), CalledByLegalizer(cl), DAG(dag) {}
+
+    bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; }
+    bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; }
+    bool isAfterLegalizeVectorOps() const {
+      return Level == AfterLegalizeDAG;
+    }
+    CombineLevel getDAGCombineLevel() { return Level; }
+    bool isCalledByLegalizer() const { return CalledByLegalizer; }
+
+    void AddToWorklist(SDNode *N);
+    void RemoveFromWorklist(SDNode *N);
+    SDValue CombineTo(SDNode *N, const std::vector<SDValue> &To,
+                      bool AddTo = true);
+    SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true);
+    SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true);
+
+    void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
+  };
+
+  /// SimplifySetCC - Try to simplify a setcc built with the specified operands
+  /// and cc. If it is unable to simplify it, return a null SDValue.
+  SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
+                          ISD::CondCode Cond, bool foldBooleans,
+                          DAGCombinerInfo &DCI, DebugLoc dl) const;
+
+  /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+  /// node is a GlobalAddress + offset.
+  virtual bool
+  isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const;
+
+  /// PerformDAGCombine - This method will be invoked for all target nodes and
+  /// for any target-independent nodes that the target has registered with
+  /// invoke it for.
+  ///
+  /// The semantics are as follows:
+  /// Return Value:
+  ///   SDValue.Val == 0   - No change was made
+  ///   SDValue.Val == N   - N was replaced, is dead, and is already handled.
+  ///   otherwise          - N should be replaced by the returned Operand.
+  ///
+  /// In addition, methods provided by DAGCombinerInfo may be used to perform
+  /// more complex transformations.
+  ///
+  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  /// isTypeDesirableForOp - Return true if the target has native support for
+  /// the specified value type and it is 'desirable' to use the type for the
+  /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+  /// instruction encodings are longer and some i16 instructions are slow.
+  virtual bool isTypeDesirableForOp(unsigned /*Opc*/, EVT VT) const {
+    // By default, assume all legal types are desirable.
+    return isTypeLegal(VT);
+  }
+
+  /// isDesirableToPromoteOp - Return true if it is profitable for dag combiner
+  /// to transform a floating point op of specified opcode to a equivalent op of
+  /// an integer type. e.g. f32 load -> i32 load can be profitable on ARM.
+  virtual bool isDesirableToTransformToIntegerOp(unsigned /*Opc*/,
+                                                 EVT /*VT*/) const {
+    return false;
+  }
+
+  /// IsDesirableToPromoteOp - This method query the target whether it is
+  /// beneficial for dag combiner to promote the specified node. If true, it
+  /// should return the desired promotion type by reference.
+  virtual bool IsDesirableToPromoteOp(SDValue /*Op*/, EVT &/*PVT*/) const {
+    return false;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Lowering methods - These methods must be implemented by targets so that
+  // the SelectionDAGBuilder code knows how to lower these.
+  //
+
+  /// LowerFormalArguments - This hook must be implemented to lower the
+  /// incoming (formal) arguments, described by the Ins array, into the
+  /// specified DAG. The implementation should fill in the InVals array
+  /// with legal-type argument values, and return the resulting token
+  /// chain value.
+  ///
+  virtual SDValue
+    LowerFormalArguments(SDValue /*Chain*/, CallingConv::ID /*CallConv*/,
+                         bool /*isVarArg*/,
+                         const SmallVectorImpl<ISD::InputArg> &/*Ins*/,
+                         DebugLoc /*dl*/, SelectionDAG &/*DAG*/,
+                         SmallVectorImpl<SDValue> &/*InVals*/) const {
+    llvm_unreachable("Not Implemented");
+  }
+
+  struct ArgListEntry {
+    SDValue Node;
+    Type* Ty;
+    bool isSExt  : 1;
+    bool isZExt  : 1;
+    bool isInReg : 1;
+    bool isSRet  : 1;
+    bool isNest  : 1;
+    bool isByVal : 1;
+    uint16_t Alignment;
+
+    ArgListEntry() : isSExt(false), isZExt(false), isInReg(false),
+      isSRet(false), isNest(false), isByVal(false), Alignment(0) { }
+  };
+  typedef std::vector<ArgListEntry> ArgListTy;
+
+  /// CallLoweringInfo - This structure contains all information that is
+  /// necessary for lowering calls. It is passed to TLI::LowerCallTo when the
+  /// SelectionDAG builder needs to lower a call, and targets will see this
+  /// struct in their LowerCall implementation.
+  struct CallLoweringInfo {
+    SDValue Chain;
+    Type *RetTy;
+    bool RetSExt           : 1;
+    bool RetZExt           : 1;
+    bool IsVarArg          : 1;
+    bool IsInReg           : 1;
+    bool DoesNotReturn     : 1;
+    bool IsReturnValueUsed : 1;
+
+    // IsTailCall should be modified by implementations of
+    // TargetLowering::LowerCall that perform tail call conversions.
+    bool IsTailCall;
+
+    unsigned NumFixedArgs;
+    CallingConv::ID CallConv;
+    SDValue Callee;
+    ArgListTy &Args;
+    SelectionDAG &DAG;
+    DebugLoc DL;
+    ImmutableCallSite *CS;
+    SmallVector<ISD::OutputArg, 32> Outs;
+    SmallVector<SDValue, 32> OutVals;
+    SmallVector<ISD::InputArg, 32> Ins;
+
+
+    /// CallLoweringInfo - Constructs a call lowering context based on the
+    /// ImmutableCallSite \p cs.
+    CallLoweringInfo(SDValue chain, Type *retTy,
+                     FunctionType *FTy, bool isTailCall, SDValue callee,
+                     ArgListTy &args, SelectionDAG &dag, DebugLoc dl,
+                     ImmutableCallSite &cs)
+    : Chain(chain), RetTy(retTy), RetSExt(cs.paramHasAttr(0, Attribute::SExt)),
+      RetZExt(cs.paramHasAttr(0, Attribute::ZExt)), IsVarArg(FTy->isVarArg()),
+      IsInReg(cs.paramHasAttr(0, Attribute::InReg)),
+      DoesNotReturn(cs.doesNotReturn()),
+      IsReturnValueUsed(!cs.getInstruction()->use_empty()),
+      IsTailCall(isTailCall), NumFixedArgs(FTy->getNumParams()),
+      CallConv(cs.getCallingConv()), Callee(callee), Args(args), DAG(dag),
+      DL(dl), CS(&cs) {}
+
+    /// CallLoweringInfo - Constructs a call lowering context based on the
+    /// provided call information.
+    CallLoweringInfo(SDValue chain, Type *retTy, bool retSExt, bool retZExt,
+                     bool isVarArg, bool isInReg, unsigned numFixedArgs,
+                     CallingConv::ID callConv, bool isTailCall,
+                     bool doesNotReturn, bool isReturnValueUsed, SDValue callee,
+                     ArgListTy &args, SelectionDAG &dag, DebugLoc dl)
+    : Chain(chain), RetTy(retTy), RetSExt(retSExt), RetZExt(retZExt),
+      IsVarArg(isVarArg), IsInReg(isInReg), DoesNotReturn(doesNotReturn),
+      IsReturnValueUsed(isReturnValueUsed), IsTailCall(isTailCall),
+      NumFixedArgs(numFixedArgs), CallConv(callConv), Callee(callee),
+      Args(args), DAG(dag), DL(dl), CS(NULL) {}
+  };
+
+  /// LowerCallTo - This function lowers an abstract call to a function into an
+  /// actual call.  This returns a pair of operands.  The first element is the
+  /// return value for the function (if RetTy is not VoidTy).  The second
+  /// element is the outgoing token chain. It calls LowerCall to do the actual
+  /// lowering.
+  std::pair<SDValue, SDValue> LowerCallTo(CallLoweringInfo &CLI) const;
+
+  /// LowerCall - This hook must be implemented to lower calls into the
+  /// the specified DAG. The outgoing arguments to the call are described
+  /// by the Outs array, and the values to be returned by the call are
+  /// described by the Ins array. The implementation should fill in the
+  /// InVals array with legal-type return values from the call, and return
+  /// the resulting token chain value.
+  virtual SDValue
+    LowerCall(CallLoweringInfo &/*CLI*/,
+              SmallVectorImpl<SDValue> &/*InVals*/) const {
+    llvm_unreachable("Not Implemented");
+  }
+
+  /// HandleByVal - Target-specific cleanup for formal ByVal parameters.
+  virtual void HandleByVal(CCState *, unsigned &, unsigned) const {}
+
+  /// CanLowerReturn - This hook should be implemented to check whether the
+  /// return values described by the Outs array can fit into the return
+  /// registers.  If false is returned, an sret-demotion is performed.
+  ///
+  virtual bool CanLowerReturn(CallingConv::ID /*CallConv*/,
+                              MachineFunction &/*MF*/, bool /*isVarArg*/,
+               const SmallVectorImpl<ISD::OutputArg> &/*Outs*/,
+               LLVMContext &/*Context*/) const
+  {
+    // Return true by default to get preexisting behavior.
+    return true;
+  }
+
+  /// LowerReturn - This hook must be implemented to lower outgoing
+  /// return values, described by the Outs array, into the specified
+  /// DAG. The implementation should return the resulting token chain
+  /// value.
+  ///
+  virtual SDValue
+    LowerReturn(SDValue /*Chain*/, CallingConv::ID /*CallConv*/,
+                bool /*isVarArg*/,
+                const SmallVectorImpl<ISD::OutputArg> &/*Outs*/,
+                const SmallVectorImpl<SDValue> &/*OutVals*/,
+                DebugLoc /*dl*/, SelectionDAG &/*DAG*/) const {
+    llvm_unreachable("Not Implemented");
+  }
+
+  /// isUsedByReturnOnly - Return true if result of the specified node is used
+  /// by a return node only. It also compute and return the input chain for the
+  /// tail call.
+  /// This is used to determine whether it is possible
+  /// to codegen a libcall as tail call at legalization time.
+  virtual bool isUsedByReturnOnly(SDNode *, SDValue &Chain) const {
+    return false;
+  }
+
+  /// mayBeEmittedAsTailCall - Return true if the target may be able emit the
+  /// call instruction as a tail call. This is used by optimization passes to
+  /// determine if it's profitable to duplicate return instructions to enable
+  /// tailcall optimization.
+  virtual bool mayBeEmittedAsTailCall(CallInst *) const {
+    return false;
+  }
+
+  /// getTypeForExtArgOrReturn - Return the type that should be used to zero or
+  /// sign extend a zeroext/signext integer argument or return value.
+  /// FIXME: Most C calling convention requires the return type to be promoted,
+  /// but this is not true all the time, e.g. i1 on x86-64. It is also not
+  /// necessary for non-C calling conventions. The frontend should handle this
+  /// and include all of the necessary information.
+  virtual MVT getTypeForExtArgOrReturn(MVT VT,
+                                       ISD::NodeType /*ExtendKind*/) const {
+    MVT MinVT = getRegisterType(MVT::i32);
+    return VT.bitsLT(MinVT) ? MinVT : VT;
+  }
+
+  /// LowerOperationWrapper - This callback is invoked by the type legalizer
+  /// to legalize nodes with an illegal operand type but legal result types.
+  /// It replaces the LowerOperation callback in the type Legalizer.
+  /// The reason we can not do away with LowerOperation entirely is that
+  /// LegalizeDAG isn't yet ready to use this callback.
+  /// TODO: Consider merging with ReplaceNodeResults.
+
+  /// The target places new result values for the node in Results (their number
+  /// and types must exactly match those of the original return values of
+  /// the node), or leaves Results empty, which indicates that the node is not
+  /// to be custom lowered after all.
+  /// The default implementation calls LowerOperation.
+  virtual void LowerOperationWrapper(SDNode *N,
+                                     SmallVectorImpl<SDValue> &Results,
+                                     SelectionDAG &DAG) const;
+
+  /// LowerOperation - This callback is invoked for operations that are
+  /// unsupported by the target, which are registered to use 'custom' lowering,
+  /// and whose defined values are all legal.
+  /// If the target has no operations that require custom lowering, it need not
+  /// implement this.  The default implementation of this aborts.
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+  /// ReplaceNodeResults - This callback is invoked when a node result type is
+  /// illegal for the target, and the operation was registered to use 'custom'
+  /// lowering for that result type.  The target places new result values for
+  /// the node in Results (their number and types must exactly match those of
+  /// the original return values of the node), or leaves Results empty, which
+  /// indicates that the node is not to be custom lowered after all.
+  ///
+  /// If the target has no operations that require custom lowering, it need not
+  /// implement this.  The default implementation aborts.
+  virtual void ReplaceNodeResults(SDNode * /*N*/,
+                                  SmallVectorImpl<SDValue> &/*Results*/,
+                                  SelectionDAG &/*DAG*/) const {
+    llvm_unreachable("ReplaceNodeResults not implemented for this target!");
+  }
+
+  /// getTargetNodeName() - This method returns the name of a target specific
+  /// DAG node.
+  virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+  /// createFastISel - This method returns a target specific FastISel object,
+  /// or null if the target does not support "fast" ISel.
+  virtual FastISel *createFastISel(FunctionLoweringInfo &,
+                                   const TargetLibraryInfo *) const {
+    return 0;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Inline Asm Support hooks
+  //
+
+  /// ExpandInlineAsm - This hook allows the target to expand an inline asm
+  /// call to be explicit llvm code if it wants to.  This is useful for
+  /// turning simple inline asms into LLVM intrinsics, which gives the
+  /// compiler more information about the behavior of the code.
+  virtual bool ExpandInlineAsm(CallInst *) const {
+    return false;
+  }
+
+  enum ConstraintType {
+    C_Register,            // Constraint represents specific register(s).
+    C_RegisterClass,       // Constraint represents any of register(s) in class.
+    C_Memory,              // Memory constraint.
+    C_Other,               // Something else.
+    C_Unknown              // Unsupported constraint.
+  };
+
+  enum ConstraintWeight {
+    // Generic weights.
+    CW_Invalid  = -1,     // No match.
+    CW_Okay     = 0,      // Acceptable.
+    CW_Good     = 1,      // Good weight.
+    CW_Better   = 2,      // Better weight.
+    CW_Best     = 3,      // Best weight.
+
+    // Well-known weights.
+    CW_SpecificReg  = CW_Okay,    // Specific register operands.
+    CW_Register     = CW_Good,    // Register operands.
+    CW_Memory       = CW_Better,  // Memory operands.
+    CW_Constant     = CW_Best,    // Constant operand.
+    CW_Default      = CW_Okay     // Default or don't know type.
+  };
+
+  /// AsmOperandInfo - This contains information for each constraint that we are
+  /// lowering.
+  struct AsmOperandInfo : public InlineAsm::ConstraintInfo {
+    /// ConstraintCode - This contains the actual string for the code, like "m".
+    /// TargetLowering picks the 'best' code from ConstraintInfo::Codes that
+    /// most closely matches the operand.
+    std::string ConstraintCode;
+
+    /// ConstraintType - Information about the constraint code, e.g. Register,
+    /// RegisterClass, Memory, Other, Unknown.
+    TargetLowering::ConstraintType ConstraintType;
+
+    /// CallOperandval - If this is the result output operand or a
+    /// clobber, this is null, otherwise it is the incoming operand to the
+    /// CallInst.  This gets modified as the asm is processed.
+    Value *CallOperandVal;
+
+    /// ConstraintVT - The ValueType for the operand value.
+    MVT ConstraintVT;
+
+    /// isMatchingInputConstraint - Return true of this is an input operand that
+    /// is a matching constraint like "4".
+    bool isMatchingInputConstraint() const;
+
+    /// getMatchedOperand - If this is an input matching constraint, this method
+    /// returns the output operand it matches.
+    unsigned getMatchedOperand() const;
+
+    /// Copy constructor for copying from an AsmOperandInfo.
+    AsmOperandInfo(const AsmOperandInfo &info)
+      : InlineAsm::ConstraintInfo(info),
+        ConstraintCode(info.ConstraintCode),
+        ConstraintType(info.ConstraintType),
+        CallOperandVal(info.CallOperandVal),
+        ConstraintVT(info.ConstraintVT) {
+    }
+
+    /// Copy constructor for copying from a ConstraintInfo.
+    AsmOperandInfo(const InlineAsm::ConstraintInfo &info)
+      : InlineAsm::ConstraintInfo(info),
+        ConstraintType(TargetLowering::C_Unknown),
+        CallOperandVal(0), ConstraintVT(MVT::Other) {
+    }
+  };
+
+  typedef std::vector<AsmOperandInfo> AsmOperandInfoVector;
+
+  /// ParseConstraints - Split up the constraint string from the inline
+  /// assembly value into the specific constraints and their prefixes,
+  /// and also tie in the associated operand values.
+  /// If this returns an empty vector, and if the constraint string itself
+  /// isn't empty, there was an error parsing.
+  virtual AsmOperandInfoVector ParseConstraints(ImmutableCallSite CS) const;
+
+  /// Examine constraint type and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  virtual ConstraintWeight getMultipleConstraintMatchWeight(
+      AsmOperandInfo &info, int maIndex) const;
+
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  virtual ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const;
+
+  /// ComputeConstraintToUse - Determines the constraint code and constraint
+  /// type to use for the specific AsmOperandInfo, setting
+  /// OpInfo.ConstraintCode and OpInfo.ConstraintType.  If the actual operand
+  /// being passed in is available, it can be passed in as Op, otherwise an
+  /// empty SDValue can be passed.
+  virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo,
+                                      SDValue Op,
+                                      SelectionDAG *DAG = 0) const;
+
+  /// getConstraintType - Given a constraint, return the type of constraint it
+  /// is for this target.
+  virtual ConstraintType getConstraintType(const std::string &Constraint) const;
+
+  /// getRegForInlineAsmConstraint - Given a physical register constraint (e.g.
+  /// {edx}), return the register number and the register class for the
+  /// register.
+  ///
+  /// Given a register class constraint, like 'r', if this corresponds directly
+  /// to an LLVM register class, return a register of 0 and the register class
+  /// pointer.
+  ///
+  /// This should only be used for C_Register constraints.  On error,
+  /// this returns a register number of 0 and a null register class pointer..
+  virtual std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 EVT VT) const;
+
+  /// LowerXConstraint - try to replace an X constraint, which matches anything,
+  /// with another that has more specific requirements based on the type of the
+  /// corresponding operand.  This returns null if there is no replacement to
+  /// make.
+  virtual const char *LowerXConstraint(EVT ConstraintVT) const;
+
+  /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+  /// vector.  If it is invalid, don't add anything to Ops.
+  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                            std::vector<SDValue> &Ops,
+                                            SelectionDAG &DAG) const;
+
+  //===--------------------------------------------------------------------===//
+  // Div utility functions
+  //
+  SDValue BuildExactSDIV(SDValue Op1, SDValue Op2, DebugLoc dl,
+                         SelectionDAG &DAG) const;
+  SDValue BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
+                      std::vector<SDNode*> *Created) const;
+  SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
+                      std::vector<SDNode*> *Created) const;
+
+  //===--------------------------------------------------------------------===//
+  // Instruction Emitting Hooks
+  //
+
+  // EmitInstrWithCustomInserter - This method should be implemented by targets
+  // that mark instructions with the 'usesCustomInserter' flag.  These
+  // instructions are special in various ways, which require special support to
+  // insert.  The specified MachineInstr is created but not inserted into any
+  // basic blocks, and this method is called to expand it into a sequence of
+  // instructions, potentially also creating new basic blocks and control flow.
+  virtual MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+  /// AdjustInstrPostInstrSelection - This method should be implemented by
+  /// targets that mark instructions with the 'hasPostISelHook' flag. These
+  /// instructions must be adjusted after instruction selection by target hooks.
+  /// e.g. To fill in optional defs for ARM 's' setting instructions.
+  virtual void
+  AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const;
+};
+
 /// GetReturnInfo - Given an LLVM IR type and return type attributes,
 /// compute the return value EVTs and flags, and optionally also
 /// the offsets, if the return value is being lowered to memory.
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index aa049f0..35cf20a 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -59,10 +59,6 @@ protected: // Can only create subclasses.
   TargetMachine(const Target &T, StringRef TargetTriple,
                 StringRef CPU, StringRef FS, const TargetOptions &Options);
 
-  /// getSubtargetImpl - virtual method implemented by subclasses that returns
-  /// a reference to that target's TargetSubtargetInfo-derived member variable.
-  virtual const TargetSubtargetInfo *getSubtargetImpl() const { return 0; }
-
   /// TheTarget - The Target that this machine was created for.
   const Target &TheTarget;
 
@@ -95,6 +91,10 @@ public:
   const StringRef getTargetCPU() const { return TargetCPU; }
   const StringRef getTargetFeatureString() const { return TargetFS; }
 
+  /// getSubtargetImpl - virtual method implemented by subclasses that returns
+  /// a reference to that target's TargetSubtargetInfo-derived member variable.
+  virtual const TargetSubtargetInfo *getSubtargetImpl() const { return 0; }
+
   TargetOptions Options;
 
   // Interfaces to the major aspects of target machine information:
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 8fc566e..c31db24 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -48,10 +48,10 @@ namespace llvm {
           UseSoftFloat(false), NoZerosInBSS(false), JITExceptionHandling(false),
           JITEmitDebugInfo(false), JITEmitDebugInfoToDisk(false),
           GuaranteedTailCallOpt(false), DisableTailCalls(false),
-          StackAlignmentOverride(0), RealignStack(true), EnableFastISel(false),
-          PositionIndependentExecutable(false), EnableSegmentedStacks(false),
-          UseInitArray(false), TrapFuncName(""),FloatABIType(FloatABI::Default),
-          AllowFPOpFusion(FPOpFusion::Standard)
+          StackAlignmentOverride(0), RealignStack(true), SSPBufferSize(0),
+          EnableFastISel(false), PositionIndependentExecutable(false),
+          EnableSegmentedStacks(false), UseInitArray(false), TrapFuncName(""),
+          FloatABIType(FloatABI::Default), AllowFPOpFusion(FPOpFusion::Standard)
     {}
 
     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index 2a8ef98..6b1e70b 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -733,21 +733,6 @@ public:
     llvm_unreachable("isFrameOffsetLegal does not exist on this target");
   }
 
-  /// eliminateCallFramePseudoInstr - This method is called during prolog/epilog
-  /// code insertion to eliminate call frame setup and destroy pseudo
-  /// instructions (but only if the Target is using them).  It is responsible
-  /// for eliminating these instructions, replacing them with concrete
-  /// instructions.  This method need only be implemented if using call frame
-  /// setup/destroy pseudo instructions.
-  ///
-  virtual void
-  eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const {
-    llvm_unreachable("Call Frame Pseudo Instructions do not exist on this "
-                     "target!");
-  }
-
 
   /// saveScavengerRegister - Spill the register so it can be used by the
   /// register scavenger. Return true if the register was spilled, false
@@ -767,10 +752,11 @@ public:
   /// referenced by the iterator contains an MO_FrameIndex operand which must be
   /// eliminated by this method.  This method may modify or replace the
   /// specified instruction, as long as it keeps the iterator pointing at the
-  /// finished product. SPAdj is the SP adjustment due to call frame setup
-  /// instruction.
+  /// finished product.  SPAdj is the SP adjustment due to call frame setup
+  /// instruction.  FIOperandNum is the FI operand number.
   virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                                   int SPAdj, RegScavenger *RS=NULL) const = 0;
+                                   int SPAdj, unsigned FIOperandNum,
+                                   RegScavenger *RS = NULL) const = 0;
 
   //===--------------------------------------------------------------------===//
   /// Debug information queries.
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 0da82fd..b7920ba 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -76,6 +76,7 @@ class SchedMachineModel {
   int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle.
   int MinLatency = -1; // Determines which instrucions are allowed in a group.
                        // (-1) inorder (0) ooo, (1): inorder +var latencies.
+  int ILPWindow = -1;  // Cycles of latency likely hidden by hardware buffers.
   int LoadLatency = -1; // Cycles for loads to access the cache.
   int HighLatency = -1; // Approximation of cycles for "high latency" ops.
   int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index 3f22f47..b2d405d 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -19,6 +19,7 @@
 
 namespace llvm {
 
+class MachineFunction;
 class MachineInstr;
 class SDep;
 class SUnit;
@@ -73,6 +74,9 @@ public:
   // the latency of a schedule dependency.
   virtual void adjustSchedDependency(SUnit *def, SUnit *use,
                                      SDep& dep) const { }
+
+  /// \brief Reset the features for the subtarget.
+  virtual void resetSubtargetFeatures(const MachineFunction *MF) { }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 3ea0a42..209f68d 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SUPPORT_PASSMANAGERBUILDER_H
-#define LLVM_SUPPORT_PASSMANAGERBUILDER_H
+#ifndef LLVM_TRANSFORMS_IPO_PASSMANAGERBUILDER_H
+#define LLVM_TRANSFORMS_IPO_PASSMANAGERBUILDER_H
 
 #include <vector>
 
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 5504dc9..fed92c8 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -39,9 +39,11 @@ ModulePass *createGCOVProfilerPass(bool EmitNotes = true, bool EmitData = true,
 // Insert AddressSanitizer (address sanity checking) instrumentation
 FunctionPass *createAddressSanitizerFunctionPass(
     bool CheckInitOrder = false, bool CheckUseAfterReturn = false,
-    bool CheckLifetime = false, StringRef BlacklistFile = StringRef());
+    bool CheckLifetime = false, StringRef BlacklistFile = StringRef(),
+    bool ZeroBaseShadow = false);
 ModulePass *createAddressSanitizerModulePass(
-    bool CheckInitOrder = false, StringRef BlacklistFile = StringRef());
+    bool CheckInitOrder = false, StringRef BlacklistFile = StringRef(),
+    bool ZeroBaseShadow = false);
 
 // Insert MemorySanitizer instrumentation (detection of uninitialized reads)
 FunctionPass *createMemorySanitizerPass(bool TrackOrigins = false,
diff --git a/include/llvm/Transforms/ObjCARC.h b/include/llvm/Transforms/ObjCARC.h
new file mode 100644
index 0000000..b3c19c0
--- /dev/null
+++ b/include/llvm/Transforms/ObjCARC.h
@@ -0,0 +1,49 @@
+//===-- ObjCARC.h - ObjCARC Scalar Transformations --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for accessor functions that expose passes
+// in the ObjCARC Scalar Transformations library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_H
+#define LLVM_TRANSFORMS_OBJCARC_H
+
+namespace llvm {
+
+class Pass;
+
+//===----------------------------------------------------------------------===//
+//
+// ObjCARCAPElim - ObjC ARC autorelease pool elimination.
+//
+Pass *createObjCARCAPElimPass();
+
+//===----------------------------------------------------------------------===//
+//
+// ObjCARCExpand - ObjC ARC preliminary simplifications.
+//
+Pass *createObjCARCExpandPass();
+
+//===----------------------------------------------------------------------===//
+//
+// ObjCARCContract - Late ObjC ARC cleanups.
+//
+Pass *createObjCARCContractPass();
+
+//===----------------------------------------------------------------------===//
+//
+// ObjCARCOpt - ObjC ARC optimization.
+//
+Pass *createObjCARCOptPass();
+
+} // End llvm namespace
+
+#endif
+
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index d465127..e89759a 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -334,30 +334,6 @@ Pass *createCorrelatedValuePropagationPass();
 
 //===----------------------------------------------------------------------===//
 //
-// ObjCARCAPElim - ObjC ARC autorelease pool elimination.
-//
-Pass *createObjCARCAPElimPass();
-
-//===----------------------------------------------------------------------===//
-//
-// ObjCARCExpand - ObjC ARC preliminary simplifications.
-//
-Pass *createObjCARCExpandPass();
-
-//===----------------------------------------------------------------------===//
-//
-// ObjCARCContract - Late ObjC ARC cleanups.
-//
-Pass *createObjCARCContractPass();
-
-//===----------------------------------------------------------------------===//
-//
-// ObjCARCOpt - ObjC ARC optimization.
-//
-Pass *createObjCARCOptPass();
-
-//===----------------------------------------------------------------------===//
-//
 // InstructionSimplifier - Remove redundant instructions.
 //
 FunctionPass *createInstructionSimplifierPass();
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index d08f75e..8f1a6e2 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -12,14 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_UTILS_BASICBLOCK_H
-#define LLVM_TRANSFORMS_UTILS_BASICBLOCK_H
+#ifndef LLVM_TRANSFORMS_UTILS_BASICBLOCKUTILS_H
+#define LLVM_TRANSFORMS_UTILS_BASICBLOCKUTILS_H
 
 // FIXME: Move to this file: BasicBlock::removePredecessor, BB::splitBasicBlock
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/Support/CFG.h"
-#include "llvm/Support/DebugLoc.h"
 
 namespace llvm {
 
diff --git a/lib/Transforms/Instrumentation/BlackList.h b/include/llvm/Transforms/Utils/BlackList.h
index ee18a98..f19470e 100644
--- a/lib/Transforms/Instrumentation/BlackList.h
+++ b/include/llvm/Transforms/Utils/BlackList.h
@@ -42,17 +42,17 @@ class BlackList {
  public:
   BlackList(const StringRef Path);
   // Returns whether either this function or it's source file are blacklisted.
-  bool isIn(const Function &F);
+  bool isIn(const Function &F) const;
   // Returns whether either this global or it's source file are blacklisted.
-  bool isIn(const GlobalVariable &G);
+  bool isIn(const GlobalVariable &G) const;
   // Returns whether this module is blacklisted by filename.
-  bool isIn(const Module &M);
+  bool isIn(const Module &M) const;
   // Returns whether a global should be excluded from initialization checking.
-  bool isInInit(const GlobalVariable &G);
+  bool isInInit(const GlobalVariable &G) const;
  private:
   StringMap<Regex*> Entries;
 
-  bool inSection(const StringRef Section, const StringRef Query);
+  bool inSection(const StringRef Section, const StringRef Query) const;
 };
 
 }  // namespace llvm
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index ca2d8f6..181ed07 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TRANSFORMS_UTILS_BUILDLIBCALLS_H
-#define TRANSFORMS_UTILS_BUILDLIBCALLS_H
+#ifndef LLVM_TRANSFORMS_UTILS_BUILDLIBCALLS_H
+#define LLVM_TRANSFORMS_UTILS_BUILDLIBCALLS_H
 
 #include "llvm/IR/IRBuilder.h"
 
diff --git a/include/llvm/Transforms/Utils/BypassSlowDivision.h b/include/llvm/Transforms/Utils/BypassSlowDivision.h
index b11be05..0d081c0 100644
--- a/include/llvm/Transforms/Utils/BypassSlowDivision.h
+++ b/include/llvm/Transforms/Utils/BypassSlowDivision.h
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
-#define TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+#ifndef LLVM_TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+#define LLVM_TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/Function.h"
diff --git a/include/llvm/Transforms/Utils/IntegerDivision.h b/include/llvm/Transforms/Utils/IntegerDivision.h
index cecc807..27d3c58 100644
--- a/include/llvm/Transforms/Utils/IntegerDivision.h
+++ b/include/llvm/Transforms/Utils/IntegerDivision.h
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TRANSFORMS_UTILS_INTEGERDIVISION_H
-#define TRANSFORMS_UTILS_INTEGERDIVISION_H
+#ifndef LLVM_TRANSFORMS_UTILS_INTEGERDIVISION_H
+#define LLVM_TRANSFORMS_UTILS_INTEGERDIVISION_H
 
 namespace llvm {
   class BinaryOperator;
@@ -43,6 +43,20 @@ namespace llvm {
   /// @brief Replace Div with generated code.
   bool expandDivision(BinaryOperator* Div);
 
+  /// Generate code to calculate the remainder of two integers, replacing Rem
+  /// with the generated code. Uses the above 32bit routine, therefore adequate
+  /// for targets with little or no support for less than 32 bit arithmetic.
+  ///
+  /// @brief Replace Rem with generated code.
+  bool expandRemainderUpTo32Bits(BinaryOperator *Rem);
+
+  /// Generate code to divide two integers, replacing Div with the generated 
+  /// code. Uses the above 32bit routine, therefore adequate for targets with 
+  /// little or no support for less than 32 bit arithmetic.
+  /// 
+  /// @brief Replace Rem with generated code.
+  bool expandDivisionUpTo32Bits(BinaryOperator *Div);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/ModuleUtils.h b/include/llvm/Transforms/Utils/ModuleUtils.h
index 2c0ec9b..bb7fc06 100644
--- a/include/llvm/Transforms/Utils/ModuleUtils.h
+++ b/include/llvm/Transforms/Utils/ModuleUtils.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_UTILS_MODULE_UTILS_H
-#define LLVM_TRANSFORMS_UTILS_MODULE_UTILS_H
+#ifndef LLVM_TRANSFORMS_UTILS_MODULEUTILS_H
+#define LLVM_TRANSFORMS_UTILS_MODULEUTILS_H
 
 namespace llvm {
 
@@ -30,4 +30,4 @@ void appendToGlobalDtors(Module &M, Function *F, int Priority);
 
 } // End llvm namespace
 
-#endif //  LLVM_TRANSFORMS_UTILS_MODULE_UTILS_H
+#endif //  LLVM_TRANSFORMS_UTILS_MODULEUTILS_H
diff --git a/include/llvm/Transforms/Utils/PromoteMemToReg.h b/include/llvm/Transforms/Utils/PromoteMemToReg.h
index 0bb6ec6..52a6157 100644
--- a/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
-#define TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
+#ifndef LLVM_TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
+#define LLVM_TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
 
 #include <vector>
 
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h
index 1ba4d22..d205dbd 100644
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@@ -84,6 +84,9 @@ struct VectorizeConfig {
   /// @brief The maximum number of pairable instructions per group.
   unsigned MaxInsts;
 
+  /// @brief The maximum number of candidate instruction pairs per group.
+  unsigned MaxPairs;
+
   /// @brief The maximum number of pairing iterations.
   unsigned MaxIter;
 
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index f32bd70..210b80a 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -555,19 +555,3 @@ bool llvm::isIdentifiedObject(const Value *V) {
     return A->hasNoAliasAttr() || A->hasByValAttr();
   return false;
 }
-
-/// isKnownNonNull - Return true if we know that the specified value is never
-/// null.
-bool llvm::isKnownNonNull(const Value *V) {
-  // Alloca never returns null, malloc might.
-  if (isa<AllocaInst>(V)) return true;
-
-  // A byval argument is never null.
-  if (const Argument *A = dyn_cast<Argument>(V))
-    return A->hasByValAttr();
-
-  // Global values are not null unless extern weak.
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
-    return !GV->hasExternalWeakLinkage();
-  return false;
-}
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 78abe0f..4c64c4a 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -18,7 +18,6 @@ add_llvm_library(LLVMAnalysis
   DomPrinter.cpp
   DominanceFrontier.cpp
   IVUsers.cpp
-  InlineCost.cpp
   InstCount.cpp
   InstructionSimplify.cpp
   Interval.cpp
diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp
index 1dff3d4..8cda01a 100644
--- a/lib/Analysis/CodeMetrics.cpp
+++ b/lib/Analysis/CodeMetrics.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -19,114 +20,14 @@
 
 using namespace llvm;
 
-/// callIsSmall - If a call is likely to lower to a single target instruction,
-/// or is otherwise deemed small return true.
-/// TODO: Perhaps calls like memcpy, strcpy, etc?
-bool llvm::callIsSmall(ImmutableCallSite CS) {
-  if (isa<IntrinsicInst>(CS.getInstruction()))
-    return true;
-
-  const Function *F = CS.getCalledFunction();
-  if (!F) return false;
-
-  if (F->hasLocalLinkage()) return false;
-
-  if (!F->hasName()) return false;
-
-  StringRef Name = F->getName();
-
-  // These will all likely lower to a single selection DAG node.
-  if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
-      Name == "fabs" || Name == "fabsf" || Name == "fabsl" ||
-      Name == "sin" || Name == "sinf" || Name == "sinl" ||
-      Name == "cos" || Name == "cosf" || Name == "cosl" ||
-      Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl" )
-    return true;
-
-  // These are all likely to be optimized into something smaller.
-  if (Name == "pow" || Name == "powf" || Name == "powl" ||
-      Name == "exp2" || Name == "exp2l" || Name == "exp2f" ||
-      Name == "floor" || Name == "floorf" || Name == "ceil" ||
-      Name == "round" || Name == "ffs" || Name == "ffsl" ||
-      Name == "abs" || Name == "labs" || Name == "llabs")
-    return true;
-
-  return false;
-}
-
-bool llvm::isInstructionFree(const Instruction *I, const DataLayout *TD) {
-  if (isa<PHINode>(I))
-    return true;
-
-  // If a GEP has all constant indices, it will probably be folded with
-  // a load/store.
-  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
-    return GEP->hasAllConstantIndices();
-
-  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-    switch (II->getIntrinsicID()) {
-    default:
-      return false;
-    case Intrinsic::dbg_declare:
-    case Intrinsic::dbg_value:
-    case Intrinsic::invariant_start:
-    case Intrinsic::invariant_end:
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-    case Intrinsic::objectsize:
-    case Intrinsic::ptr_annotation:
-    case Intrinsic::var_annotation:
-      // These intrinsics don't count as size.
-      return true;
-    }
-  }
-
-  if (const CastInst *CI = dyn_cast<CastInst>(I)) {
-    // Noop casts, including ptr <-> int,  don't count.
-    if (CI->isLosslessCast())
-      return true;
-
-    Value *Op = CI->getOperand(0);
-    // An inttoptr cast is free so long as the input is a legal integer type
-    // which doesn't contain values outside the range of a pointer.
-    if (isa<IntToPtrInst>(CI) && TD &&
-        TD->isLegalInteger(Op->getType()->getScalarSizeInBits()) &&
-        Op->getType()->getScalarSizeInBits() <= TD->getPointerSizeInBits())
-      return true;
-
-    // A ptrtoint cast is free so long as the result is large enough to store
-    // the pointer, and a legal integer type.
-    if (isa<PtrToIntInst>(CI) && TD &&
-        TD->isLegalInteger(Op->getType()->getScalarSizeInBits()) &&
-        Op->getType()->getScalarSizeInBits() >= TD->getPointerSizeInBits())
-      return true;
-
-    // trunc to a native type is free (assuming the target has compare and
-    // shift-right of the same width).
-    if (TD && isa<TruncInst>(CI) &&
-        TD->isLegalInteger(TD->getTypeSizeInBits(CI->getType())))
-      return true;
-    // Result of a cmp instruction is often extended (to be used by other
-    // cmp instructions, logical or return instructions). These are usually
-    // nop on most sane targets.
-    if (isa<CmpInst>(CI->getOperand(0)))
-      return true;
-  }
-
-  return false;
-}
-
 /// analyzeBasicBlock - Fill in the current structure with information gleaned
 /// from the specified block.
 void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
-                                    const DataLayout *TD) {
+                                    const TargetTransformInfo &TTI) {
   ++NumBlocks;
   unsigned NumInstsBeforeThisBB = NumInsts;
   for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
        II != E; ++II) {
-    if (isInstructionFree(II, TD))
-      continue;
-
     // Special handling for calls.
     if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
       ImmutableCallSite CS(cast<Instruction>(II));
@@ -144,12 +45,10 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
         // for that case.
         if (F == BB->getParent())
           isRecursive = true;
-      }
-
-      if (!callIsSmall(CS)) {
-        // Each argument to a call takes on average one instruction to set up.
-        NumInsts += CS.arg_size();
 
+        if (TTI.isLoweredToCall(F))
+          ++NumCalls;
+      } else {
         // We don't want inline asm to count as a call - that would prevent loop
         // unrolling. The argument setup cost is still real, though.
         if (!isa<InlineAsm>(CS.getCalledValue()))
@@ -173,7 +72,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
       if (InvI->hasFnAttr(Attribute::NoDuplicate))
         notDuplicatable = true;
 
-    ++NumInsts;
+    NumInsts += TTI.getUserCost(&*II);
   }
 
   if (isa<ReturnInst>(BB->getTerminator()))
@@ -195,18 +94,3 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
   // Remember NumInsts for this BB.
   NumBBInsts[BB] = NumInsts - NumInstsBeforeThisBB;
 }
-
-void CodeMetrics::analyzeFunction(Function *F, const DataLayout *TD) {
-  // If this function contains a call that "returns twice" (e.g., setjmp or
-  // _setjmp) and it isn't marked with "returns twice" itself, never inline it.
-  // This is a hack because we depend on the user marking their local variables
-  // as volatile if they are live across a setjmp call, and they probably
-  // won't do this in callers.
-  exposesReturnsTwice = F->callsFunctionThatReturnsTwice() &&
-    !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                     Attribute::ReturnsTwice);
-
-  // Look at the size of the callee.
-  for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-    analyzeBasicBlock(&*BB, TD);
-}
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 2b7d3bd..09d7608 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -54,13 +54,12 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
 
   // Handle a vector->integer cast.
   if (IntegerType *IT = dyn_cast<IntegerType>(DestTy)) {
-    ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C);
-    if (CDV == 0)
+    VectorType *VTy = dyn_cast<VectorType>(C->getType());
+    if (VTy == 0)
       return ConstantExpr::getBitCast(C, DestTy);
 
-    unsigned NumSrcElts = CDV->getType()->getNumElements();
-
-    Type *SrcEltTy = CDV->getType()->getElementType();
+    unsigned NumSrcElts = VTy->getNumElements();
+    Type *SrcEltTy = VTy->getElementType();
 
     // If the vector is a vector of floating point, convert it to vector of int
     // to simplify things.
@@ -70,9 +69,12 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
         VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElts);
       // Ask IR to do the conversion now that #elts line up.
       C = ConstantExpr::getBitCast(C, SrcIVTy);
-      CDV = cast<ConstantDataVector>(C);
     }
 
+    ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C);
+    if (CDV == 0)
+      return ConstantExpr::getBitCast(C, DestTy);
+
     // Now that we know that the input value is a vector of integers, just shift
     // and insert them into our result.
     unsigned BitShift = TD.getTypeAllocSizeInBits(SrcEltTy);
@@ -218,10 +220,10 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
 /// from a global, return the global and the constant.  Because of
 /// constantexprs, this function is recursive.
 static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
-                                       int64_t &Offset, const DataLayout &TD) {
+                                       APInt &Offset, const DataLayout &TD) {
   // Trivial case, constant is the global.
   if ((GV = dyn_cast<GlobalValue>(C))) {
-    Offset = 0;
+    Offset.clearAllBits();
     return true;
   }
 
@@ -235,34 +237,13 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD);
 
   // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
-  if (CE->getOpcode() == Instruction::GetElementPtr) {
-    // Cannot compute this if the element type of the pointer is missing size
-    // info.
-    if (!cast<PointerType>(CE->getOperand(0)->getType())
-                 ->getElementType()->isSized())
-      return false;
-
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
     // If the base isn't a global+constant, we aren't either.
     if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD))
       return false;
 
     // Otherwise, add any offset that our operands provide.
-    gep_type_iterator GTI = gep_type_begin(CE);
-    for (User::const_op_iterator i = CE->op_begin() + 1, e = CE->op_end();
-         i != e; ++i, ++GTI) {
-      ConstantInt *CI = dyn_cast<ConstantInt>(*i);
-      if (!CI) return false;  // Index isn't a simple constant?
-      if (CI->isZero()) continue;  // Not adding anything.
-
-      if (StructType *ST = dyn_cast<StructType>(*GTI)) {
-        // N = N + Offset
-        Offset += TD.getStructLayout(ST)->getElementOffset(CI->getZExtValue());
-      } else {
-        SequentialType *SQT = cast<SequentialType>(*GTI);
-        Offset += TD.getTypeAllocSize(SQT->getElementType())*CI->getSExtValue();
-      }
-    }
-    return true;
+    return GEP->accumulateConstantOffset(TD, Offset);
   }
 
   return false;
@@ -310,6 +291,10 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
       C = FoldBitCast(C, Type::getInt32Ty(C->getContext()), TD);
       return ReadDataFromGlobal(C, ByteOffset, CurPtr, BytesLeft, TD);
     }
+    if (CFP->getType()->isHalfTy()){
+      C = FoldBitCast(C, Type::getInt16Ty(C->getContext()), TD);
+      return ReadDataFromGlobal(C, ByteOffset, CurPtr, BytesLeft, TD);
+    }
     return false;
   }
 
@@ -402,7 +387,9 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
     // that address spaces don't matter here since we're not going to result in
     // an actual new load.
     Type *MapTy;
-    if (LoadTy->isFloatTy())
+    if (LoadTy->isHalfTy())
+      MapTy = Type::getInt16PtrTy(C->getContext());
+    else if (LoadTy->isFloatTy())
       MapTy = Type::getInt32PtrTy(C->getContext());
     else if (LoadTy->isDoubleTy())
       MapTy = Type::getInt64PtrTy(C->getContext());
@@ -423,7 +410,7 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
   if (BytesLoaded > 32 || BytesLoaded == 0) return 0;
 
   GlobalValue *GVal;
-  int64_t Offset;
+  APInt Offset(TD.getPointerSizeInBits(), 0);
   if (!IsConstantOffsetFromGlobal(C, GVal, Offset, TD))
     return 0;
 
@@ -434,14 +421,15 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
 
   // If we're loading off the beginning of the global, some bytes may be valid,
   // but we don't try to handle this.
-  if (Offset < 0) return 0;
+  if (Offset.isNegative()) return 0;
 
   // If we're not accessing anything in this constant, the result is undefined.
-  if (uint64_t(Offset) >= TD.getTypeAllocSize(GV->getInitializer()->getType()))
+  if (Offset.getZExtValue() >=
+      TD.getTypeAllocSize(GV->getInitializer()->getType()))
     return UndefValue::get(IntType);
 
   unsigned char RawBytes[32] = {0};
-  if (!ReadDataFromGlobal(GV->getInitializer(), Offset, RawBytes,
+  if (!ReadDataFromGlobal(GV->getInitializer(), Offset.getZExtValue(), RawBytes,
                           BytesLoaded, TD))
     return 0;
 
@@ -550,10 +538,10 @@ static Constant *ConstantFoldLoadInst(const LoadInst *LI, const DataLayout *TD){
 
 /// SymbolicallyEvaluateBinop - One of Op0/Op1 is a constant expression.
 /// Attempt to symbolically evaluate the result of a binary operator merging
-/// these together.  If target data info is available, it is provided as TD,
-/// otherwise TD is null.
+/// these together.  If target data info is available, it is provided as DL,
+/// otherwise DL is null.
 static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
-                                           Constant *Op1, const DataLayout *TD){
+                                           Constant *Op1, const DataLayout *DL){
   // SROA
 
   // Fold (and 0xffffffff00000000, (shl x, 32)) -> shl.
@@ -561,17 +549,44 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
   // bits.
 
 
+  if (Opc == Instruction::And && DL) {
+    unsigned BitWidth = DL->getTypeSizeInBits(Op0->getType());
+    APInt KnownZero0(BitWidth, 0), KnownOne0(BitWidth, 0);
+    APInt KnownZero1(BitWidth, 0), KnownOne1(BitWidth, 0);
+    ComputeMaskedBits(Op0, KnownZero0, KnownOne0, DL);
+    ComputeMaskedBits(Op1, KnownZero1, KnownOne1, DL);
+    if ((KnownOne1 | KnownZero0).isAllOnesValue()) {
+      // All the bits of Op0 that the 'and' could be masking are already zero.
+      return Op0;
+    }
+    if ((KnownOne0 | KnownZero1).isAllOnesValue()) {
+      // All the bits of Op1 that the 'and' could be masking are already zero.
+      return Op1;
+    }
+
+    APInt KnownZero = KnownZero0 | KnownZero1;
+    APInt KnownOne = KnownOne0 & KnownOne1;
+    if ((KnownZero | KnownOne).isAllOnesValue()) {
+      return ConstantInt::get(Op0->getType(), KnownOne);
+    }
+  }
+
   // If the constant expr is something like &A[123] - &A[4].f, fold this into a
   // constant.  This happens frequently when iterating over a global array.
-  if (Opc == Instruction::Sub && TD) {
+  if (Opc == Instruction::Sub && DL) {
     GlobalValue *GV1, *GV2;
-    int64_t Offs1, Offs2;
+    unsigned PtrSize = DL->getPointerSizeInBits();
+    unsigned OpSize = DL->getTypeSizeInBits(Op0->getType());
+    APInt Offs1(PtrSize, 0), Offs2(PtrSize, 0);
 
-    if (IsConstantOffsetFromGlobal(Op0, GV1, Offs1, *TD))
-      if (IsConstantOffsetFromGlobal(Op1, GV2, Offs2, *TD) &&
+    if (IsConstantOffsetFromGlobal(Op0, GV1, Offs1, *DL))
+      if (IsConstantOffsetFromGlobal(Op1, GV2, Offs2, *DL) &&
           GV1 == GV2) {
         // (&GV+C1) - (&GV+C2) -> C1-C2, pointer arithmetic cannot overflow.
-        return ConstantInt::get(Op0->getType(), Offs1-Offs2);
+        // PtrToInt may change the bitwidth so we have convert to the right size
+        // first.
+        return ConstantInt::get(Op0->getType(), Offs1.zextOrTrunc(OpSize) -
+                                                Offs2.zextOrTrunc(OpSize));
       }
   }
 
@@ -1104,6 +1119,13 @@ Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
 bool
 llvm::canConstantFoldCallTo(const Function *F) {
   switch (F->getIntrinsicID()) {
+  case Intrinsic::fabs:
+  case Intrinsic::log:
+  case Intrinsic::log2:
+  case Intrinsic::log10:
+  case Intrinsic::exp:
+  case Intrinsic::exp2:
+  case Intrinsic::floor:
   case Intrinsic::sqrt:
   case Intrinsic::pow:
   case Intrinsic::powi:
@@ -1142,8 +1164,7 @@ llvm::canConstantFoldCallTo(const Function *F) {
   switch (Name[0]) {
   default: return false;
   case 'a':
-    return Name == "acos" || Name == "asin" ||
-      Name == "atan" || Name == "atan2";
+    return Name == "acos" || Name == "asin" || Name == "atan" || Name =="atan2";
   case 'c':
     return Name == "cos" || Name == "ceil" || Name == "cosf" || Name == "cosh";
   case 'e':
@@ -1171,11 +1192,17 @@ static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
     return 0;
   }
 
+  if (Ty->isHalfTy()) {
+    APFloat APF(V);
+    bool unused;
+    APF.convert(APFloat::IEEEhalf, APFloat::rmNearestTiesToEven, &unused);
+    return ConstantFP::get(Ty->getContext(), APF);
+  }
   if (Ty->isFloatTy())
     return ConstantFP::get(Ty->getContext(), APFloat((float)V));
   if (Ty->isDoubleTy())
     return ConstantFP::get(Ty->getContext(), APFloat(V));
-  llvm_unreachable("Can only constant fold float/double");
+  llvm_unreachable("Can only constant fold half/float/double");
 }
 
 static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
@@ -1187,11 +1214,17 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
     return 0;
   }
 
+  if (Ty->isHalfTy()) {
+    APFloat APF(V);
+    bool unused;
+    APF.convert(APFloat::IEEEhalf, APFloat::rmNearestTiesToEven, &unused);
+    return ConstantFP::get(Ty->getContext(), APF);
+  }
   if (Ty->isFloatTy())
     return ConstantFP::get(Ty->getContext(), APFloat((float)V));
   if (Ty->isDoubleTy())
     return ConstantFP::get(Ty->getContext(), APFloat(V));
-  llvm_unreachable("Can only constant fold float/double");
+  llvm_unreachable("Can only constant fold half/float/double");
 }
 
 /// ConstantFoldConvertToInt - Attempt to an SSE floating point to integer
@@ -1243,7 +1276,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
       if (!TLI)
         return 0;
 
-      if (!Ty->isFloatTy() && !Ty->isDoubleTy())
+      if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
         return 0;
 
       /// We only fold functions with finite arguments. Folding NaN and inf is
@@ -1256,8 +1289,46 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
       /// the host native double versions.  Float versions are not called
       /// directly but for all these it is true (float)(f((double)arg)) ==
       /// f(arg).  Long double not supported yet.
-      double V = Ty->isFloatTy() ? (double)Op->getValueAPF().convertToFloat() :
-                                     Op->getValueAPF().convertToDouble();
+      double V;
+      if (Ty->isFloatTy())
+        V = Op->getValueAPF().convertToFloat();
+      else if (Ty->isDoubleTy())
+        V = Op->getValueAPF().convertToDouble();
+      else {
+        bool unused;
+        APFloat APF = Op->getValueAPF();
+        APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &unused);
+        V = APF.convertToDouble();
+      }
+
+      switch (F->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::fabs:
+          return ConstantFoldFP(fabs, V, Ty);
+#if HAVE_LOG2
+        case Intrinsic::log2:
+          return ConstantFoldFP(log2, V, Ty);
+#endif
+#if HAVE_LOG
+        case Intrinsic::log:
+          return ConstantFoldFP(log, V, Ty);
+#endif
+#if HAVE_LOG10
+        case Intrinsic::log10:
+          return ConstantFoldFP(log10, V, Ty);
+#endif
+#if HAVE_EXP
+        case Intrinsic::exp:
+          return ConstantFoldFP(exp, V, Ty);
+#endif
+#if HAVE_EXP2
+        case Intrinsic::exp2:
+          return ConstantFoldFP(exp2, V, Ty);
+#endif
+        case Intrinsic::floor:
+          return ConstantFoldFP(floor, V, Ty);
+      }
+
       switch (Name[0]) {
       case 'a':
         if (Name == "acos" && TLI->has(LibFunc::acos))
@@ -1299,7 +1370,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
         else if (Name == "log10" && V > 0 && TLI->has(LibFunc::log10))
           return ConstantFoldFP(log10, V, Ty);
         else if (F->getIntrinsicID() == Intrinsic::sqrt &&
-                 (Ty->isFloatTy() || Ty->isDoubleTy())) {
+                 (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())) {
           if (V >= -0.0)
             return ConstantFoldFP(sqrt, V, Ty);
           else // Undefined
@@ -1337,7 +1408,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
       case Intrinsic::ctpop:
         return ConstantInt::get(Ty, Op->getValue().countPopulation());
       case Intrinsic::convert_from_fp16: {
-        APFloat Val(Op->getValue());
+        APFloat Val(APFloat::IEEEhalf, Op->getValue());
 
         bool lost = false;
         APFloat::opStatus status =
@@ -1391,18 +1462,35 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
 
   if (Operands.size() == 2) {
     if (ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
-      if (!Ty->isFloatTy() && !Ty->isDoubleTy())
+      if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
         return 0;
-      double Op1V = Ty->isFloatTy() ?
-                      (double)Op1->getValueAPF().convertToFloat() :
-                      Op1->getValueAPF().convertToDouble();
+      double Op1V;
+      if (Ty->isFloatTy())
+        Op1V = Op1->getValueAPF().convertToFloat();
+      else if (Ty->isDoubleTy())
+        Op1V = Op1->getValueAPF().convertToDouble();
+      else {
+        bool unused;
+        APFloat APF = Op1->getValueAPF();
+        APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &unused);
+        Op1V = APF.convertToDouble();
+      }
+
       if (ConstantFP *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
         if (Op2->getType() != Op1->getType())
           return 0;
 
-        double Op2V = Ty->isFloatTy() ?
-                      (double)Op2->getValueAPF().convertToFloat():
-                      Op2->getValueAPF().convertToDouble();
+        double Op2V;
+        if (Ty->isFloatTy())
+          Op2V = Op2->getValueAPF().convertToFloat();
+        else if (Ty->isDoubleTy())
+          Op2V = Op2->getValueAPF().convertToDouble();
+        else {
+          bool unused;
+          APFloat APF = Op2->getValueAPF();
+          APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &unused);
+          Op2V = APF.convertToDouble();
+        }
 
         if (F->getIntrinsicID() == Intrinsic::pow) {
           return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
@@ -1416,6 +1504,10 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
         if (Name == "atan2" && TLI->has(LibFunc::atan2))
           return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
       } else if (ConstantInt *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
+        if (F->getIntrinsicID() == Intrinsic::powi && Ty->isHalfTy())
+          return ConstantFP::get(F->getContext(),
+                                 APFloat((float)std::pow((float)Op1V,
+                                                 (int)Op2C->getZExtValue())));
         if (F->getIntrinsicID() == Intrinsic::powi && Ty->isFloatTy())
           return ConstantFP::get(F->getContext(),
                                  APFloat((float)std::pow((float)Op1V,
@@ -1468,12 +1560,12 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
           return ConstantStruct::get(cast<StructType>(F->getReturnType()), Ops);
         }
         case Intrinsic::cttz:
-          // FIXME: This should check for Op2 == 1, and become unreachable if
-          // Op1 == 0.
+          if (Op2->isOne() && Op1->isZero()) // cttz(0, 1) is undef.
+            return UndefValue::get(Ty);
           return ConstantInt::get(Ty, Op1->getValue().countTrailingZeros());
         case Intrinsic::ctlz:
-          // FIXME: This should check for Op2 == 1, and become unreachable if
-          // Op1 == 0.
+          if (Op2->isOne() && Op1->isZero()) // ctlz(0, 1) is undef.
+            return UndefValue::get(Ty);
           return ConstantInt::get(Ty, Op1->getValue().countLeadingZeros());
         }
       }
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 1784512..44684a9 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -80,11 +80,23 @@ CostModelAnalysis::runOnFunction(Function &F) {
  return false;
 }
 
+static bool isReverseVectorMask(SmallVector<int, 16> &Mask) {
+  for (unsigned i = 0, MaskSize = Mask.size(); i < MaskSize; ++i)
+    if (Mask[i] > 0 && Mask[i] != (int)(MaskSize - 1 - i))
+      return false;
+  return true;
+}
+
 unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   if (!TTI)
     return -1;
 
   switch (I->getOpcode()) {
+  case Instruction::GetElementPtr:{
+    Type *ValTy = I->getOperand(0)->getType()->getPointerElementType();
+    return TTI->getAddressComputationCost(ValTy);
+  }
+
   case Instruction::Ret:
   case Instruction::PHI:
   case Instruction::Br: {
@@ -166,6 +178,17 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
       return TTI->getVectorInstrCost(I->getOpcode(),
                                      IE->getType(), Idx);
     }
+  case Instruction::ShuffleVector: {
+    const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
+    Type *VecTypOp0 = Shuffle->getOperand(0)->getType();
+    unsigned NumVecElems = VecTypOp0->getVectorNumElements();
+    SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
+
+    if (NumVecElems == Mask.size() && isReverseVectorMask(Mask))
+      return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, 0,
+                                 0);
+    return -1;
+  }
   default:
     // We don't have any information on this instruction.
     return -1;
diff --git a/lib/Analysis/IPA/CMakeLists.txt b/lib/Analysis/IPA/CMakeLists.txt
index 34d6d1b..67b4135 100644
--- a/lib/Analysis/IPA/CMakeLists.txt
+++ b/lib/Analysis/IPA/CMakeLists.txt
@@ -1,9 +1,11 @@
 add_llvm_library(LLVMipa
   CallGraph.cpp
   CallGraphSCCPass.cpp
+  CallPrinter.cpp
   FindUsedTypes.cpp
   GlobalsModRef.cpp
   IPA.cpp
+  InlineCost.cpp
   )
 
 add_dependencies(LLVMipa intrinsics_gen)
diff --git a/lib/Analysis/IPA/CallPrinter.cpp b/lib/Analysis/IPA/CallPrinter.cpp
new file mode 100644
index 0000000..306ae7a
--- /dev/null
+++ b/lib/Analysis/IPA/CallPrinter.cpp
@@ -0,0 +1,87 @@
+//===- CallPrinter.cpp - DOT printer for call graph -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines '-dot-callgraph', which emit a callgraph.<fnname>.dot
+// containing the call graph of a module.
+//
+// There is also a pass available to directly call dotty ('-view-callgraph').
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallPrinter.h"
+#include "llvm/Analysis/DOTGraphTraitsPass.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+template<>
+struct DOTGraphTraits<CallGraph*> : public DefaultDOTGraphTraits {
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(CallGraph *Graph) {
+    return "Call graph";
+  }
+
+  std::string getNodeLabel(CallGraphNode *Node, CallGraph *Graph) {
+    if (Function *Func = Node->getFunction())
+      return Func->getName();
+
+    return "external node";
+  }
+};
+
+} // end llvm namespace
+
+namespace {
+
+struct CallGraphViewer
+  : public DOTGraphTraitsModuleViewer<CallGraph, true> {
+  static char ID;
+
+  CallGraphViewer()
+    : DOTGraphTraitsModuleViewer<CallGraph, true>("callgraph", ID) {
+    initializeCallGraphViewerPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct CallGraphPrinter
+  : public DOTGraphTraitsModulePrinter<CallGraph, true> {
+  static char ID;
+
+  CallGraphPrinter()
+    : DOTGraphTraitsModulePrinter<CallGraph, true>("callgraph", ID) {
+      initializeCallGraphPrinterPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+} // end anonymous namespace
+
+char CallGraphViewer::ID = 0;
+INITIALIZE_PASS(CallGraphViewer, "view-callgraph",
+                "View call graph",
+                false, false)
+
+char CallGraphPrinter::ID = 0;
+INITIALIZE_PASS(CallGraphPrinter, "dot-callgraph",
+                "Print call graph to 'dot' file",
+                false, false)
+
+// Create methods available outside of this file, to use them
+// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
+// the link time optimization.
+
+ModulePass *llvm::createCallGraphViewerPass() {
+  return new CallGraphViewer();
+}
+
+ModulePass *llvm::createCallGraphPrinterPass() {
+  return new CallGraphPrinter();
+}
diff --git a/lib/Analysis/IPA/IPA.cpp b/lib/Analysis/IPA/IPA.cpp
index 0ba2e04..aa5164e 100644
--- a/lib/Analysis/IPA/IPA.cpp
+++ b/lib/Analysis/IPA/IPA.cpp
@@ -20,6 +20,8 @@ using namespace llvm;
 void llvm::initializeIPA(PassRegistry &Registry) {
   initializeBasicCallGraphPass(Registry);
   initializeCallGraphAnalysisGroup(Registry);
+  initializeCallGraphPrinterPass(Registry);
+  initializeCallGraphViewerPass(Registry);
   initializeFindUsedTypesPass(Registry);
   initializeGlobalsModRefPass(Registry);
 }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 6e5c035..3292e00 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -44,6 +45,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   // DataLayout if available, or null.
   const DataLayout *const TD;
 
+  /// The TargetTransformInfo available for this compilation.
+  const TargetTransformInfo &TTI;
+
   // The called function.
   Function &F;
 
@@ -130,16 +134,17 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool visitCallSite(CallSite CS);
 
 public:
-  CallAnalyzer(const DataLayout *TD, Function &Callee, int Threshold)
-    : TD(TD), F(Callee), Threshold(Threshold), Cost(0),
-      IsCallerRecursive(false), IsRecursiveCall(false),
-      ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
-      AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
-      FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
-      NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
-      NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
-      NumInstructionsSimplified(0), SROACostSavings(0), SROACostSavingsLost(0) {
-  }
+  CallAnalyzer(const DataLayout *TD, const TargetTransformInfo &TTI,
+               Function &Callee, int Threshold)
+      : TD(TD), TTI(TTI), F(Callee), Threshold(Threshold), Cost(0),
+        IsCallerRecursive(false), IsRecursiveCall(false),
+        ExposesReturnsTwice(false), HasDynamicAlloca(false),
+        ContainsNoDuplicateCall(false), AllocatedSize(0), NumInstructions(0),
+        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
+        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
+        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
+        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
+        SROACostSavings(0), SROACostSavingsLost(0) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -417,7 +422,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
-  return isInstructionFree(&I, TD);
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 }
 
 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
@@ -447,7 +452,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
-  return isInstructionFree(&I, TD);
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 }
 
 bool CallAnalyzer::visitCastInst(CastInst &I) {
@@ -464,7 +469,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
   disableSROA(I.getOperand(0));
 
-  return isInstructionFree(&I, TD);
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 }
 
 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
@@ -731,7 +736,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
       return false;
     }
 
-    if (!callIsSmall(CS)) {
+    if (TTI.isLoweredToCall(F)) {
       // We account for the average 1 instruction per call argument setup
       // here.
       Cost += CS.arg_size() * InlineConstants::InstrCost;
@@ -764,7 +769,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   // during devirtualization and so we want to give it a hefty bonus for
   // inlining, but cap that bonus in the event that inlining wouldn't pan
   // out. Pretend to inline the function, with a custom threshold.
-  CallAnalyzer CA(TD, *F, InlineConstants::IndirectCallThreshold);
+  CallAnalyzer CA(TD, TTI, *F, InlineConstants::IndirectCallThreshold);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
     // bonus we want to apply, but don't go below zero.
@@ -777,7 +782,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
 bool CallAnalyzer::visitInstruction(Instruction &I) {
   // Some instructions are free. All of the free intrinsics can also be
   // handled by SROA, etc.
-  if (isInstructionFree(&I, TD))
+  if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
     return true;
 
   // We found something we don't understand or can't handle. Mark any SROA-able
@@ -1132,11 +1137,35 @@ void CallAnalyzer::dump() {
 }
 #endif
 
-InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, int Threshold) {
+INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
+                      true, true)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
+                    true, true)
+
+char InlineCostAnalysis::ID = 0;
+
+InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID), TD(0) {}
+
+InlineCostAnalysis::~InlineCostAnalysis() {}
+
+void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetTransformInfo>();
+  CallGraphSCCPass::getAnalysisUsage(AU);
+}
+
+bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TTI = &getAnalysis<TargetTransformInfo>();
+  return false;
+}
+
+InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
   return getInlineCost(CS, CS.getCalledFunction(), Threshold);
 }
 
-InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
+InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
                                              int Threshold) {
   // Cannot inline indirect calls.
   if (!Callee)
@@ -1163,7 +1192,7 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
         << "...\n");
 
-  CallAnalyzer CA(TD, *Callee, Threshold);
+  CallAnalyzer CA(TD, *TTI, *Callee, Threshold);
   bool ShouldInline = CA.analyzeCall(CS);
 
   DEBUG(CA.dump());
@@ -1177,9 +1206,10 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
 }
 
-bool InlineCostAnalyzer::isInlineViable(Function &F) {
-  bool ReturnsTwice =F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                    Attribute::ReturnsTwice);
+bool InlineCostAnalysis::isInlineViable(Function &F) {
+  bool ReturnsTwice =
+    F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                   Attribute::ReturnsTwice);
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
     // Disallow inlining of functions which contain an indirect branch.
     if (isa<IndirectBrInst>(BI->getTerminator()))
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index d97e226..4a3c74e 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -21,10 +21,10 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/Operator.h"
@@ -663,12 +663,20 @@ Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 /// accumulates the total constant offset applied in the returned constant. It
 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
 /// no constant offsets applied.
-static Constant *stripAndComputeConstantOffsets(const DataLayout &TD,
+///
+/// This is very similar to GetPointerBaseWithConstantOffset except it doesn't
+/// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
+/// folding.
+static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
                                                 Value *&V) {
-  if (!V->getType()->isPointerTy())
-    return 0;
+  assert(V->getType()->getScalarType()->isPointerTy());
+
+  // Without DataLayout, just be conservative for now. Theoretically, more could
+  // be done in this case.
+  if (!TD)
+    return ConstantInt::get(IntegerType::get(V->getContext(), 64), 0);
 
-  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  unsigned IntPtrWidth = TD->getPointerSizeInBits();
   APInt Offset = APInt::getNullValue(IntPtrWidth);
 
   // Even though we don't look through PHI nodes, we could be called on an
@@ -677,7 +685,7 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout &TD,
   Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-      if (!GEP->isInBounds() || !GEP->accumulateConstantOffset(TD, Offset))
+      if (!GEP->isInBounds() || !GEP->accumulateConstantOffset(*TD, Offset))
         break;
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
@@ -689,23 +697,24 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout &TD,
     } else {
       break;
     }
-    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+    assert(V->getType()->getScalarType()->isPointerTy() &&
+           "Unexpected operand type!");
   } while (Visited.insert(V));
 
-  Type *IntPtrTy = TD.getIntPtrType(V->getContext());
-  return ConstantInt::get(IntPtrTy, Offset);
+  Type *IntPtrTy = TD->getIntPtrType(V->getContext());
+  Constant *OffsetIntPtr = ConstantInt::get(IntPtrTy, Offset);
+  if (V->getType()->isVectorTy())
+    return ConstantVector::getSplat(V->getType()->getVectorNumElements(),
+                                    OffsetIntPtr);
+  return OffsetIntPtr;
 }
 
 /// \brief Compute the constant difference between two pointer values.
 /// If the difference is not a constant, returns zero.
-static Constant *computePointerDifference(const DataLayout &TD,
+static Constant *computePointerDifference(const DataLayout *TD,
                                           Value *LHS, Value *RHS) {
   Constant *LHSOffset = stripAndComputeConstantOffsets(TD, LHS);
-  if (!LHSOffset)
-    return 0;
   Constant *RHSOffset = stripAndComputeConstantOffsets(TD, RHS);
-  if (!RHSOffset)
-    return 0;
 
   // If LHS and RHS are not related via constant offsets to the same base
   // value, there is nothing we can do here.
@@ -819,9 +828,9 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
           return W;
 
   // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
-  if (Q.TD && match(Op0, m_PtrToInt(m_Value(X))) &&
+  if (match(Op0, m_PtrToInt(m_Value(X))) &&
       match(Op1, m_PtrToInt(m_Value(Y))))
-    if (Constant *Result = computePointerDifference(*Q.TD, X, Y))
+    if (Constant *Result = computePointerDifference(Q.TD, X, Y))
       return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
 
   // Mul distributes over Sub.  Try some generic simplifications based on this.
@@ -1684,9 +1693,48 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
   return 0;
 }
 
-static Constant *computePointerICmp(const DataLayout &TD,
+// A significant optimization not implemented here is assuming that alloca
+// addresses are not equal to incoming argument values. They don't *alias*,
+// as we say, but that doesn't mean they aren't equal, so we take a
+// conservative approach.
+//
+// This is inspired in part by C++11 5.10p1:
+//   "Two pointers of the same type compare equal if and only if they are both
+//    null, both point to the same function, or both represent the same
+//    address."
+//
+// This is pretty permissive.
+//
+// It's also partly due to C11 6.5.9p6:
+//   "Two pointers compare equal if and only if both are null pointers, both are
+//    pointers to the same object (including a pointer to an object and a
+//    subobject at its beginning) or function, both are pointers to one past the
+//    last element of the same array object, or one is a pointer to one past the
+//    end of one array object and the other is a pointer to the start of a
+//    different array object that happens to immediately follow the ﬁrst array
+//    object in the address space.)
+//
+// C11's version is more restrictive, however there's no reason why an argument
+// couldn't be a one-past-the-end value for a stack object in the caller and be
+// equal to the beginning of a stack object in the callee.
+//
+// If the C and C++ standards are ever made sufficiently restrictive in this
+// area, it may be possible to update LLVM's semantics accordingly and reinstate
+// this optimization.
+static Constant *computePointerICmp(const DataLayout *TD,
+                                    const TargetLibraryInfo *TLI,
                                     CmpInst::Predicate Pred,
                                     Value *LHS, Value *RHS) {
+  // First, skip past any trivial no-ops.
+  LHS = LHS->stripPointerCasts();
+  RHS = RHS->stripPointerCasts();
+
+  // A non-null pointer is not equal to a null pointer.
+  if (llvm::isKnownNonNull(LHS) && isa<ConstantPointerNull>(RHS) &&
+      (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE))
+    return ConstantInt::get(GetCompareTy(LHS),
+                            !CmpInst::isTrueWhenEqual(Pred));
+
   // We can only fold certain predicates on pointer comparisons.
   switch (Pred) {
   default:
@@ -1709,19 +1757,83 @@ static Constant *computePointerICmp(const DataLayout &TD,
     break;
   }
 
+  // Strip off any constant offsets so that we can reason about them.
+  // It's tempting to use getUnderlyingObject or even just stripInBoundsOffsets
+  // here and compare base addresses like AliasAnalysis does, however there are
+  // numerous hazards. AliasAnalysis and its utilities rely on special rules
+  // governing loads and stores which don't apply to icmps. Also, AliasAnalysis
+  // doesn't need to guarantee pointer inequality when it says NoAlias.
   Constant *LHSOffset = stripAndComputeConstantOffsets(TD, LHS);
-  if (!LHSOffset)
-    return 0;
   Constant *RHSOffset = stripAndComputeConstantOffsets(TD, RHS);
-  if (!RHSOffset)
-    return 0;
 
-  // If LHS and RHS are not related via constant offsets to the same base
-  // value, there is nothing we can do here.
-  if (LHS != RHS)
-    return 0;
+  // If LHS and RHS are related via constant offsets to the same base
+  // value, we can replace it with an icmp which just compares the offsets.
+  if (LHS == RHS)
+    return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset);
+
+  // Various optimizations for (in)equality comparisons.
+  if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
+    // Different non-empty allocations that exist at the same time have
+    // different addresses (if the program can tell). Global variables always
+    // exist, so they always exist during the lifetime of each other and all
+    // allocas. Two different allocas usually have different addresses...
+    //
+    // However, if there's an @llvm.stackrestore dynamically in between two
+    // allocas, they may have the same address. It's tempting to reduce the
+    // scope of the problem by only looking at *static* allocas here. That would
+    // cover the majority of allocas while significantly reducing the likelihood
+    // of having an @llvm.stackrestore pop up in the middle. However, it's not
+    // actually impossible for an @llvm.stackrestore to pop up in the middle of
+    // an entry block. Also, if we have a block that's not attached to a
+    // function, we can't tell if it's "static" under the current definition.
+    // Theoretically, this problem could be fixed by creating a new kind of
+    // instruction kind specifically for static allocas. Such a new instruction
+    // could be required to be at the top of the entry block, thus preventing it
+    // from being subject to a @llvm.stackrestore. Instcombine could even
+    // convert regular allocas into these special allocas. It'd be nifty.
+    // However, until then, this problem remains open.
+    //
+    // So, we'll assume that two non-empty allocas have different addresses
+    // for now.
+    //
+    // With all that, if the offsets are within the bounds of their allocations
+    // (and not one-past-the-end! so we can't use inbounds!), and their
+    // allocations aren't the same, the pointers are not equal.
+    //
+    // Note that it's not necessary to check for LHS being a global variable
+    // address, due to canonicalization and constant folding.
+    if (isa<AllocaInst>(LHS) &&
+        (isa<AllocaInst>(RHS) || isa<GlobalVariable>(RHS))) {
+      ConstantInt *LHSOffsetCI = dyn_cast<ConstantInt>(LHSOffset);
+      ConstantInt *RHSOffsetCI = dyn_cast<ConstantInt>(RHSOffset);
+      uint64_t LHSSize, RHSSize;
+      if (LHSOffsetCI && RHSOffsetCI &&
+          getObjectSize(LHS, LHSSize, TD, TLI) &&
+          getObjectSize(RHS, RHSSize, TD, TLI)) {
+        const APInt &LHSOffsetValue = LHSOffsetCI->getValue();
+        const APInt &RHSOffsetValue = RHSOffsetCI->getValue();
+        if (!LHSOffsetValue.isNegative() &&
+            !RHSOffsetValue.isNegative() &&
+            LHSOffsetValue.ult(LHSSize) &&
+            RHSOffsetValue.ult(RHSSize)) {
+          return ConstantInt::get(GetCompareTy(LHS),
+                                  !CmpInst::isTrueWhenEqual(Pred));
+        }
+      }
+
+      // Repeat the above check but this time without depending on DataLayout
+      // or being able to compute a precise size.
+      if (!cast<PointerType>(LHS->getType())->isEmptyTy() &&
+          !cast<PointerType>(RHS->getType())->isEmptyTy() &&
+          LHSOffset->isNullValue() &&
+          RHSOffset->isNullValue())
+        return ConstantInt::get(GetCompareTy(LHS),
+                                !CmpInst::isTrueWhenEqual(Pred));
+    }
+  }
 
-  return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset);
+  // Otherwise, fail.
+  return 0;
 }
 
 /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
@@ -1786,62 +1898,6 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
-  // icmp <object*>, <object*/null> - Different identified objects have
-  // different addresses (unless null), and what's more the address of an
-  // identified local is never equal to another argument (again, barring null).
-  // Note that generalizing to the case where LHS is a global variable address
-  // or null is pointless, since if both LHS and RHS are constants then we
-  // already constant folded the compare, and if only one of them is then we
-  // moved it to RHS already.
-  Value *LHSPtr = LHS->stripPointerCasts();
-  Value *RHSPtr = RHS->stripPointerCasts();
-  if (LHSPtr == RHSPtr)
-    return ConstantInt::get(ITy, CmpInst::isTrueWhenEqual(Pred));
-
-  // Be more aggressive about stripping pointer adjustments when checking a
-  // comparison of an alloca address to another object.  We can rip off all
-  // inbounds GEP operations, even if they are variable.
-  LHSPtr = LHSPtr->stripInBoundsOffsets();
-  if (llvm::isIdentifiedObject(LHSPtr)) {
-    RHSPtr = RHSPtr->stripInBoundsOffsets();
-    if (llvm::isKnownNonNull(LHSPtr) || llvm::isKnownNonNull(RHSPtr)) {
-      // If both sides are different identified objects, they aren't equal
-      // unless they're null.
-      if (LHSPtr != RHSPtr && llvm::isIdentifiedObject(RHSPtr) &&
-          Pred == CmpInst::ICMP_EQ)
-        return ConstantInt::get(ITy, false);
-
-      // A local identified object (alloca or noalias call) can't equal any
-      // incoming argument, unless they're both null or they belong to
-      // different functions. The latter happens during inlining.
-      if (Instruction *LHSInst = dyn_cast<Instruction>(LHSPtr))
-        if (Argument *RHSArg = dyn_cast<Argument>(RHSPtr))
-          if (LHSInst->getParent()->getParent() == RHSArg->getParent() &&
-              Pred == CmpInst::ICMP_EQ)
-            return ConstantInt::get(ITy, false);
-    }
-
-    // Assume that the constant null is on the right.
-    if (llvm::isKnownNonNull(LHSPtr) && isa<ConstantPointerNull>(RHSPtr)) {
-      if (Pred == CmpInst::ICMP_EQ)
-        return ConstantInt::get(ITy, false);
-      else if (Pred == CmpInst::ICMP_NE)
-        return ConstantInt::get(ITy, true);
-    }
-  } else if (Argument *LHSArg = dyn_cast<Argument>(LHSPtr)) {
-    RHSPtr = RHSPtr->stripInBoundsOffsets();
-    // An alloca can't be equal to an argument unless they come from separate
-    // functions via inlining.
-    if (AllocaInst *RHSInst = dyn_cast<AllocaInst>(RHSPtr)) {
-      if (LHSArg->getParent() == RHSInst->getParent()->getParent()) {
-        if (Pred == CmpInst::ICMP_EQ)
-          return ConstantInt::get(ITy, false);
-        else if (Pred == CmpInst::ICMP_NE)
-          return ConstantInt::get(ITy, true);
-      }
-    }
-  }
-
   // If we are comparing with zero then try hard since this is a common case.
   if (match(RHS, m_Zero())) {
     bool LHSKnownNonNegative, LHSKnownNegative;
@@ -2468,8 +2524,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 
   // Simplify comparisons of related pointers using a powerful, recursive
   // GEP-walk when we have target data available..
-  if (Q.TD && LHS->getType()->isPointerTy() && RHS->getType()->isPointerTy())
-    if (Constant *C = computePointerICmp(*Q.TD, Pred, LHS, RHS))
+  if (LHS->getType()->isPointerTy())
+    if (Constant *C = computePointerICmp(Q.TD, Q.TLI, Pred, LHS, RHS))
       return C;
 
   if (GetElementPtrInst *GLHS = dyn_cast<GetElementPtrInst>(LHS)) {
@@ -2869,6 +2925,37 @@ Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                            RecursionLimit);
 }
 
+static bool IsIdempotent(Intrinsic::ID ID) {
+  switch (ID) {
+  default: return false;
+
+  // Unary idempotent: f(f(x)) = f(x)
+  case Intrinsic::fabs:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+    return true;
+  }
+}
+
+template <typename IterTy>
+static Value *SimplifyIntrinsic(Intrinsic::ID IID, IterTy ArgBegin, IterTy ArgEnd,
+                                const Query &Q, unsigned MaxRecurse) {
+  // Perform idempotent optimizations
+  if (!IsIdempotent(IID))
+    return 0;
+
+  // Unary Ops
+  if (std::distance(ArgBegin, ArgEnd) == 1)
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*ArgBegin))
+      if (II->getIntrinsicID() == IID)
+        return II;
+
+  return 0;
+}
+
 template <typename IterTy>
 static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
                            const Query &Q, unsigned MaxRecurse) {
@@ -2885,6 +2972,11 @@ static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
   if (!F)
     return 0;
 
+  if (unsigned IID = F->getIntrinsicID())
+    if (Value *Ret =
+        SimplifyIntrinsic((Intrinsic::ID) IID, ArgBegin, ArgEnd, Q, MaxRecurse))
+      return Ret;
+
   if (!canConstantFoldCallTo(F))
     return 0;
 
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 1c94d10..66b5e85 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index fd10a6b..9393508 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -412,51 +412,49 @@ void Lint::visitMemoryReference(Instruction &I,
   }
 
   // Check for buffer overflows and misalignment.
-  if (TD) {
-    // Only handles memory references that read/write something simple like an
-    // alloca instruction or a global variable.
-    int64_t Offset = 0;
-    if (Value *Base = GetPointerBaseWithConstantOffset(Ptr, Offset, *TD)) {
-      // OK, so the access is to a constant offset from Ptr.  Check that Ptr is
-      // something we can handle and if so extract the size of this base object
-      // along with its alignment.
-      uint64_t BaseSize = AliasAnalysis::UnknownSize;
-      unsigned BaseAlign = 0;
-
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
-        Type *ATy = AI->getAllocatedType();
-        if (!AI->isArrayAllocation() && ATy->isSized())
-          BaseSize = TD->getTypeAllocSize(ATy);
-        BaseAlign = AI->getAlignment();
-        if (BaseAlign == 0 && ATy->isSized())
-          BaseAlign = TD->getABITypeAlignment(ATy);
-      } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
-        // If the global may be defined differently in another compilation unit
-        // then don't warn about funky memory accesses.
-        if (GV->hasDefinitiveInitializer()) {
-          Type *GTy = GV->getType()->getElementType();
-          if (GTy->isSized())
-            BaseSize = TD->getTypeAllocSize(GTy);
-          BaseAlign = GV->getAlignment();
-          if (BaseAlign == 0 && GTy->isSized())
-            BaseAlign = TD->getABITypeAlignment(GTy);
-        }
+  // Only handles memory references that read/write something simple like an
+  // alloca instruction or a global variable.
+  int64_t Offset = 0;
+  if (Value *Base = GetPointerBaseWithConstantOffset(Ptr, Offset, TD)) {
+    // OK, so the access is to a constant offset from Ptr.  Check that Ptr is
+    // something we can handle and if so extract the size of this base object
+    // along with its alignment.
+    uint64_t BaseSize = AliasAnalysis::UnknownSize;
+    unsigned BaseAlign = 0;
+
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
+      Type *ATy = AI->getAllocatedType();
+      if (TD && !AI->isArrayAllocation() && ATy->isSized())
+        BaseSize = TD->getTypeAllocSize(ATy);
+      BaseAlign = AI->getAlignment();
+      if (TD && BaseAlign == 0 && ATy->isSized())
+        BaseAlign = TD->getABITypeAlignment(ATy);
+    } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
+      // If the global may be defined differently in another compilation unit
+      // then don't warn about funky memory accesses.
+      if (GV->hasDefinitiveInitializer()) {
+        Type *GTy = GV->getType()->getElementType();
+        if (TD && GTy->isSized())
+          BaseSize = TD->getTypeAllocSize(GTy);
+        BaseAlign = GV->getAlignment();
+        if (TD && BaseAlign == 0 && GTy->isSized())
+          BaseAlign = TD->getABITypeAlignment(GTy);
       }
-
-      // Accesses from before the start or after the end of the object are not
-      // defined.
-      Assert1(Size == AliasAnalysis::UnknownSize ||
-              BaseSize == AliasAnalysis::UnknownSize ||
-              (Offset >= 0 && Offset + Size <= BaseSize),
-              "Undefined behavior: Buffer overflow", &I);
-
-      // Accesses that say that the memory is more aligned than it is are not
-      // defined.
-      if (Align == 0 && Ty && Ty->isSized())
-        Align = TD->getABITypeAlignment(Ty);
-      Assert1(!BaseAlign || Align <= MinAlign(BaseAlign, Offset),
-              "Undefined behavior: Memory reference address is misaligned", &I);
     }
+
+    // Accesses from before the start or after the end of the object are not
+    // defined.
+    Assert1(Size == AliasAnalysis::UnknownSize ||
+            BaseSize == AliasAnalysis::UnknownSize ||
+            (Offset >= 0 && Offset + Size <= BaseSize),
+            "Undefined behavior: Buffer overflow", &I);
+
+    // Accesses that say that the memory is more aligned than it is are not
+    // defined.
+    if (TD && Align == 0 && Ty && Ty->isSized())
+      Align = TD->getABITypeAlignment(Ty);
+    Assert1(!BaseAlign || Align <= MinAlign(BaseAlign, Offset),
+            "Undefined behavior: Memory reference address is misaligned", &I);
   }
 }
 
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index 3158873..0902a39 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -57,8 +57,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
                                        unsigned Align, const DataLayout *TD) {
   int64_t ByteOffset = 0;
   Value *Base = V;
-  if (TD)
-    Base = GetPointerBaseWithConstantOffset(V, ByteOffset, *TD);
+  Base = GetPointerBaseWithConstantOffset(V, ByteOffset, TD);
 
   if (ByteOffset < 0) // out of bounds
     return false;
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 4d4c627..f1ad650 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Assembly/Writer.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -233,6 +234,55 @@ bool Loop::isSafeToClone() const {
   return true;
 }
 
+bool Loop::isAnnotatedParallel() const {
+
+  BasicBlock *latch = getLoopLatch();
+  if (latch == NULL)
+    return false;
+
+  MDNode *desiredLoopIdMetadata =
+    latch->getTerminator()->getMetadata("llvm.loop.parallel");
+
+  if (!desiredLoopIdMetadata)
+      return false;
+
+  // The loop branch contains the parallel loop metadata. In order to ensure
+  // that any parallel-loop-unaware optimization pass hasn't added loop-carried
+  // dependencies (thus converted the loop back to a sequential loop), check
+  // that all the memory instructions in the loop contain parallelism metadata
+  // that point to the same unique "loop id metadata" the loop branch does.
+  for (block_iterator BB = block_begin(), BE = block_end(); BB != BE; ++BB) {
+    for (BasicBlock::iterator II = (*BB)->begin(), EE = (*BB)->end();
+         II != EE; II++) {
+
+      if (!II->mayReadOrWriteMemory())
+        continue;
+
+      if (!II->getMetadata("llvm.mem.parallel_loop_access"))
+        return false;
+
+      // The memory instruction can refer to the loop identifier metadata
+      // directly or indirectly through another list metadata (in case of
+      // nested parallel loops). The loop identifier metadata refers to
+      // itself so we can check both cases with the same routine.
+      MDNode *loopIdMD =
+          dyn_cast<MDNode>(II->getMetadata("llvm.mem.parallel_loop_access"));
+      bool loopIdMDFound = false;
+      for (unsigned i = 0, e = loopIdMD->getNumOperands(); i < e; ++i) {
+        if (loopIdMD->getOperand(i) == desiredLoopIdMetadata) {
+          loopIdMDFound = true;
+          break;
+        }
+      }
+
+      if (!loopIdMDFound)
+        return false;
+    }
+  }
+  return true;
+}
+
+
 /// hasDedicatedExits - Return true if no exit block for the loop
 /// has a predecessor that is outside the loop.
 bool Loop::hasDedicatedExits() const {
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index f88affb..0fc0550 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -385,21 +385,16 @@ ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *TD,
 
 SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
   V = V->stripPointerCasts();
-
-  if (isa<Instruction>(V) || isa<GEPOperator>(V)) {
-    // If we have already seen this instruction, bail out.
-    if (!SeenInsts.insert(V))
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If we have already seen this instruction, bail out. Cycles can happen in
+    // unreachable code after constant propagation.
+    if (!SeenInsts.insert(I))
       return unknown();
 
-    SizeOffsetType Ret;
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
-      Ret = visitGEPOperator(*GEP);
-    else
-      Ret = visit(cast<Instruction>(*V));
-    SeenInsts.erase(V);
-    return Ret;
+      return visitGEPOperator(*GEP);
+    return visit(*I);
   }
-
   if (Argument *A = dyn_cast<Argument>(V))
     return visitArgument(*A);
   if (ConstantPointerNull *P = dyn_cast<ConstantPointerNull>(V))
@@ -413,6 +408,8 @@ SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
     if (CE->getOpcode() == Instruction::IntToPtr)
       return unknown(); // clueless
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      return visitGEPOperator(cast<GEPOperator>(*CE));
   }
 
   DEBUG(dbgs() << "ObjectSizeOffsetVisitor::compute() unhandled value: " << *V
@@ -546,21 +543,9 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitLoadInst(LoadInst&) {
   return unknown();
 }
 
-SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode &PHI) {
-  if (PHI.getNumIncomingValues() == 0)
-    return unknown();
-
-  SizeOffsetType Ret = compute(PHI.getIncomingValue(0));
-  if (!bothKnown(Ret))
-    return unknown();
-
-  // verify that all PHI incoming pointers have the same size and offset
-  for (unsigned i = 1, e = PHI.getNumIncomingValues(); i != e; ++i) {
-    SizeOffsetType EdgeData = compute(PHI.getIncomingValue(i));
-    if (!bothKnown(EdgeData) || EdgeData != Ret)
-      return unknown();
-  }
-  return Ret;
+SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode&) {
+  // too complex to analyze statically.
+  return unknown();
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index eee7607..38bf5dd 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -262,7 +262,7 @@ isLoadLoadClobberIfExtendedToFullWidth(const AliasAnalysis::Location &MemLoc,
 
   // If we haven't already computed the base/offset of MemLoc, do so now.
   if (MemLocBase == 0)
-    MemLocBase = GetPointerBaseWithConstantOffset(MemLoc.Ptr, MemLocOffs, *TD);
+    MemLocBase = GetPointerBaseWithConstantOffset(MemLoc.Ptr, MemLocOffs, TD);
 
   unsigned Size = MemoryDependenceAnalysis::
     getLoadLoadClobberFullWidthSize(MemLocBase, MemLocOffs, MemLoc.Size,
@@ -283,11 +283,17 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
                                 const DataLayout &TD) {
   // We can only extend simple integer loads.
   if (!isa<IntegerType>(LI->getType()) || !LI->isSimple()) return 0;
+
+  // Load widening is hostile to ThreadSanitizer: it may cause false positives
+  // or make the reports more cryptic (access sizes are wrong).
+  if (LI->getParent()->getParent()->getAttributes().
+      hasAttribute(AttributeSet::FunctionIndex, Attribute::SanitizeThread))
+    return 0;
   
   // Get the base of this load.
   int64_t LIOffs = 0;
   const Value *LIBase = 
-    GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, TD);
+    GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, &TD);
   
   // If the two pointers are not based on the same pointer, we can't tell that
   // they are related.
@@ -328,7 +334,7 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
 
     if (LIOffs+NewLoadByteSize > MemLocEnd &&
         LI->getParent()->getParent()->getAttributes().
-          hasAttribute(AttributeSet::FunctionIndex, Attribute::AddressSafety))
+          hasAttribute(AttributeSet::FunctionIndex, Attribute::SanitizeAddress))
       // We will be reading past the location accessed by the original program.
       // While this is safe in a regular build, Address Safety analysis tools
       // may start reporting false warnings. So, don't do widening.
diff --git a/lib/Analysis/ProfileDataLoaderPass.cpp b/lib/Analysis/ProfileDataLoaderPass.cpp
index 51b7f1d..2ee0093 100644
--- a/lib/Analysis/ProfileDataLoaderPass.cpp
+++ b/lib/Analysis/ProfileDataLoaderPass.cpp
@@ -177,8 +177,8 @@ bool ProfileMetadataLoaderPass::runOnModule(Module &M) {
   unsigned ReadCount = matchEdges(M, PB, Counters);
 
   if (ReadCount != Counters.size()) {
-    M.getContext().emitWarning("profile information is inconsistent "
-                               "with the current program");
+    errs() << "WARNING: profile information is inconsistent with "
+           << "the current program!\n";
   }
   NumEdgesRead = ReadCount;
 
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
index 094c107..346f8d6 100644
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ b/lib/Analysis/ProfileInfoLoaderPass.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/ProfileInfoLoader.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
@@ -171,8 +170,8 @@ bool LoaderPass::runOnModule(Module &M) {
       }
     }
     if (ReadCount != Counters.size()) {
-      M.getContext().emitWarning("profile information is inconsistent "
-                                 "with the current program");
+      errs() << "WARNING: profile information is inconsistent with "
+             << "the current program!\n";
     }
     NumEdgesRead = ReadCount;
   }
@@ -219,8 +218,8 @@ bool LoaderPass::runOnModule(Module &M) {
       }
     }
     if (ReadCount != Counters.size()) {
-      M.getContext().emitWarning("profile information is inconsistent "
-                                 "with the current program");
+      errs() << "WARNING: profile information is inconsistent with "
+             << "the current program!\n";
     }
     NumEdgesRead = ReadCount;
   }
@@ -240,8 +239,8 @@ bool LoaderPass::runOnModule(Module &M) {
           BlockInformation[F][BB] = (double)Counters[ReadCount++];
     }
     if (ReadCount != Counters.size()) {
-      M.getContext().emitWarning("profile information is inconsistent "
-                                 "with the current program");
+      errs() << "WARNING: profile information is inconsistent with "
+             << "the current program!\n";
     }
   }
 
@@ -259,8 +258,8 @@ bool LoaderPass::runOnModule(Module &M) {
         FunctionInformation[F] = (double)Counters[ReadCount++];
     }
     if (ReadCount != Counters.size()) {
-      M.getContext().emitWarning("profile information is inconsistent "
-                                 "with the current program");
+      errs() << "WARNING: profile information is inconsistent with "
+             << "the current program!\n";
     }
   }
 
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index b87ad75..fcd7ce2 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -1523,9 +1523,8 @@ Value *SCEVExpander::expand(const SCEV *S) {
     }
 
   // Check to see if we already expanded this here.
-  std::map<std::pair<const SCEV *, Instruction *>,
-           AssertingVH<Value> >::iterator I =
-    InsertedExpressions.find(std::make_pair(S, InsertPt));
+  std::map<std::pair<const SCEV *, Instruction *>, TrackingVH<Value> >::iterator
+    I = InsertedExpressions.find(std::make_pair(S, InsertPt));
   if (I != InsertedExpressions.end())
     return I->second;
 
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 63f495a..72421a0 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -9,6 +9,12 @@
 
 #define DEBUG_TYPE "tti"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
@@ -43,6 +49,49 @@ void TargetTransformInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetTransformInfo>();
 }
 
+unsigned TargetTransformInfo::getOperationCost(unsigned Opcode, Type *Ty,
+                                               Type *OpTy) const {
+  return PrevTTI->getOperationCost(Opcode, Ty, OpTy);
+}
+
+unsigned TargetTransformInfo::getGEPCost(
+    const Value *Ptr, ArrayRef<const Value *> Operands) const {
+  return PrevTTI->getGEPCost(Ptr, Operands);
+}
+
+unsigned TargetTransformInfo::getCallCost(FunctionType *FTy,
+                                          int NumArgs) const {
+  return PrevTTI->getCallCost(FTy, NumArgs);
+}
+
+unsigned TargetTransformInfo::getCallCost(const Function *F,
+                                          int NumArgs) const {
+  return PrevTTI->getCallCost(F, NumArgs);
+}
+
+unsigned TargetTransformInfo::getCallCost(
+    const Function *F, ArrayRef<const Value *> Arguments) const {
+  return PrevTTI->getCallCost(F, Arguments);
+}
+
+unsigned TargetTransformInfo::getIntrinsicCost(
+    Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> ParamTys) const {
+  return PrevTTI->getIntrinsicCost(IID, RetTy, ParamTys);
+}
+
+unsigned TargetTransformInfo::getIntrinsicCost(
+    Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) const {
+  return PrevTTI->getIntrinsicCost(IID, RetTy, Arguments);
+}
+
+unsigned TargetTransformInfo::getUserCost(const User *U) const {
+  return PrevTTI->getUserCost(U);
+}
+
+bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
+  return PrevTTI->isLoweredToCall(F);
+}
+
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
   return PrevTTI->isLegalAddImmediate(Imm);
 }
@@ -92,6 +141,14 @@ unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
   return PrevTTI->getNumberOfRegisters(Vector);
 }
 
+unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
+  return PrevTTI->getRegisterBitWidth(Vector);
+}
+
+unsigned TargetTransformInfo::getMaximumUnrollFactor() const {
+  return PrevTTI->getMaximumUnrollFactor();
+}
+
 unsigned TargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
                                                      Type *Ty) const {
   return PrevTTI->getArithmeticInstrCost(Opcode, Ty);
@@ -139,18 +196,25 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
   return PrevTTI->getNumberOfParts(Tp);
 }
 
+unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp) const {
+  return PrevTTI->getAddressComputationCost(Tp);
+}
 
 namespace {
 
 struct NoTTI : ImmutablePass, TargetTransformInfo {
-  NoTTI() : ImmutablePass(ID) {
+  const DataLayout *DL;
+
+  NoTTI() : ImmutablePass(ID), DL(0) {
     initializeNoTTIPass(*PassRegistry::getPassRegistry());
   }
 
   virtual void initializePass() {
     // Note that this subclass is special, and must *not* call initializeTTI as
     // it does not chain.
+    TopTTI = this;
     PrevTTI = 0;
+    DL = getAnalysisIfAvailable<DataLayout>();
   }
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -168,6 +232,213 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return this;
   }
 
+  unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) const {
+    switch (Opcode) {
+    default:
+      // By default, just classify everything as 'basic'.
+      return TCC_Basic;
+
+    case Instruction::GetElementPtr:
+      llvm_unreachable("Use getGEPCost for GEP operations!");
+
+    case Instruction::BitCast:
+      assert(OpTy && "Cast instructions must provide the operand type");
+      if (Ty == OpTy || (Ty->isPointerTy() && OpTy->isPointerTy()))
+        // Identity and pointer-to-pointer casts are free.
+        return TCC_Free;
+
+      // Otherwise, the default basic cost is used.
+      return TCC_Basic;
+
+    case Instruction::IntToPtr:
+      // An inttoptr cast is free so long as the input is a legal integer type
+      // which doesn't contain values outside the range of a pointer.
+      if (DL && DL->isLegalInteger(OpTy->getScalarSizeInBits()) &&
+          OpTy->getScalarSizeInBits() <= DL->getPointerSizeInBits())
+        return TCC_Free;
+
+      // Otherwise it's not a no-op.
+      return TCC_Basic;
+
+    case Instruction::PtrToInt:
+      // A ptrtoint cast is free so long as the result is large enough to store
+      // the pointer, and a legal integer type.
+      if (DL && DL->isLegalInteger(OpTy->getScalarSizeInBits()) &&
+          OpTy->getScalarSizeInBits() >= DL->getPointerSizeInBits())
+        return TCC_Free;
+
+      // Otherwise it's not a no-op.
+      return TCC_Basic;
+
+    case Instruction::Trunc:
+      // trunc to a native type is free (assuming the target has compare and
+      // shift-right of the same width).
+      if (DL && DL->isLegalInteger(DL->getTypeSizeInBits(Ty)))
+        return TCC_Free;
+
+      return TCC_Basic;
+    }
+  }
+
+  unsigned getGEPCost(const Value *Ptr,
+                      ArrayRef<const Value *> Operands) const {
+    // In the basic model, we just assume that all-constant GEPs will be folded
+    // into their uses via addressing modes.
+    for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)
+      if (!isa<Constant>(Operands[Idx]))
+        return TCC_Basic;
+
+    return TCC_Free;
+  }
+
+  unsigned getCallCost(FunctionType *FTy, int NumArgs = -1) const {
+    assert(FTy && "FunctionType must be provided to this routine.");
+
+    // The target-independent implementation just measures the size of the
+    // function by approximating that each argument will take on average one
+    // instruction to prepare.
+
+    if (NumArgs < 0)
+      // Set the argument number to the number of explicit arguments in the
+      // function.
+      NumArgs = FTy->getNumParams();
+
+    return TCC_Basic * (NumArgs + 1);
+  }
+
+  unsigned getCallCost(const Function *F, int NumArgs = -1) const {
+    assert(F && "A concrete function must be provided to this routine.");
+
+    if (NumArgs < 0)
+      // Set the argument number to the number of explicit arguments in the
+      // function.
+      NumArgs = F->arg_size();
+
+    if (Intrinsic::ID IID = (Intrinsic::ID)F->getIntrinsicID()) {
+      FunctionType *FTy = F->getFunctionType();
+      SmallVector<Type *, 8> ParamTys(FTy->param_begin(), FTy->param_end());
+      return TopTTI->getIntrinsicCost(IID, FTy->getReturnType(), ParamTys);
+    }
+
+    if (!TopTTI->isLoweredToCall(F))
+      return TCC_Basic; // Give a basic cost if it will be lowered directly.
+
+    return TopTTI->getCallCost(F->getFunctionType(), NumArgs);
+  }
+
+  unsigned getCallCost(const Function *F,
+                       ArrayRef<const Value *> Arguments) const {
+    // Simply delegate to generic handling of the call.
+    // FIXME: We should use instsimplify or something else to catch calls which
+    // will constant fold with these arguments.
+    return TopTTI->getCallCost(F, Arguments.size());
+  }
+
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Type *> ParamTys) const {
+    switch (IID) {
+    default:
+      // Intrinsics rarely (if ever) have normal argument setup constraints.
+      // Model them as having a basic instruction cost.
+      // FIXME: This is wrong for libc intrinsics.
+      return TCC_Basic;
+
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::objectsize:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+      // These intrinsics don't actually represent code after lowering.
+      return TCC_Free;
+    }
+  }
+
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<const Value *> Arguments) const {
+    // Delegate to the generic intrinsic handling code. This mostly provides an
+    // opportunity for targets to (for example) special case the cost of
+    // certain intrinsics based on constants used as arguments.
+    SmallVector<Type *, 8> ParamTys;
+    ParamTys.reserve(Arguments.size());
+    for (unsigned Idx = 0, Size = Arguments.size(); Idx != Size; ++Idx)
+      ParamTys.push_back(Arguments[Idx]->getType());
+    return TopTTI->getIntrinsicCost(IID, RetTy, ParamTys);
+  }
+
+  unsigned getUserCost(const User *U) const {
+    if (isa<PHINode>(U))
+      return TCC_Free; // Model all PHI nodes as free.
+
+    if (const GEPOperator *GEP = dyn_cast<GEPOperator>(U))
+      // In the basic model we just assume that all-constant GEPs will be
+      // folded into their uses via addressing modes.
+      return GEP->hasAllConstantIndices() ? TCC_Free : TCC_Basic;
+
+    if (ImmutableCallSite CS = U) {
+      const Function *F = CS.getCalledFunction();
+      if (!F) {
+        // Just use the called value type.
+        Type *FTy = CS.getCalledValue()->getType()->getPointerElementType();
+        return TopTTI->getCallCost(cast<FunctionType>(FTy), CS.arg_size());
+      }
+
+      SmallVector<const Value *, 8> Arguments;
+      for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(),
+                                           AE = CS.arg_end();
+           AI != AE; ++AI)
+        Arguments.push_back(*AI);
+
+      return TopTTI->getCallCost(F, Arguments);
+    }
+
+    if (const CastInst *CI = dyn_cast<CastInst>(U)) {
+      // Result of a cmp instruction is often extended (to be used by other
+      // cmp instructions, logical or return instructions). These are usually
+      // nop on most sane targets.
+      if (isa<CmpInst>(CI->getOperand(0)))
+        return TCC_Free;
+    }
+
+    // Otherwise delegate to the fully generic implementations.
+    return getOperationCost(Operator::getOpcode(U), U->getType(),
+                            U->getNumOperands() == 1 ?
+                                U->getOperand(0)->getType() : 0);
+  }
+
+  bool isLoweredToCall(const Function *F) const {
+    // FIXME: These should almost certainly not be handled here, and instead
+    // handled with the help of TLI or the target itself. This was largely
+    // ported from existing analysis heuristics here so that such refactorings
+    // can take place in the future.
+
+    if (F->isIntrinsic())
+      return false;
+
+    if (F->hasLocalLinkage() || !F->hasName())
+      return true;
+
+    StringRef Name = F->getName();
+
+    // These will all likely lower to a single selection DAG node.
+    if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
+        Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" ||
+        Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" ||
+        Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
+      return false;
+
+    // These are all likely to be optimized into something smaller.
+    if (Name == "pow" || Name == "powf" || Name == "powl" || Name == "exp2" ||
+        Name == "exp2l" || Name == "exp2f" || Name == "floor" || Name ==
+        "floorf" || Name == "ceil" || Name == "round" || Name == "ffs" ||
+        Name == "ffsl" || Name == "abs" || Name == "labs" || Name == "llabs")
+      return false;
+
+    return true;
+  }
 
   bool isLegalAddImmediate(int64_t Imm) const {
     return false;
@@ -216,6 +487,14 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return 8;
   }
 
+  unsigned  getRegisterBitWidth(bool Vector) const {
+    return 32;
+  }
+
+  unsigned getMaximumUnrollFactor() const {
+    return 1;
+  }
+
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
     return 1;
   }
@@ -259,6 +538,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
   unsigned getNumberOfParts(Type *Tp) const {
     return 0;
   }
+
+  unsigned getAddressComputationCost(Type *Tp) const {
+    return 0;
+  }
 };
 
 } // end anonymous namespace
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index efb9b08..8e3994e 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -1510,7 +1510,7 @@ static Value *BuildSubAggregate(Value *From, Value* To, Type *IndexedType,
                                 SmallVector<unsigned, 10> &Idxs,
                                 unsigned IdxSkip,
                                 Instruction *InsertBefore) {
-  llvm::StructType *STy = llvm::dyn_cast<llvm::StructType>(IndexedType);
+  llvm::StructType *STy = dyn_cast<llvm::StructType>(IndexedType);
   if (STy) {
     // Save the original To argument so we can modify it
     Value *OrigTo = To;
@@ -1671,8 +1671,10 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
 /// it can be expressed as a base pointer plus a constant offset.  Return the
 /// base and offset to the caller.
 Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
-                                              const DataLayout &TD) {
-  unsigned BitWidth = TD.getPointerSizeInBits();
+                                              const DataLayout *TD) {
+  // Without DataLayout, conservatively assume 64-bit offsets, which is
+  // the widest we support.
+  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
   APInt ByteOffset(BitWidth, 0);
   while (1) {
     if (Ptr->getType()->isVectorTy())
@@ -1680,7 +1682,7 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
 
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
       APInt GEPOffset(BitWidth, 0);
-      if (!GEP->accumulateConstantOffset(TD, GEPOffset))
+      if (TD && !GEP->accumulateConstantOffset(*TD, GEPOffset))
         break;
       ByteOffset += GEPOffset;
       Ptr = GEP->getPointerOperand();
@@ -2012,3 +2014,19 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
     return false; // Misc instructions which have effects
   }
 }
+
+/// isKnownNonNull - Return true if we know that the specified value is never
+/// null.
+bool llvm::isKnownNonNull(const Value *V) {
+  // Alloca never returns null, malloc might.
+  if (isa<AllocaInst>(V)) return true;
+
+  // A byval argument is never null.
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return A->hasByValAttr();
+
+  // Global values are not null unless extern weak.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    return !GV->hasExternalWeakLinkage();
+  return false;
+}
diff --git a/lib/Archive/ArchiveReader.cpp b/lib/Archive/ArchiveReader.cpp
index 26b05e0..2604fdf 100644
--- a/lib/Archive/ArchiveReader.cpp
+++ b/lib/Archive/ArchiveReader.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Bitcode/Archive.h"
 #include "ArchiveInternals.h"
+#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Module.h"
@@ -20,7 +21,6 @@
 #include <cctype>
 #include <cstdio>
 #include <cstdlib>
-#include <memory>
 using namespace llvm;
 
 /// Read a variable-bit-rate encoded unsigned integer
@@ -178,7 +178,7 @@ Archive::parseMemberHeader(const char*& At, const char* End, std::string* error)
           }
           if (p >= endp) {
             if (error)
-              *error = "missing name termiantor in string table";
+              *error = "missing name terminator in string table";
             return 0;
           }
         } else {
@@ -327,14 +327,14 @@ Archive::loadArchive(std::string* error) {
 
 // Open and completely load the archive file.
 Archive*
-Archive::OpenAndLoad(const sys::Path& file, LLVMContext& C, 
+Archive::OpenAndLoad(const sys::Path& File, LLVMContext& C,
                      std::string* ErrorMessage) {
-  std::auto_ptr<Archive> result ( new Archive(file, C));
+  OwningPtr<Archive> result ( new Archive(File, C));
   if (result->mapToMemory(ErrorMessage))
-    return 0;
+    return NULL;
   if (!result->loadArchive(ErrorMessage))
-    return 0;
-  return result.release();
+    return NULL;
+  return result.take();
 }
 
 // Get all the bitcode modules from the archive
@@ -441,15 +441,15 @@ Archive::loadSymbolTable(std::string* ErrorMsg) {
 }
 
 // Open the archive and load just the symbol tables
-Archive* Archive::OpenAndLoadSymbols(const sys::Path& file,
+Archive* Archive::OpenAndLoadSymbols(const sys::Path& File,
                                      LLVMContext& C,
                                      std::string* ErrorMessage) {
-  std::auto_ptr<Archive> result ( new Archive(file, C) );
+  OwningPtr<Archive> result ( new Archive(File, C) );
   if (result->mapToMemory(ErrorMessage))
-    return 0;
+    return NULL;
   if (!result->loadSymbolTable(ErrorMessage))
-    return 0;
-  return result.release();
+    return NULL;
+  return result.take();
 }
 
 // Look up one symbol in the symbol table and return the module that defines
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 32e709c..f46383b 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLLexer.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Assembly/Parser.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -55,22 +56,12 @@ uint64_t LLLexer::atoull(const char *Buffer, const char *End) {
   return Result;
 }
 
-static char parseHexChar(char C) {
-  if (C >= '0' && C <= '9')
-    return C-'0';
-  if (C >= 'A' && C <= 'F')
-    return C-'A'+10;
-  if (C >= 'a' && C <= 'f')
-    return C-'a'+10;
-  return 0;
-}
-
 uint64_t LLLexer::HexIntToVal(const char *Buffer, const char *End) {
   uint64_t Result = 0;
   for (; Buffer != End; ++Buffer) {
     uint64_t OldRes = Result;
     Result *= 16;
-    Result += parseHexChar(*Buffer);
+    Result += hexDigitValue(*Buffer);
 
     if (Result < OldRes) {   // Uh, oh, overflow detected!!!
       Error("constant bigger than 64 bits detected!");
@@ -86,12 +77,12 @@ void LLLexer::HexToIntPair(const char *Buffer, const char *End,
   for (int i=0; i<16; i++, Buffer++) {
     assert(Buffer != End);
     Pair[0] *= 16;
-    Pair[0] += parseHexChar(*Buffer);
+    Pair[0] += hexDigitValue(*Buffer);
   }
   Pair[1] = 0;
   for (int i=0; i<16 && Buffer != End; i++, Buffer++) {
     Pair[1] *= 16;
-    Pair[1] += parseHexChar(*Buffer);
+    Pair[1] += hexDigitValue(*Buffer);
   }
   if (Buffer != End)
     Error("constant bigger than 128 bits detected!");
@@ -105,12 +96,12 @@ void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End,
   for (int i=0; i<4 && Buffer != End; i++, Buffer++) {
     assert(Buffer != End);
     Pair[1] *= 16;
-    Pair[1] += parseHexChar(*Buffer);
+    Pair[1] += hexDigitValue(*Buffer);
   }
   Pair[0] = 0;
   for (int i=0; i<16; i++, Buffer++) {
     Pair[0] *= 16;
-    Pair[0] += parseHexChar(*Buffer);
+    Pair[0] += hexDigitValue(*Buffer);
   }
   if (Buffer != End)
     Error("constant bigger than 128 bits detected!");
@@ -128,8 +119,10 @@ static void UnEscapeLexed(std::string &Str) {
       if (BIn < EndBuffer-1 && BIn[1] == '\\') {
         *BOut++ = '\\'; // Two \ becomes one
         BIn += 2;
-      } else if (BIn < EndBuffer-2 && isxdigit(BIn[1]) && isxdigit(BIn[2])) {
-        *BOut = parseHexChar(BIn[1]) * 16 + parseHexChar(BIn[2]);
+      } else if (BIn < EndBuffer-2 &&
+                 isxdigit(static_cast<unsigned char>(BIn[1])) &&
+                 isxdigit(static_cast<unsigned char>(BIn[2]))) {
+        *BOut = hexDigitValue(BIn[1]) * 16 + hexDigitValue(BIn[2]);
         BIn += 3;                           // Skip over handled chars
         ++BOut;
       } else {
@@ -144,7 +137,8 @@ static void UnEscapeLexed(std::string &Str) {
 
 /// isLabelChar - Return true for [-a-zA-Z$._0-9].
 static bool isLabelChar(char C) {
-  return isalnum(C) || C == '-' || C == '$' || C == '.' || C == '_';
+  return isalnum(static_cast<unsigned char>(C)) || C == '-' || C == '$' ||
+         C == '.' || C == '_';
 }
 
 
@@ -197,7 +191,7 @@ lltok::Kind LLLexer::LexToken() {
   switch (CurChar) {
   default:
     // Handle letters: [a-zA-Z_]
-    if (isalpha(CurChar) || CurChar == '_')
+    if (isalpha(static_cast<unsigned char>(CurChar)) || CurChar == '_')
       return LexIdentifier();
 
     return lltok::Error;
@@ -235,6 +229,7 @@ lltok::Kind LLLexer::LexToken() {
     SkipLineComment();
     return LexToken();
   case '!': return LexExclaim();
+  case '#': return LexHash();
   case '0': case '1': case '2': case '3': case '4':
   case '5': case '6': case '7': case '8': case '9':
   case '-':
@@ -290,8 +285,8 @@ lltok::Kind LLLexer::LexAt() {
     return lltok::GlobalVar;
 
   // Handle GlobalVarID: @[0-9]+
-  if (isdigit(CurPtr[0])) {
-    for (++CurPtr; isdigit(CurPtr[0]); ++CurPtr)
+  if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
+    for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
       /*empty*/;
 
     uint64_t Val = atoull(TokStart+1, CurPtr);
@@ -325,10 +320,12 @@ lltok::Kind LLLexer::ReadString(lltok::Kind kind) {
 /// ReadVarName - Read the rest of a token containing a variable name.
 bool LLLexer::ReadVarName() {
   const char *NameStart = CurPtr;
-  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+  if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
+      CurPtr[0] == '-' || CurPtr[0] == '$' ||
       CurPtr[0] == '.' || CurPtr[0] == '_') {
     ++CurPtr;
-    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+    while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
+           CurPtr[0] == '-' || CurPtr[0] == '$' ||
            CurPtr[0] == '.' || CurPtr[0] == '_')
       ++CurPtr;
 
@@ -354,8 +351,8 @@ lltok::Kind LLLexer::LexPercent() {
     return lltok::LocalVar;
 
   // Handle LocalVarID: %[0-9]+
-  if (isdigit(CurPtr[0])) {
-    for (++CurPtr; isdigit(CurPtr[0]); ++CurPtr)
+  if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
+    for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
       /*empty*/;
 
     uint64_t Val = atoull(TokStart+1, CurPtr);
@@ -389,10 +386,12 @@ lltok::Kind LLLexer::LexQuote() {
 ///    !
 lltok::Kind LLLexer::LexExclaim() {
   // Lex a metadata name as a MetadataVar.
-  if (isalpha(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+  if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
+      CurPtr[0] == '-' || CurPtr[0] == '$' ||
       CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') {
     ++CurPtr;
-    while (isalnum(CurPtr[0]) || CurPtr[0] == '-' || CurPtr[0] == '$' ||
+    while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
+           CurPtr[0] == '-' || CurPtr[0] == '$' ||
            CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\')
       ++CurPtr;
 
@@ -403,6 +402,24 @@ lltok::Kind LLLexer::LexExclaim() {
   return lltok::exclaim;
 }
 
+/// LexHash - Lex all tokens that start with a # character:
+///    AttrGrpID ::= #[0-9]+
+lltok::Kind LLLexer::LexHash() {
+  // Handle AttrGrpID: #[0-9]+
+  if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
+    for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+      /*empty*/;
+
+    uint64_t Val = atoull(TokStart+1, CurPtr);
+    if ((unsigned)Val != Val)
+      Error("invalid value number (too large)!");
+    UIntVal = unsigned(Val);
+    return lltok::AttrGrpID;
+  }
+
+  return lltok::Error;
+}
+
 /// LexIdentifier: Handle several related productions:
 ///    Label           [-a-zA-Z$._0-9]+:
 ///    IntegerType     i[0-9]+
@@ -415,8 +432,11 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   for (; isLabelChar(*CurPtr); ++CurPtr) {
     // If we decide this is an integer, remember the end of the sequence.
-    if (!IntEnd && !isdigit(*CurPtr)) IntEnd = CurPtr;
-    if (!KeywordEnd && !isalnum(*CurPtr) && *CurPtr != '_') KeywordEnd = CurPtr;
+    if (!IntEnd && !isdigit(static_cast<unsigned char>(*CurPtr)))
+      IntEnd = CurPtr;
+    if (!KeywordEnd && !isalnum(static_cast<unsigned char>(*CurPtr)) &&
+        *CurPtr != '_')
+      KeywordEnd = CurPtr;
   }
 
   // If we stopped due to a colon, this really is a label.
@@ -445,9 +465,11 @@ lltok::Kind LLLexer::LexIdentifier() {
   CurPtr = KeywordEnd;
   --StartChar;
   unsigned Len = CurPtr-StartChar;
-#define KEYWORD(STR) \
-  if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR))) \
-    return lltok::kw_##STR;
+#define KEYWORD(STR)                                                    \
+  do {                                                                  \
+    if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR)))  \
+      return lltok::kw_##STR;                                           \
+  } while (0)
 
   KEYWORD(true);    KEYWORD(false);
   KEYWORD(declare); KEYWORD(define);
@@ -472,6 +494,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(hidden);
   KEYWORD(protected);
   KEYWORD(unnamed_addr);
+  KEYWORD(externally_initialized);
   KEYWORD(extern_weak);
   KEYWORD(external);
   KEYWORD(thread_local);
@@ -498,11 +521,11 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(seq_cst);
   KEYWORD(singlethread);
 
-  KEYWORD(nnan)
-  KEYWORD(ninf)
-  KEYWORD(nsz)
-  KEYWORD(arcp)
-  KEYWORD(fast)
+  KEYWORD(nnan);
+  KEYWORD(ninf);
+  KEYWORD(nsz);
+  KEYWORD(arcp);
+  KEYWORD(fast);
   KEYWORD(nuw);
   KEYWORD(nsw);
   KEYWORD(exact);
@@ -537,34 +560,39 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(cc);
   KEYWORD(c);
 
-  KEYWORD(signext);
-  KEYWORD(zeroext);
+  KEYWORD(attributes);
+
+  KEYWORD(alwaysinline);
+  KEYWORD(byval);
+  KEYWORD(inlinehint);
   KEYWORD(inreg);
-  KEYWORD(sret);
-  KEYWORD(nounwind);
-  KEYWORD(noreturn);
+  KEYWORD(minsize);
+  KEYWORD(naked);
+  KEYWORD(nest);
   KEYWORD(noalias);
+  KEYWORD(nobuiltin);
   KEYWORD(nocapture);
-  KEYWORD(byval);
-  KEYWORD(nest);
+  KEYWORD(noduplicate);
+  KEYWORD(noimplicitfloat);
+  KEYWORD(noinline);
+  KEYWORD(nonlazybind);
+  KEYWORD(noredzone);
+  KEYWORD(noreturn);
+  KEYWORD(nounwind);
+  KEYWORD(optsize);
   KEYWORD(readnone);
   KEYWORD(readonly);
-  KEYWORD(uwtable);
   KEYWORD(returns_twice);
-
-  KEYWORD(inlinehint);
-  KEYWORD(noinline);
-  KEYWORD(alwaysinline);
-  KEYWORD(optsize);
+  KEYWORD(signext);
+  KEYWORD(sret);
   KEYWORD(ssp);
   KEYWORD(sspreq);
-  KEYWORD(noredzone);
-  KEYWORD(noimplicitfloat);
-  KEYWORD(naked);
-  KEYWORD(nonlazybind);
-  KEYWORD(address_safety);
-  KEYWORD(minsize);
-  KEYWORD(noduplicate);
+  KEYWORD(sspstrong);
+  KEYWORD(sanitize_address);
+  KEYWORD(sanitize_thread);
+  KEYWORD(sanitize_memory);
+  KEYWORD(uwtable);
+  KEYWORD(zeroext);
 
   KEYWORD(type);
   KEYWORD(opaque);
@@ -659,7 +687,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   // Check for [us]0x[0-9A-Fa-f]+ which are Hexadecimal constant generated by
   // the CFE to avoid forcing it to deal with 64-bit numbers.
   if ((TokStart[0] == 'u' || TokStart[0] == 's') &&
-      TokStart[1] == '0' && TokStart[2] == 'x' && isxdigit(TokStart[3])) {
+      TokStart[1] == '0' && TokStart[2] == 'x' &&
+      isxdigit(static_cast<unsigned char>(TokStart[3]))) {
     int len = CurPtr-TokStart-3;
     uint32_t bits = len * 4;
     APInt Tmp(bits, StringRef(TokStart+3, len), 16);
@@ -699,13 +728,13 @@ lltok::Kind LLLexer::Lex0x() {
     Kind = 'J';
   }
 
-  if (!isxdigit(CurPtr[0])) {
+  if (!isxdigit(static_cast<unsigned char>(CurPtr[0]))) {
     // Bad token, return it as an error.
     CurPtr = TokStart+1;
     return lltok::Error;
   }
 
-  while (isxdigit(CurPtr[0]))
+  while (isxdigit(static_cast<unsigned char>(CurPtr[0])))
     ++CurPtr;
 
   if (Kind == 'J') {
@@ -722,20 +751,21 @@ lltok::Kind LLLexer::Lex0x() {
   case 'K':
     // F80HexFPConstant - x87 long double in hexadecimal format (10 bytes)
     FP80HexToIntPair(TokStart+3, CurPtr, Pair);
-    APFloatVal = APFloat(APInt(80, Pair));
+    APFloatVal = APFloat(APFloat::x87DoubleExtended, APInt(80, Pair));
     return lltok::APFloat;
   case 'L':
     // F128HexFPConstant - IEEE 128-bit in hexadecimal format (16 bytes)
     HexToIntPair(TokStart+3, CurPtr, Pair);
-    APFloatVal = APFloat(APInt(128, Pair), true);
+    APFloatVal = APFloat(APFloat::IEEEquad, APInt(128, Pair));
     return lltok::APFloat;
   case 'M':
     // PPC128HexFPConstant - PowerPC 128-bit in hexadecimal format (16 bytes)
     HexToIntPair(TokStart+3, CurPtr, Pair);
-    APFloatVal = APFloat(APInt(128, Pair));
+    APFloatVal = APFloat(APFloat::PPCDoubleDouble, APInt(128, Pair));
     return lltok::APFloat;
   case 'H':
-    APFloatVal = APFloat(APInt(16,HexIntToVal(TokStart+3, CurPtr)));
+    APFloatVal = APFloat(APFloat::IEEEhalf,
+                         APInt(16,HexIntToVal(TokStart+3, CurPtr)));
     return lltok::APFloat;
   }
 }
@@ -751,7 +781,8 @@ lltok::Kind LLLexer::Lex0x() {
 ///    HexPPC128Constant 0xM[0-9A-Fa-f]+
 lltok::Kind LLLexer::LexDigitOrNegative() {
   // If the letter after the negative is not a number, this is probably a label.
-  if (!isdigit(TokStart[0]) && !isdigit(CurPtr[0])) {
+  if (!isdigit(static_cast<unsigned char>(TokStart[0])) &&
+      !isdigit(static_cast<unsigned char>(CurPtr[0]))) {
     // Okay, this is not a number after the -, it's probably a label.
     if (const char *End = isLabelTail(CurPtr)) {
       StrVal.assign(TokStart, End-1);
@@ -765,7 +796,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
   // At this point, it is either a label, int or fp constant.
 
   // Skip digits, we have at least one.
-  for (; isdigit(CurPtr[0]); ++CurPtr)
+  for (; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
     /*empty*/;
 
   // Check to see if this really is a label afterall, e.g. "-1:".
@@ -802,13 +833,14 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
   ++CurPtr;
 
   // Skip over [0-9]*([eE][-+]?[0-9]+)?
-  while (isdigit(CurPtr[0])) ++CurPtr;
+  while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
 
   if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
-    if (isdigit(CurPtr[1]) ||
-        ((CurPtr[1] == '-' || CurPtr[1] == '+') && isdigit(CurPtr[2]))) {
+    if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
+        ((CurPtr[1] == '-' || CurPtr[1] == '+') &&
+          isdigit(static_cast<unsigned char>(CurPtr[2])))) {
       CurPtr += 2;
-      while (isdigit(CurPtr[0])) ++CurPtr;
+      while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
     }
   }
 
@@ -820,11 +852,11 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
 lltok::Kind LLLexer::LexPositive() {
   // If the letter after the negative is a number, this is probably not a
   // label.
-  if (!isdigit(CurPtr[0]))
+  if (!isdigit(static_cast<unsigned char>(CurPtr[0])))
     return lltok::Error;
 
   // Skip digits.
-  for (++CurPtr; isdigit(CurPtr[0]); ++CurPtr)
+  for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
     /*empty*/;
 
   // At this point, we need a '.'.
@@ -836,13 +868,14 @@ lltok::Kind LLLexer::LexPositive() {
   ++CurPtr;
 
   // Skip over [0-9]*([eE][-+]?[0-9]+)?
-  while (isdigit(CurPtr[0])) ++CurPtr;
+  while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
 
   if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
-    if (isdigit(CurPtr[1]) ||
-        ((CurPtr[1] == '-' || CurPtr[1] == '+') && isdigit(CurPtr[2]))) {
+    if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
+        ((CurPtr[1] == '-' || CurPtr[1] == '+') &&
+        isdigit(static_cast<unsigned char>(CurPtr[2])))) {
       CurPtr += 2;
-      while (isdigit(CurPtr[0])) ++CurPtr;
+      while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
     }
   }
 
diff --git a/lib/AsmParser/LLLexer.h b/lib/AsmParser/LLLexer.h
index 1a307a8..85703c7 100644
--- a/lib/AsmParser/LLLexer.h
+++ b/lib/AsmParser/LLLexer.h
@@ -81,6 +81,7 @@ namespace llvm {
     lltok::Kind LexPercent();
     lltok::Kind LexQuote();
     lltok::Kind Lex0x();
+    lltok::Kind LexHash();
 
     uint64_t atoull(const char *Buffer, const char *End);
     uint64_t HexIntToVal(const char *Buffer, const char *End);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index fea5ec8..c8da1f8 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -65,6 +65,64 @@ bool LLParser::ValidateEndOfModule() {
     ForwardRefInstMetadata.clear();
   }
 
+  // Handle any function attribute group forward references.
+  for (std::map<Value*, std::vector<unsigned> >::iterator
+         I = ForwardRefAttrGroups.begin(), E = ForwardRefAttrGroups.end();
+         I != E; ++I) {
+    Value *V = I->first;
+    std::vector<unsigned> &Vec = I->second;
+    AttrBuilder B;
+
+    for (std::vector<unsigned>::iterator VI = Vec.begin(), VE = Vec.end();
+         VI != VE; ++VI)
+      B.merge(NumberedAttrBuilders[*VI]);
+
+    if (Function *Fn = dyn_cast<Function>(V)) {
+      AttributeSet AS = Fn->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
+      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
+                               AS.getFnAttributes());
+
+      FnAttrs.merge(B);
+
+      // If the alignment was parsed as an attribute, move to the alignment
+      // field.
+      if (FnAttrs.hasAlignmentAttr()) {
+        Fn->setAlignment(FnAttrs.getAlignment());
+        FnAttrs.removeAttribute(Attribute::Alignment);
+      }
+
+      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
+                            AttributeSet::get(Context,
+                                              AttributeSet::FunctionIndex,
+                                              FnAttrs));
+      Fn->setAttributes(AS);
+    } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
+      AttributeSet AS = CI->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
+      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
+                               AS.getFnAttributes());
+      FnAttrs.merge(B);
+      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
+                            AttributeSet::get(Context,
+                                              AttributeSet::FunctionIndex,
+                                              FnAttrs));
+      CI->setAttributes(AS);
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) {
+      AttributeSet AS = II->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
+      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
+                               AS.getFnAttributes());
+      FnAttrs.merge(B);
+      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
+                            AttributeSet::get(Context,
+                                              AttributeSet::FunctionIndex,
+                                              FnAttrs));
+      II->setAttributes(AS);
+    } else {
+      llvm_unreachable("invalid object with forward attribute group reference");
+    }
+  }
 
   // If there are entries in ForwardRefBlockAddresses at this point, they are
   // references after the function was defined.  Resolve those now.
@@ -174,7 +232,7 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::GlobalID:   if (ParseUnnamedGlobal()) return true; break;
     case lltok::GlobalVar:  if (ParseNamedGlobal()) return true; break;
     case lltok::exclaim:    if (ParseStandaloneMetadata()) return true; break;
-    case lltok::MetadataVar: if (ParseNamedMetadata()) return true; break;
+    case lltok::MetadataVar:if (ParseNamedMetadata()) return true; break;
 
     // The Global variable production with no name can have many different
     // optional leading prefixes, the production is:
@@ -220,6 +278,8 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_global:        // GlobalType
       if (ParseGlobal("", SMLoc(), 0, false, 0)) return true;
       break;
+
+    case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break;
     }
   }
 }
@@ -632,9 +692,11 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
 
 /// ParseGlobal
 ///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalThreadLocal
-///       OptionalAddrSpace OptionalUnNammedAddr GlobalType Type Const
+///       OptionalAddrSpace OptionalUnNammedAddr
+///       OptionalExternallyInitialized GlobalType Type Const
 ///   ::= OptionalLinkage OptionalVisibility OptionalThreadLocal
-///       OptionalAddrSpace OptionalUnNammedAddr GlobalType Type Const
+///       OptionalAddrSpace OptionalUnNammedAddr
+///       OptionalExternallyInitialized GlobalType Type Const
 ///
 /// Everything through visibility has been parsed already.
 ///
@@ -642,9 +704,10 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
                            unsigned Linkage, bool HasLinkage,
                            unsigned Visibility) {
   unsigned AddrSpace;
-  bool IsConstant, UnnamedAddr;
+  bool IsConstant, UnnamedAddr, IsExternallyInitialized;
   GlobalVariable::ThreadLocalMode TLM;
   LocTy UnnamedAddrLoc;
+  LocTy IsExternallyInitializedLoc;
   LocTy TyLoc;
 
   Type *Ty = 0;
@@ -652,6 +715,9 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
       ParseOptionalAddrSpace(AddrSpace) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
                          &UnnamedAddrLoc) ||
+      ParseOptionalToken(lltok::kw_externally_initialized,
+                         IsExternallyInitialized,
+                         &IsExternallyInitializedLoc) ||
       ParseGlobalType(IsConstant) ||
       ParseType(Ty, TyLoc))
     return true;
@@ -709,6 +775,7 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   GV->setConstant(IsConstant);
   GV->setLinkage((GlobalValue::LinkageTypes)Linkage);
   GV->setVisibility((GlobalValue::VisibilityTypes)Visibility);
+  GV->setExternallyInitialized(IsExternallyInitialized);
   GV->setThreadLocalMode(TLM);
   GV->setUnnamedAddr(UnnamedAddr);
 
@@ -733,6 +800,159 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
   return false;
 }
 
+/// ParseUnnamedAttrGrp
+///   ::= 'attributes' AttrGrpID '=' '{' AttrValPair+ '}'
+bool LLParser::ParseUnnamedAttrGrp() {
+  assert(Lex.getKind() == lltok::kw_attributes);
+  LocTy AttrGrpLoc = Lex.getLoc();
+  Lex.Lex();
+
+  assert(Lex.getKind() == lltok::AttrGrpID);
+  unsigned VarID = Lex.getUIntVal();
+  std::vector<unsigned> unused;
+  LocTy NoBuiltinLoc;
+  Lex.Lex();
+
+  if (ParseToken(lltok::equal, "expected '=' here") ||
+      ParseToken(lltok::lbrace, "expected '{' here") ||
+      ParseFnAttributeValuePairs(NumberedAttrBuilders[VarID], unused, true,
+                                 NoBuiltinLoc) ||
+      ParseToken(lltok::rbrace, "expected end of attribute group"))
+    return true;
+
+  if (!NumberedAttrBuilders[VarID].hasAttributes())
+    return Error(AttrGrpLoc, "attribute group has no attributes");
+
+  return false;
+}
+
+/// ParseFnAttributeValuePairs
+///   ::= <attr> | <attr> '=' <value>
+bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
+                                          std::vector<unsigned> &FwdRefAttrGrps,
+                                          bool inAttrGrp, LocTy &NoBuiltinLoc) {
+  bool HaveError = false;
+
+  B.clear();
+
+  while (true) {
+    lltok::Kind Token = Lex.getKind();
+    if (Token == lltok::kw_nobuiltin)
+      NoBuiltinLoc = Lex.getLoc();
+    switch (Token) {
+    default:
+      if (!inAttrGrp) return HaveError;
+      return Error(Lex.getLoc(), "unterminated attribute group");
+    case lltok::rbrace:
+      // Finished.
+      return false;
+
+    case lltok::AttrGrpID: {
+      // Allow a function to reference an attribute group:
+      //
+      //   define void @foo() #1 { ... }
+      if (inAttrGrp)
+        HaveError |=
+          Error(Lex.getLoc(),
+              "cannot have an attribute group reference in an attribute group");
+
+      unsigned AttrGrpNum = Lex.getUIntVal();
+      if (inAttrGrp) break;
+
+      // Save the reference to the attribute group. We'll fill it in later.
+      FwdRefAttrGrps.push_back(AttrGrpNum);
+      break;
+    }
+    // Target-dependent attributes:
+    case lltok::StringConstant: {
+      std::string Attr = Lex.getStrVal();
+      Lex.Lex();
+      std::string Val;
+      if (EatIfPresent(lltok::equal) &&
+          ParseStringConstant(Val))
+        return true;
+
+      B.addAttribute(Attr, Val);
+      continue;
+    }
+
+    // Target-independent attributes:
+    case lltok::kw_align: {
+      // As a hack, we allow "align 2" on functions as a synonym for "alignstack
+      // 2".
+      unsigned Alignment;
+      if (inAttrGrp) {
+        Lex.Lex();
+        if (ParseToken(lltok::equal, "expected '=' here") ||
+            ParseUInt32(Alignment))
+          return true;
+      } else {
+        if (ParseOptionalAlignment(Alignment))
+          return true;
+      }
+      B.addAlignmentAttr(Alignment);
+      continue;
+    }
+    case lltok::kw_alignstack: {
+      unsigned Alignment;
+      if (inAttrGrp) {
+        Lex.Lex();
+        if (ParseToken(lltok::equal, "expected '=' here") ||
+            ParseUInt32(Alignment))
+          return true;
+      } else {
+        if (ParseOptionalStackAlignment(Alignment))
+          return true;
+      }
+      B.addStackAlignmentAttr(Alignment);
+      continue;
+    }
+    case lltok::kw_alwaysinline:      B.addAttribute(Attribute::AlwaysInline); break;
+    case lltok::kw_inlinehint:        B.addAttribute(Attribute::InlineHint); break;
+    case lltok::kw_minsize:           B.addAttribute(Attribute::MinSize); break;
+    case lltok::kw_naked:             B.addAttribute(Attribute::Naked); break;
+    case lltok::kw_nobuiltin:         B.addAttribute(Attribute::NoBuiltin); break;
+    case lltok::kw_noduplicate:       B.addAttribute(Attribute::NoDuplicate); break;
+    case lltok::kw_noimplicitfloat:   B.addAttribute(Attribute::NoImplicitFloat); break;
+    case lltok::kw_noinline:          B.addAttribute(Attribute::NoInline); break;
+    case lltok::kw_nonlazybind:       B.addAttribute(Attribute::NonLazyBind); break;
+    case lltok::kw_noredzone:         B.addAttribute(Attribute::NoRedZone); break;
+    case lltok::kw_noreturn:          B.addAttribute(Attribute::NoReturn); break;
+    case lltok::kw_nounwind:          B.addAttribute(Attribute::NoUnwind); break;
+    case lltok::kw_optsize:           B.addAttribute(Attribute::OptimizeForSize); break;
+    case lltok::kw_readnone:          B.addAttribute(Attribute::ReadNone); break;
+    case lltok::kw_readonly:          B.addAttribute(Attribute::ReadOnly); break;
+    case lltok::kw_returns_twice:     B.addAttribute(Attribute::ReturnsTwice); break;
+    case lltok::kw_ssp:               B.addAttribute(Attribute::StackProtect); break;
+    case lltok::kw_sspreq:            B.addAttribute(Attribute::StackProtectReq); break;
+    case lltok::kw_sspstrong:         B.addAttribute(Attribute::StackProtectStrong); break;
+    case lltok::kw_sanitize_address:  B.addAttribute(Attribute::SanitizeAddress); break;
+    case lltok::kw_sanitize_thread:   B.addAttribute(Attribute::SanitizeThread); break;
+    case lltok::kw_sanitize_memory:   B.addAttribute(Attribute::SanitizeMemory); break;
+    case lltok::kw_uwtable:           B.addAttribute(Attribute::UWTable); break;
+
+    // Error handling.
+    case lltok::kw_inreg:
+    case lltok::kw_signext:
+    case lltok::kw_zeroext:
+      HaveError |=
+        Error(Lex.getLoc(),
+              "invalid use of attribute on a function");
+      break;
+    case lltok::kw_byval:
+    case lltok::kw_nest:
+    case lltok::kw_noalias:
+    case lltok::kw_nocapture:
+    case lltok::kw_sret:
+      HaveError |=
+        Error(Lex.getLoc(),
+              "invalid use of parameter-only attribute on a function");
+      break;
+    }
+
+    Lex.Lex();
+  }
+}
 
 //===----------------------------------------------------------------------===//
 // GlobalValue Reference/Resolution Routines.
@@ -912,71 +1132,6 @@ bool LLParser::ParseOptionalAddrSpace(unsigned &AddrSpace) {
          ParseToken(lltok::rparen, "expected ')' in address space");
 }
 
-/// ParseOptionalFuncAttrs - Parse a potentially empty list of function attributes.
-bool LLParser::ParseOptionalFuncAttrs(AttrBuilder &B) {
-  bool HaveError = false;
-
-  B.clear();
-
-  while (1) {
-    lltok::Kind Token = Lex.getKind();
-    switch (Token) {
-    default:  // End of attributes.
-      return HaveError;
-    case lltok::kw_alignstack: {
-      unsigned Alignment;
-      if (ParseOptionalStackAlignment(Alignment))
-        return true;
-      B.addStackAlignmentAttr(Alignment);
-      continue;
-    }
-    case lltok::kw_align: {
-      // As a hack, we allow "align 2" on functions as a synonym for "alignstack
-      // 2".
-      unsigned Alignment;
-      if (ParseOptionalAlignment(Alignment))
-        return true;
-      B.addAlignmentAttr(Alignment);
-      continue;
-    }
-    case lltok::kw_address_safety:  B.addAttribute(Attribute::AddressSafety); break;
-    case lltok::kw_alwaysinline:    B.addAttribute(Attribute::AlwaysInline); break;
-    case lltok::kw_inlinehint:      B.addAttribute(Attribute::InlineHint); break;
-    case lltok::kw_minsize:         B.addAttribute(Attribute::MinSize); break;
-    case lltok::kw_naked:           B.addAttribute(Attribute::Naked); break;
-    case lltok::kw_noinline:        B.addAttribute(Attribute::NoInline); break;
-    case lltok::kw_nonlazybind:     B.addAttribute(Attribute::NonLazyBind); break;
-    case lltok::kw_noredzone:       B.addAttribute(Attribute::NoRedZone); break;
-    case lltok::kw_noimplicitfloat: B.addAttribute(Attribute::NoImplicitFloat); break;
-    case lltok::kw_noreturn:        B.addAttribute(Attribute::NoReturn); break;
-    case lltok::kw_nounwind:        B.addAttribute(Attribute::NoUnwind); break;
-    case lltok::kw_optsize:         B.addAttribute(Attribute::OptimizeForSize); break;
-    case lltok::kw_readnone:        B.addAttribute(Attribute::ReadNone); break;
-    case lltok::kw_readonly:        B.addAttribute(Attribute::ReadOnly); break;
-    case lltok::kw_returns_twice:   B.addAttribute(Attribute::ReturnsTwice); break;
-    case lltok::kw_ssp:             B.addAttribute(Attribute::StackProtect); break;
-    case lltok::kw_sspreq:          B.addAttribute(Attribute::StackProtectReq); break;
-    case lltok::kw_uwtable:         B.addAttribute(Attribute::UWTable); break;
-    case lltok::kw_noduplicate:     B.addAttribute(Attribute::NoDuplicate); break;
-
-    // Error handling.
-    case lltok::kw_zeroext:
-    case lltok::kw_signext:
-    case lltok::kw_inreg:
-      HaveError |= Error(Lex.getLoc(), "invalid use of attribute on a function");
-      break;
-    case lltok::kw_sret:      case lltok::kw_noalias:
-    case lltok::kw_nocapture: case lltok::kw_byval:
-    case lltok::kw_nest:
-      HaveError |=
-        Error(Lex.getLoc(), "invalid use of parameter-only attribute on a function");
-      break;
-    }
-
-    Lex.Lex();
-  }
-}
-
 /// ParseOptionalParamAttrs - Parse a potentially empty list of parameter attributes.
 bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
   bool HaveError = false;
@@ -1004,16 +1159,17 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_sret:            B.addAttribute(Attribute::StructRet); break;
     case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
 
-    case lltok::kw_noreturn:       case lltok::kw_nounwind:
-    case lltok::kw_uwtable:        case lltok::kw_returns_twice:
-    case lltok::kw_noinline:       case lltok::kw_readnone:
-    case lltok::kw_readonly:       case lltok::kw_inlinehint:
-    case lltok::kw_alwaysinline:   case lltok::kw_optsize:
-    case lltok::kw_ssp:            case lltok::kw_sspreq:
-    case lltok::kw_noredzone:      case lltok::kw_noimplicitfloat:
-    case lltok::kw_naked:          case lltok::kw_nonlazybind:
-    case lltok::kw_address_safety: case lltok::kw_minsize:
-    case lltok::kw_alignstack:
+    case lltok::kw_alignstack:      case lltok::kw_nounwind:
+    case lltok::kw_alwaysinline:    case lltok::kw_optsize:
+    case lltok::kw_inlinehint:      case lltok::kw_readnone:
+    case lltok::kw_minsize:         case lltok::kw_readonly:
+    case lltok::kw_naked:           case lltok::kw_returns_twice:
+    case lltok::kw_nobuiltin:       case lltok::kw_sanitize_address:
+    case lltok::kw_noimplicitfloat: case lltok::kw_sanitize_memory:
+    case lltok::kw_noinline:        case lltok::kw_sanitize_thread:
+    case lltok::kw_nonlazybind:     case lltok::kw_ssp:
+    case lltok::kw_noredzone:       case lltok::kw_sspreq:
+    case lltok::kw_noreturn:        case lltok::kw_uwtable:
       HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
       break;
     }
@@ -1044,17 +1200,19 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
       HaveError |= Error(Lex.getLoc(), "invalid use of parameter-only attribute");
       break;
 
-    case lltok::kw_noreturn:       case lltok::kw_nounwind:
-    case lltok::kw_uwtable:        case lltok::kw_returns_twice:
-    case lltok::kw_noinline:       case lltok::kw_readnone:
-    case lltok::kw_readonly:       case lltok::kw_inlinehint:
-    case lltok::kw_alwaysinline:   case lltok::kw_optsize:
-    case lltok::kw_ssp:            case lltok::kw_sspreq:
-    case lltok::kw_noredzone:      case lltok::kw_noimplicitfloat:
-    case lltok::kw_naked:          case lltok::kw_nonlazybind:
-    case lltok::kw_address_safety: case lltok::kw_minsize:
-    case lltok::kw_alignstack:     case lltok::kw_align:
-    case lltok::kw_noduplicate:
+    case lltok::kw_align:                 case lltok::kw_noreturn:
+    case lltok::kw_alignstack:            case lltok::kw_nounwind:
+    case lltok::kw_alwaysinline:          case lltok::kw_optsize:
+    case lltok::kw_inlinehint:            case lltok::kw_readnone:
+    case lltok::kw_minsize:               case lltok::kw_readonly:
+    case lltok::kw_naked:                 case lltok::kw_returns_twice:
+    case lltok::kw_nobuiltin:             case lltok::kw_sanitize_address:
+    case lltok::kw_noduplicate:           case lltok::kw_sanitize_memory:
+    case lltok::kw_noimplicitfloat:       case lltok::kw_sanitize_thread:
+    case lltok::kw_noinline:              case lltok::kw_ssp:
+    case lltok::kw_nonlazybind:           case lltok::kw_sspreq:
+    case lltok::kw_noredzone:             case lltok::kw_sspstrong:
+                                          case lltok::kw_uwtable:
       HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
       break;
     }
@@ -1471,6 +1629,7 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
   if (ParseToken(lltok::lparen, "expected '(' in call"))
     return true;
 
+  unsigned AttrIndex = 1;
   while (Lex.getKind() != lltok::rparen) {
     // If this isn't the first argument, we need a comma.
     if (!ArgList.empty() &&
@@ -1488,8 +1647,9 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
     // Otherwise, handle normal operands.
     if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS))
       return true;
-    ArgList.push_back(ParamInfo(ArgLoc, V, Attribute::get(V->getContext(),
-                                                           ArgAttrs)));
+    ArgList.push_back(ParamInfo(ArgLoc, V, AttributeSet::get(V->getContext(),
+                                                             AttrIndex++,
+                                                             ArgAttrs)));
   }
 
   Lex.Lex();  // Lex the ')'.
@@ -1538,9 +1698,10 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     if (!FunctionType::isValidArgumentType(ArgTy))
       return Error(TypeLoc, "invalid type for function argument");
 
+    unsigned AttrIndex = 1;
     ArgList.push_back(ArgInfo(TypeLoc, ArgTy,
-                              Attribute::get(ArgTy->getContext(),
-                                              Attrs), Name));
+                              AttributeSet::get(ArgTy->getContext(),
+                                                AttrIndex++, Attrs), Name));
 
     while (EatIfPresent(lltok::comma)) {
       // Handle ... at end of arg list.
@@ -1567,7 +1728,8 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
         return Error(TypeLoc, "invalid type for function argument");
 
       ArgList.push_back(ArgInfo(TypeLoc, ArgTy,
-                                Attribute::get(ArgTy->getContext(), Attrs),
+                                AttributeSet::get(ArgTy->getContext(),
+                                                  AttrIndex++, Attrs),
                                 Name));
     }
   }
@@ -1592,7 +1754,7 @@ bool LLParser::ParseFunctionType(Type *&Result) {
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     if (!ArgList[i].Name.empty())
       return Error(ArgList[i].Loc, "argument name invalid in function type");
-    if (ArgList[i].Attrs.hasAttributes())
+    if (ArgList[i].Attrs.hasAttributes(i + 1))
       return Error(ArgList[i].Loc,
                    "argument attributes invalid in function type");
   }
@@ -2156,7 +2318,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     return false;
 
   case lltok::kw_asm: {
-    // ValID ::= 'asm' SideEffect? AlignStack? STRINGCONSTANT ',' STRINGCONSTANT
+    // ValID ::= 'asm' SideEffect? AlignStack? IntelDialect? STRINGCONSTANT ','
+    //             STRINGCONSTANT
     bool HasSideEffect, AlignStack, AsmDialect;
     Lex.Lex();
     if (ParseOptionalToken(lltok::kw_sideeffect, HasSideEffect) ||
@@ -2786,6 +2949,8 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   SmallVector<ArgInfo, 8> ArgList;
   bool isVarArg;
   AttrBuilder FuncAttrs;
+  std::vector<unsigned> FwdRefAttrGrps;
+  LocTy NoBuiltinLoc;
   std::string Section;
   unsigned Alignment;
   std::string GC;
@@ -2795,7 +2960,8 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   if (ParseArgumentList(ArgList, isVarArg) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
                          &UnnamedAddrLoc) ||
-      ParseOptionalFuncAttrs(FuncAttrs) ||
+      ParseFnAttributeValuePairs(FuncAttrs, FwdRefAttrGrps, false,
+                                 NoBuiltinLoc) ||
       (EatIfPresent(lltok::kw_section) &&
        ParseStringConstant(Section)) ||
       ParseOptionalAlignment(Alignment) ||
@@ -2803,6 +2969,9 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
        ParseStringConstant(GC)))
     return true;
 
+  if (FuncAttrs.contains(Attribute::NoBuiltin))
+    return Error(NoBuiltinLoc, "'nobuiltin' attribute not valid on function");
+
   // If the alignment was parsed as an attribute, move to the alignment field.
   if (FuncAttrs.hasAlignmentAttr()) {
     Alignment = FuncAttrs.getAlignment();
@@ -2812,25 +2981,25 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   // Okay, if we got here, the function is syntactically valid.  Convert types
   // and do semantic checks.
   std::vector<Type*> ParamTypeList;
-  SmallVector<AttributeWithIndex, 8> Attrs;
+  SmallVector<AttributeSet, 8> Attrs;
 
   if (RetAttrs.hasAttributes())
-    Attrs.push_back(
-      AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                              Attribute::get(RetType->getContext(),
-                                              RetAttrs)));
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::ReturnIndex,
+                                      RetAttrs));
 
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     ParamTypeList.push_back(ArgList[i].Ty);
-    if (ArgList[i].Attrs.hasAttributes())
-      Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
+    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
+      AttrBuilder B(ArgList[i].Attrs, i + 1);
+      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
+    }
   }
 
   if (FuncAttrs.hasAttributes())
-    Attrs.push_back(
-      AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                              Attribute::get(RetType->getContext(),
-                                              FuncAttrs)));
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::FunctionIndex,
+                                      FuncAttrs));
 
   AttributeSet PAL = AttributeSet::get(Context, Attrs);
 
@@ -2895,6 +3064,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Fn->setAlignment(Alignment);
   Fn->setSection(Section);
   if (!GC.empty()) Fn->setGC(GC.c_str());
+  ForwardRefAttrGroups[Fn] = FwdRefAttrGrps;
 
   // Add all of the arguments we parsed to the function.
   Function::arg_iterator ArgIt = Fn->arg_begin();
@@ -3314,6 +3484,8 @@ bool LLParser::ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
 bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy CallLoc = Lex.getLoc();
   AttrBuilder RetAttrs, FnAttrs;
+  std::vector<unsigned> FwdRefAttrGrps;
+  LocTy NoBuiltinLoc;
   CallingConv::ID CC;
   Type *RetType = 0;
   LocTy RetTypeLoc;
@@ -3326,7 +3498,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
       ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
       ParseValID(CalleeID) ||
       ParseParameterList(ArgList, PFS) ||
-      ParseOptionalFuncAttrs(FnAttrs) ||
+      ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
+                                 NoBuiltinLoc) ||
       ParseToken(lltok::kw_to, "expected 'to' in invoke") ||
       ParseTypeAndBasicBlock(NormalBB, PFS) ||
       ParseToken(lltok::kw_unwind, "expected 'unwind' in invoke") ||
@@ -3357,12 +3530,11 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   if (ConvertValIDToValue(PFTy, CalleeID, Callee, &PFS)) return true;
 
   // Set up the Attribute for the function.
-  SmallVector<AttributeWithIndex, 8> Attrs;
+  SmallVector<AttributeSet, 8> Attrs;
   if (RetAttrs.hasAttributes())
-    Attrs.push_back(
-      AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                              Attribute::get(Callee->getContext(),
-                                              RetAttrs)));
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::ReturnIndex,
+                                      RetAttrs));
 
   SmallVector<Value*, 8> Args;
 
@@ -3382,18 +3554,19 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs.hasAttributes())
-      Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
+    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
+      AttrBuilder B(ArgList[i].Attrs, i + 1);
+      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
+    }
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
   if (FnAttrs.hasAttributes())
-    Attrs.push_back(
-      AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                              Attribute::get(Callee->getContext(),
-                                              FnAttrs)));
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::FunctionIndex,
+                                      FnAttrs));
 
   // Finish off the Attribute and check them
   AttributeSet PAL = AttributeSet::get(Context, Attrs);
@@ -3401,6 +3574,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   InvokeInst *II = InvokeInst::Create(Callee, NormalBB, UnwindBB, Args);
   II->setCallingConv(CC);
   II->setAttributes(PAL);
+  ForwardRefAttrGroups[II] = FwdRefAttrGrps;
   Inst = II;
   return false;
 }
@@ -3719,6 +3893,8 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
 bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
                          bool isTail) {
   AttrBuilder RetAttrs, FnAttrs;
+  std::vector<unsigned> FwdRefAttrGrps;
+  LocTy NoBuiltinLoc;
   CallingConv::ID CC;
   Type *RetType = 0;
   LocTy RetTypeLoc;
@@ -3732,7 +3908,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
       ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
       ParseValID(CalleeID) ||
       ParseParameterList(ArgList, PFS) ||
-      ParseOptionalFuncAttrs(FnAttrs))
+      ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
+                                 NoBuiltinLoc))
     return true;
 
   // If RetType is a non-function pointer type, then this is the short syntax
@@ -3759,12 +3936,11 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   if (ConvertValIDToValue(PFTy, CalleeID, Callee, &PFS)) return true;
 
   // Set up the Attribute for the function.
-  SmallVector<AttributeWithIndex, 8> Attrs;
+  SmallVector<AttributeSet, 8> Attrs;
   if (RetAttrs.hasAttributes())
-    Attrs.push_back(
-      AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                              Attribute::get(Callee->getContext(),
-                                              RetAttrs)));
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::ReturnIndex,
+                                      RetAttrs));
 
   SmallVector<Value*, 8> Args;
 
@@ -3784,18 +3960,19 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs.hasAttributes())
-      Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
+    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
+      AttrBuilder B(ArgList[i].Attrs, i + 1);
+      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
+    }
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
   if (FnAttrs.hasAttributes())
-    Attrs.push_back(
-      AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                              Attribute::get(Callee->getContext(),
-                                              FnAttrs)));
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::FunctionIndex,
+                                      FnAttrs));
 
   // Finish off the Attribute and check them
   AttributeSet PAL = AttributeSet::get(Context, Attrs);
@@ -3804,6 +3981,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   CI->setTailCall(isTail);
   CI->setCallingConv(CC);
   CI->setAttributes(PAL);
+  ForwardRefAttrGroups[CI] = FwdRefAttrGrps;
   Inst = CI;
   return false;
 }
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index f255897..1f2879e 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -125,6 +125,10 @@ namespace llvm {
     std::map<ValID, std::vector<std::pair<ValID, GlobalValue*> > >
       ForwardRefBlockAddresses;
 
+    // Attribute builder reference information.
+    std::map<Value*, std::vector<unsigned> > ForwardRefAttrGroups;
+    std::map<unsigned, AttrBuilder> NumberedAttrBuilders;
+
   public:
     LLParser(MemoryBuffer *F, SourceMgr &SM, SMDiagnostic &Err, Module *m) :
       Context(m->getContext()), Lex(F, SM, Err, m->getContext()),
@@ -191,7 +195,6 @@ namespace llvm {
     bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalAddrSpace(unsigned &AddrSpace);
-    bool ParseOptionalFuncAttrs(AttrBuilder &B);
     bool ParseOptionalParamAttrs(AttrBuilder &B);
     bool ParseOptionalReturnAttrs(AttrBuilder &B);
     bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage);
@@ -236,6 +239,10 @@ namespace llvm {
     bool ParseMDString(MDString *&Result);
     bool ParseMDNodeID(MDNode *&Result);
     bool ParseMDNodeID(MDNode *&Result, unsigned &SlotNo);
+    bool ParseUnnamedAttrGrp();
+    bool ParseFnAttributeValuePairs(AttrBuilder &B,
+                                    std::vector<unsigned> &FwdRefAttrGrps,
+                                    bool inAttrGrp, LocTy &NoBuiltinLoc);
 
     // Type Parsing.
     bool ParseType(Type *&Result, bool AllowVoid = false);
@@ -326,8 +333,8 @@ namespace llvm {
     struct ParamInfo {
       LocTy Loc;
       Value *V;
-      Attribute Attrs;
-      ParamInfo(LocTy loc, Value *v, Attribute attrs)
+      AttributeSet Attrs;
+      ParamInfo(LocTy loc, Value *v, AttributeSet attrs)
         : Loc(loc), V(v), Attrs(attrs) {}
     };
     bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
@@ -347,9 +354,9 @@ namespace llvm {
     struct ArgInfo {
       LocTy Loc;
       Type *Ty;
-      Attribute Attrs;
+      AttributeSet Attrs;
       std::string Name;
-      ArgInfo(LocTy L, Type *ty, Attribute Attr, const std::string &N)
+      ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N)
         : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
     };
     bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 5b4d415..cd25ba3 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -30,6 +30,7 @@ namespace lltok {
     lparen, rparen,    // (  )
     backslash,         // \    (not /)
     exclaim,           // !
+    hash,              // #
 
     kw_x,
     kw_true,    kw_false,
@@ -44,6 +45,7 @@ namespace lltok {
     kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
     kw_default, kw_hidden, kw_protected,
     kw_unnamed_addr,
+    kw_externally_initialized,
     kw_extern_weak,
     kw_external, kw_thread_local,
     kw_localdynamic, kw_initialexec, kw_localexec,
@@ -89,34 +91,39 @@ namespace lltok {
     kw_ptx_kernel, kw_ptx_device,
     kw_spir_kernel, kw_spir_func,
 
-    kw_signext,
-    kw_zeroext,
+    // Attributes:
+    kw_attributes,
+    kw_alwaysinline,
+    kw_sanitize_address,
+    kw_byval,
+    kw_inlinehint,
     kw_inreg,
-    kw_sret,
-    kw_nounwind,
-    kw_noreturn,
+    kw_minsize,
+    kw_naked,
+    kw_nest,
     kw_noalias,
+    kw_nobuiltin,
     kw_nocapture,
-    kw_byval,
-    kw_nest,
+    kw_noduplicate,
+    kw_noimplicitfloat,
+    kw_noinline,
+    kw_nonlazybind,
+    kw_noredzone,
+    kw_noreturn,
+    kw_nounwind,
+    kw_optsize,
     kw_readnone,
     kw_readonly,
-    kw_uwtable,
     kw_returns_twice,
-
-    kw_inlinehint,
-    kw_noinline,
-    kw_alwaysinline,
-    kw_optsize,
+    kw_signext,
     kw_ssp,
     kw_sspreq,
-    kw_noredzone,
-    kw_noimplicitfloat,
-    kw_naked,
-    kw_nonlazybind,
-    kw_address_safety,
-    kw_minsize,
-    kw_noduplicate,
+    kw_sspstrong,
+    kw_sret,
+    kw_sanitize_thread,
+    kw_sanitize_memory,
+    kw_uwtable,
+    kw_zeroext,
 
     kw_type,
     kw_opaque,
@@ -153,6 +160,7 @@ namespace lltok {
     // Unsigned Valued tokens (UIntVal).
     GlobalID,          // @42
     LocalVarID,        // %42
+    AttrGrpID,         // #42
 
     // String valued tokens (StrVal).
     LabelStr,          // foo:
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 59cda22..f348843 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This header defines the BitcodeReader class.
-//
-//===----------------------------------------------------------------------===//
 
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "BitcodeReader.h"
@@ -432,6 +428,26 @@ Type *BitcodeReader::getTypeByID(unsigned ID) {
 //  Functions for parsing blocks from the bitcode file
 //===----------------------------------------------------------------------===//
 
+
+/// \brief This fills an AttrBuilder object with the LLVM attributes that have
+/// been decoded from the given integer. This function must stay in sync with
+/// 'encodeLLVMAttributesForBitcode'.
+static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
+                                           uint64_t EncodedAttrs) {
+  // FIXME: Remove in 4.0.
+
+  // The alignment is stored as a 16-bit raw value from bits 31--16.  We shift
+  // the bits above 31 down by 11 bits.
+  unsigned Alignment = (EncodedAttrs & (0xffffULL << 16)) >> 16;
+  assert((!Alignment || isPowerOf2_32(Alignment)) &&
+         "Alignment must be a power of two.");
+
+  if (Alignment)
+    B.addAlignmentAttr(Alignment);
+  B.addRawValue(((EncodedAttrs & (0xfffffULL << 32)) >> 11) |
+                (EncodedAttrs & 0xffff));
+}
+
 bool BitcodeReader::ParseAttributeBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
     return Error("Malformed block record");
@@ -441,51 +457,46 @@ bool BitcodeReader::ParseAttributeBlock() {
 
   SmallVector<uint64_t, 64> Record;
 
-  SmallVector<AttributeWithIndex, 8> Attrs;
+  SmallVector<AttributeSet, 8> Attrs;
 
   // Read all the records.
   while (1) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK) {
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of PARAMATTR block");
-      return false;
-    }
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
-    if (Code == bitc::ENTER_SUBBLOCK) {
-      // No known subblocks, always skip them.
-      Stream.ReadSubBlockID();
-      if (Stream.SkipBlock())
-        return Error("Malformed block record");
-      continue;
-    }
-
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("Error at end of PARAMATTR block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
     }
 
     // Read a record.
     Record.clear();
-    switch (Stream.ReadRecord(Code, Record)) {
+    switch (Stream.readRecord(Entry.ID, Record)) {
     default:  // Default behavior: ignore.
       break;
-    case bitc::PARAMATTR_CODE_ENTRY: { // ENTRY: [paramidx0, attr0, ...]
+    case bitc::PARAMATTR_CODE_ENTRY_OLD: { // ENTRY: [paramidx0, attr0, ...]
+      // FIXME: Remove in 4.0.
       if (Record.size() & 1)
         return Error("Invalid ENTRY record");
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
-        Attribute ReconstitutedAttr =
-          Attribute::decodeLLVMAttributesForBitcode(Context, Record[i+1]);
-        Record[i+1] = ReconstitutedAttr.getBitMask();
+        AttrBuilder B;
+        decodeLLVMAttributesForBitcode(B, Record[i+1]);
+        Attrs.push_back(AttributeSet::get(Context, Record[i], B));
       }
 
-      for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
-        AttrBuilder B(Record[i+1]);
-        if (B.hasAttributes())
-          Attrs.push_back(AttributeWithIndex::get(Record[i],
-                                                  Attribute::get(Context, B)));
-      }
+      MAttributes.push_back(AttributeSet::get(Context, Attrs));
+      Attrs.clear();
+      break;
+    }
+    case bitc::PARAMATTR_CODE_ENTRY: { // ENTRY: [attrgrp0, attrgrp1, ...]
+      for (unsigned i = 0, e = Record.size(); i != e; ++i)
+        Attrs.push_back(MAttributeGroups[Record[i]]);
 
       MAttributes.push_back(AttributeSet::get(Context, Attrs));
       Attrs.clear();
@@ -495,6 +506,81 @@ bool BitcodeReader::ParseAttributeBlock() {
   }
 }
 
+bool BitcodeReader::ParseAttributeGroupBlock() {
+  if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID))
+    return Error("Malformed block record");
+
+  if (!MAttributeGroups.empty())
+    return Error("Multiple PARAMATTR_GROUP blocks found!");
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Read all the records.
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("Error at end of PARAMATTR_GROUP block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default:  // Default behavior: ignore.
+      break;
+    case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...]
+      if (Record.size() < 3)
+        return Error("Invalid ENTRY record");
+
+      uint64_t GrpID = Record[0];
+      uint64_t Idx = Record[1]; // Index of the object this attribute refers to.
+
+      AttrBuilder B;
+      for (unsigned i = 2, e = Record.size(); i != e; ++i) {
+        if (Record[i] == 0) {        // Enum attribute
+          B.addAttribute(Attribute::AttrKind(Record[++i]));
+        } else if (Record[i] == 1) { // Align attribute
+          if (Attribute::AttrKind(Record[++i]) == Attribute::Alignment)
+            B.addAlignmentAttr(Record[++i]);
+          else
+            B.addStackAlignmentAttr(Record[++i]);
+        } else {                     // String attribute
+          assert((Record[i] == 3 || Record[i] == 4) &&
+                 "Invalid attribute group entry");
+          bool HasValue = (Record[i++] == 4);
+          SmallString<64> KindStr;
+          SmallString<64> ValStr;
+
+          while (Record[i] != 0 && i != e)
+            KindStr += Record[i++];
+          assert(Record[i] == 0 && "Kind string not null terminated");
+
+          if (HasValue) {
+            // Has a value associated with it.
+            ++i; // Skip the '0' that terminates the "kind" string.
+            while (Record[i] != 0 && i != e)
+              ValStr += Record[i++];
+            assert(Record[i] == 0 && "Value string not null terminated");
+          }
+
+          B.addAttribute(KindStr.str(), ValStr.str());
+        }
+      }
+
+      MAttributeGroups[GrpID] = AttributeSet::get(Context, Idx, B);
+      break;
+    }
+    }
+  }
+}
+
 bool BitcodeReader::ParseTypeTable() {
   if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW))
     return Error("Malformed block record");
@@ -513,32 +599,26 @@ bool BitcodeReader::ParseTypeTableBody() {
 
   // Read all the records for this type table.
   while (1) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      Error("Error in the type table block");
+      return true;
+    case BitstreamEntry::EndBlock:
       if (NumRecords != TypeList.size())
         return Error("Invalid type forward reference in TYPE_BLOCK");
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of type table block");
       return false;
-    }
-
-    if (Code == bitc::ENTER_SUBBLOCK) {
-      // No known subblocks, always skip them.
-      Stream.ReadSubBlockID();
-      if (Stream.SkipBlock())
-        return Error("Malformed block record");
-      continue;
-    }
-
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
     }
 
     // Read a record.
     Record.clear();
     Type *ResultTy = 0;
-    switch (Stream.ReadRecord(Code, Record)) {
+    switch (Stream.readRecord(Entry.ID, Record)) {
     default: return Error("unknown type in type table");
     case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries]
       // TYPE_CODE_NUMENTRY contains a count of the number of types in the
@@ -736,28 +816,22 @@ bool BitcodeReader::ParseValueSymbolTable() {
   // Read all the records for this value table.
   SmallString<128> ValueName;
   while (1) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK) {
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of value symbol table block");
-      return false;
-    }
-    if (Code == bitc::ENTER_SUBBLOCK) {
-      // No known subblocks, always skip them.
-      Stream.ReadSubBlockID();
-      if (Stream.SkipBlock())
-        return Error("Malformed block record");
-      continue;
-    }
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed value symbol table block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
     }
 
     // Read a record.
     Record.clear();
-    switch (Stream.ReadRecord(Code, Record)) {
+    switch (Stream.readRecord(Entry.ID, Record)) {
     default:  // Default behavior: unknown type.
       break;
     case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
@@ -797,41 +871,35 @@ bool BitcodeReader::ParseMetadata() {
 
   // Read all the records.
   while (1) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK) {
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of PARAMATTR block");
-      return false;
-    }
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
-    if (Code == bitc::ENTER_SUBBLOCK) {
-      // No known subblocks, always skip them.
-      Stream.ReadSubBlockID();
-      if (Stream.SkipBlock())
-        return Error("Malformed block record");
-      continue;
-    }
-
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      Error("malformed metadata block");
+      return true;
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
     }
 
     bool IsFunctionLocal = false;
     // Read a record.
     Record.clear();
-    Code = Stream.ReadRecord(Code, Record);
+    unsigned Code = Stream.readRecord(Entry.ID, Record);
     switch (Code) {
     default:  // Default behavior: ignore.
       break;
     case bitc::METADATA_NAME: {
-      // Read named of the named metadata.
+      // Read name of the named metadata.
       SmallString<8> Name(Record.begin(), Record.end());
       Record.clear();
       Code = Stream.ReadCode();
 
       // METADATA_NAME is always followed by METADATA_NAMED_NODE.
-      unsigned NextBitCode = Stream.ReadRecord(Code, Record);
+      unsigned NextBitCode = Stream.readRecord(Code, Record);
       assert(NextBitCode == bitc::METADATA_NAMED_NODE); (void)NextBitCode;
 
       // Read named metadata elements.
@@ -958,27 +1026,29 @@ bool BitcodeReader::ParseConstants() {
   Type *CurTy = Type::getInt32Ty(Context);
   unsigned NextCstNo = ValueList.size();
   while (1) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK)
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed block record in AST file");
+    case BitstreamEntry::EndBlock:
+      if (NextCstNo != ValueList.size())
+        return Error("Invalid constant reference!");
+
+      // Once all the constants have been read, go through and resolve forward
+      // references.
+      ValueList.ResolveConstantForwardRefs();
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
       break;
-
-    if (Code == bitc::ENTER_SUBBLOCK) {
-      // No known subblocks, always skip them.
-      Stream.ReadSubBlockID();
-      if (Stream.SkipBlock())
-        return Error("Malformed block record");
-      continue;
-    }
-
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
     }
 
     // Read a record.
     Record.clear();
     Value *V = 0;
-    unsigned BitCode = Stream.ReadRecord(Code, Record);
+    unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default:  // Default behavior: unknown constant
     case bitc::CST_CODE_UNDEF:     // UNDEF
@@ -1013,21 +1083,27 @@ bool BitcodeReader::ParseConstants() {
       if (Record.empty())
         return Error("Invalid FLOAT record");
       if (CurTy->isHalfTy())
-        V = ConstantFP::get(Context, APFloat(APInt(16, (uint16_t)Record[0])));
+        V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf,
+                                             APInt(16, (uint16_t)Record[0])));
       else if (CurTy->isFloatTy())
-        V = ConstantFP::get(Context, APFloat(APInt(32, (uint32_t)Record[0])));
+        V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle,
+                                             APInt(32, (uint32_t)Record[0])));
       else if (CurTy->isDoubleTy())
-        V = ConstantFP::get(Context, APFloat(APInt(64, Record[0])));
+        V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble,
+                                             APInt(64, Record[0])));
       else if (CurTy->isX86_FP80Ty()) {
         // Bits are not stored the same way as a normal i80 APInt, compensate.
         uint64_t Rearrange[2];
         Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16);
         Rearrange[1] = Record[0] >> 48;
-        V = ConstantFP::get(Context, APFloat(APInt(80, Rearrange)));
+        V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended,
+                                             APInt(80, Rearrange)));
       } else if (CurTy->isFP128Ty())
-        V = ConstantFP::get(Context, APFloat(APInt(128, Record), true));
+        V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad,
+                                             APInt(128, Record)));
       else if (CurTy->isPPC_FP128Ty())
-        V = ConstantFP::get(Context, APFloat(APInt(128, Record)));
+        V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble,
+                                             APInt(128, Record)));
       else
         V = UndefValue::get(CurTy);
       break;
@@ -1333,17 +1409,6 @@ bool BitcodeReader::ParseConstants() {
     ValueList.AssignValue(V, NextCstNo);
     ++NextCstNo;
   }
-
-  if (NextCstNo != ValueList.size())
-    return Error("Invalid constant reference!");
-
-  if (Stream.ReadBlockEnd())
-    return Error("Error at end of constants block");
-
-  // Once all the constants have been read, go through and resolve forward
-  // references.
-  ValueList.ResolveConstantForwardRefs();
-  return false;
 }
 
 bool BitcodeReader::ParseUseLists() {
@@ -1354,29 +1419,22 @@ bool BitcodeReader::ParseUseLists() {
 
   // Read all the records.
   while (1) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK) {
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of use-list table block");
-      return false;
-    }
-
-    if (Code == bitc::ENTER_SUBBLOCK) {
-      // No known subblocks, always skip them.
-      Stream.ReadSubBlockID();
-      if (Stream.SkipBlock())
-        return Error("Malformed block record");
-      continue;
-    }
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed use list block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
     }
 
     // Read a use list record.
     Record.clear();
-    switch (Stream.ReadRecord(Code, Record)) {
+    switch (Stream.readRecord(Entry.ID, Record)) {
     default:  // Default behavior: unknown type.
       break;
     case bitc::USELIST_CODE_ENTRY: { // USELIST_CODE_ENTRY: TBD.
@@ -1448,17 +1506,18 @@ bool BitcodeReader::ParseModule(bool Resume) {
   std::vector<std::string> GCTable;
 
   // Read all the records for this module.
-  while (!Stream.AtEndOfStream()) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK) {
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of module block");
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
 
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      Error("malformed module block");
+      return true;
+    case BitstreamEntry::EndBlock:
       return GlobalCleanup();
-    }
 
-    if (Code == bitc::ENTER_SUBBLOCK) {
-      switch (Stream.ReadSubBlockID()) {
+    case BitstreamEntry::SubBlock:
+      switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
           return Error("Malformed block record");
@@ -1471,6 +1530,10 @@ bool BitcodeReader::ParseModule(bool Resume) {
         if (ParseAttributeBlock())
           return true;
         break;
+      case bitc::PARAMATTR_GROUP_BLOCK_ID:
+        if (ParseAttributeGroupBlock())
+          return true;
+        break;
       case bitc::TYPE_BLOCK_ID_NEW:
         if (ParseTypeTable())
           return true;
@@ -1517,15 +1580,15 @@ bool BitcodeReader::ParseModule(bool Resume) {
         break;
       }
       continue;
-    }
 
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
     }
 
+
     // Read a record.
-    switch (Stream.ReadRecord(Code, Record)) {
+    switch (Stream.readRecord(Entry.ID, Record)) {
     default: break;  // Default behavior, ignore unknown content.
     case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
       if (Record.size() < 1)
@@ -1620,9 +1683,13 @@ bool BitcodeReader::ParseModule(bool Resume) {
       if (Record.size() > 8)
         UnnamedAddr = Record[8];
 
+      bool ExternallyInitialized = false;
+      if (Record.size() > 9)
+        ExternallyInitialized = Record[9];
+
       GlobalVariable *NewGV =
         new GlobalVariable(*TheModule, Ty, isConstant, Linkage, 0, "", 0,
-                           TLM, AddressSpace);
+                           TLM, AddressSpace, ExternallyInitialized);
       NewGV->setAlignment(Alignment);
       if (!Section.empty())
         NewGV->setSection(Section);
@@ -1713,8 +1780,6 @@ bool BitcodeReader::ParseModule(bool Resume) {
     }
     Record.clear();
   }
-
-  return Error("Premature end of bitstream");
 }
 
 bool BitcodeReader::ParseBitcodeInto(Module *M) {
@@ -1733,47 +1798,55 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
-  while (!Stream.AtEndOfStream()) {
-    unsigned Code = Stream.ReadCode();
+  while (1) {
+    if (Stream.AtEndOfStream())
+      return false;
+
+    BitstreamEntry Entry =
+      Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      Error("malformed module file");
+      return true;
+    case BitstreamEntry::EndBlock:
+      return false;
 
-    if (Code != bitc::ENTER_SUBBLOCK) {
+    case BitstreamEntry::SubBlock:
+      switch (Entry.ID) {
+      case bitc::BLOCKINFO_BLOCK_ID:
+        if (Stream.ReadBlockInfoBlock())
+          return Error("Malformed BlockInfoBlock");
+        break;
+      case bitc::MODULE_BLOCK_ID:
+        // Reject multiple MODULE_BLOCK's in a single bitstream.
+        if (TheModule)
+          return Error("Multiple MODULE_BLOCKs in same stream");
+        TheModule = M;
+        if (ParseModule(false))
+          return true;
+        if (LazyStreamer) return false;
+        break;
+      default:
+        if (Stream.SkipBlock())
+          return Error("Malformed block record");
+        break;
+      }
+      continue;
+    case BitstreamEntry::Record:
+      // There should be no records in the top-level of blocks.
 
-      // The ranlib in xcode 4 will align archive members by appending newlines
+      // The ranlib in Xcode 4 will align archive members by appending newlines
       // to the end of them. If this file size is a multiple of 4 but not 8, we
       // have to read and ignore these final 4 bytes :-(
-      if (Stream.GetAbbrevIDWidth() == 2 && Code == 2 &&
+      if (Stream.getAbbrevIDWidth() == 2 && Entry.ID == 2 &&
           Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
           Stream.AtEndOfStream())
         return false;
 
       return Error("Invalid record at top-level");
     }
-
-    unsigned BlockID = Stream.ReadSubBlockID();
-
-    // We only know the MODULE subblock ID.
-    switch (BlockID) {
-    case bitc::BLOCKINFO_BLOCK_ID:
-      if (Stream.ReadBlockInfoBlock())
-        return Error("Malformed BlockInfoBlock");
-      break;
-    case bitc::MODULE_BLOCK_ID:
-      // Reject multiple MODULE_BLOCK's in a single bitstream.
-      if (TheModule)
-        return Error("Multiple MODULE_BLOCKs in same stream");
-      TheModule = M;
-      if (ParseModule(false))
-        return true;
-      if (LazyStreamer) return false;
-      break;
-    default:
-      if (Stream.SkipBlock())
-        return Error("Malformed block record");
-      break;
-    }
   }
-
-  return false;
 }
 
 bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
@@ -1783,32 +1856,22 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
   SmallVector<uint64_t, 64> Record;
 
   // Read all the records for this module.
-  while (!Stream.AtEndOfStream()) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK) {
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of module block");
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
 
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed module block");
+    case BitstreamEntry::EndBlock:
       return false;
-    }
-
-    if (Code == bitc::ENTER_SUBBLOCK) {
-      switch (Stream.ReadSubBlockID()) {
-      default:  // Skip unknown content.
-        if (Stream.SkipBlock())
-          return Error("Malformed block record");
-        break;
-      }
-      continue;
-    }
-
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
     }
 
     // Read a record.
-    switch (Stream.ReadRecord(Code, Record)) {
+    switch (Stream.readRecord(Entry.ID, Record)) {
     default: break;  // Default behavior, ignore unknown content.
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
@@ -1820,8 +1883,6 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
     }
     Record.clear();
   }
-
-  return Error("Premature end of bitstream");
 }
 
 bool BitcodeReader::ParseTriple(std::string &Triple) {
@@ -1838,28 +1899,32 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
-  while (!Stream.AtEndOfStream()) {
-    unsigned Code = Stream.ReadCode();
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
 
-    if (Code != bitc::ENTER_SUBBLOCK)
-      return Error("Invalid record at top-level");
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      Error("malformed module file");
+      return true;
+    case BitstreamEntry::EndBlock:
+      return false;
 
-    unsigned BlockID = Stream.ReadSubBlockID();
+    case BitstreamEntry::SubBlock:
+      if (Entry.ID == bitc::MODULE_BLOCK_ID)
+        return ParseModuleTriple(Triple);
 
-    // We only know the MODULE subblock ID.
-    switch (BlockID) {
-    case bitc::MODULE_BLOCK_ID:
-      if (ParseModuleTriple(Triple))
+      // Ignore other sub-blocks.
+      if (Stream.SkipBlock()) {
+        Error("malformed block record in AST file");
         return true;
-      break;
-    default:
-      if (Stream.SkipBlock())
-        return Error("Malformed block record");
-      break;
+      }
+      continue;
+
+    case BitstreamEntry::Record:
+      Stream.skipRecord(Entry.ID);
+      continue;
     }
   }
-
-  return false;
 }
 
 /// ParseMetadataAttachment - Parse metadata attachments.
@@ -1868,20 +1933,23 @@ bool BitcodeReader::ParseMetadataAttachment() {
     return Error("Malformed block record");
 
   SmallVector<uint64_t, 64> Record;
-  while(1) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK) {
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of PARAMATTR block");
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed metadata block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
       break;
     }
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
-    }
+
     // Read a metadata attachment record.
     Record.clear();
-    switch (Stream.ReadRecord(Code, Record)) {
+    switch (Stream.readRecord(Entry.ID, Record)) {
     default:  // Default behavior: ignore.
       break;
     case bitc::METADATA_ATTACHMENT: {
@@ -1902,7 +1970,6 @@ bool BitcodeReader::ParseMetadataAttachment() {
     }
     }
   }
-  return false;
 }
 
 /// ParseFunctionBody - Lazily parse the specified function body block.
@@ -1927,15 +1994,16 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
   // Read all the records.
   SmallVector<uint64_t, 64> Record;
   while (1) {
-    unsigned Code = Stream.ReadCode();
-    if (Code == bitc::END_BLOCK) {
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of function block");
-      break;
-    }
+    BitstreamEntry Entry = Stream.advance();
 
-    if (Code == bitc::ENTER_SUBBLOCK) {
-      switch (Stream.ReadSubBlockID()) {
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      return Error("Bitcode error in function block");
+    case BitstreamEntry::EndBlock:
+      goto OutOfRecordLoop;
+
+    case BitstreamEntry::SubBlock:
+      switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
           return Error("Malformed block record");
@@ -1955,17 +2023,16 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         break;
       }
       continue;
-    }
 
-    if (Code == bitc::DEFINE_ABBREV) {
-      Stream.ReadAbbrevRecord();
-      continue;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
     }
 
     // Read a record.
     Record.clear();
     Instruction *I = 0;
-    unsigned BitCode = Stream.ReadRecord(Code, Record);
+    unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default: // Default behavior: reject
       return Error("Unknown instruction");
@@ -2742,6 +2809,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       ValueList.AssignValue(I, NextValueNo++);
   }
 
+OutOfRecordLoop:
+
   // Check the function list for unresolved values.
   if (Argument *A = dyn_cast<Argument>(ValueList.back())) {
     if (A->getParent() == 0) {
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index 3347418..28674eb 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -148,6 +148,9 @@ class BitcodeReader : public GVMaterializer {
   /// are off by one.
   std::vector<AttributeSet> MAttributes;
 
+  /// \brief The set of attribute groups.
+  std::map<unsigned, AttributeSet> MAttributeGroups;
+
   /// FunctionBBs - While parsing a function body, this is a list of the basic
   /// blocks for the function.
   std::vector<BasicBlock*> FunctionBBs;
@@ -320,6 +323,7 @@ private:
 
   bool ParseModule(bool Resume);
   bool ParseAttributeBlock();
+  bool ParseAttributeGroupBlock();
   bool ParseTypeTable();
   bool ParseTypeTableBody();
 
diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
new file mode 100644
index 0000000..942346b
--- /dev/null
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -0,0 +1,371 @@
+//===- BitstreamReader.cpp - BitstreamReader implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/BitstreamReader.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  BitstreamCursor implementation
+//===----------------------------------------------------------------------===//
+
+void BitstreamCursor::operator=(const BitstreamCursor &RHS) {
+  freeState();
+
+  BitStream = RHS.BitStream;
+  NextChar = RHS.NextChar;
+  CurWord = RHS.CurWord;
+  BitsInCurWord = RHS.BitsInCurWord;
+  CurCodeSize = RHS.CurCodeSize;
+
+  // Copy abbreviations, and bump ref counts.
+  CurAbbrevs = RHS.CurAbbrevs;
+  for (size_t i = 0, e = CurAbbrevs.size(); i != e; ++i)
+    CurAbbrevs[i]->addRef();
+
+  // Copy block scope and bump ref counts.
+  BlockScope = RHS.BlockScope;
+  for (size_t S = 0, e = BlockScope.size(); S != e; ++S) {
+    std::vector<BitCodeAbbrev*> &Abbrevs = BlockScope[S].PrevAbbrevs;
+    for (size_t i = 0, e = Abbrevs.size(); i != e; ++i)
+      Abbrevs[i]->addRef();
+  }
+}
+
+void BitstreamCursor::freeState() {
+  // Free all the Abbrevs.
+  for (size_t i = 0, e = CurAbbrevs.size(); i != e; ++i)
+    CurAbbrevs[i]->dropRef();
+  CurAbbrevs.clear();
+
+  // Free all the Abbrevs in the block scope.
+  for (size_t S = 0, e = BlockScope.size(); S != e; ++S) {
+    std::vector<BitCodeAbbrev*> &Abbrevs = BlockScope[S].PrevAbbrevs;
+    for (size_t i = 0, e = Abbrevs.size(); i != e; ++i)
+      Abbrevs[i]->dropRef();
+  }
+  BlockScope.clear();
+}
+
+/// EnterSubBlock - Having read the ENTER_SUBBLOCK abbrevid, enter
+/// the block, and return true if the block has an error.
+bool BitstreamCursor::EnterSubBlock(unsigned BlockID, unsigned *NumWordsP) {
+  // Save the current block's state on BlockScope.
+  BlockScope.push_back(Block(CurCodeSize));
+  BlockScope.back().PrevAbbrevs.swap(CurAbbrevs);
+
+  // Add the abbrevs specific to this block to the CurAbbrevs list.
+  if (const BitstreamReader::BlockInfo *Info =
+      BitStream->getBlockInfo(BlockID)) {
+    for (size_t i = 0, e = Info->Abbrevs.size(); i != e; ++i) {
+      CurAbbrevs.push_back(Info->Abbrevs[i]);
+      CurAbbrevs.back()->addRef();
+    }
+  }
+
+  // Get the codesize of this block.
+  CurCodeSize = ReadVBR(bitc::CodeLenWidth);
+  SkipToFourByteBoundary();
+  unsigned NumWords = Read(bitc::BlockSizeWidth);
+  if (NumWordsP) *NumWordsP = NumWords;
+
+  // Validate that this block is sane.
+  if (CurCodeSize == 0 || AtEndOfStream())
+    return true;
+
+  return false;
+}
+
+void BitstreamCursor::readAbbreviatedLiteral(const BitCodeAbbrevOp &Op,
+                                             SmallVectorImpl<uint64_t> &Vals) {
+  assert(Op.isLiteral() && "Not a literal");
+  // If the abbrev specifies the literal value to use, use it.
+  Vals.push_back(Op.getLiteralValue());
+}
+
+void BitstreamCursor::readAbbreviatedField(const BitCodeAbbrevOp &Op,
+                                           SmallVectorImpl<uint64_t> &Vals) {
+  assert(!Op.isLiteral() && "Use ReadAbbreviatedLiteral for literals!");
+
+  // Decode the value as we are commanded.
+  switch (Op.getEncoding()) {
+  case BitCodeAbbrevOp::Array:
+  case BitCodeAbbrevOp::Blob:
+    assert(0 && "Should not reach here");
+  case BitCodeAbbrevOp::Fixed:
+    Vals.push_back(Read((unsigned)Op.getEncodingData()));
+    break;
+  case BitCodeAbbrevOp::VBR:
+    Vals.push_back(ReadVBR64((unsigned)Op.getEncodingData()));
+    break;
+  case BitCodeAbbrevOp::Char6:
+    Vals.push_back(BitCodeAbbrevOp::DecodeChar6(Read(6)));
+    break;
+  }
+}
+
+void BitstreamCursor::skipAbbreviatedField(const BitCodeAbbrevOp &Op) {
+  assert(!Op.isLiteral() && "Use ReadAbbreviatedLiteral for literals!");
+
+  // Decode the value as we are commanded.
+  switch (Op.getEncoding()) {
+  case BitCodeAbbrevOp::Array:
+  case BitCodeAbbrevOp::Blob:
+    assert(0 && "Should not reach here");
+  case BitCodeAbbrevOp::Fixed:
+    (void)Read((unsigned)Op.getEncodingData());
+    break;
+  case BitCodeAbbrevOp::VBR:
+    (void)ReadVBR64((unsigned)Op.getEncodingData());
+    break;
+  case BitCodeAbbrevOp::Char6:
+    (void)Read(6);
+    break;
+  }
+}
+
+
+
+/// skipRecord - Read the current record and discard it.
+void BitstreamCursor::skipRecord(unsigned AbbrevID) {
+  // Skip unabbreviated records by reading past their entries.
+  if (AbbrevID == bitc::UNABBREV_RECORD) {
+    unsigned Code = ReadVBR(6);
+    (void)Code;
+    unsigned NumElts = ReadVBR(6);
+    for (unsigned i = 0; i != NumElts; ++i)
+      (void)ReadVBR64(6);
+    return;
+  }
+
+  const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
+
+  for (unsigned i = 0, e = Abbv->getNumOperandInfos(); i != e; ++i) {
+    const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
+    if (Op.isLiteral())
+      continue;
+
+    if (Op.getEncoding() != BitCodeAbbrevOp::Array &&
+        Op.getEncoding() != BitCodeAbbrevOp::Blob) {
+      skipAbbreviatedField(Op);
+      continue;
+    }
+
+    if (Op.getEncoding() == BitCodeAbbrevOp::Array) {
+      // Array case.  Read the number of elements as a vbr6.
+      unsigned NumElts = ReadVBR(6);
+
+      // Get the element encoding.
+      assert(i+2 == e && "array op not second to last?");
+      const BitCodeAbbrevOp &EltEnc = Abbv->getOperandInfo(++i);
+
+      // Read all the elements.
+      for (; NumElts; --NumElts)
+        skipAbbreviatedField(EltEnc);
+      continue;
+    }
+
+    assert(Op.getEncoding() == BitCodeAbbrevOp::Blob);
+    // Blob case.  Read the number of bytes as a vbr6.
+    unsigned NumElts = ReadVBR(6);
+    SkipToFourByteBoundary();  // 32-bit alignment
+
+    // Figure out where the end of this blob will be including tail padding.
+    size_t NewEnd = GetCurrentBitNo()+((NumElts+3)&~3)*8;
+
+    // If this would read off the end of the bitcode file, just set the
+    // record to empty and return.
+    if (!canSkipToPos(NewEnd/8)) {
+      NextChar = BitStream->getBitcodeBytes().getExtent();
+      break;
+    }
+
+    // Skip over the blob.
+    JumpToBit(NewEnd);
+  }
+}
+
+unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
+                                     SmallVectorImpl<uint64_t> &Vals,
+                                     StringRef *Blob) {
+  if (AbbrevID == bitc::UNABBREV_RECORD) {
+    unsigned Code = ReadVBR(6);
+    unsigned NumElts = ReadVBR(6);
+    for (unsigned i = 0; i != NumElts; ++i)
+      Vals.push_back(ReadVBR64(6));
+    return Code;
+  }
+
+  const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
+
+  for (unsigned i = 0, e = Abbv->getNumOperandInfos(); i != e; ++i) {
+    const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
+    if (Op.isLiteral()) {
+      readAbbreviatedLiteral(Op, Vals);
+      continue;
+    }
+
+    if (Op.getEncoding() != BitCodeAbbrevOp::Array &&
+        Op.getEncoding() != BitCodeAbbrevOp::Blob) {
+      readAbbreviatedField(Op, Vals);
+      continue;
+    }
+
+    if (Op.getEncoding() == BitCodeAbbrevOp::Array) {
+      // Array case.  Read the number of elements as a vbr6.
+      unsigned NumElts = ReadVBR(6);
+
+      // Get the element encoding.
+      assert(i+2 == e && "array op not second to last?");
+      const BitCodeAbbrevOp &EltEnc = Abbv->getOperandInfo(++i);
+
+      // Read all the elements.
+      for (; NumElts; --NumElts)
+        readAbbreviatedField(EltEnc, Vals);
+      continue;
+    }
+
+    assert(Op.getEncoding() == BitCodeAbbrevOp::Blob);
+    // Blob case.  Read the number of bytes as a vbr6.
+    unsigned NumElts = ReadVBR(6);
+    SkipToFourByteBoundary();  // 32-bit alignment
+
+    // Figure out where the end of this blob will be including tail padding.
+    size_t CurBitPos = GetCurrentBitNo();
+    size_t NewEnd = CurBitPos+((NumElts+3)&~3)*8;
+
+    // If this would read off the end of the bitcode file, just set the
+    // record to empty and return.
+    if (!canSkipToPos(NewEnd/8)) {
+      Vals.append(NumElts, 0);
+      NextChar = BitStream->getBitcodeBytes().getExtent();
+      break;
+    }
+
+    // Otherwise, inform the streamer that we need these bytes in memory.
+    const char *Ptr = (const char*)
+      BitStream->getBitcodeBytes().getPointer(CurBitPos/8, NumElts);
+
+    // If we can return a reference to the data, do so to avoid copying it.
+    if (Blob) {
+      *Blob = StringRef(Ptr, NumElts);
+    } else {
+      // Otherwise, unpack into Vals with zero extension.
+      for (; NumElts; --NumElts)
+        Vals.push_back((unsigned char)*Ptr++);
+    }
+    // Skip over tail padding.
+    JumpToBit(NewEnd);
+  }
+
+  unsigned Code = (unsigned)Vals[0];
+  Vals.erase(Vals.begin());
+  return Code;
+}
+
+
+void BitstreamCursor::ReadAbbrevRecord() {
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  unsigned NumOpInfo = ReadVBR(5);
+  for (unsigned i = 0; i != NumOpInfo; ++i) {
+    bool IsLiteral = Read(1) ? true : false;
+    if (IsLiteral) {
+      Abbv->Add(BitCodeAbbrevOp(ReadVBR64(8)));
+      continue;
+    }
+
+    BitCodeAbbrevOp::Encoding E = (BitCodeAbbrevOp::Encoding)Read(3);
+    if (BitCodeAbbrevOp::hasEncodingData(E)) {
+      unsigned Data = ReadVBR64(5);
+
+      // As a special case, handle fixed(0) (i.e., a fixed field with zero bits)
+      // and vbr(0) as a literal zero.  This is decoded the same way, and avoids
+      // a slow path in Read() to have to handle reading zero bits.
+      if ((E == BitCodeAbbrevOp::Fixed || E == BitCodeAbbrevOp::VBR) &&
+          Data == 0) {
+        Abbv->Add(BitCodeAbbrevOp(0));
+        continue;
+      }
+      
+      Abbv->Add(BitCodeAbbrevOp(E, Data));
+    } else
+      Abbv->Add(BitCodeAbbrevOp(E));
+  }
+  CurAbbrevs.push_back(Abbv);
+}
+
+bool BitstreamCursor::ReadBlockInfoBlock() {
+  // If this is the second stream to get to the block info block, skip it.
+  if (BitStream->hasBlockInfoRecords())
+    return SkipBlock();
+
+  if (EnterSubBlock(bitc::BLOCKINFO_BLOCK_ID)) return true;
+
+  SmallVector<uint64_t, 64> Record;
+  BitstreamReader::BlockInfo *CurBlockInfo = 0;
+
+  // Read all the records for this module.
+  while (1) {
+    BitstreamEntry Entry = advanceSkippingSubblocks(AF_DontAutoprocessAbbrevs);
+
+    switch (Entry.Kind) {
+    case llvm::BitstreamEntry::SubBlock: // Handled for us already.
+    case llvm::BitstreamEntry::Error:
+      return true;
+    case llvm::BitstreamEntry::EndBlock:
+      return false;
+    case llvm::BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read abbrev records, associate them with CurBID.
+    if (Entry.ID == bitc::DEFINE_ABBREV) {
+      if (!CurBlockInfo) return true;
+      ReadAbbrevRecord();
+
+      // ReadAbbrevRecord installs the abbrev in CurAbbrevs.  Move it to the
+      // appropriate BlockInfo.
+      BitCodeAbbrev *Abbv = CurAbbrevs.back();
+      CurAbbrevs.pop_back();
+      CurBlockInfo->Abbrevs.push_back(Abbv);
+      continue;
+    }
+
+    // Read a record.
+    Record.clear();
+    switch (readRecord(Entry.ID, Record)) {
+      default: break;  // Default behavior, ignore unknown content.
+      case bitc::BLOCKINFO_CODE_SETBID:
+        if (Record.size() < 1) return true;
+        CurBlockInfo = &BitStream->getOrCreateBlockInfo((unsigned)Record[0]);
+        break;
+      case bitc::BLOCKINFO_CODE_BLOCKNAME: {
+        if (!CurBlockInfo) return true;
+        if (BitStream->isIgnoringBlockInfoNames()) break;  // Ignore name.
+        std::string Name;
+        for (unsigned i = 0, e = Record.size(); i != e; ++i)
+          Name += (char)Record[i];
+        CurBlockInfo->Name = Name;
+        break;
+      }
+      case bitc::BLOCKINFO_CODE_SETRECORDNAME: {
+        if (!CurBlockInfo) return true;
+        if (BitStream->isIgnoringBlockInfoNames()) break;  // Ignore name.
+        std::string Name;
+        for (unsigned i = 1, e = Record.size(); i != e; ++i)
+          Name += (char)Record[i];
+        CurBlockInfo->RecordNames.push_back(std::make_pair((unsigned)Record[0],
+                                                           Name));
+        break;
+      }
+    }
+  }
+}
+
diff --git a/lib/Bitcode/Reader/CMakeLists.txt b/lib/Bitcode/Reader/CMakeLists.txt
index dfe7e10..f614c9f 100644
--- a/lib/Bitcode/Reader/CMakeLists.txt
+++ b/lib/Bitcode/Reader/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMBitReader
   BitReader.cpp
   BitcodeReader.cpp
+  BitstreamReader.cpp
   )
 
 add_dependencies(LLVMBitReader intrinsics_gen)
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 4ee762e..1b73f23 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -161,7 +161,54 @@ static void WriteStringRecord(unsigned Code, StringRef Str,
   Stream.EmitRecord(Code, Vals, AbbrevToUse);
 }
 
-// Emit information about parameter attributes.
+static void WriteAttributeGroupTable(const ValueEnumerator &VE,
+                                     BitstreamWriter &Stream) {
+  const std::vector<AttributeSet> &AttrGrps = VE.getAttributeGroups();
+  if (AttrGrps.empty()) return;
+
+  Stream.EnterSubblock(bitc::PARAMATTR_GROUP_BLOCK_ID, 3);
+
+  SmallVector<uint64_t, 64> Record;
+  for (unsigned i = 0, e = AttrGrps.size(); i != e; ++i) {
+    AttributeSet AS = AttrGrps[i];
+    for (unsigned i = 0, e = AS.getNumSlots(); i != e; ++i) {
+      AttributeSet A = AS.getSlotAttributes(i);
+
+      Record.push_back(VE.getAttributeGroupID(A));
+      Record.push_back(AS.getSlotIndex(i));
+
+      for (AttributeSet::iterator I = AS.begin(0), E = AS.end(0);
+           I != E; ++I) {
+        Attribute Attr = *I;
+        if (Attr.isEnumAttribute()) {
+          Record.push_back(0);
+          Record.push_back(Attr.getKindAsEnum());
+        } else if (Attr.isAlignAttribute()) {
+          Record.push_back(1);
+          Record.push_back(Attr.getKindAsEnum());
+          Record.push_back(Attr.getValueAsInt());
+        } else {
+          StringRef Kind = Attr.getKindAsString();
+          StringRef Val = Attr.getValueAsString();
+
+          Record.push_back(Val.empty() ? 3 : 4);
+          Record.append(Kind.begin(), Kind.end());
+          Record.push_back(0);
+          if (!Val.empty()) {
+            Record.append(Val.begin(), Val.end());
+            Record.push_back(0);
+          }
+        }
+      }
+
+      Stream.EmitRecord(bitc::PARAMATTR_GRP_CODE_ENTRY, Record);
+      Record.clear();
+    }
+  }
+
+  Stream.ExitBlock();
+}
+
 static void WriteAttributeTable(const ValueEnumerator &VE,
                                 BitstreamWriter &Stream) {
   const std::vector<AttributeSet> &Attrs = VE.getAttributes();
@@ -172,11 +219,8 @@ static void WriteAttributeTable(const ValueEnumerator &VE,
   SmallVector<uint64_t, 64> Record;
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
     const AttributeSet &A = Attrs[i];
-    for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i) {
-      const AttributeWithIndex &PAWI = A.getSlot(i);
-      Record.push_back(PAWI.Index);
-      Record.push_back(Attribute::encodeLLVMAttributesForBitcode(PAWI.Attrs));
-    }
+    for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i)
+      Record.push_back(VE.getAttributeGroupID(A.getSlotAttributes(i)));
 
     Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record);
     Record.clear();
@@ -490,10 +534,11 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(GV->hasSection() ? SectionMap[GV->getSection()] : 0);
     if (GV->isThreadLocal() ||
         GV->getVisibility() != GlobalValue::DefaultVisibility ||
-        GV->hasUnnamedAddr()) {
+        GV->hasUnnamedAddr() || GV->isExternallyInitialized()) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
       Vals.push_back(GV->hasUnnamedAddr());
+      Vals.push_back(GV->isExternallyInitialized());
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
     }
@@ -1174,7 +1219,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::Br:
     {
       Code = bitc::FUNC_CODE_INST_BR;
-      BranchInst &II = cast<BranchInst>(I);
+      const BranchInst &II = cast<BranchInst>(I);
       Vals.push_back(VE.getValueID(II.getSuccessor(0)));
       if (II.isConditional()) {
         Vals.push_back(VE.getValueID(II.getSuccessor(1)));
@@ -1189,7 +1234,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       SmallVector<uint64_t, 128> Vals64;
 
       Code = bitc::FUNC_CODE_INST_SWITCH;
-      SwitchInst &SI = cast<SwitchInst>(I);
+      const SwitchInst &SI = cast<SwitchInst>(I);
 
       uint32_t SwitchRecordHeader = SI.hash() | (SWITCH_INST_MAGIC << 16);
       Vals64.push_back(SwitchRecordHeader);
@@ -1198,9 +1243,9 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       pushValue64(SI.getCondition(), InstID, Vals64, VE);
       Vals64.push_back(VE.getValueID(SI.getDefaultDest()));
       Vals64.push_back(SI.getNumCases());
-      for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
+      for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
            i != e; ++i) {
-        IntegersSubset& CaseRanges = i.getCaseValueEx();
+        const IntegersSubset& CaseRanges = i.getCaseValueEx();
         unsigned Code, Abbrev; // will unused.
 
         if (CaseRanges.isSingleNumber()) {
@@ -1829,6 +1874,9 @@ static void WriteModule(const Module *M, BitstreamWriter &Stream) {
   // Emit blockinfo, which defines the standard abbreviations etc.
   WriteBlockInfo(VE, Stream);
 
+  // Emit information about attribute groups.
+  WriteAttributeGroupTable(VE, Stream);
+
   // Emit information about parameter attributes.
   WriteAttributeTable(VE, Stream);
 
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index b2f7875..4f19dd0 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -418,15 +418,26 @@ void ValueEnumerator::EnumerateOperandType(const Value *V) {
     EnumerateMetadata(V);
 }
 
-void ValueEnumerator::EnumerateAttributes(const AttributeSet &PAL) {
+void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
   if (PAL.isEmpty()) return;  // null is always 0.
+
   // Do a lookup.
-  unsigned &Entry = AttributeMap[PAL.getRawPointer()];
+  unsigned &Entry = AttributeMap[PAL];
   if (Entry == 0) {
     // Never saw this before, add it.
     Attribute.push_back(PAL);
     Entry = Attribute.size();
   }
+
+  // Do lookups for all attribute groups.
+  for (unsigned i = 0, e = PAL.getNumSlots(); i != e; ++i) {
+    AttributeSet AS = PAL.getSlotAttributes(i);
+    unsigned &Entry = AttributeGroupMap[AS];
+    if (Entry == 0) {
+      AttributeGroups.push_back(AS);
+      Entry = AttributeGroups.size();
+    }
+  }
 }
 
 void ValueEnumerator::incorporateFunction(const Function &F) {
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 2d3d570..0af6164 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -52,7 +52,11 @@ private:
   SmallVector<const MDNode *, 8> FunctionLocalMDs;
   ValueMapType MDValueMap;
 
-  typedef DenseMap<void*, unsigned> AttributeMapType;
+  typedef DenseMap<AttributeSet, unsigned> AttributeGroupMapType;
+  AttributeGroupMapType AttributeGroupMap;
+  std::vector<AttributeSet> AttributeGroups;
+
+  typedef DenseMap<AttributeSet, unsigned> AttributeMapType;
   AttributeMapType AttributeMap;
   std::vector<AttributeSet> Attribute;
 
@@ -98,13 +102,20 @@ public:
   unsigned getInstructionID(const Instruction *I) const;
   void setInstructionID(const Instruction *I);
 
-  unsigned getAttributeID(const AttributeSet &PAL) const {
+  unsigned getAttributeID(AttributeSet PAL) const {
     if (PAL.isEmpty()) return 0;  // Null maps to zero.
-    AttributeMapType::const_iterator I = AttributeMap.find(PAL.getRawPointer());
+    AttributeMapType::const_iterator I = AttributeMap.find(PAL);
     assert(I != AttributeMap.end() && "Attribute not in ValueEnumerator!");
     return I->second;
   }
 
+  unsigned getAttributeGroupID(AttributeSet PAL) const {
+    if (PAL.isEmpty()) return 0;  // Null maps to zero.
+    AttributeGroupMapType::const_iterator I = AttributeGroupMap.find(PAL);
+    assert(I != AttributeGroupMap.end() && "Attribute not in ValueEnumerator!");
+    return I->second;
+  }
+
   /// getFunctionConstantRange - Return the range of values that corresponds to
   /// function-local constants.
   void getFunctionConstantRange(unsigned &Start, unsigned &End) const {
@@ -124,6 +135,9 @@ public:
   const std::vector<AttributeSet> &getAttributes() const {
     return Attribute;
   }
+  const std::vector<AttributeSet> &getAttributeGroups() const {
+    return AttributeGroups;
+  }
 
   /// getGlobalBasicBlockID - This returns the function-specific ID for the
   /// specified basic block.  This is relatively expensive information, so it
@@ -146,7 +160,7 @@ private:
   void EnumerateValue(const Value *V);
   void EnumerateType(Type *T);
   void EnumerateOperandType(const Value *V);
-  void EnumerateAttributes(const AttributeSet &PAL);
+  void EnumerateAttributes(AttributeSet PAL);
 
   void EnumerateValueSymbolTable(const ValueSymbolTable &ST);
   void EnumerateNamedMetadata(const Module *M);
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 152d9fa..c50f8b5 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -151,23 +151,7 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   std::vector<unsigned> &KillIndices = State->GetKillIndices();
   std::vector<unsigned> &DefIndices = State->GetDefIndices();
 
-  // Determine the live-out physregs for this block.
-  if (IsReturnBlock) {
-    // In a return block, examine the function live-out regs.
-    for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
-         E = MRI.liveout_end(); I != E; ++I) {
-      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
-        unsigned Reg = *AI;
-        State->UnionGroups(Reg, 0);
-        KillIndices[Reg] = BB->size();
-        DefIndices[Reg] = ~0u;
-      }
-    }
-  }
-
-  // In a non-return block, examine the live-in regs of all successors.
-  // Note a return block can have successors if the return instruction is
-  // predicated.
+  // Examine the live-in regs of all successors.
   for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
          SE = BB->succ_end(); SI != SE; ++SI)
     for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
diff --git a/lib/CodeGen/AllocationOrder.cpp b/lib/CodeGen/AllocationOrder.cpp
index 94754a0..3fa1f8f 100644
--- a/lib/CodeGen/AllocationOrder.cpp
+++ b/lib/CodeGen/AllocationOrder.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -45,4 +44,9 @@ AllocationOrder::AllocationOrder(unsigned VirtReg,
       dbgs() << '\n';
     }
   });
+#ifndef NDEBUG
+  for (unsigned I = 0, E = Hints.size(); I != E; ++I)
+    assert(std::find(Order.begin(), Order.end(), Hints[I]) != Order.end() &&
+           "Target hint is outside allocation order.");
+#endif
 }
diff --git a/lib/CodeGen/AllocationOrder.h b/lib/CodeGen/AllocationOrder.h
index a5293f6..aed461a 100644
--- a/lib/CodeGen/AllocationOrder.h
+++ b/lib/CodeGen/AllocationOrder.h
@@ -39,6 +39,9 @@ public:
                   const VirtRegMap &VRM,
                   const RegisterClassInfo &RegClassInfo);
 
+  /// Get the allocation order without reordered hints.
+  ArrayRef<MCPhysReg> getOrder() const { return Order; }
+
   /// Return the next physical register in the allocation order, or 0.
   /// It is safe to call next() again after it returned 0, it will keep
   /// returning 0 until rewind() is called.
@@ -53,6 +56,18 @@ public:
     return 0;
   }
 
+  /// As next(), but allow duplicates to be returned, and stop before the
+  /// Limit'th register in the RegisterClassInfo allocation order.
+  ///
+  /// This can produce more than Limit registers if there are hints.
+  unsigned nextWithDups(unsigned Limit) {
+    if (Pos < 0)
+      return Hints.end()[Pos++];
+    if (Pos < int(Limit))
+      return Order[Pos++];
+    return 0;
+  }
+
   /// Start over from the beginning.
   void rewind() { Pos = -int(Hints.size()); }
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index aaba144..c7abf7a 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -265,8 +265,7 @@ static const Value *getNoopInput(const Value *V, const TargetLowering &TLI) {
 /// between it and the return.
 ///
 /// This function only tests target-independent requirements.
-bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attribute CalleeRetAttr,
-                                const TargetLowering &TLI) {
+bool llvm::isInTailCallPosition(ImmutableCallSite CS,const TargetLowering &TLI){
   const Instruction *I = CS.getInstruction();
   const BasicBlock *ExitBB = I->getParent();
   const TerminatorInst *Term = ExitBB->getTerminator();
@@ -312,14 +311,16 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attribute CalleeRetAttr,
   // Conservatively require the attributes of the call to match those of
   // the return. Ignore noalias because it doesn't affect the call sequence.
   const Function *F = ExitBB->getParent();
-  Attribute CallerRetAttr = F->getAttributes().getRetAttributes();
-  if (AttrBuilder(CalleeRetAttr).removeAttribute(Attribute::NoAlias) !=
-      AttrBuilder(CallerRetAttr).removeAttribute(Attribute::NoAlias))
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
+        removeAttribute(Attribute::NoAlias) !=
+      AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
+        removeAttribute(Attribute::NoAlias))
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerRetAttr.hasAttribute(Attribute::ZExt) ||
-      CallerRetAttr.hasAttribute(Attribute::SExt))
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
     return false;
 
   // Otherwise, make sure the unmodified return value of I is the return value.
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 1728331..188047d 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -32,12 +32,11 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
-cl::opt<bool>
+static cl::opt<bool>
 EnableARMEHABIDescriptors("arm-enable-ehabi-descriptors", cl::Hidden,
   cl::desc("Generate ARM EHABI tables with unwinding descriptors"),
   cl::init(false));
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index cbcc290..aa9d43b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -391,9 +391,9 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     //   - pointer to mangled symbol above with initializer
     unsigned PtrSize = TD->getPointerSizeInBits()/8;
     OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
-                          PtrSize, 0);
-    OutStreamer.EmitIntValue(0, PtrSize, 0);
-    OutStreamer.EmitSymbolValue(MangSym, PtrSize, 0);
+				PtrSize);
+    OutStreamer.EmitIntValue(0, PtrSize);
+    OutStreamer.EmitSymbolValue(MangSym, PtrSize);
 
     OutStreamer.AddBlankLine();
     return;
@@ -1040,7 +1040,7 @@ void AsmPrinter::EmitConstantPool() {
       // Emit inter-object padding for alignment.
       unsigned AlignMask = CPE.getAlignment() - 1;
       unsigned NewOffset = (Offset + AlignMask) & ~AlignMask;
-      OutStreamer.EmitFill(NewOffset - Offset, 0/*fillval*/, 0/*addrspace*/);
+      OutStreamer.EmitZeros(NewOffset - Offset);
 
       Type *Ty = CPE.getType();
       Offset = NewOffset + TM.getDataLayout()->getTypeAllocSize(Ty);
@@ -1203,7 +1203,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   assert(Value && "Unknown entry kind!");
 
   unsigned EntrySize = MJTI->getEntrySize(*TM.getDataLayout());
-  OutStreamer.EmitValue(Value, EntrySize, /*addrspace*/0);
+  OutStreamer.EmitValue(Value, EntrySize);
 }
 
 
@@ -1326,19 +1326,19 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
 /// EmitInt8 - Emit a byte directive and value.
 ///
 void AsmPrinter::EmitInt8(int Value) const {
-  OutStreamer.EmitIntValue(Value, 1, 0/*addrspace*/);
+  OutStreamer.EmitIntValue(Value, 1);
 }
 
 /// EmitInt16 - Emit a short directive and value.
 ///
 void AsmPrinter::EmitInt16(int Value) const {
-  OutStreamer.EmitIntValue(Value, 2, 0/*addrspace*/);
+  OutStreamer.EmitIntValue(Value, 2);
 }
 
 /// EmitInt32 - Emit a long directive and value.
 ///
 void AsmPrinter::EmitInt32(int Value) const {
-  OutStreamer.EmitIntValue(Value, 4, 0/*addrspace*/);
+  OutStreamer.EmitIntValue(Value, 4);
 }
 
 /// EmitLabelDifference - Emit something like ".long Hi-Lo" where the size
@@ -1353,14 +1353,14 @@ void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
                             OutContext);
 
   if (!MAI->hasSetDirective()) {
-    OutStreamer.EmitValue(Diff, Size, 0/*AddrSpace*/);
+    OutStreamer.EmitValue(Diff, Size);
     return;
   }
 
   // Otherwise, emit with .set (aka assignment).
   MCSymbol *SetLabel = GetTempSymbol("set", SetCounter++);
   OutStreamer.EmitAssignment(SetLabel, Diff);
-  OutStreamer.EmitSymbolValue(SetLabel, Size, 0/*AddrSpace*/);
+  OutStreamer.EmitSymbolValue(SetLabel, Size);
 }
 
 /// EmitLabelOffsetDifference - Emit something like ".long Hi+Offset-Lo"
@@ -1384,12 +1384,12 @@ void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
                             OutContext);
 
   if (!MAI->hasSetDirective())
-    OutStreamer.EmitValue(Diff, 4, 0/*AddrSpace*/);
+    OutStreamer.EmitValue(Diff, 4);
   else {
     // Otherwise, emit with .set (aka assignment).
     MCSymbol *SetLabel = GetTempSymbol("set", SetCounter++);
     OutStreamer.EmitAssignment(SetLabel, Diff);
-    OutStreamer.EmitSymbolValue(SetLabel, 4, 0/*AddrSpace*/);
+    OutStreamer.EmitSymbolValue(SetLabel, 4);
   }
 }
 
@@ -1407,7 +1407,7 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
                                    MCConstantExpr::Create(Offset, OutContext),
                                    OutContext);
 
-  OutStreamer.EmitValue(Expr, Size, 0/*AddrSpace*/);
+  OutStreamer.EmitValue(Expr, Size);
 }
 
 
@@ -1746,90 +1746,48 @@ static void emitGlobalConstantStruct(const ConstantStruct *CS,
 
 static void emitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
                                  AsmPrinter &AP) {
-  if (CFP->getType()->isHalfTy()) {
-    if (AP.isVerbose()) {
-      SmallString<10> Str;
-      CFP->getValueAPF().toString(Str);
-      AP.OutStreamer.GetCommentOS() << "half " << Str << '\n';
-    }
-    uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
-    AP.OutStreamer.EmitIntValue(Val, 2, AddrSpace);
-    return;
-  }
-
-  if (CFP->getType()->isFloatTy()) {
-    if (AP.isVerbose()) {
-      float Val = CFP->getValueAPF().convertToFloat();
-      uint64_t IntVal = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
-      AP.OutStreamer.GetCommentOS() << "float " << Val << '\n'
-                                    << " (" << format("0x%x", IntVal) << ")\n";
-    }
-    uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
-    AP.OutStreamer.EmitIntValue(Val, 4, AddrSpace);
-    return;
-  }
+  APInt API = CFP->getValueAPF().bitcastToAPInt();
 
-  // FP Constants are printed as integer constants to avoid losing
-  // precision.
-  if (CFP->getType()->isDoubleTy()) {
-    if (AP.isVerbose()) {
-      double Val = CFP->getValueAPF().convertToDouble();
-      uint64_t IntVal = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
-      AP.OutStreamer.GetCommentOS() << "double " << Val << '\n'
-                                    << " (" << format("0x%lx", IntVal) << ")\n";
-    }
+  // First print a comment with what we think the original floating-point value
+  // should have been.
+  if (AP.isVerbose()) {
+    SmallString<8> StrVal;
+    CFP->getValueAPF().toString(StrVal);
 
-    uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
-    AP.OutStreamer.EmitIntValue(Val, 8, AddrSpace);
-    return;
+    CFP->getType()->print(AP.OutStreamer.GetCommentOS());
+    AP.OutStreamer.GetCommentOS() << ' ' << StrVal << '\n';
   }
 
-  if (CFP->getType()->isX86_FP80Ty() || CFP->getType()->isFP128Ty()) {
-    // all long double variants are printed as hex
-    // API needed to prevent premature destruction
-    APInt API = CFP->getValueAPF().bitcastToAPInt();
-    const uint64_t *p = API.getRawData();
-    if (AP.isVerbose()) {
-      // Convert to double so we can print the approximate val as a comment.
-      SmallString<8> StrVal;
-      CFP->getValueAPF().toString(StrVal);
+  // Now iterate through the APInt chunks, emitting them in endian-correct
+  // order, possibly with a smaller chunk at beginning/end (e.g. for x87 80-bit
+  // floats).
+  unsigned NumBytes = API.getBitWidth() / 8;
+  unsigned TrailingBytes = NumBytes % sizeof(uint64_t);
+  const uint64_t *p = API.getRawData();
 
-      const char *TyNote = CFP->getType()->isFP128Ty() ? "fp128 " : "x86_fp80 ";
-      AP.OutStreamer.GetCommentOS() << TyNote << StrVal << '\n';
-    }
+  // PPC's long double has odd notions of endianness compared to how LLVM
+  // handles it: p[0] goes first for *big* endian on PPC.
+  if (AP.TM.getDataLayout()->isBigEndian() != CFP->getType()->isPPC_FP128Ty()) {
+    int Chunk = API.getNumWords() - 1;
 
-    // The 80-bit type is made of a 64-bit and 16-bit value, the 128-bit has 2
-    // 64-bit words.
-    uint32_t TrailingSize = CFP->getType()->isFP128Ty() ? 8 : 2;
+    if (TrailingBytes)
+      AP.OutStreamer.EmitIntValue(p[Chunk--], TrailingBytes, AddrSpace);
 
-    if (AP.TM.getDataLayout()->isBigEndian()) {
-      AP.OutStreamer.EmitIntValue(p[1], TrailingSize, AddrSpace);
-      AP.OutStreamer.EmitIntValue(p[0], 8, AddrSpace);
-    } else {
-      AP.OutStreamer.EmitIntValue(p[0], 8, AddrSpace);
-      AP.OutStreamer.EmitIntValue(p[1], TrailingSize, AddrSpace);
-    }
+    for (; Chunk >= 0; --Chunk)
+      AP.OutStreamer.EmitIntValue(p[Chunk], sizeof(uint64_t), AddrSpace);
+  } else {
+    unsigned Chunk;
+    for (Chunk = 0; Chunk < NumBytes / sizeof(uint64_t); ++Chunk)
+      AP.OutStreamer.EmitIntValue(p[Chunk], sizeof(uint64_t), AddrSpace);
 
-    // Emit the tail padding for the long double.
-    const DataLayout &TD = *AP.TM.getDataLayout();
-    AP.OutStreamer.EmitZeros(TD.getTypeAllocSize(CFP->getType()) -
-                             TD.getTypeStoreSize(CFP->getType()), AddrSpace);
-    return;
+    if (TrailingBytes)
+      AP.OutStreamer.EmitIntValue(p[Chunk], TrailingBytes, AddrSpace);
   }
 
-  assert(CFP->getType()->isPPC_FP128Ty() &&
-         "Floating point constant type not handled");
-  // All long double variants are printed as hex
-  // API needed to prevent premature destruction.
-  APInt API = CFP->getValueAPF().bitcastToAPInt();
-  const uint64_t *p = API.getRawData();
-  if (AP.TM.getDataLayout()->isBigEndian()) {
-    AP.OutStreamer.EmitIntValue(p[0], 8, AddrSpace);
-    AP.OutStreamer.EmitIntValue(p[1], 8, AddrSpace);
-  } else {
-    AP.OutStreamer.EmitIntValue(p[1], 8, AddrSpace);
-    AP.OutStreamer.EmitIntValue(p[0], 8, AddrSpace);
-  }
+  // Emit the tail padding for the long double.
+  const DataLayout &TD = *AP.TM.getDataLayout();
+  AP.OutStreamer.EmitZeros(TD.getTypeAllocSize(CFP->getType()) -
+                           TD.getTypeStoreSize(CFP->getType()), AddrSpace);
 }
 
 static void emitGlobalConstantLargeInt(const ConstantInt *CI,
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index ece92d8..156acac 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -46,7 +46,7 @@ void AsmPrinter::EmitULEB128(unsigned Value, const char *Desc,
   if (isVerbose() && Desc)
     OutStreamer.AddComment(Desc);
 
-  OutStreamer.EmitULEB128IntValue(Value, 0/*addrspace*/, PadTo);
+  OutStreamer.EmitULEB128IntValue(Value, PadTo);
 }
 
 /// EmitCFAByte - Emit a .byte 42 directive for a DW_CFA_xxx value.
@@ -58,7 +58,7 @@ void AsmPrinter::EmitCFAByte(unsigned Val) const {
     else
       OutStreamer.AddComment(dwarf::CallFrameString(Val));
   }
-  OutStreamer.EmitIntValue(Val, 1, 0/*addrspace*/);
+  OutStreamer.EmitIntValue(Val, 1);
 }
 
 static const char *DecodeDWARFEncoding(unsigned Encoding) {
@@ -102,7 +102,7 @@ void AsmPrinter::EmitEncodingByte(unsigned Val, const char *Desc) const {
                              DecodeDWARFEncoding(Val));
   }
 
-  OutStreamer.EmitIntValue(Val, 1, 0/*addrspace*/);
+  OutStreamer.EmitIntValue(Val, 1);
 }
 
 /// GetSizeOfEncodedValue - Return the size of the encoding in bytes.
@@ -126,9 +126,9 @@ void AsmPrinter::EmitTTypeReference(const GlobalValue *GV,
 
     const MCExpr *Exp =
       TLOF.getTTypeGlobalReference(GV, Mang, MMI, Encoding, OutStreamer);
-    OutStreamer.EmitValue(Exp, GetSizeOfEncodedValue(Encoding), /*addrspace*/0);
+    OutStreamer.EmitValue(Exp, GetSizeOfEncodedValue(Encoding));
   } else
-    OutStreamer.EmitIntValue(0, GetSizeOfEncodedValue(Encoding), 0);
+    OutStreamer.EmitIntValue(0, GetSizeOfEncodedValue(Encoding));
 }
 
 /// EmitSectionOffset - Emit the 4-byte offset of Label from the start of its
@@ -157,7 +157,7 @@ void AsmPrinter::EmitSectionOffset(const MCSymbol *Label,
   // If the section in question will end up with an address of 0 anyway, we can
   // just emit an absolute reference to save a relocation.
   if (Section.isBaseAddressKnownZero()) {
-    OutStreamer.EmitSymbolValue(Label, 4, 0/*AddrSpace*/);
+    OutStreamer.EmitSymbolValue(Label, 4);
     return;
   }
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 4890627..8b9a884 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -38,7 +38,7 @@ using namespace llvm;
 namespace {
   struct SrcMgrDiagInfo {
     const MDNode *LocInfo;
-    LLVMContext::DiagHandlerTy DiagHandler;
+    LLVMContext::InlineAsmDiagHandlerTy DiagHandler;
     void *DiagContext;
   };
 }
@@ -89,15 +89,15 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
   SourceMgr SrcMgr;
   SrcMgrDiagInfo DiagInfo;
 
-  // If the current LLVMContext has a diagnostic handler, set it in SourceMgr.
+  // If the current LLVMContext has an inline asm handler, set it in SourceMgr.
   LLVMContext &LLVMCtx = MMI->getModule()->getContext();
   bool HasDiagHandler = false;
-  if (LLVMCtx.getDiagnosticHandler() != 0) {
+  if (LLVMCtx.getInlineAsmDiagnosticHandler() != 0) {
     // If the source manager has an issue, we arrange for srcMgrDiagHandler
     // to be invoked, getting DiagInfo passed into it.
     DiagInfo.LocInfo = LocMDNode;
-    DiagInfo.DiagHandler = LLVMCtx.getDiagnosticHandler();
-    DiagInfo.DiagContext = LLVMCtx.getDiagnosticContext();
+    DiagInfo.DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
+    DiagInfo.DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
     SrcMgr.setDiagHandler(srcMgrDiagHandler, &DiagInfo);
     HasDiagHandler = true;
   }
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index a913ca4..4ded281 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -193,18 +193,20 @@ void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
   case dwarf::DW_FORM_data1: Size = 1; break;
   case dwarf::DW_FORM_ref2:  // Fall thru
   case dwarf::DW_FORM_data2: Size = 2; break;
+  case dwarf::DW_FORM_sec_offset: // Fall thru
   case dwarf::DW_FORM_ref4:  // Fall thru
   case dwarf::DW_FORM_data4: Size = 4; break;
   case dwarf::DW_FORM_ref8:  // Fall thru
   case dwarf::DW_FORM_data8: Size = 8; break;
   case dwarf::DW_FORM_GNU_str_index: Asm->EmitULEB128(Integer); return;
+  case dwarf::DW_FORM_GNU_addr_index: Asm->EmitULEB128(Integer); return;
   case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return;
   case dwarf::DW_FORM_sdata: Asm->EmitSLEB128(Integer); return;
   case dwarf::DW_FORM_addr:
     Size = Asm->getDataLayout().getPointerSize(); break;
   default: llvm_unreachable("DIE Value form not supported yet");
   }
-  Asm->OutStreamer.EmitIntValue(Integer, Size, 0/*addrspace*/);
+  Asm->OutStreamer.EmitIntValue(Integer, Size);
 }
 
 /// SizeOf - Determine size of integer value in bytes.
@@ -217,11 +219,13 @@ unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
   case dwarf::DW_FORM_data1: return sizeof(int8_t);
   case dwarf::DW_FORM_ref2:  // Fall thru
   case dwarf::DW_FORM_data2: return sizeof(int16_t);
+  case dwarf::DW_FORM_sec_offset: // Fall thru
   case dwarf::DW_FORM_ref4:  // Fall thru
   case dwarf::DW_FORM_data4: return sizeof(int32_t);
   case dwarf::DW_FORM_ref8:  // Fall thru
   case dwarf::DW_FORM_data8: return sizeof(int64_t);
   case dwarf::DW_FORM_GNU_str_index: return MCAsmInfo::getULEB128Size(Integer);
+  case dwarf::DW_FORM_GNU_addr_index: return MCAsmInfo::getULEB128Size(Integer);
   case dwarf::DW_FORM_udata: return MCAsmInfo::getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata: return MCAsmInfo::getSLEB128Size(Integer);
   case dwarf::DW_FORM_addr:  return AP->getDataLayout().getPointerSize();
@@ -243,13 +247,14 @@ void DIEInteger::print(raw_ostream &O) {
 /// EmitValue - Emit label value.
 ///
 void DIELabel::EmitValue(AsmPrinter *AP, unsigned Form) const {
-  AP->OutStreamer.EmitSymbolValue(Label, SizeOf(AP, Form), 0/*AddrSpace*/);
+  AP->OutStreamer.EmitSymbolValue(Label, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of label value in bytes.
 ///
 unsigned DIELabel::SizeOf(AsmPrinter *AP, unsigned Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
+  if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
   return AP->getDataLayout().getPointerSize();
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index c193999..f58ec9b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -173,7 +173,7 @@ void DwarfAccelTable::EmitOffsets(AsmPrinter *Asm, MCSymbol *SecBegin) {
         MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create((*HI)->Sym, Context),
                                 MCSymbolRefExpr::Create(SecBegin, Context),
                                 Context);
-      Asm->OutStreamer.EmitValue(Sub, sizeof(uint32_t), 0);
+      Asm->OutStreamer.EmitValue(Sub, sizeof(uint32_t));
     }
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 21cceaf..93b00fb 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -170,6 +170,42 @@ void CompileUnit::addLabel(DIE *Die, unsigned Attribute, unsigned Form,
   Die->addValue(Attribute, Form, Value);
 }
 
+/// addLabelAddress - Add a dwarf label attribute data and value using
+/// DW_FORM_addr or DW_FORM_GNU_addr_index.
+///
+void CompileUnit::addLabelAddress(DIE *Die, unsigned Attribute,
+                                  MCSymbol *Label) {
+  if (!DD->useSplitDwarf()) {
+    if (Label != NULL) {
+      DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
+      Die->addValue(Attribute, dwarf::DW_FORM_addr, Value);
+    } else {
+      DIEValue *Value = new (DIEValueAllocator) DIEInteger(0);
+      Die->addValue(Attribute, dwarf::DW_FORM_addr, Value);
+    }
+  } else {
+    unsigned idx = DU->getAddrPoolIndex(Label);
+    DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
+    Die->addValue(Attribute, dwarf::DW_FORM_GNU_addr_index, Value);
+  }
+}
+
+/// addOpAddress - Add a dwarf op address data and value using the
+/// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
+///
+void CompileUnit::addOpAddress(DIE *Die, MCSymbol *Sym) {
+
+  if (!DD->useSplitDwarf()) {
+    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    addLabel(Die, 0, dwarf::DW_FORM_udata, Sym);
+  } else {
+    unsigned idx = DU->getAddrPoolIndex(Sym);
+    DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
+    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
+    Die->addValue(0, dwarf::DW_FORM_GNU_addr_index, Value);
+  }
+}
+
 /// addDelta - Add a label delta attribute data and value.
 ///
 void CompileUnit::addDelta(DIE *Die, unsigned Attribute, unsigned Form,
@@ -592,10 +628,21 @@ bool CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
   return true;
 }
 
+/// addConstantFPValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantFPValue(DIE *Die, const ConstantFP *CFP) {
+  return addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), false);
+}
+
 /// addConstantValue - Add constant value entry in variable DIE.
 bool CompileUnit::addConstantValue(DIE *Die, const ConstantInt *CI,
                                    bool Unsigned) {
-  unsigned CIBitWidth = CI->getBitWidth();
+  return addConstantValue(Die, CI->getValue(), Unsigned);
+}
+
+// addConstantValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantValue(DIE *Die, const APInt &Val,
+                                   bool Unsigned) {
+  unsigned CIBitWidth = Val.getBitWidth();
   if (CIBitWidth <= 64) {
     unsigned form = 0;
     switch (CIBitWidth) {
@@ -607,16 +654,15 @@ bool CompileUnit::addConstantValue(DIE *Die, const ConstantInt *CI,
       form = Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata;
     }
     if (Unsigned)
-      addUInt(Die, dwarf::DW_AT_const_value, form, CI->getZExtValue());
+      addUInt(Die, dwarf::DW_AT_const_value, form, Val.getZExtValue());
     else
-      addSInt(Die, dwarf::DW_AT_const_value, form, CI->getSExtValue());
+      addSInt(Die, dwarf::DW_AT_const_value, form, Val.getSExtValue());
     return true;
   }
 
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
   // Get the raw data form of the large APInt.
-  const APInt Val = CI->getValue();
   const uint64_t *Ptr64 = Val.getRawData();
 
   int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
@@ -650,18 +696,21 @@ void CompileUnit::addTemplateParams(DIE &Buffer, DIArray TParams) {
   }
 }
 
+/// getOrCreateContextDIE - Get context owner's DIE.
+DIE *CompileUnit::getOrCreateContextDIE(DIDescriptor Context) {
+  if (Context.isType())
+    return getOrCreateTypeDIE(DIType(Context));
+  else if (Context.isNameSpace())
+    return getOrCreateNameSpace(DINameSpace(Context));
+  else if (Context.isSubprogram())
+    return getOrCreateSubprogramDIE(DISubprogram(Context));
+  else 
+    return getDIE(Context);
+}
+
 /// addToContextOwner - Add Die into the list of its context owner's children.
 void CompileUnit::addToContextOwner(DIE *Die, DIDescriptor Context) {
-  if (Context.isType()) {
-    DIE *ContextDIE = getOrCreateTypeDIE(DIType(Context));
-    ContextDIE->addChild(Die);
-  } else if (Context.isNameSpace()) {
-    DIE *ContextDIE = getOrCreateNameSpace(DINameSpace(Context));
-    ContextDIE->addChild(Die);
-  } else if (Context.isSubprogram()) {
-    DIE *ContextDIE = getOrCreateSubprogramDIE(DISubprogram(Context));
-    ContextDIE->addChild(Die);
-  } else if (DIE *ContextDIE = getDIE(Context))
+  if (DIE *ContextDIE = getOrCreateContextDIE(Context))
     ContextDIE->addChild(Die);
   else
     addDie(Die);
@@ -864,6 +913,8 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
       } else {
         DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
         addType(Arg, DIType(Ty));
+        if (DIType(Ty).isArtificial())
+          addFlag(Arg, dwarf::DW_AT_artificial);
         Buffer.addChild(Arg);
       }
     }
@@ -905,22 +956,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
             dwarf::DW_ACCESS_public);
         if (SP.isExplicit())
           addFlag(ElemDie, dwarf::DW_AT_explicit);
-      }
-      else if (Element.isVariable()) {
-        DIVariable DV(Element);
-        ElemDie = new DIE(dwarf::DW_TAG_variable);
-        addString(ElemDie, dwarf::DW_AT_name, DV.getName());
-        addType(ElemDie, DV.getType());
-        addFlag(ElemDie, dwarf::DW_AT_declaration);
-        addFlag(ElemDie, dwarf::DW_AT_external);
-        addSourceLine(ElemDie, DV);
       } else if (Element.isDerivedType()) {
         DIDerivedType DDTy(Element);
         if (DDTy.getTag() == dwarf::DW_TAG_friend) {
           ElemDie = new DIE(dwarf::DW_TAG_friend);
           addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
-        } else
-          ElemDie = createMemberDIE(DIDerivedType(Element));
+        } else if (DDTy.isStaticMember())
+          ElemDie = createStaticMemberDIE(DDTy);
+        else
+          ElemDie = createMemberDIE(DDTy);
       } else if (Element.isObjCProperty()) {
         DIObjCProperty Property(Element);
         ElemDie = new DIE(Property.getTag());
@@ -1236,39 +1280,56 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
   if (!GV.Verify())
     return;
 
-  DIE *VariableDIE = new DIE(GV.getTag());
-  // Add to map.
-  insertDIE(N, VariableDIE);
-
-  // Add name.
-  addString(VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
-  StringRef LinkageName = GV.getLinkageName();
-  bool isGlobalVariable = GV.getGlobal() != NULL;
-  if (!LinkageName.empty() && isGlobalVariable)
-    addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
-              getRealLinkageName(LinkageName));
-  // Add type.
+  DIDescriptor GVContext = GV.getContext();
   DIType GTy = GV.getType();
-  addType(VariableDIE, GTy);
 
-  // Add scoping info.
-  if (!GV.isLocalToUnit())
-    addFlag(VariableDIE, dwarf::DW_AT_external);
+  // If this is a static data member definition, some attributes belong
+  // to the declaration DIE.
+  DIE *VariableDIE = NULL;
+  bool IsStaticMember = false;
+  DIDerivedType SDMDecl = GV.getStaticDataMemberDeclaration();
+  if (SDMDecl.Verify()) {
+    assert(SDMDecl.isStaticMember() && "Expected static member decl");
+    // We need the declaration DIE that is in the static member's class.
+    // But that class might not exist in the DWARF yet.
+    // Creating the class will create the static member decl DIE.
+    getOrCreateContextDIE(SDMDecl.getContext());
+    VariableDIE = getDIE(SDMDecl);
+    assert(VariableDIE && "Static member decl has no context?");
+    IsStaticMember = true;
+  }
+
+  // If this is not a static data member definition, create the variable
+  // DIE and add the initial set of attributes to it.
+  if (!VariableDIE) {
+    VariableDIE = new DIE(GV.getTag());
+    // Add to map.
+    insertDIE(N, VariableDIE);
+
+    // Add name and type.
+    addString(VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
+    addType(VariableDIE, GTy);
+
+    // Add scoping info.
+    if (!GV.isLocalToUnit()) {
+      addFlag(VariableDIE, dwarf::DW_AT_external);
+      addGlobalName(GV.getName(), VariableDIE);
+    }
+
+    // Add line number info.
+    addSourceLine(VariableDIE, GV);
+    // Add to context owner.
+    addToContextOwner(VariableDIE, GVContext);
+  }
 
-  // Add line number info.
-  addSourceLine(VariableDIE, GV);
-  // Add to context owner.
-  DIDescriptor GVContext = GV.getContext();
-  addToContextOwner(VariableDIE, GVContext);
   // Add location.
   bool addToAccelTable = false;
   DIE *VariableSpecDIE = NULL;
+  bool isGlobalVariable = GV.getGlobal() != NULL;
   if (isGlobalVariable) {
     addToAccelTable = true;
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Block, 0, dwarf::DW_FORM_udata,
-             Asm->Mang->getSymbol(GV.getGlobal()));
+    addOpAddress(Block, Asm->Mang->getSymbol(GV.getGlobal()));
     // Do not create specification DIE if context is either compile unit
     // or a subprogram.
     if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
@@ -1278,22 +1339,31 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
       addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
                   dwarf::DW_FORM_ref4, VariableDIE);
       addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
-      addFlag(VariableDIE, dwarf::DW_AT_declaration);
+      // A static member's declaration is already flagged as such.
+      if (!SDMDecl.Verify())
+        addFlag(VariableDIE, dwarf::DW_AT_declaration);
       addDie(VariableSpecDIE);
     } else {
       addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
     }
+    // Add linkage name.
+    StringRef LinkageName = GV.getLinkageName();
+    if (!LinkageName.empty() && isGlobalVariable)
+      addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
+                getRealLinkageName(LinkageName));
   } else if (const ConstantInt *CI =
-             dyn_cast_or_null<ConstantInt>(GV.getConstant()))
-    addConstantValue(VariableDIE, CI, GTy.isUnsignedDIType());
-  else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) {
+             dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
+    // AT_const_value was added when the static memeber was created. To avoid
+    // emitting AT_const_value multiple times, we only add AT_const_value when
+    // it is not a static member.
+    if (!IsStaticMember)
+      addConstantValue(VariableDIE, CI, GTy.isUnsignedDIType());
+  } else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) {
     addToAccelTable = true;
     // GV is a merged global.
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
     Value *Ptr = CE->getOperand(0);
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Block, 0, dwarf::DW_FORM_udata,
-                    Asm->Mang->getSymbol(cast<GlobalValue>(Ptr)));
+    addOpAddress(Block, Asm->Mang->getSymbol(cast<GlobalValue>(Ptr)));
     addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
     SmallVector<Value*, 3> Idx(CE->op_begin()+1, CE->op_end());
     addUInt(Block, 0, dwarf::DW_FORM_udata,
@@ -1618,3 +1688,38 @@ DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
   }
   return MemberDie;
 }
+
+/// createStaticMemberDIE - Create new DIE for C++ static member.
+DIE *CompileUnit::createStaticMemberDIE(const DIDerivedType DT) {
+  if (!DT.Verify())
+    return NULL;
+
+  DIE *StaticMemberDIE = new DIE(DT.getTag());
+  DIType Ty = DT.getTypeDerivedFrom();
+
+  addString(StaticMemberDIE, dwarf::DW_AT_name, DT.getName());
+  addType(StaticMemberDIE, Ty);
+  addSourceLine(StaticMemberDIE, DT);
+  addFlag(StaticMemberDIE, dwarf::DW_AT_external);
+  addFlag(StaticMemberDIE, dwarf::DW_AT_declaration);
+
+  // FIXME: We could omit private if the parent is a class_type, and
+  // public if the parent is something else.
+  if (DT.isProtected())
+    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_protected);
+  else if (DT.isPrivate())
+    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_private);
+  else
+    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_public);
+
+  if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT.getConstant()))
+    addConstantValue(StaticMemberDIE, CI, Ty.isUnsignedDIType());
+  if (const ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(DT.getConstant()))
+    addConstantFPValue(StaticMemberDIE, CFP);
+
+  insertDIE(DT, StaticMemberDIE);
+  return StaticMemberDIE;
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index f210dcc..77bf6a9 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -27,6 +27,7 @@ class DwarfUnits;
 class MachineLocation;
 class MachineOperand;
 class ConstantInt;
+class ConstantFP;
 class DbgVariable;
 
 //===----------------------------------------------------------------------===//
@@ -63,6 +64,10 @@ class CompileUnit {
   /// descriptors to debug information entries using a DIEEntry proxy.
   DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap;
 
+  /// GlobalNames - A map of globally visible named entities for this unit.
+  ///
+  StringMap<DIE*> GlobalNames;
+
   /// GlobalTypes - A map of globally visible types for this unit.
   ///
   StringMap<DIE*> GlobalTypes;
@@ -86,6 +91,9 @@ class CompileUnit {
   /// DWARF version doesn't handle the language, return -1.
   int64_t getDefaultLowerBound() const;
 
+  /// getOrCreateContextDIE - Get context owner's DIE.
+  DIE *getOrCreateContextDIE(DIDescriptor Context);
+
 public:
   CompileUnit(unsigned UID, unsigned L, DIE *D, AsmPrinter *A, DwarfDebug *DW,
               DwarfUnits *);
@@ -95,6 +103,7 @@ public:
   unsigned getUniqueID()            const { return UniqueID; }
   unsigned getLanguage()            const { return Language; }
   DIE* getCUDie()                   const { return CUDie.get(); }
+  const StringMap<DIE*> &getGlobalNames() const { return GlobalNames; }
   const StringMap<DIE*> &getGlobalTypes() const { return GlobalTypes; }
 
   const StringMap<std::vector<DIE*> > &getAccelNames() const {
@@ -115,6 +124,10 @@ public:
   ///
   bool hasContent() const { return !CUDie->getChildren().empty(); }
 
+  /// addGlobalName - Add a new global entity to the compile unit.
+  ///
+  void addGlobalName(StringRef Name, DIE *Die) { GlobalNames[Name] = Die; }
+
   /// addGlobalType - Add a new global type to the compile unit.
   ///
   void addGlobalType(DIType Ty);
@@ -207,6 +220,16 @@ public:
   void addLabel(DIE *Die, unsigned Attribute, unsigned Form,
                 const MCSymbol *Label);
 
+  /// addLabelAddress - Add a dwarf label attribute data and value using
+  /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  ///
+  void addLabelAddress(DIE *Die, unsigned Attribute, MCSymbol *Label);
+
+  /// addOpAddress - Add a dwarf op address data and value using the
+  /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  ///
+  void addOpAddress(DIE *Die, MCSymbol *Label);
+
   /// addDelta - Add a label delta attribute data and value.
   ///
   void addDelta(DIE *Die, unsigned Attribute, unsigned Form,
@@ -237,9 +260,11 @@ public:
   /// addConstantValue - Add constant value entry in variable DIE.
   bool addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty);
   bool addConstantValue(DIE *Die, const ConstantInt *CI, bool Unsigned);
+  bool addConstantValue(DIE *Die, const APInt &Val, bool Unsigned);
 
   /// addConstantFPValue - Add constant value entry in variable DIE.
   bool addConstantFPValue(DIE *Die, const MachineOperand &MO);
+  bool addConstantFPValue(DIE *Die, const ConstantFP *CFP);
 
   /// addTemplateParams - Add template parameters in buffer.
   void addTemplateParams(DIE &Buffer, DIArray TParams);
@@ -339,6 +364,9 @@ public:
   /// createMemberDIE - Create new member DIE.
   DIE *createMemberDIE(DIDerivedType DT);
 
+  /// createStaticMemberDIE - Create new static data member DIE.
+  DIE *createStaticMemberDIE(DIDerivedType DT);
+
 private:
 
   // DIEValueAllocator - All DIEValues are allocated through this allocator.
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 66a5a6d..87659ef 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -54,6 +54,10 @@ static cl::opt<bool> UnknownLocations("use-unknown-locations", cl::Hidden,
      cl::desc("Make an absence of debug location information explicit."),
      cl::init(false));
 
+static cl::opt<bool> GenerateDwarfPubNamesSection("generate-dwarf-pubnames",
+     cl::Hidden, cl::ZeroOrMore, cl::init(false),
+     cl::desc("Generate DWARF pubnames section"));
+
 namespace {
   enum DefaultOnOff {
     Default, Enable, Disable
@@ -159,14 +163,13 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     PrevLabel(NULL), GlobalCUIndexCount(0),
     InfoHolder(A, &AbbreviationsSet, &Abbreviations, "info_string",
                DIEValueAllocator),
-    SkeletonCU(0),
     SkeletonAbbrevSet(InitAbbreviationsSetSize),
     SkeletonHolder(A, &SkeletonAbbrevSet, &SkeletonAbbrevs, "skel_string",
                    DIEValueAllocator) {
 
   DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
   DwarfStrSectionSym = TextSectionSym = 0;
-  DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0;
+  DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = DwarfLineSectionSym = 0;
   DwarfAbbrevDWOSectionSym = DwarfStrDWOSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
 
@@ -237,6 +240,15 @@ unsigned DwarfUnits::getStringPoolIndex(StringRef Str) {
   return Entry.second;
 }
 
+unsigned DwarfUnits::getAddrPoolIndex(MCSymbol *Sym) {
+  std::pair<MCSymbol*, unsigned> &Entry = AddressPool[Sym];
+  if (Entry.first) return Entry.second;
+
+  Entry.second = NextAddrPoolNumber++;
+  Entry.first = Sym;
+  return Entry.second;
+}
+
 // Define a unique number for the abbreviation.
 //
 void DwarfUnits::assignAbbrevNumber(DIEAbbrev &Abbrev) {
@@ -384,10 +396,12 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
     }
   }
 
-  SPCU->addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
-                 Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber()));
-  SPCU->addLabel(SPDie, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
-                 Asm->GetTempSymbol("func_end", Asm->getFunctionNumber()));
+  SPCU->addLabelAddress(SPDie, dwarf::DW_AT_low_pc,
+                        Asm->GetTempSymbol("func_begin",
+                                           Asm->getFunctionNumber()));
+  SPCU->addLabelAddress(SPDie, dwarf::DW_AT_high_pc,
+                        Asm->GetTempSymbol("func_end",
+                                           Asm->getFunctionNumber()));
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   MachineLocation Location(RI->getFrameRegister(*Asm->MF));
   SPCU->addAddress(SPDie, dwarf::DW_AT_frame_base, Location);
@@ -429,16 +443,16 @@ DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
     return ScopeDIE;
   }
 
-  const MCSymbol *Start = getLabelBeforeInsn(RI->first);
-  const MCSymbol *End = getLabelAfterInsn(RI->second);
+  MCSymbol *Start = getLabelBeforeInsn(RI->first);
+  MCSymbol *End = getLabelAfterInsn(RI->second);
 
   if (End == 0) return 0;
 
   assert(Start->isDefined() && "Invalid starting label for an inlined scope!");
   assert(End->isDefined() && "Invalid end label for an inlined scope!");
 
-  TheCU->addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, Start);
-  TheCU->addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, End);
+  TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_low_pc, Start);
+  TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_high_pc, End);
 
   return ScopeDIE;
 }
@@ -462,8 +476,8 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
   }
 
   SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin();
-  const MCSymbol *StartLabel = getLabelBeforeInsn(RI->first);
-  const MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
+  MCSymbol *StartLabel = getLabelBeforeInsn(RI->first);
+  MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
 
   if (StartLabel == 0 || EndLabel == 0) {
     llvm_unreachable("Unexpected Start and End labels for an inlined scope!");
@@ -492,10 +506,8 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
     DebugRangeSymbols.push_back(NULL);
     DebugRangeSymbols.push_back(NULL);
   } else {
-    TheCU->addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
-                    StartLabel);
-    TheCU->addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr,
-                    EndLabel);
+    TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_low_pc, StartLabel);
+    TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_high_pc, EndLabel);
   }
 
   InlinedSubprogramDIEs.insert(OriginDIE);
@@ -531,6 +543,12 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
   if (!Scope || !Scope->getScopeNode())
     return NULL;
 
+  DIScope DS(Scope->getScopeNode());
+  // Early return to avoid creating dangling variable|scope DIEs.
+  if (!Scope->getInlinedAt() && DS.isSubprogram() && Scope->isAbstractScope() &&
+      !TheCU->getDIE(DS))
+    return NULL;
+
   SmallVector<DIE *, 8> Children;
   DIE *ObjectPointer = NULL;
 
@@ -556,7 +574,6 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
   for (unsigned j = 0, M = Scopes.size(); j < M; ++j)
     if (DIE *Nested = constructScopeDIE(TheCU, Scopes[j]))
       Children.push_back(Nested);
-  DIScope DS(Scope->getScopeNode());
   DIE *ScopeDIE = NULL;
   if (Scope->getInlinedAt())
     ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
@@ -646,15 +663,28 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
                  DIUnit.getLanguage());
   NewCU->addString(Die, dwarf::DW_AT_name, FN);
   // 2.17.1 requires that we use DW_AT_low_pc for a single entry point
-  // into an entity.
-  NewCU->addUInt(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
+  // into an entity. We're using 0 (or a NULL label) for this.
+  NewCU->addLabelAddress(Die, dwarf::DW_AT_low_pc, NULL);
+
+  // Define start line table label for each Compile Unit.
+  MCSymbol *LineTableStartSym = Asm->GetTempSymbol("line_table_start",
+                                                   NewCU->getUniqueID());
+  Asm->OutStreamer.getContext().setMCLineTableSymbol(LineTableStartSym,
+                                                     NewCU->getUniqueID());
+
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section.
+  // The line table entries are not always emitted in assembly, so it
+  // is not okay to use line_table_start here.
   if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
     NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
-                    Asm->GetTempSymbol("section_line"));
-  else
+                    NewCU->getUniqueID() == 0 ?
+                    Asm->GetTempSymbol("section_line") : LineTableStartSym);
+  else if (NewCU->getUniqueID() == 0)
     NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
+  else
+    NewCU->addDelta(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
+                    LineTableStartSym, DwarfLineSectionSym);
 
   if (!CompilationDir.empty())
     NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
@@ -671,8 +701,13 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
 
   if (!FirstCU)
     FirstCU = NewCU;
-  if (useSplitDwarf() && !SkeletonCU)
-    SkeletonCU = constructSkeletonCU(N);
+
+  if (useSplitDwarf()) {
+    // This should be a unique identifier when we want to build .dwp files.
+    NewCU->addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, 0);
+    // Now construct the skeleton CU associated.
+    constructSkeletonCU(N);
+  }
 
   InfoHolder.addUnit(NewCU);
 
@@ -702,7 +737,9 @@ void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU,
   // Add to context owner.
   TheCU->addToContextOwner(SubprogramDie, SP.getContext());
 
-  return;
+  // Expose as global, if requested.
+  if (GenerateDwarfPubNamesSection)
+    TheCU->addGlobalName(SP.getName(), SubprogramDie);
 }
 
 // Collect debug info from named mdnodes such as llvm.dbg.enum and llvm.dbg.ty.
@@ -754,6 +791,9 @@ bool DwarfDebug::collectLegacyDebugInfo(const Module *M) {
   }
   if (!HasDebugInfo) return false;
 
+  // Emit initial sections so we can refer to them later.
+  emitSectionLabels();
+
   // Create all the compile unit DIEs.
   for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
          E = DbgFinder.compile_unit_end(); I != E; ++I)
@@ -791,6 +831,9 @@ void DwarfDebug::beginModule() {
   // module using debug info finder to collect debug info.
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (CU_Nodes) {
+    // Emit initial sections so we can reference labels later.
+    emitSectionLabels();
+
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
       DICompileUnit CUNode(CU_Nodes->getOperand(i));
       CompileUnit *CU = constructCompileUnit(CUNode);
@@ -923,9 +966,6 @@ void DwarfDebug::endModule() {
   // Finalize the debug info for the module.
   finalizeModuleInfo();
 
-  // Emit initial sections.
-  emitSectionLabels();
-
   if (!useSplitDwarf()) {
     // Emit all the DIEs into a debug info section.
     emitDebugInfo();
@@ -975,6 +1015,9 @@ void DwarfDebug::endModule() {
     // Emit info into a debug macinfo section.
     emitDebugMacInfo();
 
+    // Emit DWO addresses.
+    InfoHolder.emitAddresses(Asm->getObjFileLowering().getDwarfAddrSection());
+
     // Emit inline info.
     // TODO: When we don't need the option anymore we
     // can remove all of the code that this section
@@ -991,6 +1034,10 @@ void DwarfDebug::endModule() {
     emitAccelTypes();
   }
 
+  // Emit info into a debug pubnames section, if requested.
+  if (GenerateDwarfPubNamesSection)
+    emitDebugPubnames();
+
   // Emit info into a debug pubtypes section.
   // TODO: When we don't need the option anymore we can
   // remove all of the code that adds to the table.
@@ -1008,11 +1055,12 @@ void DwarfDebug::endModule() {
          E = CUMap.end(); I != E; ++I)
     delete I->second;
 
-  delete SkeletonCU;
+  for (SmallVector<CompileUnit *, 1>::iterator I = SkeletonCUs.begin(),
+         E = SkeletonCUs.end(); I != E; ++I)
+    delete *I;
 
   // Reset these for the next Module if we have one.
   FirstCU = NULL;
-  SkeletonCU = NULL;
 }
 
 // Find abstract variable, if any, associated with Var.
@@ -1179,7 +1227,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
       continue;
     }
 
-    // handle multiple DBG_VALUE instructions describing one variable.
+    // Handle multiple DBG_VALUE instructions describing one variable.
     RegVar->setDotDebugLocOffset(DotDebugLocEntries.size());
 
     for (SmallVectorImpl<const MachineInstr*>::const_iterator
@@ -1234,14 +1282,14 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
 }
 
 // Return Label preceding the instruction.
-const MCSymbol *DwarfDebug::getLabelBeforeInsn(const MachineInstr *MI) {
+MCSymbol *DwarfDebug::getLabelBeforeInsn(const MachineInstr *MI) {
   MCSymbol *Label = LabelsBeforeInsn.lookup(MI);
   assert(Label && "Didn't insert label before instruction");
   return Label;
 }
 
 // Return Label immediately following the instruction.
-const MCSymbol *DwarfDebug::getLabelAfterInsn(const MachineInstr *MI) {
+MCSymbol *DwarfDebug::getLabelAfterInsn(const MachineInstr *MI) {
   return LabelsAfterInsn.lookup(MI);
 }
 
@@ -1377,6 +1425,13 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   if (LScopes.empty()) return;
   identifyScopeMarkers();
 
+  // Set DwarfCompileUnitID in MCContext to the Compile Unit this function
+  // belongs to.
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
+  assert(TheCU && "Unable to find compile unit!");
+  Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
+
   FunctionBeginSym = Asm->GetTempSymbol("func_begin",
                                         Asm->getFunctionNumber());
   // Assumes in correct section after the entry point.
@@ -1561,6 +1616,8 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
                                       Asm->getFunctionNumber());
   // Assumes in correct section after the entry point.
   Asm->OutStreamer.EmitLabel(FunctionEndSym);
+  // Set DwarfCompileUnitID in MCContext to default value.
+  Asm->OutStreamer.getContext().setDwarfCompileUnitID(0);
 
   SmallPtrSet<const MDNode *, 16> ProcessedVars;
   collectVariableInfo(MF, ProcessedVars);
@@ -1734,8 +1791,11 @@ void DwarfDebug::emitSectionLabels() {
   if (const MCSection *MacroInfo = TLOF.getDwarfMacroInfoSection())
     emitSectionSym(Asm, MacroInfo);
 
-  emitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
+  DwarfLineSectionSym =
+    emitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
   emitSectionSym(Asm, TLOF.getDwarfLocSection());
+  if (GenerateDwarfPubNamesSection)
+    emitSectionSym(Asm, TLOF.getDwarfPubNamesSection());
   emitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
   DwarfStrSectionSym =
     emitSectionSym(Asm, TLOF.getDwarfStrSection(), "info_string");
@@ -1942,8 +2002,7 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
   Asm->OutStreamer.AddComment("Section end label");
 
   Asm->OutStreamer.EmitSymbolValue(Asm->GetTempSymbol("section_end",SectionEnd),
-                                   Asm->getDataLayout().getPointerSize(),
-                                   0/*AddrSpace*/);
+                                   Asm->getDataLayout().getPointerSize());
 
   // Mark end of matrix.
   Asm->OutStreamer.AddComment("DW_LNE_end_sequence");
@@ -2072,6 +2131,61 @@ void DwarfDebug::emitAccelTypes() {
   AT.Emit(Asm, SectionBegin, &InfoHolder);
 }
 
+/// emitDebugPubnames - Emit visible names into a debug pubnames section.
+///
+void DwarfDebug::emitDebugPubnames() {
+  const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+
+  typedef DenseMap<const MDNode*, CompileUnit*> CUMapType;
+  for (CUMapType::iterator I = CUMap.begin(), E = CUMap.end(); I != E; ++I) {
+    CompileUnit *TheCU = I->second;
+    unsigned ID = TheCU->getUniqueID();
+
+    if (TheCU->getGlobalNames().empty())
+      continue;
+
+    // Start the dwarf pubnames section.
+    Asm->OutStreamer.SwitchSection(
+      Asm->getObjFileLowering().getDwarfPubNamesSection());
+
+    Asm->OutStreamer.AddComment("Length of Public Names Info");
+    Asm->EmitLabelDifference(Asm->GetTempSymbol("pubnames_end", ID),
+                             Asm->GetTempSymbol("pubnames_begin", ID), 4);
+
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_begin", ID));
+
+    Asm->OutStreamer.AddComment("DWARF Version");
+    Asm->EmitInt16(dwarf::DWARF_VERSION);
+
+    Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
+    Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
+                           DwarfInfoSectionSym);
+
+    Asm->OutStreamer.AddComment("Compilation Unit Length");
+    Asm->EmitLabelDifference(Asm->GetTempSymbol(ISec->getLabelEndName(), ID),
+                             Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
+                             4);
+
+    const StringMap<DIE*> &Globals = TheCU->getGlobalNames();
+    for (StringMap<DIE*>::const_iterator
+           GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
+      const char *Name = GI->getKeyData();
+      const DIE *Entity = GI->second;
+
+      Asm->OutStreamer.AddComment("DIE offset");
+      Asm->EmitInt32(Entity->getOffset());
+
+      if (Asm->isVerbose())
+        Asm->OutStreamer.AddComment("External Name");
+      Asm->OutStreamer.EmitBytes(StringRef(Name, strlen(Name)+1), 0);
+    }
+
+    Asm->OutStreamer.AddComment("End Mark");
+    Asm->EmitInt32(0);
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_end", ID));
+  }
+}
+
 void DwarfDebug::emitDebugPubTypes() {
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2114,7 +2228,7 @@ void DwarfDebug::emitDebugPubTypes() {
 
       if (Asm->isVerbose()) Asm->OutStreamer.AddComment("External Name");
       // Emit the name with a terminating null byte.
-      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1), 0);
+      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1));
     }
 
     Asm->OutStreamer.AddComment("End Mark");
@@ -2152,22 +2266,53 @@ void DwarfUnits::emitStrings(const MCSection *StrSection,
 
     // Emit the string itself with a terminating null byte.
     Asm->OutStreamer.EmitBytes(StringRef(Entries[i].second->getKeyData(),
-                                         Entries[i].second->getKeyLength()+1),
-                               0/*addrspace*/);
+                                         Entries[i].second->getKeyLength()+1));
   }
 
   // If we've got an offset section go ahead and emit that now as well.
   if (OffsetSection) {
     Asm->OutStreamer.SwitchSection(OffsetSection);
     unsigned offset = 0;
-    unsigned size = 4;
+    unsigned size = 4; // FIXME: DWARF64 is 8.
     for (unsigned i = 0, e = Entries.size(); i != e; ++i) {
-      Asm->OutStreamer.EmitIntValue(offset, size, 0);
+      Asm->OutStreamer.EmitIntValue(offset, size);
       offset += Entries[i].second->getKeyLength() + 1;
     }
   }
 }
 
+// Emit strings into a string section.
+void DwarfUnits::emitAddresses(const MCSection *AddrSection) {
+
+  if (AddressPool.empty()) return;
+
+  // Start the dwarf addr section.
+  Asm->OutStreamer.SwitchSection(AddrSection);
+
+  // Get all of the string pool entries and put them in an array by their ID so
+  // we can sort them.
+  SmallVector<std::pair<unsigned,
+                        std::pair<MCSymbol*, unsigned>* >, 64> Entries;
+
+  for (DenseMap<MCSymbol*, std::pair<MCSymbol*, unsigned> >::iterator
+         I = AddressPool.begin(), E = AddressPool.end();
+       I != E; ++I)
+    Entries.push_back(std::make_pair(I->second.second, &(I->second)));
+
+  array_pod_sort(Entries.begin(), Entries.end());
+
+  for (unsigned i = 0, e = Entries.size(); i != e; ++i) {
+    // Emit a label for reference from debug information entries.
+    MCSymbol *Sym = Entries[i].second->first;
+    if (Sym)
+      Asm->EmitLabelReference(Entries[i].second->first,
+                              Asm->getDataLayout().getPointerSize());
+    else
+      Asm->OutStreamer.EmitIntValue(0, Asm->getDataLayout().getPointerSize());
+  }
+
+}
+
 // Emit visible names into a debug str section.
 void DwarfDebug::emitDebugStr() {
   DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
@@ -2199,12 +2344,12 @@ void DwarfDebug::emitDebugLoc() {
     DotDebugLocEntry &Entry = *I;
     if (Entry.isMerged()) continue;
     if (Entry.isEmpty()) {
-      Asm->OutStreamer.EmitIntValue(0, Size, /*addrspace*/0);
-      Asm->OutStreamer.EmitIntValue(0, Size, /*addrspace*/0);
+      Asm->OutStreamer.EmitIntValue(0, Size);
+      Asm->OutStreamer.EmitIntValue(0, Size);
       Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_loc", index));
     } else {
-      Asm->OutStreamer.EmitSymbolValue(Entry.Begin, Size, 0);
-      Asm->OutStreamer.EmitSymbolValue(Entry.End, Size, 0);
+      Asm->OutStreamer.EmitSymbolValue(Entry.Begin, Size);
+      Asm->OutStreamer.EmitSymbolValue(Entry.End, Size);
       DIVariable DV(Entry.Variable);
       Asm->OutStreamer.AddComment("Loc expr size");
       MCSymbol *begin = Asm->OutStreamer.getContext().CreateTempSymbol();
@@ -2290,9 +2435,9 @@ void DwarfDebug::emitDebugRanges() {
          I = DebugRangeSymbols.begin(), E = DebugRangeSymbols.end();
        I != E; ++I) {
     if (*I)
-      Asm->OutStreamer.EmitSymbolValue(const_cast<MCSymbol*>(*I), Size, 0);
+      Asm->OutStreamer.EmitSymbolValue(const_cast<MCSymbol*>(*I), Size);
     else
-      Asm->OutStreamer.EmitIntValue(0, Size, /*addrspace*/0);
+      Asm->OutStreamer.EmitIntValue(0, Size);
   }
 }
 
@@ -2376,7 +2521,7 @@ void DwarfDebug::emitDebugInlineInfo() {
 
       if (Asm->isVerbose()) Asm->OutStreamer.AddComment("low_pc");
       Asm->OutStreamer.EmitSymbolValue(LI->first,
-                                       Asm->getDataLayout().getPointerSize(),0);
+                                       Asm->getDataLayout().getPointerSize());
     }
   }
 
@@ -2391,68 +2536,44 @@ void DwarfDebug::emitDebugInlineInfo() {
 // DW_AT_low_pc and DW_AT_high_pc are not used, and vice versa.
 CompileUnit *DwarfDebug::constructSkeletonCU(const MDNode *N) {
   DICompileUnit DIUnit(N);
-  StringRef FN = DIUnit.getFilename();
   CompilationDir = DIUnit.getDirectory();
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
   CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++,
                                        DIUnit.getLanguage(), Die, Asm,
                                        this, &SkeletonHolder);
-  // FIXME: This should be the .dwo file.
-  NewCU->addLocalString(Die, dwarf::DW_AT_GNU_dwo_name, FN);
 
-  // FIXME: We also need DW_AT_addr_base and DW_AT_dwo_id.
+  NewCU->addLocalString(Die, dwarf::DW_AT_GNU_dwo_name,
+                        DIUnit.getSplitDebugFilename());
+
+  // This should be a unique identifier when we want to build .dwp files.
+  NewCU->addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, 0);
+
+  // FIXME: The addr base should be relative for each compile unit, however,
+  // this one is going to be 0 anyhow.
+  NewCU->addUInt(Die, dwarf::DW_AT_GNU_addr_base, dwarf::DW_FORM_sec_offset, 0);
 
   // 2.17.1 requires that we use DW_AT_low_pc for a single entry point
-  // into an entity.
+  // into an entity. We're using 0, or a NULL label for this.
   NewCU->addUInt(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
+
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section.
   if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-    NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
-                    Asm->GetTempSymbol("section_line"));
+    NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_sec_offset,
+                    DwarfLineSectionSym);
   else
-    NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
+    NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_sec_offset, 0);
 
   if (!CompilationDir.empty())
     NewCU->addLocalString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
 
   SkeletonHolder.addUnit(NewCU);
+  SkeletonCUs.push_back(NewCU);
 
   return NewCU;
 }
 
-void DwarfDebug::emitSkeletonCU(const MCSection *Section) {
-  Asm->OutStreamer.SwitchSection(Section);
-  DIE *Die = SkeletonCU->getCUDie();
-
-  // Emit the compile units header.
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol(Section->getLabelBeginName(),
-                                                SkeletonCU->getUniqueID()));
-
-  // Emit size of content not including length itself
-  unsigned ContentSize = Die->getSize() +
-    sizeof(int16_t) + // DWARF version number
-    sizeof(int32_t) + // Offset Into Abbrev. Section
-    sizeof(int8_t);   // Pointer Size (in bytes)
-
-  Asm->OutStreamer.AddComment("Length of Compilation Unit Info");
-  Asm->EmitInt32(ContentSize);
-  Asm->OutStreamer.AddComment("DWARF version number");
-  Asm->EmitInt16(dwarf::DWARF_VERSION);
-  Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
-
-  const MCSection *ASec = Asm->getObjFileLowering().getDwarfAbbrevSection();
-  Asm->EmitSectionOffset(Asm->GetTempSymbol(ASec->getLabelBeginName()),
-                         DwarfAbbrevSectionSym);
-  Asm->OutStreamer.AddComment("Address Size (in bytes)");
-  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
-
-  emitDIE(Die, &SkeletonAbbrevs);
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol(Section->getLabelEndName(),
-                                                SkeletonCU->getUniqueID()));
-}
-
 void DwarfDebug::emitSkeletonAbbrevs(const MCSection *Section) {
   assert(useSplitDwarf() && "No split dwarf debug info?");
   emitAbbrevs(Section, &SkeletonAbbrevs);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 1e471f7..7b56815 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -195,6 +195,10 @@ public:
 typedef StringMap<std::pair<MCSymbol*, unsigned>,
                   BumpPtrAllocator&> StrPool;
 
+// A Symbol->pair<Symbol, unsigned> mapping of addresses used by indirect
+// references.
+typedef DenseMap<MCSymbol *, std::pair<MCSymbol *, unsigned> > AddrPool;
+
 /// \brief Collects and handles information specific to a particular
 /// collection of units.
 class DwarfUnits {
@@ -215,12 +219,17 @@ class DwarfUnits {
   unsigned NextStringPoolNumber;
   std::string StringPref;
 
+  // Collection of addresses for this unit and assorted labels.
+  AddrPool AddressPool;
+  unsigned NextAddrPoolNumber;
+
 public:
   DwarfUnits(AsmPrinter *AP, FoldingSet<DIEAbbrev> *AS,
              std::vector<DIEAbbrev *> *A, const char *Pref,
              BumpPtrAllocator &DA) :
     Asm(AP), AbbreviationsSet(AS), Abbreviations(A),
-    StringPool(DA), NextStringPoolNumber(0), StringPref(Pref) {}
+    StringPool(DA), NextStringPoolNumber(0), StringPref(Pref),
+    AddressPool(), NextAddrPoolNumber(0) {}
 
   /// \brief Compute the size and offset of a DIE given an incoming Offset.
   unsigned computeSizeAndOffset(DIE *Die, unsigned Offset);
@@ -242,6 +251,9 @@ public:
   /// \brief Emit all of the strings to the section given.
   void emitStrings(const MCSection *, const MCSection *, const MCSymbol *);
 
+  /// \brief Emit all of the addresses to the section given.
+  void emitAddresses(const MCSection *);
+
   /// \brief Returns the entry into the start of the pool.
   MCSymbol *getStringPoolSym();
 
@@ -255,6 +267,13 @@ public:
 
   /// \brief Returns the string pool.
   StrPool *getStringPool() { return &StringPool; }
+
+  /// \brief Returns the index into the address pool with the given
+  /// label/symbol.
+  unsigned getAddrPoolIndex(MCSymbol *);
+
+  /// \brief Returns the address pool.
+  AddrPool *getAddrPool() { return &AddressPool; }
 };
 
 /// \brief Collects and handles dwarf debug information.
@@ -367,7 +386,7 @@ class DwarfDebug {
   // section offsets and are created by EmitSectionLabels.
   MCSymbol *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
   MCSymbol *DwarfStrSectionSym, *TextSectionSym, *DwarfDebugRangeSectionSym;
-  MCSymbol *DwarfDebugLocSectionSym;
+  MCSymbol *DwarfDebugLocSectionSym, *DwarfLineSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
   MCSymbol *DwarfAbbrevDWOSectionSym, *DwarfStrDWOSectionSym;
 
@@ -396,8 +415,8 @@ class DwarfDebug {
   // original object file, rather than things that are meant
   // to be in the .dwo sections.
 
-  // The CU left in the original object file for separated debug info.
-  CompileUnit *SkeletonCU;
+  // The CUs left in the original object file for separated debug info.
+  SmallVector<CompileUnit *, 1> SkeletonCUs;
 
   // Used to uniquely define abbreviations for the skeleton emission.
   FoldingSet<DIEAbbrev> SkeletonAbbrevSet;
@@ -481,6 +500,9 @@ private:
   /// \brief Emit type dies into a hashed accelerator table.
   void emitAccelTypes();
 
+  /// \brief Emit visible names into a debug pubnames section.
+  void emitDebugPubnames();
+
   /// \brief Emit visible types into a debug pubtypes section.
   void emitDebugPubTypes();
 
@@ -508,9 +530,6 @@ private:
   /// section.
   CompileUnit *constructSkeletonCU(const MDNode *);
 
-  /// \brief Emit the local split debug info section.
-  void emitSkeletonCU(const MCSection *);
-
   /// \brief Emit the local split abbreviations.
   void emitSkeletonAbbrevs(const MCSection *);
 
@@ -560,7 +579,7 @@ private:
   }
 
   /// \brief Return Label preceding the instruction.
-  const MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
+  MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
 
   /// \brief Ensure that a label will be emitted after MI.
   void requestLabelAfterInsn(const MachineInstr *MI) {
@@ -568,7 +587,7 @@ private:
   }
 
   /// \brief Return Label immediately following the instruction.
-  const MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
+  MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
 
 public:
   //===--------------------------------------------------------------------===//
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
index 975bb94..7133458 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -33,7 +33,6 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
@@ -608,7 +607,7 @@ void DwarfException::EmitExceptionTable() {
       if (!S.PadLabel) {
         if (VerboseAsm)
           Asm->OutStreamer.AddComment("    has no landing pad");
-        Asm->OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
+        Asm->OutStreamer.EmitIntValue(0, 4/*size*/);
       } else {
         if (VerboseAsm)
           Asm->OutStreamer.AddComment(Twine("    jumps to ") +
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index b802853..98177c0 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -100,7 +100,7 @@ void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
   EmitCamlGlobal(getModule(), AP, "data_end");
 
   // FIXME: Why does ocaml emit this??
-  AP.OutStreamer.EmitIntValue(0, IntPtrSize, 0);
+  AP.OutStreamer.EmitIntValue(0, IntPtrSize);
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
   EmitCamlGlobal(getModule(), AP, "frametable");
@@ -145,7 +145,7 @@ void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
                            "Live root count "+Twine(LiveCount)+" >= 65536.");
       }
 
-      AP.OutStreamer.EmitSymbolValue(J->Label, IntPtrSize, 0);
+      AP.OutStreamer.EmitSymbolValue(J->Label, IntPtrSize);
       AP.EmitInt16(FrameSize);
       AP.EmitInt16(LiveCount);
 
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
index cb25674..1561012 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -33,7 +33,6 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index c27e081..e8b5b4f 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 namespace {
 
 class BasicTTI : public ImmutablePass, public TargetTransformInfo {
-  const TargetLowering *TLI;
+  const TargetLoweringBase *TLI;
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the result needs to be inserted and/or extracted from vectors.
@@ -37,7 +37,7 @@ public:
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
-  BasicTTI(const TargetLowering *TLI) : ImmutablePass(ID), TLI(TLI) {
+  BasicTTI(const TargetLoweringBase *TLI) : ImmutablePass(ID), TLI(TLI) {
     initializeBasicTTIPass(*PassRegistry::getPassRegistry());
   }
 
@@ -83,6 +83,8 @@ public:
   /// @{
 
   virtual unsigned getNumberOfRegisters(bool Vector) const;
+  virtual unsigned getMaximumUnrollFactor() const;
+  virtual unsigned getRegisterBitWidth(bool Vector) const;
   virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
   virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
                                   int Index, Type *SubTp) const;
@@ -99,6 +101,7 @@ public:
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
                                          ArrayRef<Type*> Tys) const;
   virtual unsigned getNumberOfParts(Type *Tp) const;
+  virtual unsigned getAddressComputationCost(Type *Ty) const;
 
   /// @}
 };
@@ -110,7 +113,7 @@ INITIALIZE_AG_PASS(BasicTTI, TargetTransformInfo, "basictti",
 char BasicTTI::ID = 0;
 
 ImmutablePass *
-llvm::createBasicTargetTransformInfoPass(const TargetLowering *TLI) {
+llvm::createBasicTargetTransformInfoPass(const TargetLoweringBase *TLI) {
   return new BasicTTI(TLI);
 }
 
@@ -126,7 +129,7 @@ bool BasicTTI::isLegalICmpImmediate(int64_t imm) const {
 bool BasicTTI::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                      int64_t BaseOffset, bool HasBaseReg,
                                      int64_t Scale) const {
-  TargetLowering::AddrMode AM;
+  TargetLoweringBase::AddrMode AM;
   AM.BaseGV = BaseGV;
   AM.BaseOffs = BaseOffset;
   AM.HasBaseReg = HasBaseReg;
@@ -182,6 +185,14 @@ unsigned BasicTTI::getNumberOfRegisters(bool Vector) const {
   return 1;
 }
 
+unsigned BasicTTI::getRegisterBitWidth(bool Vector) const {
+  return 32;
+}
+
+unsigned BasicTTI::getMaximumUnrollFactor() const {
+  return 1;
+}
+
 unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
   // Check if any of the operands are vector operands.
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -231,6 +242,27 @@ unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(Src);
   std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(Dst);
 
+  // Check for NOOP conversions.
+  if (SrcLT.first == DstLT.first &&
+      SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
+
+      // Bitcast between types that are legalized to the same type are free.
+      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
+        return 0;
+  }
+
+  if (Opcode == Instruction::Trunc &&
+      TLI->isTruncateFree(SrcLT.second, DstLT.second))
+    return 0;
+
+  if (Opcode == Instruction::ZExt &&
+      TLI->isZExtFree(SrcLT.second, DstLT.second))
+    return 0;
+
+  // If the cast is marked as legal (or promote) then assume low cost.
+  if (TLI->isOperationLegalOrPromote(ISD, DstLT.second))
+    return 1;
+
   // Handle scalar conversions.
   if (!Src->isVectorTy() && !Dst->isVectorTy()) {
 
@@ -238,14 +270,6 @@ unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
     if (Opcode == Instruction::BitCast)
       return 0;
 
-    if (Opcode == Instruction::Trunc &&
-        TLI->isTruncateFree(SrcLT.second, DstLT.second))
-      return 0;
-
-    if (Opcode == Instruction::ZExt &&
-        TLI->isZExtFree(SrcLT.second, DstLT.second))
-      return 0;
-
     // Just check the op cost. If the operation is legal then assume it costs 1.
     if (!TLI->isOperationExpand(ISD, DstLT.second))
       return  1;
@@ -261,10 +285,6 @@ unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
     if (SrcLT.first == DstLT.first &&
         SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
 
-      // Bitcast between types that are legalized to the same type are free.
-      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
-        return 0;
-
       // Assume that Zext is done using AND.
       if (Opcode == Instruction::ZExt)
         return 1;
@@ -381,3 +401,7 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
   return LT.first;
 }
+
+unsigned BasicTTI::getAddressComputationCost(Type *Ty) const {
+  return 0;
+}
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index d5f3932..ddc7ada 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -9,8 +9,8 @@ add_llvm_library(LLVMCodeGen
   CodeGen.cpp
   CodePlacementOpt.cpp
   CriticalAntiDepBreaker.cpp
-  DeadMachineInstructionElim.cpp
   DFAPacketizer.cpp
+  DeadMachineInstructionElim.cpp
   DwarfEHPrepare.cpp
   EarlyIfConversion.cpp
   EdgeBundles.cpp
@@ -32,21 +32,20 @@ add_llvm_library(LLVMCodeGen
   LiveInterval.cpp
   LiveIntervalAnalysis.cpp
   LiveIntervalUnion.cpp
+  LiveRangeCalc.cpp
+  LiveRangeEdit.cpp
   LiveRegMatrix.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
-  LiveRangeCalc.cpp
-  LiveRangeEdit.cpp
   LocalStackSlotAllocation.cpp
   MachineBasicBlock.cpp
   MachineBlockFrequencyInfo.cpp
   MachineBlockPlacement.cpp
   MachineBranchProbabilityInfo.cpp
+  MachineCSE.cpp
   MachineCodeEmitter.cpp
   MachineCopyPropagation.cpp
-  MachineCSE.cpp
   MachineDominators.cpp
-  MachinePostDominators.cpp
   MachineFunction.cpp
   MachineFunctionAnalysis.cpp
   MachineFunctionPass.cpp
@@ -58,6 +57,7 @@ add_llvm_library(LLVMCodeGen
   MachineModuleInfo.cpp
   MachineModuleInfoImpls.cpp
   MachinePassRegistry.cpp
+  MachinePostDominators.cpp
   MachineRegisterInfo.cpp
   MachineSSAUpdater.cpp
   MachineScheduler.cpp
@@ -91,16 +91,17 @@ add_llvm_library(LLVMCodeGen
   ShrinkWrapping.cpp
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
-  Spiller.cpp
   SpillPlacement.cpp
+  Spiller.cpp
   SplitKit.cpp
+  StackColoring.cpp
   StackProtector.cpp
   StackSlotColoring.cpp
-  StackColoring.cpp
   StrongPHIElimination.cpp
   TailDuplication.cpp
   TargetFrameLoweringImpl.cpp
   TargetInstrInfo.cpp
+  TargetLoweringBase.cpp
   TargetLoweringObjectFileImpl.cpp
   TargetOptionsImpl.cpp
   TargetRegisterInfo.cpp
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 48105d9..0eb74a4 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -57,23 +57,7 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
 
   bool IsReturnBlock = (BBSize != 0 && BB->back().isReturn());
 
-  // Determine the live-out physregs for this block.
-  if (IsReturnBlock) {
-    // In a return block, examine the function live-out regs.
-    for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
-         E = MRI.liveout_end(); I != E; ++I) {
-      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
-        unsigned Reg = *AI;
-        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
-        KillIndices[Reg] = BBSize;
-        DefIndices[Reg] = ~0u;
-      }
-    }
-  }
-
-  // In a non-return block, examine the live-in regs of all successors.
-  // Note a return block can have successors if the return instruction is
-  // predicated.
+  // Examine the live-in regs of all successors.
   for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
          SE = BB->succ_end(); SI != SE; ++SI)
     for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
@@ -371,12 +355,13 @@ CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin,
   return false;
 }
 
-unsigned
-CriticalAntiDepBreaker::findSuitableFreeRegister(RegRefIter RegRefBegin,
-                                                 RegRefIter RegRefEnd,
-                                                 unsigned AntiDepReg,
-                                                 unsigned LastNewReg,
-                                                 const TargetRegisterClass *RC)
+unsigned CriticalAntiDepBreaker::
+findSuitableFreeRegister(RegRefIter RegRefBegin,
+                         RegRefIter RegRefEnd,
+                         unsigned AntiDepReg,
+                         unsigned LastNewReg,
+                         const TargetRegisterClass *RC,
+                         SmallVector<unsigned, 2> &Forbid)
 {
   ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(RC);
   for (unsigned i = 0; i != Order.size(); ++i) {
@@ -401,6 +386,15 @@ CriticalAntiDepBreaker::findSuitableFreeRegister(RegRefIter RegRefBegin,
         Classes[NewReg] == reinterpret_cast<TargetRegisterClass *>(-1) ||
         KillIndices[AntiDepReg] > DefIndices[NewReg])
       continue;
+    // If NewReg overlaps any of the forbidden registers, we can't use it.
+    bool Forbidden = false;
+    for (SmallVector<unsigned, 2>::iterator it = Forbid.begin(),
+           ite = Forbid.end(); it != ite; ++it)
+      if (TRI->regsOverlap(NewReg, *it)) {
+        Forbidden = true;
+        break;
+      }
+    if (Forbidden) continue;
     return NewReg;
   }
 
@@ -564,6 +558,8 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
 
     PrescanInstruction(MI);
 
+    SmallVector<unsigned, 2> ForbidRegs;
+
     // If MI's defs have a special allocation requirement, don't allow
     // any def registers to be changed. Also assume all registers
     // defined in a call must not be changed (ABI).
@@ -574,7 +570,9 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
       AntiDepReg = 0;
     else if (AntiDepReg) {
       // If this instruction has a use of AntiDepReg, breaking it
-      // is invalid.
+      // is invalid.  If the instruction defines other registers,
+      // save a list of them so that we don't pick a new register
+      // that overlaps any of them.
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         MachineOperand &MO = MI->getOperand(i);
         if (!MO.isReg()) continue;
@@ -584,6 +582,8 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
           AntiDepReg = 0;
           break;
         }
+        if (MO.isDef() && Reg != AntiDepReg)
+          ForbidRegs.push_back(Reg);
       }
     }
 
@@ -606,7 +606,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
       if (unsigned NewReg = findSuitableFreeRegister(Range.first, Range.second,
                                                      AntiDepReg,
                                                      LastNewReg[AntiDepReg],
-                                                     RC)) {
+                                                     RC, ForbidRegs)) {
         DEBUG(dbgs() << "Breaking anti-dependence edge on "
               << TRI->getName(AntiDepReg)
               << " with " << RegRefs.count(AntiDepReg) << " references"
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index 8fb2b0e..df13dd3 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -102,7 +102,8 @@ class TargetRegisterInfo;
                                       RegRefIter RegRefEnd,
                                       unsigned AntiDepReg,
                                       unsigned LastNewReg,
-                                      const TargetRegisterClass *RC);
+                                      const TargetRegisterClass *RC,
+                                      SmallVector<unsigned, 2> &Forbid);
   };
 }
 
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index a526d24..a54217f 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -99,15 +99,6 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
     // Start out assuming that reserved registers are live out of this block.
     LivePhysRegs = MRI->getReservedRegs();
 
-    // Also add any explicit live-out physregs for this block.
-    if (!MBB->empty() && MBB->back().isReturn())
-      for (MachineRegisterInfo::liveout_iterator LOI = MRI->liveout_begin(),
-           LOE = MRI->liveout_end(); LOI != LOE; ++LOI) {
-        unsigned Reg = *LOI;
-        if (TargetRegisterInfo::isPhysicalRegister(Reg))
-          LivePhysRegs.set(Reg);
-      }
-
     // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not
     // live across blocks, but some targets (x86) can have flags live out of a
     // block.
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 4cafa96..f27ec77 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -33,7 +33,7 @@ STATISTIC(NumResumesLowered, "Number of resume calls lowered");
 namespace {
   class DwarfEHPrepare : public FunctionPass {
     const TargetMachine *TM;
-    const TargetLowering *TLI;
+    const TargetLoweringBase *TLI;
 
     // RewindFunction - _Unwind_Resume or the target equivalent.
     Constant *RewindFunction;
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index f332925..fac207e 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -17,8 +17,6 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "early-ifcvt"
-#include "llvm/CodeGen/Passes.h"
-#include "MachineTraceMetrics.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -31,6 +29,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 0b9e83d..1611db8 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -49,8 +49,6 @@ private:
   bool LowerSubregToReg(MachineInstr *MI);
   bool LowerCopy(MachineInstr *MI);
 
-  void TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
-                        const TargetRegisterInfo *TRI);
   void TransferImplicitDefs(MachineInstr *MI);
 };
 } // end anonymous namespace
@@ -61,21 +59,6 @@ char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID;
 INITIALIZE_PASS(ExpandPostRA, "postrapseudos",
                 "Post-RA pseudo instruction expansion pass", false, false)
 
-/// TransferDeadFlag - MI is a pseudo-instruction with DstReg dead,
-/// and the lowered replacement instructions immediately precede it.
-/// Mark the replacement instructions with the dead flag.
-void
-ExpandPostRA::TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
-                               const TargetRegisterInfo *TRI) {
-  for (MachineBasicBlock::iterator MII =
-        prior(MachineBasicBlock::iterator(MI)); ; --MII) {
-    if (MII->addRegisterDead(DstReg, TRI))
-      break;
-    assert(MII != MI->getParent()->begin() &&
-           "copyPhysReg output doesn't reference destination register!");
-  }
-}
-
 /// TransferImplicitDefs - MI is a pseudo-instruction, and the lowered
 /// replacement instructions immediately precede it.  Copy any implicit-def
 /// operands from MI to the replacement instruction.
@@ -114,6 +97,12 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
 
   DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
 
+  if (MI->allDefsAreDead()) {
+    MI->setDesc(TII->get(TargetOpcode::KILL));
+    DEBUG(dbgs() << "subreg: replaced by: " << *MI);
+    return true;
+  }
+
   if (DstSubReg == InsReg) {
     // No need to insert an identify copy instruction.
     // Watch out for case like this:
@@ -135,10 +124,6 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
     MachineBasicBlock::iterator CopyMI = MI;
     --CopyMI;
     CopyMI->addRegisterDefined(DstReg);
-
-    // Transfer the kill/dead flags, if needed.
-    if (MI->getOperand(0).isDead())
-      TransferDeadFlag(MI, DstSubReg, TRI);
     DEBUG(dbgs() << "subreg: " << *CopyMI);
   }
 
@@ -148,6 +133,14 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
 }
 
 bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
+
+  if (MI->allDefsAreDead()) {
+    DEBUG(dbgs() << "dead copy: " << *MI);
+    MI->setDesc(TII->get(TargetOpcode::KILL));
+    DEBUG(dbgs() << "replaced by: " << *MI);
+    return true;
+  }
+
   MachineOperand &DstMO = MI->getOperand(0);
   MachineOperand &SrcMO = MI->getOperand(1);
 
@@ -155,7 +148,7 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
     DEBUG(dbgs() << "identity copy: " << *MI);
     // No need to insert an identity copy instruction, but replace with a KILL
     // if liveness is changed.
-    if (DstMO.isDead() || SrcMO.isUndef() || MI->getNumOperands() > 2) {
+    if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
       // We must make sure the super-register gets killed. Replace the
       // instruction with KILL.
       MI->setDesc(TII->get(TargetOpcode::KILL));
@@ -171,8 +164,6 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
   TII->copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(),
                    DstMO.getReg(), SrcMO.getReg(), SrcMO.isKill());
 
-  if (DstMO.isDead())
-    TransferDeadFlag(MI, DstMO.getReg(), TRI);
   if (MI->getNumOperands() > 2)
     TransferImplicitDefs(MI);
   DEBUG({
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index a6a06e4..ef5247c 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -37,21 +37,9 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const;
     
     bool runOnFunction(Function &F);
-  };
-  
-  class Deleter : public FunctionPass {
-    static char ID;
-    
-  public:
-    Deleter();
-    
-    const char *getPassName() const;
-    void getAnalysisUsage(AnalysisUsage &AU) const;
-    
-    bool runOnFunction(Function &F);
     bool doFinalization(Module &M);
   };
-  
+
 }
 
 INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
@@ -182,32 +170,9 @@ bool Printer::runOnFunction(Function &F) {
   return false;
 }
 
-// -----------------------------------------------------------------------------
-
-char Deleter::ID = 0;
-
-FunctionPass *llvm::createGCInfoDeleter() {
-  return new Deleter();
-}
-
-Deleter::Deleter() : FunctionPass(ID) {}
-
-const char *Deleter::getPassName() const {
-  return "Delete Garbage Collector Information";
-}
-
-void Deleter::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequired<GCModuleInfo>();
-}
-
-bool Deleter::runOnFunction(Function &MF) {
-  return false;
-}
-
-bool Deleter::doFinalization(Module &M) {
+bool Printer::doFinalization(Module &M) {
   GCModuleInfo *GMI = getAnalysisIfAvailable<GCModuleInfo>();
-  assert(GMI && "Deleter didn't require GCModuleInfo?!");
+  assert(GMI && "Printer didn't require GCModuleInfo?!");
   GMI->clear();
   return false;
 }
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 8906991..9958d7d 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -151,7 +151,7 @@ namespace {
     /// basic block number.
     std::vector<BBInfo> BBAnalysis;
 
-    const TargetLowering *TLI;
+    const TargetLoweringBase *TLI;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     const InstrItineraryData *InstrItins;
@@ -1557,7 +1557,7 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
     if (Succ == FallThrough)
       continue;
     FromBBI.BB->removeSuccessor(Succ);
-    if (AddEdges)
+    if (AddEdges && !ToBBI.BB->isSuccessor(Succ))
       ToBBI.BB->addSuccessor(Succ);
   }
 
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 16e7968..07f0ccf 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -413,30 +413,22 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
   }
 
   case Intrinsic::stacksave:
+  case Intrinsic::stackrestore: {
     if (!Warned)
-      Context.emitWarning("this target does not support the "
-                          "llvm.stacksave intrinsic");
-    Warned = true;
-    CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
-    break;
-
-  case Intrinsic::stackrestore:
-    if (!Warned)
-      Context.emitWarning("this target does not support the "
-                          "llvm.stackrestore intrinsic");
+      errs() << "WARNING: this target does not support the llvm.stack"
+             << (Callee->getIntrinsicID() == Intrinsic::stacksave ?
+               "save" : "restore") << " intrinsic.\n";
     Warned = true;
+    if (Callee->getIntrinsicID() == Intrinsic::stacksave)
+      CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
     break;
+  }
     
   case Intrinsic::returnaddress:
-    Context.emitWarning("this target does not support the "
-                        "llvm.returnaddress intrinsic");
-    CI->replaceAllUsesWith(ConstantPointerNull::get(
-                                            cast<PointerType>(CI->getType())));
-    break;
-
   case Intrinsic::frameaddress:
-    Context.emitWarning("this target does not support the "
-                        "llvm.frameaddress intrinsic");
+    errs() << "WARNING: this target does not support the llvm."
+           << (Callee->getIntrinsicID() == Intrinsic::returnaddress ?
+             "return" : "frame") << "address intrinsic.\n";
     CI->replaceAllUsesWith(ConstantPointerNull::get(
                                             cast<PointerType>(CI->getType())));
     break;
@@ -446,12 +438,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
 
   case Intrinsic::pcmarker:
     break;    // Simply strip out pcmarker on unsupported architectures
-  case Intrinsic::readcyclecounter:
-    Context.emitWarning("this target does not support the "
-                        "llvm.readcyclecounter intrinsic; "
-                        "it is being lowered to a constant 0");
+  case Intrinsic::readcyclecounter: {
+    errs() << "WARNING: this target does not support the llvm.readcyclecoun"
+           << "ter intrinsic.  It is being lowered to a constant 0\n";
     CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));
     break;
+  }
 
   case Intrinsic::dbg_declare:
     break;    // Simply strip out debugging intrinsics
diff --git a/lib/CodeGen/LLVMBuild.txt b/lib/CodeGen/LLVMBuild.txt
index fee0347..81ef1aa 100644
--- a/lib/CodeGen/LLVMBuild.txt
+++ b/lib/CodeGen/LLVMBuild.txt
@@ -22,4 +22,4 @@ subdirectories = AsmPrinter SelectionDAG
 type = Library
 name = CodeGen
 parent = Libraries
-required_libraries = Analysis Core MC Scalar Support Target TransformUtils
+required_libraries = Analysis Core MC Scalar Support Target TransformUtils ObjCARC
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 12cd2d1..1a09837 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -226,7 +226,6 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 
   PM.add(Printer);
 
-  PM.add(createGCInfoDeleter());
   return false;
 }
 
@@ -245,7 +244,6 @@ bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
     return true;
 
   addCodeEmitter(PM, JCE);
-  PM.add(createGCInfoDeleter());
 
   return false; // success!
 }
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index 3c01d91..8172154 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -314,24 +314,22 @@ bool LexicalScopes::dominates(DebugLoc DL, MachineBasicBlock *MBB) {
 void LexicalScope::anchor() { }
 
 /// dump - Print data structures.
-void LexicalScope::dump() const {
+void LexicalScope::dump(unsigned Indent) const {
 #ifndef NDEBUG
   raw_ostream &err = dbgs();
-  err.indent(IndentLevel);
+  err.indent(Indent);
   err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n";
   const MDNode *N = Desc;
+  err.indent(Indent);
   N->dump();
   if (AbstractScope)
-    err << "Abstract Scope\n";
+    err << std::string(Indent, ' ') << "Abstract Scope\n";
 
-  IndentLevel += 2;
   if (!Children.empty())
-    err << "Children ...\n";
+    err << std::string(Indent + 2, ' ') << "Children ...\n";
   for (unsigned i = 0, e = Children.size(); i != e; ++i)
     if (Children[i] != this)
-      Children[i]->dump();
-
-  IndentLevel -= 2;
+      Children[i]->dump(Indent + 2);
 #endif
 }
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 786f353..0b117ac 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -247,10 +247,6 @@ public:
                         LiveIntervals &LIS, MachineDominatorTree &MDT,
                         UserValueScopes &UVS);
 
-  /// renameRegister - Update locations to rewrite OldReg as NewReg:SubIdx.
-  void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx,
-                      const TargetRegisterInfo *TRI);
-
   /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
   /// live. Returns true if any changes were made.
   bool splitRegister(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
@@ -259,7 +255,7 @@ public:
   /// provided virtual register map.
   void rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI);
 
-  /// emitDebugVariables - Recreate DBG_VALUE instruction from data structures.
+  /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM,
                        LiveIntervals &LIS, const TargetInstrInfo &TRI);
 
@@ -286,6 +282,11 @@ class LDVImpl {
   MachineDominatorTree *MDT;
   const TargetRegisterInfo *TRI;
 
+  /// Whether emitDebugValues is called.
+  bool EmitDone;
+  /// Whether the machine function is modified during the pass.
+  bool ModifiedMF;
+
   /// userValues - All allocated UserValue instances.
   SmallVector<UserValue*, 8> userValues;
 
@@ -320,27 +321,30 @@ class LDVImpl {
   void computeIntervals();
 
 public:
-  LDVImpl(LiveDebugVariables *ps) : pass(*ps) {}
+  LDVImpl(LiveDebugVariables *ps) : pass(*ps), EmitDone(false),
+                                    ModifiedMF(false) {}
   bool runOnMachineFunction(MachineFunction &mf);
 
-  /// clear - Relase all memory.
+  /// clear - Release all memory.
   void clear() {
     DeleteContainerPointers(userValues);
     userValues.clear();
     virtRegToEqClass.clear();
     userVarMap.clear();
+    // Make sure we call emitDebugValues if the machine function was modified.
+    assert((!ModifiedMF || EmitDone) &&
+           "Dbg values are not emitted in LDV");
+    EmitDone = false;
+    ModifiedMF = false;
   }
 
   /// mapVirtReg - Map virtual register to an equivalence class.
   void mapVirtReg(unsigned VirtReg, UserValue *EC);
 
-  /// renameRegister - Replace all references to OldReg with NewReg:SubIdx.
-  void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx);
-
   /// splitRegister -  Replace all references to OldReg with NewRegs.
   void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
 
-  /// emitDebugVariables - Recreate DBG_VALUE instruction from data structures.
+  /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM);
 
   void print(raw_ostream&);
@@ -693,6 +697,7 @@ bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
   computeIntervals();
   DEBUG(print(dbgs()));
   LS.releaseMemory();
+  ModifiedMF = Changed;
   return Changed;
 }
 
@@ -714,45 +719,6 @@ LiveDebugVariables::~LiveDebugVariables() {
     delete static_cast<LDVImpl*>(pImpl);
 }
 
-void UserValue::
-renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx,
-               const TargetRegisterInfo *TRI) {
-  for (unsigned i = locations.size(); i; --i) {
-    unsigned LocNo = i - 1;
-    MachineOperand &Loc = locations[LocNo];
-    if (!Loc.isReg() || Loc.getReg() != OldReg)
-      continue;
-    if (TargetRegisterInfo::isPhysicalRegister(NewReg))
-      Loc.substPhysReg(NewReg, *TRI);
-    else
-      Loc.substVirtReg(NewReg, SubIdx, *TRI);
-    coalesceLocation(LocNo);
-  }
-}
-
-void LDVImpl::
-renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx) {
-  UserValue *UV = lookupVirtReg(OldReg);
-  if (!UV)
-    return;
-
-  if (TargetRegisterInfo::isVirtualRegister(NewReg))
-    mapVirtReg(NewReg, UV);
-  if (OldReg != NewReg)
-    virtRegToEqClass.erase(OldReg);
-
-  do {
-    UV->renameRegister(OldReg, NewReg, SubIdx, TRI);
-    UV = UV->getNext();
-  } while (UV);
-}
-
-void LiveDebugVariables::
-renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx) {
-  if (pImpl)
-    static_cast<LDVImpl*>(pImpl)->renameRegister(OldReg, NewReg, SubIdx);
-}
-
 //===----------------------------------------------------------------------===//
 //                           Live Range Splitting
 //===----------------------------------------------------------------------===//
@@ -1011,6 +977,7 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
     userValues[i]->rewriteLocations(*VRM, *TRI);
     userValues[i]->emitDebugValues(VRM, *LIS, *TII);
   }
+  EmitDone = true;
 }
 
 void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) {
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 68f4b16..dccd847 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -440,7 +440,7 @@ void LiveInterval::join(LiveInterval &Other,
 
     iterator OutIt = begin();
     OutIt->valno = NewVNInfo[LHSValNoAssignments[OutIt->valno->id]];
-    for (iterator I = next(OutIt), E = end(); I != E; ++I) {
+    for (iterator I = llvm::next(OutIt), E = end(); I != E; ++I) {
       VNInfo* nextValNo = NewVNInfo[LHSValNoAssignments[I->valno->id]];
       assert(nextValNo != 0 && "Huh?");
 
@@ -464,10 +464,12 @@ void LiveInterval::join(LiveInterval &Other,
     ranges.erase(OutIt, end());
   }
 
-  // Remember assignements because val# ids are changing.
-  SmallVector<unsigned, 16> OtherAssignments;
+  // Rewrite Other values before changing the VNInfo ids.
+  // This can leave Other in an invalid state because we're not coalescing
+  // touching segments that now have identical values. That's OK since Other is
+  // not supposed to be valid after calling join();
   for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
-    OtherAssignments.push_back(RHSValNoAssignments[I->valno->id]);
+    I->valno = NewVNInfo[RHSValNoAssignments[I->valno->id]];
 
   // Update val# info. Renumber them and make sure they all belong to this
   // LiveInterval now. Also remove dead val#'s.
@@ -486,148 +488,9 @@ void LiveInterval::join(LiveInterval &Other,
     valnos.resize(NumNewVals);  // shrinkify
 
   // Okay, now insert the RHS live ranges into the LHS.
-  unsigned RangeNo = 0;
-  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I, ++RangeNo) {
-    // Map the valno in the other live range to the current live range.
-    I->valno = NewVNInfo[OtherAssignments[RangeNo]];
-    assert(I->valno && "Adding a dead range?");
-  }
-  mergeIntervalRanges(Other);
-
-  verify();
-}
-
-/// \brief Helper function for merging in another LiveInterval's ranges.
-///
-/// This is a helper routine implementing an efficient merge of another
-/// LiveIntervals ranges into the current interval.
-///
-/// \param LHSValNo If non-NULL, set as the new value number for every range
-///                 from RHS which is merged into the LHS.
-/// \param RHSValNo If non-NULL, then only ranges in RHS whose original value
-///                 number maches this value number will be merged into LHS.
-void LiveInterval::mergeIntervalRanges(const LiveInterval &RHS,
-                                       VNInfo *LHSValNo,
-                                       const VNInfo *RHSValNo) {
-  if (RHS.empty())
-    return;
-
-  // Ensure we're starting with a valid range. Note that we don't verify RHS
-  // because it may have had its value numbers adjusted in preparation for
-  // merging.
-  verify();
-
-  // The strategy for merging these efficiently is as follows:
-  //
-  // 1) Find the beginning of the impacted ranges in the LHS.
-  // 2) Create a new, merged sub-squence of ranges merging from the position in
-  //    #1 until either LHS or RHS is exhausted. Any part of LHS between RHS
-  //    entries being merged will be copied into this new range.
-  // 3) Replace the relevant section in LHS with these newly merged ranges.
-  // 4) Append any remaning ranges from RHS if LHS is exhausted in #2.
-  //
-  // We don't follow the typical in-place merge strategy for sorted ranges of
-  // appending the new ranges to the back and then using std::inplace_merge
-  // because one step of the merge can both mutate the original elements and
-  // remove elements from the original. Essentially, because the merge includes
-  // collapsing overlapping ranges, a more complex approach is required.
-
-  // We do an initial binary search to optimize for a common pattern: a large
-  // LHS, and a very small RHS.
-  const_iterator RI = RHS.begin(), RE = RHS.end();
-  iterator LE = end(), LI = std::upper_bound(begin(), LE, *RI);
-
-  // Merge into NewRanges until one of the ranges is exhausted.
-  SmallVector<LiveRange, 4> NewRanges;
-
-  // Keep track of where to begin the replacement.
-  iterator ReplaceI = LI;
-
-  // If there are preceding ranges in the LHS, put the last one into NewRanges
-  // so we can optionally extend it. Adjust the replacement point accordingly.
-  if (LI != begin()) {
-    ReplaceI = llvm::prior(LI);
-    NewRanges.push_back(*ReplaceI);
-  }
-
-  // Now loop over the mergable portions of both LHS and RHS, merging into
-  // NewRanges.
-  while (LI != LE && RI != RE) {
-    // Skip incoming ranges with the wrong value.
-    if (RHSValNo && RI->valno != RHSValNo) {
-      ++RI;
-      continue;
-    }
-
-    // Select the first range. We pick the earliest start point, and then the
-    // largest range.
-    LiveRange R = *LI;
-    if (*RI < R) {
-      R = *RI;
-      ++RI;
-      if (LHSValNo)
-        R.valno = LHSValNo;
-    } else {
-      ++LI;
-    }
-
-    if (NewRanges.empty()) {
-      NewRanges.push_back(R);
-      continue;
-    }
-
-    LiveRange &LastR = NewRanges.back();
-    if (R.valno == LastR.valno) {
-      // Try to merge this range into the last one.
-      if (R.start <= LastR.end) {
-        LastR.end = std::max(LastR.end, R.end);
-        continue;
-      }
-    } else {
-      // We can't merge ranges across a value number.
-      assert(R.start >= LastR.end &&
-             "Cannot overlap two LiveRanges with differing ValID's");
-    }
-
-    // If all else fails, just append the range.
-    NewRanges.push_back(R);
-  }
-  assert(RI == RE || LI == LE);
-
-  // Check for being able to merge into the trailing sequence of ranges on the LHS.
-  if (!NewRanges.empty())
-    for (; LI != LE && (LI->valno == NewRanges.back().valno &&
-                        LI->start <= NewRanges.back().end);
-         ++LI)
-      NewRanges.back().end = std::max(NewRanges.back().end, LI->end);
-
-  // Replace the ranges in the LHS with the newly merged ones. It would be
-  // really nice if there were a move-supporting 'replace' directly in
-  // SmallVector, but as there is not, we pay the price of copies to avoid
-  // wasted memory allocations.
-  SmallVectorImpl<LiveRange>::iterator NRI = NewRanges.begin(),
-                                       NRE = NewRanges.end();
-  for (; ReplaceI != LI && NRI != NRE; ++ReplaceI, ++NRI)
-    *ReplaceI = *NRI;
-  if (NRI == NRE)
-    ranges.erase(ReplaceI, LI);
-  else
-    ranges.insert(LI, NRI, NRE);
-
-  // And finally insert any trailing end of RHS (if we have one).
-  for (; RI != RE; ++RI) {
-    LiveRange R = *RI;
-    if (LHSValNo)
-      R.valno = LHSValNo;
-    if (!ranges.empty() &&
-        ranges.back().valno == R.valno && R.start <= ranges.back().end)
-      ranges.back().end = std::max(ranges.back().end, R.end);
-    else
-      ranges.push_back(R);
-  }
-
-  // Ensure we finished with a valid new sequence of ranges.
-  verify();
+  LiveRangeUpdater Updater(this);
+  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
+    Updater.add(*I);
 }
 
 /// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
@@ -636,7 +499,9 @@ void LiveInterval::mergeIntervalRanges(const LiveInterval &RHS,
 /// the overlapping LiveRanges have the specified value number.
 void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
                                         VNInfo *LHSValNo) {
-  mergeIntervalRanges(RHS, LHSValNo);
+  LiveRangeUpdater Updater(this);
+  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
+    Updater.add(I->start, I->end, LHSValNo);
 }
 
 /// MergeValueInAsValue - Merge all of the live ranges of a specific val#
@@ -647,7 +512,10 @@ void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
 void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
                                        const VNInfo *RHSValNo,
                                        VNInfo *LHSValNo) {
-  mergeIntervalRanges(RHS, LHSValNo, RHSValNo);
+  LiveRangeUpdater Updater(this);
+  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
+    if (I->valno == RHSValNo)
+      Updater.add(I->start, I->end, LHSValNo);
 }
 
 /// MergeValueNumberInto - This method is called when two value nubmers
@@ -785,6 +653,206 @@ void LiveRange::print(raw_ostream &os) const {
   os << *this;
 }
 
+//===----------------------------------------------------------------------===//
+//                           LiveRangeUpdater class
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeUpdater class always maintains these invariants:
+//
+// - When LastStart is invalid, Spills is empty and the iterators are invalid.
+//   This is the initial state, and the state created by flush().
+//   In this state, isDirty() returns false.
+//
+// Otherwise, segments are kept in three separate areas:
+//
+// 1. [begin; WriteI) at the front of LI.
+// 2. [ReadI; end) at the back of LI.
+// 3. Spills.
+//
+// - LI.begin() <= WriteI <= ReadI <= LI.end().
+// - Segments in all three areas are fully ordered and coalesced.
+// - Segments in area 1 precede and can't coalesce with segments in area 2.
+// - Segments in Spills precede and can't coalesce with segments in area 2.
+// - No coalescing is possible between segments in Spills and segments in area
+//   1, and there are no overlapping segments.
+//
+// The segments in Spills are not ordered with respect to the segments in area
+// 1. They need to be merged.
+//
+// When they exist, Spills.back().start <= LastStart,
+//                 and WriteI[-1].start <= LastStart.
+
+void LiveRangeUpdater::print(raw_ostream &OS) const {
+  if (!isDirty()) {
+    if (LI)
+      OS << "Clean " << PrintReg(LI->reg) << " updater: " << *LI << '\n';
+    else
+      OS << "Null updater.\n";
+    return;
+  }
+  assert(LI && "Can't have null LI in dirty updater.");
+  OS << PrintReg(LI->reg) << " updater with gap = " << (ReadI - WriteI)
+     << ", last start = " << LastStart
+     << ":\n  Area 1:";
+  for (LiveInterval::const_iterator I = LI->begin(); I != WriteI; ++I)
+    OS << ' ' << *I;
+  OS << "\n  Spills:";
+  for (unsigned I = 0, E = Spills.size(); I != E; ++I)
+    OS << ' ' << Spills[I];
+  OS << "\n  Area 2:";
+  for (LiveInterval::const_iterator I = ReadI, E = LI->end(); I != E; ++I)
+    OS << ' ' << *I;
+  OS << '\n';
+}
+
+void LiveRangeUpdater::dump() const
+{
+  print(errs());
+}
+
+// Determine if A and B should be coalesced.
+static inline bool coalescable(const LiveRange &A, const LiveRange &B) {
+  assert(A.start <= B.start && "Unordered live ranges.");
+  if (A.end == B.start)
+    return A.valno == B.valno;
+  if (A.end < B.start)
+    return false;
+  assert(A.valno == B.valno && "Cannot overlap different values");
+  return true;
+}
+
+void LiveRangeUpdater::add(LiveRange Seg) {
+  assert(LI && "Cannot add to a null destination");
+
+  // Flush the state if Start moves backwards.
+  if (!LastStart.isValid() || LastStart > Seg.start) {
+    if (isDirty())
+      flush();
+    // This brings us to an uninitialized state. Reinitialize.
+    assert(Spills.empty() && "Leftover spilled segments");
+    WriteI = ReadI = LI->begin();
+  }
+
+  // Remember start for next time.
+  LastStart = Seg.start;
+
+  // Advance ReadI until it ends after Seg.start.
+  LiveInterval::iterator E = LI->end();
+  if (ReadI != E && ReadI->end <= Seg.start) {
+    // First try to close the gap between WriteI and ReadI with spills.
+    if (ReadI != WriteI)
+      mergeSpills();
+    // Then advance ReadI.
+    if (ReadI == WriteI)
+      ReadI = WriteI = LI->find(Seg.start);
+    else
+      while (ReadI != E && ReadI->end <= Seg.start)
+        *WriteI++ = *ReadI++;
+  }
+
+  assert(ReadI == E || ReadI->end > Seg.start);
+
+  // Check if the ReadI segment begins early.
+  if (ReadI != E && ReadI->start <= Seg.start) {
+    assert(ReadI->valno == Seg.valno && "Cannot overlap different values");
+    // Bail if Seg is completely contained in ReadI.
+    if (ReadI->end >= Seg.end)
+      return;
+    // Coalesce into Seg.
+    Seg.start = ReadI->start;
+    ++ReadI;
+  }
+
+  // Coalesce as much as possible from ReadI into Seg.
+  while (ReadI != E && coalescable(Seg, *ReadI)) {
+    Seg.end = std::max(Seg.end, ReadI->end);
+    ++ReadI;
+  }
+
+  // Try coalescing Spills.back() into Seg.
+  if (!Spills.empty() && coalescable(Spills.back(), Seg)) {
+    Seg.start = Spills.back().start;
+    Seg.end = std::max(Spills.back().end, Seg.end);
+    Spills.pop_back();
+  }
+
+  // Try coalescing Seg into WriteI[-1].
+  if (WriteI != LI->begin() && coalescable(WriteI[-1], Seg)) {
+    WriteI[-1].end = std::max(WriteI[-1].end, Seg.end);
+    return;
+  }
+
+  // Seg doesn't coalesce with anything, and needs to be inserted somewhere.
+  if (WriteI != ReadI) {
+    *WriteI++ = Seg;
+    return;
+  }
+
+  // Finally, append to LI or Spills.
+  if (WriteI == E) {
+    LI->ranges.push_back(Seg);
+    WriteI = ReadI = LI->ranges.end();
+  } else
+    Spills.push_back(Seg);
+}
+
+// Merge as many spilled segments as possible into the gap between WriteI
+// and ReadI. Advance WriteI to reflect the inserted instructions.
+void LiveRangeUpdater::mergeSpills() {
+  // Perform a backwards merge of Spills and [SpillI;WriteI).
+  size_t GapSize = ReadI - WriteI;
+  size_t NumMoved = std::min(Spills.size(), GapSize);
+  LiveInterval::iterator Src = WriteI;
+  LiveInterval::iterator Dst = Src + NumMoved;
+  LiveInterval::iterator SpillSrc = Spills.end();
+  LiveInterval::iterator B = LI->begin();
+
+  // This is the new WriteI position after merging spills.
+  WriteI = Dst;
+
+  // Now merge Src and Spills backwards.
+  while (Src != Dst) {
+    if (Src != B && Src[-1].start > SpillSrc[-1].start)
+      *--Dst = *--Src;
+    else
+      *--Dst = *--SpillSrc;
+  }
+  assert(NumMoved == size_t(Spills.end() - SpillSrc));
+  Spills.erase(SpillSrc, Spills.end());
+}
+
+void LiveRangeUpdater::flush() {
+  if (!isDirty())
+    return;
+  // Clear the dirty state.
+  LastStart = SlotIndex();
+
+  assert(LI && "Cannot add to a null destination");
+
+  // Nothing to merge?
+  if (Spills.empty()) {
+    LI->ranges.erase(WriteI, ReadI);
+    LI->verify();
+    return;
+  }
+
+  // Resize the WriteI - ReadI gap to match Spills.
+  size_t GapSize = ReadI - WriteI;
+  if (GapSize < Spills.size()) {
+    // The gap is too small. Make some room.
+    size_t WritePos = WriteI - LI->begin();
+    LI->ranges.insert(ReadI, Spills.size() - GapSize, LiveRange());
+    // This also invalidated ReadI, but it is recomputed below.
+    WriteI = LI->ranges.begin() + WritePos;
+  } else {
+    // Shrink the gap if necessary.
+    LI->ranges.erase(WriteI + Spills.size(), ReadI);
+  }
+  ReadI = WriteI + Spills.size();
+  mergeSpills();
+  LI->verify();
+}
+
 unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
   // Create initial equivalence classes.
   EqClass.clear();
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 4198457..22b35d5 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -40,11 +40,6 @@
 #include <limits>
 using namespace llvm;
 
-// Switch to the new experimental algorithm for computing live intervals.
-static cl::opt<bool>
-NewLiveIntervals("new-live-intervals", cl::Hidden,
-                 cl::desc("Use new algorithm forcomputing live intervals"));
-
 char LiveIntervals::ID = 0;
 char &llvm::LiveIntervalsID = LiveIntervals::ID;
 INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
@@ -60,6 +55,9 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<AliasAnalysis>();
   AU.addPreserved<AliasAnalysis>();
+  // LiveVariables isn't really required by this analysis, it is only required
+  // here to make sure it is live during TwoAddressInstructionPass and
+  // PHIElimination. This is temporary.
   AU.addRequired<LiveVariables>();
   AU.addPreserved<LiveVariables>();
   AU.addPreservedID(MachineLoopInfoID);
@@ -105,7 +103,6 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   TRI = TM->getRegisterInfo();
   TII = TM->getInstrInfo();
   AA = &getAnalysis<AliasAnalysis>();
-  LV = &getAnalysis<LiveVariables>();
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
   if (!LRCalc)
@@ -114,16 +111,8 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   // Allocate space for all virtual registers.
   VirtRegIntervals.resize(MRI->getNumVirtRegs());
 
-  if (NewLiveIntervals) {
-    // This is the new way of computing live intervals.
-    // It is independent of LiveVariables, and it can run at any time.
-    computeVirtRegs();
-    computeRegMasks();
-  } else {
-    // This is the old way of computing live intervals.
-    // It depends on LiveVariables.
-    computeIntervals();
-  }
+  computeVirtRegs();
+  computeRegMasks();
   computeLiveInRegUnits();
 
   DEBUG(dump());
@@ -165,298 +154,6 @@ void LiveIntervals::dumpInstrs() const {
 }
 #endif
 
-static
-bool MultipleDefsBySameMI(const MachineInstr &MI, unsigned MOIdx) {
-  unsigned Reg = MI.getOperand(MOIdx).getReg();
-  for (unsigned i = MOIdx+1, e = MI.getNumOperands(); i < e; ++i) {
-    const MachineOperand &MO = MI.getOperand(i);
-    if (!MO.isReg())
-      continue;
-    if (MO.getReg() == Reg && MO.isDef()) {
-      assert(MI.getOperand(MOIdx).getSubReg() != MO.getSubReg() &&
-             MI.getOperand(MOIdx).getSubReg() &&
-             (MO.getSubReg() || MO.isImplicit()));
-      return true;
-    }
-  }
-  return false;
-}
-
-/// isPartialRedef - Return true if the specified def at the specific index is
-/// partially re-defining the specified live interval. A common case of this is
-/// a definition of the sub-register.
-bool LiveIntervals::isPartialRedef(SlotIndex MIIdx, MachineOperand &MO,
-                                   LiveInterval &interval) {
-  if (!MO.getSubReg() || MO.isEarlyClobber())
-    return false;
-
-  SlotIndex RedefIndex = MIIdx.getRegSlot();
-  const LiveRange *OldLR =
-    interval.getLiveRangeContaining(RedefIndex.getRegSlot(true));
-  MachineInstr *DefMI = getInstructionFromIndex(OldLR->valno->def);
-  if (DefMI != 0) {
-    return DefMI->findRegisterDefOperandIdx(interval.reg) != -1;
-  }
-  return false;
-}
-
-void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
-                                             MachineBasicBlock::iterator mi,
-                                             SlotIndex MIIdx,
-                                             MachineOperand& MO,
-                                             unsigned MOIdx,
-                                             LiveInterval &interval) {
-  DEBUG(dbgs() << "\t\tregister: " << PrintReg(interval.reg, TRI));
-
-  // Virtual registers may be defined multiple times (due to phi
-  // elimination and 2-addr elimination).  Much of what we do only has to be
-  // done once for the vreg.  We use an empty interval to detect the first
-  // time we see a vreg.
-  LiveVariables::VarInfo& vi = LV->getVarInfo(interval.reg);
-  if (interval.empty()) {
-    // Get the Idx of the defining instructions.
-    SlotIndex defIndex = MIIdx.getRegSlot(MO.isEarlyClobber());
-
-    // Make sure the first definition is not a partial redefinition.
-    assert(!MO.readsReg() && "First def cannot also read virtual register "
-           "missing <undef> flag?");
-
-    VNInfo *ValNo = interval.getNextValue(defIndex, VNInfoAllocator);
-    assert(ValNo->id == 0 && "First value in interval is not 0?");
-
-    // Loop over all of the blocks that the vreg is defined in.  There are
-    // two cases we have to handle here.  The most common case is a vreg
-    // whose lifetime is contained within a basic block.  In this case there
-    // will be a single kill, in MBB, which comes after the definition.
-    if (vi.Kills.size() == 1 && vi.Kills[0]->getParent() == mbb) {
-      // FIXME: what about dead vars?
-      SlotIndex killIdx;
-      if (vi.Kills[0] != mi)
-        killIdx = getInstructionIndex(vi.Kills[0]).getRegSlot();
-      else
-        killIdx = defIndex.getDeadSlot();
-
-      // If the kill happens after the definition, we have an intra-block
-      // live range.
-      if (killIdx > defIndex) {
-        assert(vi.AliveBlocks.empty() &&
-               "Shouldn't be alive across any blocks!");
-        LiveRange LR(defIndex, killIdx, ValNo);
-        interval.addRange(LR);
-        DEBUG(dbgs() << " +" << LR << "\n");
-        return;
-      }
-    }
-
-    // The other case we handle is when a virtual register lives to the end
-    // of the defining block, potentially live across some blocks, then is
-    // live into some number of blocks, but gets killed.  Start by adding a
-    // range that goes from this definition to the end of the defining block.
-    LiveRange NewLR(defIndex, getMBBEndIdx(mbb), ValNo);
-    DEBUG(dbgs() << " +" << NewLR);
-    interval.addRange(NewLR);
-
-    bool PHIJoin = LV->isPHIJoin(interval.reg);
-
-    if (PHIJoin) {
-      // A phi join register is killed at the end of the MBB and revived as a
-      // new valno in the killing blocks.
-      assert(vi.AliveBlocks.empty() && "Phi join can't pass through blocks");
-      DEBUG(dbgs() << " phi-join");
-    } else {
-      // Iterate over all of the blocks that the variable is completely
-      // live in, adding [insrtIndex(begin), instrIndex(end)+4) to the
-      // live interval.
-      for (SparseBitVector<>::iterator I = vi.AliveBlocks.begin(),
-               E = vi.AliveBlocks.end(); I != E; ++I) {
-        MachineBasicBlock *aliveBlock = MF->getBlockNumbered(*I);
-        LiveRange LR(getMBBStartIdx(aliveBlock), getMBBEndIdx(aliveBlock),
-                     ValNo);
-        interval.addRange(LR);
-        DEBUG(dbgs() << " +" << LR);
-      }
-    }
-
-    // Finally, this virtual register is live from the start of any killing
-    // block to the 'use' slot of the killing instruction.
-    for (unsigned i = 0, e = vi.Kills.size(); i != e; ++i) {
-      MachineInstr *Kill = vi.Kills[i];
-      SlotIndex Start = getMBBStartIdx(Kill->getParent());
-      SlotIndex killIdx = getInstructionIndex(Kill).getRegSlot();
-
-      // Create interval with one of a NEW value number.  Note that this value
-      // number isn't actually defined by an instruction, weird huh? :)
-      if (PHIJoin) {
-        assert(getInstructionFromIndex(Start) == 0 &&
-               "PHI def index points at actual instruction.");
-        ValNo = interval.getNextValue(Start, VNInfoAllocator);
-      }
-      LiveRange LR(Start, killIdx, ValNo);
-      interval.addRange(LR);
-      DEBUG(dbgs() << " +" << LR);
-    }
-
-  } else {
-    if (MultipleDefsBySameMI(*mi, MOIdx))
-      // Multiple defs of the same virtual register by the same instruction.
-      // e.g. %reg1031:5<def>, %reg1031:6<def> = VLD1q16 %reg1024<kill>, ...
-      // This is likely due to elimination of REG_SEQUENCE instructions. Return
-      // here since there is nothing to do.
-      return;
-
-    // If this is the second time we see a virtual register definition, it
-    // must be due to phi elimination or two addr elimination.  If this is
-    // the result of two address elimination, then the vreg is one of the
-    // def-and-use register operand.
-
-    // It may also be partial redef like this:
-    // 80  %reg1041:6<def> = VSHRNv4i16 %reg1034<kill>, 12, pred:14, pred:%reg0
-    // 120 %reg1041:5<def> = VSHRNv4i16 %reg1039<kill>, 12, pred:14, pred:%reg0
-    bool PartReDef = isPartialRedef(MIIdx, MO, interval);
-    if (PartReDef || mi->isRegTiedToUseOperand(MOIdx)) {
-      // If this is a two-address definition, then we have already processed
-      // the live range.  The only problem is that we didn't realize there
-      // are actually two values in the live interval.  Because of this we
-      // need to take the LiveRegion that defines this register and split it
-      // into two values.
-      SlotIndex RedefIndex = MIIdx.getRegSlot(MO.isEarlyClobber());
-
-      const LiveRange *OldLR =
-        interval.getLiveRangeContaining(RedefIndex.getRegSlot(true));
-      VNInfo *OldValNo = OldLR->valno;
-      SlotIndex DefIndex = OldValNo->def.getRegSlot();
-
-      // Delete the previous value, which should be short and continuous,
-      // because the 2-addr copy must be in the same MBB as the redef.
-      interval.removeRange(DefIndex, RedefIndex);
-
-      // The new value number (#1) is defined by the instruction we claimed
-      // defined value #0.
-      VNInfo *ValNo = interval.createValueCopy(OldValNo, VNInfoAllocator);
-
-      // Value#0 is now defined by the 2-addr instruction.
-      OldValNo->def = RedefIndex;
-
-      // Add the new live interval which replaces the range for the input copy.
-      LiveRange LR(DefIndex, RedefIndex, ValNo);
-      DEBUG(dbgs() << " replace range with " << LR);
-      interval.addRange(LR);
-
-      // If this redefinition is dead, we need to add a dummy unit live
-      // range covering the def slot.
-      if (MO.isDead())
-        interval.addRange(LiveRange(RedefIndex, RedefIndex.getDeadSlot(),
-                                    OldValNo));
-
-      DEBUG(dbgs() << " RESULT: " << interval);
-    } else if (LV->isPHIJoin(interval.reg)) {
-      // In the case of PHI elimination, each variable definition is only
-      // live until the end of the block.  We've already taken care of the
-      // rest of the live range.
-
-      SlotIndex defIndex = MIIdx.getRegSlot();
-      if (MO.isEarlyClobber())
-        defIndex = MIIdx.getRegSlot(true);
-
-      VNInfo *ValNo = interval.getNextValue(defIndex, VNInfoAllocator);
-
-      SlotIndex killIndex = getMBBEndIdx(mbb);
-      LiveRange LR(defIndex, killIndex, ValNo);
-      interval.addRange(LR);
-      DEBUG(dbgs() << " phi-join +" << LR);
-    } else {
-      llvm_unreachable("Multiply defined register");
-    }
-  }
-
-  DEBUG(dbgs() << '\n');
-}
-
-void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
-                                      MachineBasicBlock::iterator MI,
-                                      SlotIndex MIIdx,
-                                      MachineOperand& MO,
-                                      unsigned MOIdx) {
-  if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-    handleVirtualRegisterDef(MBB, MI, MIIdx, MO, MOIdx,
-                             getOrCreateInterval(MO.getReg()));
-}
-
-/// computeIntervals - computes the live intervals for virtual
-/// registers. for some ordering of the machine instructions [1,N] a
-/// live interval is an interval [i, j) where 1 <= i <= j < N for
-/// which a variable is live
-void LiveIntervals::computeIntervals() {
-  DEBUG(dbgs() << "********** COMPUTING LIVE INTERVALS **********\n"
-               << "********** Function: " << MF->getName() << '\n');
-
-  RegMaskBlocks.resize(MF->getNumBlockIDs());
-
-  SmallVector<unsigned, 8> UndefUses;
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    RegMaskBlocks[MBB->getNumber()].first = RegMaskSlots.size();
-
-    if (MBB->empty())
-      continue;
-
-    // Track the index of the current machine instr.
-    SlotIndex MIIndex = getMBBStartIdx(MBB);
-    DEBUG(dbgs() << "BB#" << MBB->getNumber()
-          << ":\t\t# derived from " << MBB->getName() << "\n");
-
-    // Skip over empty initial indices.
-    if (getInstructionFromIndex(MIIndex) == 0)
-      MIIndex = Indexes->getNextNonNullIndex(MIIndex);
-
-    for (MachineBasicBlock::iterator MI = MBB->begin(), miEnd = MBB->end();
-         MI != miEnd; ++MI) {
-      DEBUG(dbgs() << MIIndex << "\t" << *MI);
-      if (MI->isDebugValue())
-        continue;
-      assert(Indexes->getInstructionFromIndex(MIIndex) == MI &&
-             "Lost SlotIndex synchronization");
-
-      // Handle defs.
-      for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
-        MachineOperand &MO = MI->getOperand(i);
-
-        // Collect register masks.
-        if (MO.isRegMask()) {
-          RegMaskSlots.push_back(MIIndex.getRegSlot());
-          RegMaskBits.push_back(MO.getRegMask());
-          continue;
-        }
-
-        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-          continue;
-
-        // handle register defs - build intervals
-        if (MO.isDef())
-          handleRegisterDef(MBB, MI, MIIndex, MO, i);
-        else if (MO.isUndef())
-          UndefUses.push_back(MO.getReg());
-      }
-
-      // Move to the next instr slot.
-      MIIndex = Indexes->getNextNonNullIndex(MIIndex);
-    }
-
-    // Compute the number of register mask instructions in this block.
-    std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB->getNumber()];
-    RMB.second = RegMaskSlots.size() - RMB.first;
-  }
-
-  // Create empty intervals for registers defined by implicit_def's (except
-  // for those implicit_def that define values which are liveout of their
-  // blocks.
-  for (unsigned i = 0, e = UndefUses.size(); i != e; ++i) {
-    unsigned UndefReg = UndefUses[i];
-    (void)getOrCreateInterval(UndefReg);
-  }
-}
-
 LiveInterval* LiveIntervals::createInterval(unsigned reg) {
   float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? HUGE_VALF : 0.0F;
   return new LiveInterval(reg, Weight);
@@ -1335,3 +1032,129 @@ void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI,
   HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
   HME.updateAllRanges(MI);
 }
+
+void
+LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator Begin,
+                                      MachineBasicBlock::iterator End,
+                                      ArrayRef<unsigned> OrigRegs) {
+  // Find anchor points, which are at the beginning/end of blocks or at
+  // instructions that already have indexes.
+  while (Begin != MBB->begin() && !Indexes->hasIndex(Begin))
+    --Begin;
+  while (End != MBB->end() && !Indexes->hasIndex(End))
+    ++End;
+
+  SlotIndex endIdx;
+  if (End == MBB->end())
+    endIdx = getMBBEndIdx(MBB).getPrevSlot();
+  else
+    endIdx = getInstructionIndex(End);
+
+  Indexes->repairIndexesInRange(MBB, Begin, End);
+
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr *MI = I;
+    if (MI->isDebugValue())
+      continue;
+    for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
+         MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+      if (MOI->isReg() &&
+          TargetRegisterInfo::isVirtualRegister(MOI->getReg()) &&
+          !hasInterval(MOI->getReg())) {
+        LiveInterval &LI = getOrCreateInterval(MOI->getReg());
+        computeVirtRegInterval(&LI);
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = OrigRegs.size(); i != e; ++i) {
+    unsigned Reg = OrigRegs[i];
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    LiveInterval &LI = getInterval(Reg);
+    // FIXME: Should we support undefs that gain defs?
+    if (!LI.hasAtLeastOneValue())
+      continue;
+
+    LiveInterval::iterator LII = LI.find(endIdx);
+    SlotIndex lastUseIdx;
+    if (LII != LI.end() && LII->start < endIdx)
+      lastUseIdx = LII->end;
+    else
+      --LII;
+
+    for (MachineBasicBlock::iterator I = End; I != Begin;) {
+      --I;
+      MachineInstr *MI = I;
+      if (MI->isDebugValue())
+        continue;
+
+      SlotIndex instrIdx = getInstructionIndex(MI);
+      bool isStartValid = getInstructionFromIndex(LII->start);
+      bool isEndValid = getInstructionFromIndex(LII->end);
+
+      // FIXME: This doesn't currently handle early-clobber or multiple removed
+      // defs inside of the region to repair.
+      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+           OE = MI->operands_end(); OI != OE; ++OI) {
+        const MachineOperand &MO = *OI;
+        if (!MO.isReg() || MO.getReg() != Reg)
+          continue;
+
+        if (MO.isDef()) {
+          if (!isStartValid) {
+            if (LII->end.isDead()) {
+              SlotIndex prevStart;
+              if (LII != LI.begin())
+                prevStart = llvm::prior(LII)->start;
+
+              // FIXME: This could be more efficient if there was a removeRange
+              // method that returned an iterator.
+              LI.removeRange(*LII, true);
+              if (prevStart.isValid())
+                LII = LI.find(prevStart);
+              else
+                LII = LI.begin();
+            } else {
+              LII->start = instrIdx.getRegSlot();
+              LII->valno->def = instrIdx.getRegSlot();
+              if (MO.getSubReg() && !MO.isUndef())
+                lastUseIdx = instrIdx.getRegSlot();
+              else
+                lastUseIdx = SlotIndex();
+              continue;
+            }
+          }
+
+          if (!lastUseIdx.isValid()) {
+            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
+                                          VNInfoAllocator);
+            LiveRange LR(instrIdx.getRegSlot(), instrIdx.getDeadSlot(), VNI);
+            LII = LI.addRange(LR);
+          } else if (LII->start != instrIdx.getRegSlot()) {
+            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
+                                          VNInfoAllocator);
+            LiveRange LR(instrIdx.getRegSlot(), lastUseIdx, VNI);
+            LII = LI.addRange(LR);
+          }
+
+          if (MO.getSubReg() && !MO.isUndef())
+            lastUseIdx = instrIdx.getRegSlot();
+          else
+            lastUseIdx = SlotIndex();
+        } else if (MO.isUse()) {
+          // FIXME: This should probably be handled outside of this branch,
+          // either as part of the def case (for defs inside of the region) or
+          // after the loop over the region.
+          if (!isEndValid && !LII->end.isBlock())
+            LII->end = instrIdx.getRegSlot();
+          if (!lastUseIdx.isValid())
+            lastUseIdx = instrIdx.getRegSlot();
+        }
+      }
+    }
+  }
+}
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index c3ff4f1..dede490 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -18,10 +18,11 @@
 
 using namespace llvm;
 
-void LiveRangeCalc::reset(const MachineFunction *MF,
+void LiveRangeCalc::reset(const MachineFunction *mf,
                           SlotIndexes *SI,
                           MachineDominatorTree *MDT,
                           VNInfo::Allocator *VNIA) {
+  MF = mf;
   MRI = &MF->getRegInfo();
   Indexes = SI;
   DomTree = MDT;
@@ -104,28 +105,28 @@ void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
 
 
 // Transfer information from the LiveIn vector to the live ranges.
-void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI) {
+void LiveRangeCalc::updateLiveIns() {
+  LiveRangeUpdater Updater;
   for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
          E = LiveIn.end(); I != E; ++I) {
     if (!I->DomNode)
       continue;
     MachineBasicBlock *MBB = I->DomNode->getBlock();
-
-    VNInfo *VNI = OverrideVNI ? OverrideVNI : I->Value;
-    assert(VNI && "No live-in value found");
-
+    assert(I->Value && "No live-in value found");
     SlotIndex Start, End;
     tie(Start, End) = Indexes->getMBBRange(MBB);
 
     if (I->Kill.isValid())
-      I->LI->addRange(LiveRange(Start, I->Kill, VNI));
+      // Value is killed inside this block.
+      End = I->Kill;
     else {
-      I->LI->addRange(LiveRange(Start, End, VNI));
-      // The value is live-through, update LiveOut as well.  Defer the Domtree
-      // lookup until it is needed.
+      // The value is live-through, update LiveOut as well.
+      // Defer the Domtree lookup until it is needed.
       assert(Seen.test(MBB->getNumber()));
-      LiveOut[MBB] = LiveOutPair(VNI, (MachineDomTreeNode *)0);
+      LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)0);
     }
+    Updater.setDest(I->LI);
+    Updater.add(Start, End, I->Value);
   }
   LiveIn.clear();
 }
@@ -150,13 +151,11 @@ void LiveRangeCalc::extend(LiveInterval *LI,
   // multiple values, and we may need to create even more phi-defs to preserve
   // VNInfo SSA form.  Perform a search for all predecessor blocks where we
   // know the dominating VNInfo.
-  VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill, PhysReg);
+  if (findReachingDefs(LI, KillMBB, Kill, PhysReg))
+    return;
 
   // When there were multiple different values, we may need new PHIs.
-  if (!VNI)
-    updateSSA();
-
-  updateLiveIns(VNI);
+  calculateValues();
 }
 
 
@@ -167,16 +166,18 @@ void LiveRangeCalc::calculateValues() {
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
   updateSSA();
-  updateLiveIns(0);
+  updateLiveIns();
 }
 
 
-VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
-                                        MachineBasicBlock *KillMBB,
-                                        SlotIndex Kill,
-                                        unsigned PhysReg) {
-  // Blocks where LI should be live-in.
-  SmallVector<MachineBasicBlock*, 16> WorkList(1, KillMBB);
+bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
+                                     MachineBasicBlock *KillMBB,
+                                     SlotIndex Kill,
+                                     unsigned PhysReg) {
+  unsigned KillMBBNum = KillMBB->getNumber();
+
+  // Block numbers where LI should be live-in.
+  SmallVector<unsigned, 16> WorkList(1, KillMBBNum);
 
   // Remember if we have seen more than one value.
   bool UniqueVNI = true;
@@ -184,7 +185,7 @@ VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
   // Using Seen as a visited set, perform a BFS for all reaching defs.
   for (unsigned i = 0; i != WorkList.size(); ++i) {
-    MachineBasicBlock *MBB = WorkList[i];
+    MachineBasicBlock *MBB = MF->getBlockNumbered(WorkList[i]);
 
 #ifndef NDEBUG
     if (MBB->pred_empty()) {
@@ -231,25 +232,50 @@ VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
        // No, we need a live-in value for Pred as well
        if (Pred != KillMBB)
-          WorkList.push_back(Pred);
+          WorkList.push_back(Pred->getNumber());
        else
           // Loopback to KillMBB, so value is really live through.
          Kill = SlotIndex();
     }
   }
 
-  // Transfer WorkList to LiveInBlocks in reverse order.
-  // This ordering works best with updateSSA().
   LiveIn.clear();
-  LiveIn.reserve(WorkList.size());
-  while(!WorkList.empty())
-    addLiveInBlock(LI, DomTree->getNode(WorkList.pop_back_val()));
 
-  // The kill block may not be live-through.
-  assert(LiveIn.back().DomNode->getBlock() == KillMBB);
-  LiveIn.back().Kill = Kill;
+  // Both updateSSA() and LiveRangeUpdater benefit from ordered blocks, but
+  // neither require it. Skip the sorting overhead for small updates.
+  if (WorkList.size() > 4)
+    array_pod_sort(WorkList.begin(), WorkList.end());
+
+  // If a unique reaching def was found, blit in the live ranges immediately.
+  if (UniqueVNI) {
+    LiveRangeUpdater Updater(LI);
+    for (SmallVectorImpl<unsigned>::const_iterator
+         I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
+       SlotIndex Start, End;
+       tie(Start, End) = Indexes->getMBBRange(*I);
+       // Trim the live range in KillMBB.
+       if (*I == KillMBBNum && Kill.isValid())
+         End = Kill;
+       else
+         LiveOut[MF->getBlockNumbered(*I)] =
+           LiveOutPair(TheVNI, (MachineDomTreeNode *)0);
+       Updater.add(Start, End, TheVNI);
+    }
+    return true;
+  }
+
+  // Multiple values were found, so transfer the work list to the LiveIn array
+  // where UpdateSSA will use it as a work list.
+  LiveIn.reserve(WorkList.size());
+  for (SmallVectorImpl<unsigned>::const_iterator
+       I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(*I);
+    addLiveInBlock(LI, DomTree->getNode(MBB));
+    if (MBB == KillMBB)
+      LiveIn.back().Kill = Kill;
+  }
 
-  return UniqueVNI ? TheVNI : 0;
+  return false;
 }
 
 
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 909829b..57cab7b 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -34,6 +34,7 @@ template <class NodeT> class DomTreeNodeBase;
 typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode;
 
 class LiveRangeCalc {
+  const MachineFunction *MF;
   const MachineRegisterInfo *MRI;
   SlotIndexes *Indexes;
   MachineDominatorTree *DomTree;
@@ -100,17 +101,20 @@ class LiveRangeCalc {
   /// used to add entries directly.
   SmallVector<LiveInBlock, 16> LiveIn;
 
-  /// findReachingDefs - Assuming that LI is live-in to KillMBB and killed at
-  /// Kill, search for values that can reach KillMBB.  All blocks that need LI
-  /// to be live-in are added to LiveIn.  If a unique reaching def is found,
-  /// its value is returned, if Kill is jointly dominated by multiple values,
-  /// NULL is returned.
+  /// Assuming that LI is live-in to KillMBB and killed at Kill, find the set
+  /// of defs that can reach it.
+  ///
+  /// If only one def can reach Kill, all paths from the def to kill are added
+  /// to LI, and the function returns true.
+  ///
+  /// If multiple values can reach Kill, the blocks that need LI to be live in
+  /// are added to the LiveIn array, and the function returns false.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  VNInfo *findReachingDefs(LiveInterval *LI,
-                           MachineBasicBlock *KillMBB,
-                           SlotIndex Kill,
-                           unsigned PhysReg);
+  bool findReachingDefs(LiveInterval *LI,
+                        MachineBasicBlock *KillMBB,
+                        SlotIndex Kill,
+                        unsigned PhysReg);
 
   /// updateSSA - Compute the values that will be live in to all requested
   /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
@@ -119,12 +123,11 @@ class LiveRangeCalc {
   /// blocks.  No values are read from the live ranges.
   void updateSSA();
 
-  /// updateLiveIns - Add liveness as specified in the LiveIn vector, using VNI
-  /// as a wildcard value for LiveIn entries without a value.
-  void updateLiveIns(VNInfo *VNI);
+  /// Add liveness as specified in the LiveIn vector.
+  void updateLiveIns();
 
 public:
-  LiveRangeCalc() : MRI(0), Indexes(0), DomTree(0), Alloc(0) {}
+  LiveRangeCalc() : MF(0), MRI(0), Indexes(0), DomTree(0), Alloc(0) {}
 
   //===--------------------------------------------------------------------===//
   // High-level interface.
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index f81ad1c..789eddc 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -619,29 +619,6 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
                                 MBB);
     }
 
-    // Finally, if the last instruction in the block is a return, make sure to
-    // mark it as using all of the live-out values in the function.
-    // Things marked both call and return are tail calls; do not do this for
-    // them.  The tail callee need not take the same registers as input
-    // that it produces as output, and there are dependencies for its input
-    // registers elsewhere.
-    if (!MBB->empty() && MBB->back().isReturn()
-        && !MBB->back().isCall()) {
-      MachineInstr *Ret = &MBB->back();
-
-      for (MachineRegisterInfo::liveout_iterator
-           I = MF->getRegInfo().liveout_begin(),
-           E = MF->getRegInfo().liveout_end(); I != E; ++I) {
-        assert(TargetRegisterInfo::isPhysicalRegister(*I) &&
-               "Cannot have a live-out virtual register!");
-        HandlePhysRegUse(*I, Ret);
-
-        // Add live-out registers as implicit uses.
-        if (!Ret->readsRegister(*I))
-          Ret->addOperand(MachineOperand::CreateReg(*I, false, true));
-      }
-    }
-
     // MachineCSE may CSE instructions which write to non-allocatable physical
     // registers across MBBs. Remember if any reserved register is liveout.
     SmallSet<unsigned, 4> LiveOuts;
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 7e9fb20..898e165 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -15,10 +15,12 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
@@ -663,6 +665,13 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
         << " -- BB#" << NMBB->getNumber()
         << " -- BB#" << Succ->getNumber() << '\n');
 
+  LiveIntervals *LIS = P->getAnalysisIfAvailable<LiveIntervals>();
+  SlotIndexes *Indexes = P->getAnalysisIfAvailable<SlotIndexes>();
+  if (LIS)
+    LIS->insertMBBInMaps(NMBB);
+  else if (Indexes)
+    Indexes->insertMBBInMaps(NMBB);
+
   // On some targets like Mips, branches may kill virtual registers. Make sure
   // that LiveVariables is properly updated after updateTerminator replaces the
   // terminators.
@@ -689,14 +698,67 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
       }
     }
 
+  SmallVector<unsigned, 4> UsedRegs;
+  if (LIS) {
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I) {
+      MachineInstr *MI = I;
+
+      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+           OE = MI->operands_end(); OI != OE; ++OI) {
+        if (!OI->isReg() || OI->getReg() == 0)
+          continue;
+
+        unsigned Reg = OI->getReg();
+        if (std::find(UsedRegs.begin(), UsedRegs.end(), Reg) == UsedRegs.end())
+          UsedRegs.push_back(Reg);
+      }
+    }
+  }
+
   ReplaceUsesOfBlockWith(Succ, NMBB);
+
+  // If updateTerminator() removes instructions, we need to remove them from
+  // SlotIndexes.
+  SmallVector<MachineInstr*, 4> Terminators;
+  if (Indexes) {
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I)
+      Terminators.push_back(I);
+  }
+
   updateTerminator();
 
+  if (Indexes) {
+    SmallVector<MachineInstr*, 4> NewTerminators;
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I)
+      NewTerminators.push_back(I);
+
+    for (SmallVectorImpl<MachineInstr*>::iterator I = Terminators.begin(),
+        E = Terminators.end(); I != E; ++I) {
+      if (std::find(NewTerminators.begin(), NewTerminators.end(), *I) ==
+          NewTerminators.end())
+       Indexes->removeMachineInstrFromMaps(*I);
+    }
+  }
+
   // Insert unconditional "jump Succ" instruction in NMBB if necessary.
   NMBB->addSuccessor(Succ);
   if (!NMBB->isLayoutSuccessor(Succ)) {
     Cond.clear();
     MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, Succ, NULL, Cond, dl);
+
+    if (Indexes) {
+      for (instr_iterator I = NMBB->instr_begin(), E = NMBB->instr_end();
+           I != E; ++I) {
+        // Some instructions may have been moved to NMBB by updateTerminator(),
+        // so we first remove any instruction that already has an index.
+        if (Indexes->hasIndex(I))
+          Indexes->removeMachineInstrFromMaps(I);
+        Indexes->insertMachineInstrInMaps(I);
+      }
+    }
   }
 
   // Fix PHI nodes in Succ so they refer to NMBB instead of this
@@ -731,6 +793,67 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
     LV->addNewBlock(NMBB, this, Succ);
   }
 
+  if (LIS) {
+    // After splitting the edge and updating SlotIndexes, live intervals may be
+    // in one of two situations, depending on whether this block was the last in
+    // the function. If the original block was the last in the function, all live
+    // intervals will end prior to the beginning of the new split block. If the
+    // original block was not at the end of the function, all live intervals will
+    // extend to the end of the new split block.
+
+    bool isLastMBB =
+      llvm::next(MachineFunction::iterator(NMBB)) == getParent()->end();
+
+    SlotIndex StartIndex = Indexes->getMBBEndIdx(this);
+    SlotIndex PrevIndex = StartIndex.getPrevSlot();
+    SlotIndex EndIndex = Indexes->getMBBEndIdx(NMBB);
+
+    // Find the registers used from NMBB in PHIs in Succ.
+    SmallSet<unsigned, 8> PHISrcRegs;
+    for (MachineBasicBlock::instr_iterator
+         I = Succ->instr_begin(), E = Succ->instr_end();
+         I != E && I->isPHI(); ++I) {
+      for (unsigned ni = 1, ne = I->getNumOperands(); ni != ne; ni += 2) {
+        if (I->getOperand(ni+1).getMBB() == NMBB) {
+          MachineOperand &MO = I->getOperand(ni);
+          unsigned Reg = MO.getReg();
+          PHISrcRegs.insert(Reg);
+          if (MO.isUndef())
+            continue;
+
+          LiveInterval &LI = LIS->getInterval(Reg);
+          VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
+          assert(VNI && "PHI sources should be live out of their predecessors.");
+          LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+        }
+      }
+    }
+
+    MachineRegisterInfo *MRI = &getParent()->getRegInfo();
+    for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+      unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+      if (PHISrcRegs.count(Reg) || !LIS->hasInterval(Reg))
+        continue;
+
+      LiveInterval &LI = LIS->getInterval(Reg);
+      if (!LI.liveAt(PrevIndex))
+        continue;
+
+      bool isLiveOut = LI.liveAt(LIS->getMBBStartIdx(Succ));
+      if (isLiveOut && isLastMBB) {
+        VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
+        assert(VNI && "LiveInterval should have VNInfo where it is live.");
+        LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+      } else if (!isLiveOut && !isLastMBB) {
+        LI.removeRange(StartIndex, EndIndex);
+      }
+    }
+
+    // Update all intervals for registers whose uses may have been modified by
+    // updateTerminator().
+    LIS->repairIntervalsInRange(this, getFirstTerminator(), end(), UsedRegs);
+  }
+
   if (MachineDominatorTree *MDT =
       P->getAnalysisIfAvailable<MachineDominatorTree>()) {
     // Update dominator information.
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 07a3e03..3b09c6b 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -171,7 +171,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   const TargetInstrInfo *TII;
 
   /// \brief A handle to the target's lowering info.
-  const TargetLowering *TLI;
+  const TargetLoweringBase *TLI;
 
   /// \brief Allocator and owner of BlockChain structures.
   ///
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 9647e83..5e04f2d 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -346,13 +346,6 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
     }
     OS << '\n';
   }
-  if (RegInfo && !RegInfo->liveout_empty()) {
-    OS << "Function Live Outs:";
-    for (MachineRegisterInfo::liveout_iterator
-         I = RegInfo->liveout_begin(), E = RegInfo->liveout_end(); I != E; ++I)
-      OS << ' ' << PrintReg(*I, TRI);
-    OS << '\n';
-  }
 
   for (const_iterator BB = begin(), E = end(); BB != E; ++BB) {
     OS << '\n';
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 29d8866..32d0668 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -752,20 +752,19 @@ void MachineInstr::addMemOperand(MachineFunction &MF,
 }
 
 bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
-  const MachineBasicBlock *MBB = getParent();
-  MachineBasicBlock::const_instr_iterator MII = *this; ++MII;
-  while (MII != MBB->end() && MII->isInsideBundle()) {
+  assert(!isBundledWithPred() && "Must be called on bundle header");
+  for (MachineBasicBlock::const_instr_iterator MII = this;; ++MII) {
     if (MII->getDesc().getFlags() & Mask) {
       if (Type == AnyInBundle)
         return true;
     } else {
-      if (Type == AllInBundle)
+      if (Type == AllInBundle && !MII->isBundle())
         return false;
     }
-    ++MII;
+    // This was the last instruction in the bundle.
+    if (!MII->isBundledWithSucc())
+      return Type == AllInBundle;
   }
-
-  return Type == AllInBundle;
 }
 
 bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
@@ -897,7 +896,7 @@ void MachineInstr::unbundleFromSucc() {
   assert(isBundledWithSucc() && "MI isn't bundled with its successor");
   clearFlag(BundledSucc);
   MachineBasicBlock::instr_iterator Succ = this;
-  --Succ;
+  ++Succ;
   assert(Succ->isBundledWithPred() && "Inconsistent bundle flags");
   Succ->clearFlag(BundledPred);
 }
@@ -982,18 +981,13 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
   return NULL;
 }
 
-/// getBundleSize - Return the number of instructions inside the MI bundle.
+/// Return the number of instructions inside the MI bundle, not counting the
+/// header instruction.
 unsigned MachineInstr::getBundleSize() const {
-  assert(isBundle() && "Expecting a bundle");
-
-  const MachineBasicBlock *MBB = getParent();
-  MachineBasicBlock::const_instr_iterator I = *this, E = MBB->instr_end();
+  MachineBasicBlock::const_instr_iterator I = this;
   unsigned Size = 0;
-  while ((++I != E) && I->isInsideBundle()) {
-    ++Size;
-  }
-  assert(Size > 1 && "Malformed bundle");
-
+  while (I->isBundledWithSucc())
+    ++Size, ++I;
   return Size;
 }
 
@@ -1434,7 +1428,8 @@ static void printDebugLoc(DebugLoc DL, const MachineFunction *MF,
   }
 }
 
-void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
+void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
+                         bool SkipOpers) const {
   // We can be a bit tidier if we know the TargetMachine and/or MachineFunction.
   const MachineFunction *MF = 0;
   const MachineRegisterInfo *MRI = 0;
@@ -1471,6 +1466,9 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
   else
     OS << "UNKNOWN";
 
+  if (SkipOpers)
+    return;
+
   // Print the rest of the operands.
   bool OmittedAnyCallClobbers = false;
   bool FirstOp = true;
@@ -1482,10 +1480,14 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
     OS << " ";
     getOperand(InlineAsm::MIOp_AsmString).print(OS, TM);
 
-    // Print HasSideEffects, IsAlignStack
+    // Print HasSideEffects, MayLoad, MayStore, IsAlignStack
     unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
     if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
       OS << " [sideeffect]";
+    if (ExtraInfo & InlineAsm::Extra_MayLoad)
+      OS << " [mayload]";
+    if (ExtraInfo & InlineAsm::Extra_MayStore)
+      OS << " [maystore]";
     if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
       OS << " [alignstack]";
     if (getInlineAsmDialect() == InlineAsm::AD_ATT)
@@ -1513,12 +1515,12 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
       unsigned Reg = MO.getReg();
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         const MachineRegisterInfo &MRI = MF->getRegInfo();
-        if (MRI.use_empty(Reg) && !MRI.isLiveOut(Reg)) {
+        if (MRI.use_empty(Reg)) {
           bool HasAliasLive = false;
           for (MCRegAliasIterator AI(Reg, TM->getRegisterInfo(), true);
                AI.isValid(); ++AI) {
             unsigned AliasReg = *AI;
-            if (!MRI.use_empty(AliasReg) || MRI.isLiveOut(AliasReg)) {
+            if (!MRI.use_empty(AliasReg)) {
               HasAliasLive = true;
               break;
             }
@@ -1590,7 +1592,8 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
   }
 
   bool HaveSemi = false;
-  if (Flags) {
+  const unsigned PrintableFlags = FrameSetup;
+  if (Flags & PrintableFlags) {
     if (!HaveSemi) OS << ";"; HaveSemi = true;
     OS << " flags: ";
 
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 760cf8a..ed3ed4d 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -62,7 +62,7 @@ namespace {
   class MachineLICM : public MachineFunctionPass {
     const TargetMachine   *TM;
     const TargetInstrInfo *TII;
-    const TargetLowering *TLI;
+    const TargetLoweringBase *TLI;
     const TargetRegisterInfo *TRI;
     const MachineFrameInfo *MFI;
     MachineRegisterInfo *MRI;
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 21877e5..a777f52 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -283,13 +283,6 @@ bool MachineRegisterInfo::isLiveIn(unsigned Reg) const {
   return false;
 }
 
-bool MachineRegisterInfo::isLiveOut(unsigned Reg) const {
-  for (liveout_iterator I = liveout_begin(), E = liveout_end(); I != E; ++I)
-    if (*I == Reg)
-      return true;
-  return false;
-}
-
 /// getLiveInPhysReg - If VReg is a live-in virtual register, return the
 /// corresponding live-in physical register.
 unsigned MachineRegisterInfo::getLiveInPhysReg(unsigned VReg) const {
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 54a7851..a93d070 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <queue>
 
@@ -48,15 +49,6 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 
-// Threshold to very roughly model an out-of-order processor's instruction
-// buffers. If the actual value of this threshold matters much in practice, then
-// it can be specified by the machine model. For now, it's an experimental
-// tuning knob to determine when and if it matters.
-static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
-  cl::desc("Allow expected latency to exceed the critical path by N cycles "
-           "before attempting to balance ILP"),
-  cl::init(10U));
-
 // Experimental heuristics
 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
   cl::desc("Enable load clustering."), cl::init(true));
@@ -65,6 +57,9 @@ static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
   cl::desc("Enable scheduling for macro fusion."), cl::init(true));
 
+// DAG subtrees must have at least this many nodes.
+static const unsigned MinSubtreeSize = 8;
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
@@ -268,7 +263,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
       }
       DEBUG(dbgs() << "********** MI Scheduling **********\n");
       DEBUG(dbgs() << MF->getName()
-            << ":BB#" << MBB->getNumber() << "\n  From: " << *I << "    To: ";
+            << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+            << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
             dbgs() << " Remaining: " << RemainingInstrs << "\n");
@@ -310,6 +306,12 @@ void ReadyQueue::dump() {
 // preservation.
 //===----------------------------------------------------------------------===//
 
+ScheduleDAGMI::~ScheduleDAGMI() {
+  delete DFSResult;
+  DeleteContainerPointers(Mutations);
+  delete SchedImpl;
+}
+
 bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
   if (SuccSU != &ExitSU) {
     // Do not use WillCreateCycle, it assumes SD scheduling.
@@ -465,7 +467,8 @@ void ScheduleDAGMI::initRegPressure() {
   // Cache the list of excess pressure sets in this region. This will also track
   // the max pressure in the scheduled code for these sets.
   RegionCriticalPSets.clear();
-  std::vector<unsigned> RegionPressure = RPTracker.getPressure().MaxSetPressure;
+  const std::vector<unsigned> &RegionPressure =
+    RPTracker.getPressure().MaxSetPressure;
   for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
     unsigned Limit = TRI->getRegPressureSetLimit(i);
     DEBUG(dbgs() << TRI->getRegPressureSetName(i)
@@ -484,7 +487,7 @@ void ScheduleDAGMI::initRegPressure() {
 // FIXME: When the pressure tracker deals in pressure differences then we won't
 // iterate over all RegionCriticalPSets[i].
 void ScheduleDAGMI::
-updateScheduledPressure(std::vector<unsigned> NewMaxPressure) {
+updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure) {
   for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
     unsigned ID = RegionCriticalPSets[i].PSetID;
     int &MaxUnits = RegionCriticalPSets[i].UnitIncrease;
@@ -510,12 +513,19 @@ void ScheduleDAGMI::schedule() {
 
   postprocessDAG();
 
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  // This may initialize a DFSResult to be used for queue priority.
+  SchedImpl->initialize(this);
+
   DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           SUnits[su].dumpAll(this));
-
   if (ViewMISchedDAGs) viewGraph();
 
-  initQueues();
+  // Initialize ready queues now that the DAG and priority data are finalized.
+  initQueues(TopRoots, BotRoots);
 
   bool IsTopNode = false;
   while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
@@ -550,7 +560,6 @@ void ScheduleDAGMI::buildDAGWithRegPressure() {
 
   // Build the DAG, and compute current register pressure.
   buildSchedGraph(AA, &RPTracker);
-  if (ViewMISchedDAGs) viewGraph();
 
   // Initialize top/bottom trackers after computing region pressure.
   initRegPressure();
@@ -563,39 +572,56 @@ void ScheduleDAGMI::postprocessDAG() {
   }
 }
 
-// Release all DAG roots for scheduling.
-//
-// Nodes with unreleased weak edges can still be roots.
-void ScheduleDAGMI::releaseRoots() {
-  SmallVector<SUnit*, 16> BotRoots;
+void ScheduleDAGMI::computeDFSResult() {
+  if (!DFSResult)
+    DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
+  DFSResult->clear();
+  ScheduledTrees.clear();
+  DFSResult->resize(SUnits.size());
+  DFSResult->compute(SUnits);
+  ScheduledTrees.resize(DFSResult->getNumSubtrees());
+}
 
+void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
+                                          SmallVectorImpl<SUnit*> &BotRoots) {
   for (std::vector<SUnit>::iterator
          I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
     SUnit *SU = &(*I);
+    assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits");
+
+    // Order predecessors so DFSResult follows the critical path.
+    SU->biasCriticalPath();
+
     // A SUnit is ready to top schedule if it has no predecessors.
-    if (!I->NumPredsLeft && SU != &EntrySU)
-      SchedImpl->releaseTopNode(SU);
+    if (!I->NumPredsLeft)
+      TopRoots.push_back(SU);
     // A SUnit is ready to bottom schedule if it has no successors.
-    if (!I->NumSuccsLeft && SU != &ExitSU)
+    if (!I->NumSuccsLeft)
       BotRoots.push_back(SU);
   }
-  // Release bottom roots in reverse order so the higher priority nodes appear
-  // first. This is more natural and slightly more efficient.
-  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
-         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I)
-    SchedImpl->releaseBottomNode(*I);
+  ExitSU.biasCriticalPath();
 }
 
 /// Identify DAG roots and setup scheduler queues.
-void ScheduleDAGMI::initQueues() {
+void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
+                               ArrayRef<SUnit*> BotRoots) {
   NextClusterSucc = NULL;
   NextClusterPred = NULL;
 
-  // Initialize the strategy before modifying the DAG.
-  SchedImpl->initialize(this);
-
   // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
-  releaseRoots();
+  //
+  // Nodes with unreleased weak edges can still be roots.
+  // Release top roots in forward order.
+  for (SmallVectorImpl<SUnit*>::const_iterator
+         I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) {
+    SchedImpl->releaseTopNode(*I);
+  }
+  // Release bottom roots in reverse order so the higher priority nodes appear
+  // first. This is more natural and slightly more efficient.
+  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
+         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) {
+    SchedImpl->releaseBottomNode(*I);
+  }
 
   releaseSuccessors(&EntrySU);
   releasePredecessors(&ExitSU);
@@ -660,6 +686,15 @@ void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
 
   SU->isScheduled = true;
 
+  if (DFSResult) {
+    unsigned SubtreeID = DFSResult->getSubtreeID(SU);
+    if (!ScheduledTrees.test(SubtreeID)) {
+      ScheduledTrees.set(SubtreeID);
+      DFSResult->scheduleTree(SubtreeID);
+      SchedImpl->scheduleTree(SubtreeID);
+    }
+  }
+
   // Notify the scheduling strategy after updating the DAG.
   SchedImpl->schedNode(SU, IsTopNode);
 }
@@ -1019,6 +1054,9 @@ public:
 #endif
 
     void reset() {
+      // A new HazardRec is created for each DAG and owned by SchedBoundary.
+      delete HazardRec;
+
       Available.clear();
       Pending.clear();
       CheckPending = false;
@@ -1044,7 +1082,8 @@ public:
     /// PendingFlag set.
     SchedBoundary(unsigned ID, const Twine &Name):
       DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
-      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P") {
+      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
+      HazardRec(0) {
       reset();
     }
 
@@ -1188,10 +1227,13 @@ void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
   SchedModel = DAG->getSchedModel();
   TRI = DAG->TRI;
+
   Rem.init(DAG, SchedModel);
   Top.init(DAG, SchedModel, &Rem);
   Bot.init(DAG, SchedModel, &Rem);
 
+  DAG->computeDFSResult();
+
   // Initialize resource counts.
 
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
@@ -1297,7 +1339,8 @@ void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) {
     if (L > RemLatency)
       RemLatency = L;
   }
-  if (RemLatency + ExpectedLatency >= Rem->CriticalPath + ILPWindow
+  unsigned CriticalPathLimit = Rem->CriticalPath + SchedModel->getILPWindow();
+  if (RemLatency + ExpectedLatency >= CriticalPathLimit
       && RemLatency > Rem->getMaxRemainingCount(SchedModel)) {
     Policy.ReduceLatency = true;
     DEBUG(dbgs() << "Increase ILP: " << Available.getName() << '\n');
@@ -1541,8 +1584,8 @@ void ConvergingScheduler::checkResourceLimits(
   ConvergingScheduler::SchedCandidate &BotCand) {
 
   // Set ReduceLatency to true if needed.
-  Bot.setLatencyPolicy(TopCand.Policy);
-  Top.setLatencyPolicy(BotCand.Policy);
+  Bot.setLatencyPolicy(BotCand.Policy);
+  Top.setLatencyPolicy(TopCand.Policy);
 
   // Handle resource-limited regions.
   if (Top.IsResourceLimited && Bot.IsResourceLimited
@@ -2060,12 +2103,11 @@ ConvergingSchedRegistry("converge", "Standard converging scheduler.",
 namespace {
 /// \brief Order nodes by the ILP metric.
 struct ILPOrder {
-  SchedDFSResult *DFSResult;
-  BitVector *ScheduledTrees;
+  const SchedDFSResult *DFSResult;
+  const BitVector *ScheduledTrees;
   bool MaximizeILP;
 
-  ILPOrder(SchedDFSResult *dfs, BitVector *schedtrees, bool MaxILP)
-    : DFSResult(dfs), ScheduledTrees(schedtrees), MaximizeILP(MaxILP) {}
+  ILPOrder(bool MaxILP): DFSResult(0), ScheduledTrees(0), MaximizeILP(MaxILP) {}
 
   /// \brief Apply a less-than relation on node priority.
   ///
@@ -2103,26 +2145,22 @@ class ILPScheduler : public MachineSchedStrategy {
   /// (a motivating test case must be found).
   static const unsigned SubtreeLimit = 16;
 
-  SchedDFSResult DFSResult;
-  BitVector ScheduledTrees;
+  ScheduleDAGMI *DAG;
   ILPOrder Cmp;
 
   std::vector<SUnit*> ReadyQ;
 public:
-  ILPScheduler(bool MaximizeILP)
-  : DFSResult(/*BottomUp=*/true, SubtreeLimit),
-    Cmp(&DFSResult, &ScheduledTrees, MaximizeILP) {}
+  ILPScheduler(bool MaximizeILP): DAG(0), Cmp(MaximizeILP) {}
 
-  virtual void initialize(ScheduleDAGMI *DAG) {
+  virtual void initialize(ScheduleDAGMI *dag) {
+    DAG = dag;
+    DAG->computeDFSResult();
+    Cmp.DFSResult = DAG->getDFSResult();
+    Cmp.ScheduledTrees = &DAG->getScheduledTrees();
     ReadyQ.clear();
-    DFSResult.clear();
-    DFSResult.resize(DAG->SUnits.size());
-    ScheduledTrees.clear();
   }
 
   virtual void registerRoots() {
-    DFSResult.compute(ReadyQ);
-    ScheduledTrees.resize(DFSResult.getNumSubtrees());
     // Restore the heap in ReadyQ with the updated DFS results.
     std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
   }
@@ -2139,21 +2177,22 @@ public:
     IsTopNode = false;
     DEBUG(dbgs() << "*** Scheduling " << "SU(" << SU->NodeNum << "): "
           << *SU->getInstr()
-          << " ILP: " << DFSResult.getILP(SU)
-          << " Tree: " << DFSResult.getSubtreeID(SU) << " @"
-          << DFSResult.getSubtreeLevel(DFSResult.getSubtreeID(SU))<< '\n');
+          << " ILP: " << DAG->getDFSResult()->getILP(SU)
+          << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @"
+          << DAG->getDFSResult()->getSubtreeLevel(
+            DAG->getDFSResult()->getSubtreeID(SU)) << '\n');
     return SU;
   }
 
+  /// \brief Scheduler callback to notify that a new subtree is scheduled.
+  virtual void scheduleTree(unsigned SubtreeID) {
+    std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+
   /// Callback after a node is scheduled. Mark a newly scheduled tree, notify
   /// DFSResults, and resort the priority Q.
   virtual void schedNode(SUnit *SU, bool IsTopNode) {
     assert(!IsTopNode && "SchedDFSResult needs bottom-up");
-    if (!ScheduledTrees.test(DFSResult.getSubtreeID(SU))) {
-      ScheduledTrees.set(DFSResult.getSubtreeID(SU));
-      DFSResult.scheduleTree(DFSResult.getSubtreeID(SU));
-      std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
-    }
   }
 
   virtual void releaseTopNode(SUnit *) { /*only called for top roots*/ }
@@ -2264,3 +2303,90 @@ static MachineSchedRegistry ShufflerRegistry(
   "shuffle", "Shuffle machine instructions alternating directions",
   createInstructionShuffler);
 #endif // !NDEBUG
+
+//===----------------------------------------------------------------------===//
+// GraphWriter support for ScheduleDAGMI.
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+namespace llvm {
+
+template<> struct GraphTraits<
+  ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
+
+template<>
+struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const ScheduleDAG *G) {
+    return G->MF.getName();
+  }
+
+  static bool renderGraphFromBottomUp() {
+    return true;
+  }
+
+  static bool isNodeHidden(const SUnit *Node) {
+    return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+  }
+
+  static bool hasNodeAddressLabel(const SUnit *Node,
+                                  const ScheduleDAG *Graph) {
+    return false;
+  }
+
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const SUnit *Node,
+                                       SUnitIterator EI,
+                                       const ScheduleDAG *Graph) {
+    if (EI.isArtificialDep())
+      return "color=cyan,style=dashed";
+    if (EI.isCtrlDep())
+      return "color=blue,style=dashed";
+    return "";
+  }
+
+  static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
+    std::string Str;
+    raw_string_ostream SS(Str);
+    SS << "SU(" << SU->NodeNum << ')';
+    return SS.str();
+  }
+  static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
+    return G->getGraphNodeLabel(SU);
+  }
+
+  static std::string getNodeAttributes(const SUnit *N,
+                                       const ScheduleDAG *Graph) {
+    std::string Str("shape=Mrecord");
+    const SchedDFSResult *DFS =
+      static_cast<const ScheduleDAGMI*>(Graph)->getDFSResult();
+    if (DFS) {
+      Str += ",style=filled,fillcolor=\"#";
+      Str += DOT::getColorString(DFS->getSubtreeID(N));
+      Str += '"';
+    }
+    return Str;
+  }
+};
+} // namespace llvm
+#endif // NDEBUG
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) {
+#ifndef NDEBUG
+  ViewGraph(this, Name, false, Title);
+#else
+  errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+/// Out-of-line implementation with no arguments is handy for gdb.
+void ScheduleDAGMI::viewGraph() {
+  viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
+}
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 685ccab..f77a7b1 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "machine-trace-metrics"
-#include "MachineTraceMetrics.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 4daa211..5584708 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -39,9 +40,16 @@ DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
                      cl::Hidden, cl::desc("Disable critical edge splitting "
                                           "during PHI elimination"));
 
+static cl::opt<bool>
+SplitAllCriticalEdges("phi-elim-split-all-critical-edges", cl::init(false),
+                      cl::Hidden, cl::desc("Split all critical edges during "
+                                           "PHI elimination"));
+
 namespace {
   class PHIElimination : public MachineFunctionPass {
     MachineRegisterInfo *MRI; // Machine register information
+    LiveVariables *LV;
+    LiveIntervals *LIS;
 
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -57,8 +65,8 @@ namespace {
     /// in predecessor basic blocks.
     ///
     bool EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB);
-    void LowerAtomicPHINode(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator AfterPHIsIt);
+    void LowerPHINode(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator AfterPHIsIt);
 
     /// analyzePHINodes - Gather information about the PHI nodes in
     /// here. In particular, we want to map the number of uses of a virtual
@@ -70,7 +78,12 @@ namespace {
 
     /// Split critical edges where necessary for good coalescer performance.
     bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
-                       LiveVariables &LV, MachineLoopInfo *MLI);
+                       MachineLoopInfo *MLI);
+
+    // These functions are temporary abstractions around LiveVariables and
+    // LiveIntervals, so they can go away when LiveVariables does.
+    bool isLiveIn(unsigned Reg, MachineBasicBlock *MBB);
+    bool isLiveOutPastPHIs(unsigned Reg, MachineBasicBlock *MBB);
 
     typedef std::pair<unsigned, unsigned> BBVRegPair;
     typedef DenseMap<BBVRegPair, unsigned> VRegPHIUse;
@@ -87,7 +100,7 @@ namespace {
   };
 }
 
-STATISTIC(NumAtomic, "Number of atomic phis lowered");
+STATISTIC(NumLowered, "Number of phis lowered");
 STATISTIC(NumCriticalEdgesSplit, "Number of critical edges split");
 STATISTIC(NumReused, "Number of reused lowered phis");
 
@@ -103,6 +116,8 @@ INITIALIZE_PASS_END(PHIElimination, "phi-node-elimination",
 
 void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveVariables>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addPreserved<LiveIntervals>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addPreserved<MachineLoopInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -110,19 +125,20 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
 
 bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
+  LV = getAnalysisIfAvailable<LiveVariables>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
 
   bool Changed = false;
 
   // This pass takes the function out of SSA form.
   MRI->leaveSSA();
 
-  // Split critical edges to help the coalescer
-  if (!DisableEdgeSplitting) {
-    if (LiveVariables *LV = getAnalysisIfAvailable<LiveVariables>()) {
-      MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
-      for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-        Changed |= SplitPHIEdges(MF, *I, *LV, MLI);
-    }
+  // Split critical edges to help the coalescer. This does not yet support
+  // updating LiveIntervals, so we disable it.
+  if (!DisableEdgeSplitting && (LV || LIS)) {
+    MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      Changed |= SplitPHIEdges(MF, *I, MLI);
   }
 
   // Populate VRegPHIUseCount
@@ -137,14 +153,20 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
          E = ImpDefs.end(); I != E; ++I) {
     MachineInstr *DefMI = *I;
     unsigned DefReg = DefMI->getOperand(0).getReg();
-    if (MRI->use_nodbg_empty(DefReg))
+    if (MRI->use_nodbg_empty(DefReg)) {
+      if (LIS)
+        LIS->RemoveMachineInstrFromMaps(DefMI);
       DefMI->eraseFromParent();
+    }
   }
 
   // Clean up the lowered PHI instructions.
   for (LoweredPHIMap::iterator I = LoweredPHIs.begin(), E = LoweredPHIs.end();
-       I != E; ++I)
+       I != E; ++I) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(I->first);
     MF.DeleteMachineInstr(I->first);
+  }
 
   LoweredPHIs.clear();
   ImpDefs.clear();
@@ -166,7 +188,7 @@ bool PHIElimination::EliminatePHINodes(MachineFunction &MF,
   MachineBasicBlock::iterator AfterPHIsIt = MBB.SkipPHIsAndLabels(MBB.begin());
 
   while (MBB.front().isPHI())
-    LowerAtomicPHINode(MBB, AfterPHIsIt);
+    LowerPHINode(MBB, AfterPHIsIt);
 
   return true;
 }
@@ -193,15 +215,11 @@ static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi,
 }
 
 
-/// LowerAtomicPHINode - Lower the PHI node at the top of the specified block,
-/// under the assumption that it needs to be lowered in a way that supports
-/// atomic execution of PHIs.  This lowering method is always correct all of the
-/// time.
+/// LowerPHINode - Lower the PHI node at the top of the specified block,
 ///
-void PHIElimination::LowerAtomicPHINode(
-                                      MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator AfterPHIsIt) {
-  ++NumAtomic;
+void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator AfterPHIsIt) {
+  ++NumLowered;
   // Unlink the PHI node from the basic block, but don't delete the PHI yet.
   MachineInstr *MPhi = MBB.remove(MBB.begin());
 
@@ -244,7 +262,6 @@ void PHIElimination::LowerAtomicPHINode(
   }
 
   // Update live variable information if there is any.
-  LiveVariables *LV = getAnalysisIfAvailable<LiveVariables>();
   if (LV) {
     MachineInstr *PHICopy = prior(AfterPHIsIt);
 
@@ -283,6 +300,48 @@ void PHIElimination::LowerAtomicPHINode(
     }
   }
 
+  // Update LiveIntervals for the new copy or implicit def.
+  if (LIS) {
+    MachineInstr *NewInstr = prior(AfterPHIsIt);
+    SlotIndex DestCopyIndex = LIS->InsertMachineInstrInMaps(NewInstr);
+
+    SlotIndex MBBStartIndex = LIS->getMBBStartIdx(&MBB);
+    if (IncomingReg) {
+      // Add the region from the beginning of MBB to the copy instruction to
+      // IncomingReg's live interval.
+      LiveInterval &IncomingLI = LIS->getOrCreateInterval(IncomingReg);
+      VNInfo *IncomingVNI = IncomingLI.getVNInfoAt(MBBStartIndex);
+      if (!IncomingVNI)
+        IncomingVNI = IncomingLI.getNextValue(MBBStartIndex,
+                                              LIS->getVNInfoAllocator());
+      IncomingLI.addRange(LiveRange(MBBStartIndex,
+                                    DestCopyIndex.getRegSlot(),
+                                    IncomingVNI));
+    }
+
+    LiveInterval &DestLI = LIS->getInterval(DestReg);
+    assert(DestLI.begin() != DestLI.end() &&
+           "PHIs should have nonempty LiveIntervals.");
+    if (DestLI.endIndex().isDead()) {
+      // A dead PHI's live range begins and ends at the start of the MBB, but
+      // the lowered copy, which will still be dead, needs to begin and end at
+      // the copy instruction.
+      VNInfo *OrigDestVNI = DestLI.getVNInfoAt(MBBStartIndex);
+      assert(OrigDestVNI && "PHI destination should be live at block entry.");
+      DestLI.removeRange(MBBStartIndex, MBBStartIndex.getDeadSlot());
+      DestLI.createDeadDef(DestCopyIndex.getRegSlot(),
+                           LIS->getVNInfoAllocator());
+      DestLI.removeValNo(OrigDestVNI);
+    } else {
+      // Otherwise, remove the region from the beginning of MBB to the copy
+      // instruction from DestReg's live interval.
+      DestLI.removeRange(MBBStartIndex, DestCopyIndex.getRegSlot());
+      VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
+      assert(DestVNI && "PHI destination should be live at its definition.");
+      DestVNI->def = DestCopyIndex.getRegSlot();
+    }
+  }
+
   // Adjust the VRegPHIUseCount map to account for the removal of this PHI node.
   for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
     --VRegPHIUseCount[BBVRegPair(MPhi->getOperand(i+1).getMBB()->getNumber(),
@@ -315,45 +374,44 @@ void PHIElimination::LowerAtomicPHINode(
       findPHICopyInsertPoint(&opBlock, &MBB, SrcReg);
 
     // Insert the copy.
+    MachineInstr *NewSrcInstr = 0;
     if (!reusedIncoming && IncomingReg) {
       if (SrcUndef) {
         // The source register is undefined, so there is no need for a real
         // COPY, but we still need to ensure joint dominance by defs.
         // Insert an IMPLICIT_DEF instruction.
-        BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
-                TII->get(TargetOpcode::IMPLICIT_DEF), IncomingReg);
+        NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                              TII->get(TargetOpcode::IMPLICIT_DEF),
+                              IncomingReg);
 
         // Clean up the old implicit-def, if there even was one.
         if (MachineInstr *DefMI = MRI->getVRegDef(SrcReg))
           if (DefMI->isImplicitDef())
             ImpDefs.insert(DefMI);
       } else {
-        BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
-                TII->get(TargetOpcode::COPY), IncomingReg)
-          .addReg(SrcReg, 0, SrcSubReg);
+        NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                            TII->get(TargetOpcode::COPY), IncomingReg)
+                        .addReg(SrcReg, 0, SrcSubReg);
       }
     }
 
-    // Now update live variable information if we have it.  Otherwise we're done
-    if (SrcUndef || !LV) continue;
+    // We only need to update the LiveVariables kill of SrcReg if this was the
+    // last PHI use of SrcReg to be lowered on this CFG edge and it is not live
+    // out of the predecessor. We can also ignore undef sources.
+    if (LV && !SrcUndef &&
+        !VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)] &&
+        !LV->isLiveOut(SrcReg, opBlock)) {
+      // We want to be able to insert a kill of the register if this PHI (aka,
+      // the copy we just inserted) is the last use of the source value. Live
+      // variable analysis conservatively handles this by saying that the value
+      // is live until the end of the block the PHI entry lives in. If the value
+      // really is dead at the PHI copy, there will be no successor blocks which
+      // have the value live-in.
+
+      // Okay, if we now know that the value is not live out of the block, we
+      // can add a kill marker in this block saying that it kills the incoming
+      // value!
 
-    // We want to be able to insert a kill of the register if this PHI (aka, the
-    // copy we just inserted) is the last use of the source value.  Live
-    // variable analysis conservatively handles this by saying that the value is
-    // live until the end of the block the PHI entry lives in.  If the value
-    // really is dead at the PHI copy, there will be no successor blocks which
-    // have the value live-in.
-
-    // Also check to see if this register is in use by another PHI node which
-    // has not yet been eliminated.  If so, it will be killed at an appropriate
-    // point later.
-
-    // Is it used by any PHI instructions in this block?
-    bool ValueIsUsed = VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)];
-
-    // Okay, if we now know that the value is not live out of the block, we can
-    // add a kill marker in this block saying that it kills the incoming value!
-    if (!ValueIsUsed && !LV->isLiveOut(SrcReg, opBlock)) {
       // In our final twist, we have to decide which instruction kills the
       // register.  In most cases this is the copy, however, terminator
       // instructions at the end of the block may also use the value. In this
@@ -394,11 +452,74 @@ void PHIElimination::LowerAtomicPHINode(
       unsigned opBlockNum = opBlock.getNumber();
       LV->getVarInfo(SrcReg).AliveBlocks.reset(opBlockNum);
     }
+
+    if (LIS) {
+      if (NewSrcInstr) {
+        LIS->InsertMachineInstrInMaps(NewSrcInstr);
+        LIS->addLiveRangeToEndOfBlock(IncomingReg, NewSrcInstr);
+      }
+
+      if (!SrcUndef &&
+          !VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)]) {
+        LiveInterval &SrcLI = LIS->getInterval(SrcReg);
+
+        bool isLiveOut = false;
+        for (MachineBasicBlock::succ_iterator SI = opBlock.succ_begin(),
+             SE = opBlock.succ_end(); SI != SE; ++SI) {
+          SlotIndex startIdx = LIS->getMBBStartIdx(*SI);
+          VNInfo *VNI = SrcLI.getVNInfoAt(startIdx);
+
+          // Definitions by other PHIs are not truly live-in for our purposes.
+          if (VNI && VNI->def != startIdx) {
+            isLiveOut = true;
+            break;
+          }
+        }
+
+        if (!isLiveOut) {
+          MachineBasicBlock::iterator KillInst = opBlock.end();
+          MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
+          for (MachineBasicBlock::iterator Term = FirstTerm;
+              Term != opBlock.end(); ++Term) {
+            if (Term->readsRegister(SrcReg))
+              KillInst = Term;
+          }
+
+          if (KillInst == opBlock.end()) {
+            // No terminator uses the register.
+
+            if (reusedIncoming || !IncomingReg) {
+              // We may have to rewind a bit if we didn't just insert a copy.
+              KillInst = FirstTerm;
+              while (KillInst != opBlock.begin()) {
+                --KillInst;
+                if (KillInst->isDebugValue())
+                  continue;
+                if (KillInst->readsRegister(SrcReg))
+                  break;
+              }
+            } else {
+              // We just inserted this copy.
+              KillInst = prior(InsertPos);
+            }
+          }
+          assert(KillInst->readsRegister(SrcReg) &&
+                 "Cannot find kill instruction");
+
+          SlotIndex LastUseIndex = LIS->getInstructionIndex(KillInst);
+          SrcLI.removeRange(LastUseIndex.getRegSlot(),
+                            LIS->getMBBEndIdx(&opBlock));
+        }
+      }
+    }
   }
 
   // Really delete the PHI instruction now, if it is not in the LoweredPHIs map.
-  if (reusedIncoming || !IncomingReg)
+  if (reusedIncoming || !IncomingReg) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(MPhi);
     MF.DeleteMachineInstr(MPhi);
+  }
 }
 
 /// analyzePHINodes - Gather information about the PHI nodes in here. In
@@ -418,7 +539,6 @@ void PHIElimination::analyzePHINodes(const MachineFunction& MF) {
 
 bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
                                    MachineBasicBlock &MBB,
-                                   LiveVariables &LV,
                                    MachineLoopInfo *MLI) {
   if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad())
     return false;   // Quick exit for basic blocks without PHIs.
@@ -438,10 +558,10 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
 
       // Avoid splitting backedges of loops. It would introduce small
       // out-of-line blocks into the loop which is very bad for code placement.
-      if (PreMBB == &MBB)
+      if (PreMBB == &MBB && !SplitAllCriticalEdges)
         continue;
       const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : 0;
-      if (IsLoopHeader && PreLoop == CurLoop)
+      if (IsLoopHeader && PreLoop == CurLoop && !SplitAllCriticalEdges)
         continue;
 
       // LV doesn't consider a phi use live-out, so isLiveOut only returns true
@@ -450,7 +570,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
       // there is a risk it may not be coalesced away.
       //
       // If the copy would be a kill, there is no need to split the edge.
-      if (!LV.isLiveOut(Reg, *PreMBB))
+      if (!isLiveOutPastPHIs(Reg, PreMBB) && !SplitAllCriticalEdges)
         continue;
 
       DEBUG(dbgs() << PrintReg(Reg) << " live-out before critical edge BB#"
@@ -465,7 +585,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
       // is likely to be left after coalescing. If we are looking at a loop
       // exiting edge, split it so we won't insert code in the loop, otherwise
       // don't bother.
-      bool ShouldSplit = !LV.isLiveIn(Reg, MBB);
+      bool ShouldSplit = !isLiveIn(Reg, &MBB) || SplitAllCriticalEdges;
 
       // Check for a loop exiting edge.
       if (!ShouldSplit && CurLoop != PreLoop) {
@@ -492,3 +612,33 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
   }
   return Changed;
 }
+
+bool PHIElimination::isLiveIn(unsigned Reg, MachineBasicBlock *MBB) {
+  assert((LV || LIS) &&
+         "isLiveIn() requires either LiveVariables or LiveIntervals");
+  if (LIS)
+    return LIS->isLiveInToMBB(LIS->getInterval(Reg), MBB);
+  else
+    return LV->isLiveIn(Reg, *MBB);
+}
+
+bool PHIElimination::isLiveOutPastPHIs(unsigned Reg, MachineBasicBlock *MBB) {
+  assert((LV || LIS) &&
+         "isLiveOutPastPHIs() requires either LiveVariables or LiveIntervals");
+  // LiveVariables considers uses in PHIs to be in the predecessor basic block,
+  // so that a register used only in a PHI is not live out of the block. In
+  // contrast, LiveIntervals considers uses in PHIs to be on the edge rather than
+  // in the predecessor basic block, so that a register used only in a PHI is live
+  // out of the block.
+  if (LIS) {
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+      if (LI.liveAt(LIS->getMBBStartIdx(*SI)))
+        return true;
+    }
+    return false;
+  } else {
+    return LV->isLiveOut(Reg, *MBB);
+  }
+}
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 2a135bc..b79f9f9 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -89,7 +89,7 @@ PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
                    cl::desc("Print machine instrs"),
                    cl::value_desc("pass-name"), cl::init("option-unspecified"));
 
-// Experimental option to run live inteerval analysis early.
+// Experimental option to run live interval analysis early.
 static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
     cl::desc("Run live interval analysis earlier in the pipeline"));
 
@@ -238,9 +238,6 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
   substitutePass(&PostRAMachineLICMID, &MachineLICMID);
 
-  // Disable early if-conversion. Targets that are ready can enable it.
-  disablePass(&EarlyIfConverterID);
-
   // Temporarily disable experimental passes.
   const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
   if (!ST.enableMachineScheduler())
@@ -551,7 +548,12 @@ void TargetPassConfig::addMachineSSAOptimization() {
   addPass(&DeadMachineInstructionElimID);
   printAndVerify("After codegen DCE pass");
 
-  addPass(&EarlyIfConverterID);
+  // Allow targets to insert passes that improve instruction level parallelism,
+  // like if-conversion. Such passes will typically need dominator trees and
+  // loop info, just like LICM and CSE below.
+  if (addILPOpts())
+    printAndVerify("After ILP optimizations");
+
   addPass(&MachineLICMID);
   addPass(&MachineCSEID);
   addPass(&MachineSinkingID);
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 488f1d4..53fe273 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -418,11 +418,11 @@ void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
   // Start with no live registers.
   LiveRegs.reset();
 
-  // Determine the live-out physregs for this block.
-  if (!BB->empty() && BB->back().isReturn()) {
-    // In a return block, examine the function live-out regs.
-    for (MachineRegisterInfo::liveout_iterator I = MRI.liveout_begin(),
-           E = MRI.liveout_end(); I != E; ++I) {
+  // Examine the live-in regs of all successors.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+       SE = BB->succ_end(); SI != SE; ++SI) {
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+         E = (*SI)->livein_end(); I != E; ++I) {
       unsigned Reg = *I;
       LiveRegs.set(Reg);
       // Repeat, for all subregs.
@@ -430,20 +430,6 @@ void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
         LiveRegs.set(*SubRegs);
     }
   }
-  else {
-    // In a non-return block, examine the live-in regs of all successors.
-    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
-           SE = BB->succ_end(); SI != SE; ++SI) {
-      for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
-             E = (*SI)->livein_end(); I != E; ++I) {
-        unsigned Reg = *I;
-        LiveRegs.set(Reg);
-        // Repeat, for all subregs.
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-          LiveRegs.set(*SubRegs);
-      }
-    }
-  }
 }
 
 bool SchedulePostRATDList::ToggleKillFlag(MachineInstr *MI,
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 1d0e71e..b18d52d 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -139,7 +139,6 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
 /// variables for the function's frame information and eliminate call frame
 /// pseudo instructions.
 void PEI::calculateCallsInformation(MachineFunction &Fn) {
-  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
   const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
   const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
   MachineFrameInfo *MFI = Fn.getFrameInfo();
@@ -186,7 +185,7 @@ void PEI::calculateCallsInformation(MachineFunction &Fn) {
     // here. The sub/add sp instruction pairs are still inserted, but we don't
     // need to track the SP adjustment for frame index elimination.
     if (TFI->canSimplifyCallFramePseudos(Fn))
-      RegInfo->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
+      TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
   }
 }
 
@@ -693,6 +692,14 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
   // space in small chunks instead of one large contiguous block.
   if (Fn.getTarget().Options.EnableSegmentedStacks)
     TFI.adjustForSegmentedStacks(Fn);
+
+  // Emit additional code that is required to explicitly handle the stack in
+  // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
+  // approach is rather similar to that of Segmented Stacks, but it uses a
+  // different conditional check and another BIF for allocating more stack
+  // space.
+  if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE)
+    TFI.adjustForHiPEPrologue(Fn);
 }
 
 /// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
@@ -739,7 +746,7 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
 
         MachineBasicBlock::iterator PrevI = BB->end();
         if (I != BB->begin()) PrevI = prior(I);
-        TRI.eliminateCallFramePseudoInstr(Fn, *BB, I);
+        TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
 
         // Visit the instructions created by eliminateCallFramePseudoInstr().
         if (PrevI == BB->end())
@@ -751,34 +758,36 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
 
       MachineInstr *MI = I;
       bool DoIncr = true;
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
-        if (MI->getOperand(i).isFI()) {
-          // Some instructions (e.g. inline asm instructions) can have
-          // multiple frame indices and/or cause eliminateFrameIndex
-          // to insert more than one instruction. We need the register
-          // scavenger to go through all of these instructions so that
-          // it can update its register information. We keep the
-          // iterator at the point before insertion so that we can
-          // revisit them in full.
-          bool AtBeginning = (I == BB->begin());
-          if (!AtBeginning) --I;
-
-          // If this instruction has a FrameIndex operand, we need to
-          // use that target machine register info object to eliminate
-          // it.
-          TRI.eliminateFrameIndex(MI, SPAdj,
-                                  FrameIndexVirtualScavenging ?  NULL : RS);
-
-          // Reset the iterator if we were at the beginning of the BB.
-          if (AtBeginning) {
-            I = BB->begin();
-            DoIncr = false;
-          }
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        if (!MI->getOperand(i).isFI())
+            continue;
 
-          MI = 0;
-          break;
+        // Some instructions (e.g. inline asm instructions) can have
+        // multiple frame indices and/or cause eliminateFrameIndex
+        // to insert more than one instruction. We need the register
+        // scavenger to go through all of these instructions so that
+        // it can update its register information. We keep the
+        // iterator at the point before insertion so that we can
+        // revisit them in full.
+        bool AtBeginning = (I == BB->begin());
+        if (!AtBeginning) --I;
+
+        // If this instruction has a FrameIndex operand, we need to
+        // use that target machine register info object to eliminate
+        // it.
+        TRI.eliminateFrameIndex(MI, SPAdj, i,
+                                FrameIndexVirtualScavenging ?  NULL : RS);
+
+        // Reset the iterator if we were at the beginning of the BB.
+        if (AtBeginning) {
+          I = BB->begin();
+          DoIncr = false;
         }
 
+        MI = 0;
+        break;
+      }
+
       if (DoIncr && I != BB->end()) ++I;
 
       // Update register states.
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 8d849dc..bb9c05c 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -113,12 +113,27 @@ namespace {
     // PhysRegState - One of the RegState enums, or a virtreg.
     std::vector<unsigned> PhysRegState;
 
+    // Set of register units.
     typedef SparseSet<unsigned> UsedInInstrSet;
 
-    // UsedInInstr - Set of physregs that are used in the current instruction,
-    // and so cannot be allocated.
+    // Set of register units that are used in the current instruction, and so
+    // cannot be allocated.
     UsedInInstrSet UsedInInstr;
 
+    // Mark a physreg as used in this instruction.
+    void markRegUsedInInstr(unsigned PhysReg) {
+      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+        UsedInInstr.insert(*Units);
+    }
+
+    // Check if a physreg or any of its aliases are used in this instruction.
+    bool isRegUsedInInstr(unsigned PhysReg) const {
+      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+        if (UsedInInstr.count(*Units))
+          return true;
+      return false;
+    }
+
     // SkippedInstrs - Descriptors of instructions whose clobber list was
     // ignored because all registers were spilled. It is still necessary to
     // mark all the clobbered registers as used by the function.
@@ -177,7 +192,6 @@ namespace {
                                        unsigned VirtReg, unsigned Hint);
     void spillAll(MachineBasicBlock::iterator MI);
     bool setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg);
-    void addRetOperands(MachineBasicBlock *MBB);
   };
   char RAFast::ID = 0;
 }
@@ -334,7 +348,7 @@ void RAFast::usePhysReg(MachineOperand &MO) {
   unsigned PhysReg = MO.getReg();
   assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
          "Bad usePhysReg operand");
-
+  markRegUsedInInstr(PhysReg);
   switch (PhysRegState[PhysReg]) {
   case regDisabled:
     break;
@@ -342,7 +356,6 @@ void RAFast::usePhysReg(MachineOperand &MO) {
     PhysRegState[PhysReg] = regFree;
     // Fall through
   case regFree:
-    UsedInInstr.insert(PhysReg);
     MO.setIsKill();
     return;
   default:
@@ -362,13 +375,11 @@ void RAFast::usePhysReg(MachineOperand &MO) {
              "Instruction is not using a subregister of a reserved register");
       // Leave the superregister in the working set.
       PhysRegState[Alias] = regFree;
-      UsedInInstr.insert(Alias);
       MO.getParent()->addRegisterKilled(Alias, TRI, true);
       return;
     case regFree:
       if (TRI->isSuperRegister(PhysReg, Alias)) {
         // Leave the superregister in the working set.
-        UsedInInstr.insert(Alias);
         MO.getParent()->addRegisterKilled(Alias, TRI, true);
         return;
       }
@@ -382,7 +393,6 @@ void RAFast::usePhysReg(MachineOperand &MO) {
 
   // All aliases are disabled, bring register into working set.
   PhysRegState[PhysReg] = regFree;
-  UsedInInstr.insert(PhysReg);
   MO.setIsKill();
 }
 
@@ -391,7 +401,7 @@ void RAFast::usePhysReg(MachineOperand &MO) {
 /// reserved instead of allocated.
 void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
                            RegState NewState) {
-  UsedInInstr.insert(PhysReg);
+  markRegUsedInInstr(PhysReg);
   switch (unsigned VirtReg = PhysRegState[PhysReg]) {
   case regDisabled:
     break;
@@ -431,7 +441,7 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
 // can be allocated directly.
 // Returns spillImpossible when PhysReg or an alias can't be spilled.
 unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
-  if (UsedInInstr.count(PhysReg)) {
+  if (isRegUsedInInstr(PhysReg)) {
     DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is already used in instr.\n");
     return spillImpossible;
   }
@@ -456,8 +466,6 @@ unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
   unsigned Cost = 0;
   for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
     unsigned Alias = *AI;
-    if (UsedInInstr.count(Alias))
-      return spillImpossible;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
     case regDisabled:
       break;
@@ -532,7 +540,7 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
   // First try to find a completely free register.
   for (ArrayRef<MCPhysReg>::iterator I = AO.begin(), E = AO.end(); I != E; ++I){
     unsigned PhysReg = *I;
-    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.count(PhysReg)) {
+    if (PhysRegState[PhysReg] == regFree && !isRegUsedInInstr(PhysReg)) {
       assignVirtToPhysReg(*LRI, PhysReg);
       return LRI;
     }
@@ -598,7 +606,7 @@ RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
   LRI->LastUse = MI;
   LRI->LastOpNum = OpNum;
   LRI->Dirty = true;
-  UsedInInstr.insert(LRI->PhysReg);
+  markRegUsedInInstr(LRI->PhysReg);
   return LRI;
 }
 
@@ -648,7 +656,7 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
   assert(LRI->PhysReg && "Register not assigned");
   LRI->LastUse = MI;
   LRI->LastOpNum = OpNum;
-  UsedInInstr.insert(LRI->PhysReg);
+  markRegUsedInInstr(LRI->PhysReg);
   return LRI;
 }
 
@@ -709,8 +717,8 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    markRegUsedInInstr(Reg);
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-      UsedInInstr.insert(*AI);
       if (ThroughRegs.count(PhysRegState[*AI]))
         definePhysReg(MI, *AI, regFree);
     }
@@ -766,65 +774,12 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
     DEBUG(dbgs() << "\tSetting " << PrintReg(Reg, TRI)
                  << " as used in instr\n");
-    UsedInInstr.insert(Reg);
+    markRegUsedInInstr(Reg);
   }
 
   // Also mark PartialDefs as used to avoid reallocation.
   for (unsigned i = 0, e = PartialDefs.size(); i != e; ++i)
-    UsedInInstr.insert(PartialDefs[i]);
-}
-
-/// addRetOperand - ensure that a return instruction has an operand for each
-/// value live out of the function.
-///
-/// Things marked both call and return are tail calls; do not do this for them.
-/// The tail callee need not take the same registers as input that it produces
-/// as output, and there are dependencies for its input registers elsewhere.
-///
-/// FIXME: This should be done as part of instruction selection, and this helper
-/// should be deleted. Until then, we use custom logic here to create the proper
-/// operand under all circumstances. We can't use addRegisterKilled because that
-/// doesn't make sense for undefined values. We can't simply avoid calling it
-/// for undefined values, because we must ensure that the operand always exists.
-void RAFast::addRetOperands(MachineBasicBlock *MBB) {
-  if (MBB->empty() || !MBB->back().isReturn() || MBB->back().isCall())
-    return;
-
-  MachineInstr *MI = &MBB->back();
-
-  for (MachineRegisterInfo::liveout_iterator
-         I = MBB->getParent()->getRegInfo().liveout_begin(),
-         E = MBB->getParent()->getRegInfo().liveout_end(); I != E; ++I) {
-    unsigned Reg = *I;
-    assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
-           "Cannot have a live-out virtual register.");
-
-    bool hasDef = PhysRegState[Reg] == regReserved;
-
-    // Check if this register already has an operand.
-    bool Found = false;
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse())
-        continue;
-
-      unsigned OperReg = MO.getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(OperReg))
-        continue;
-
-      if (OperReg == Reg || TRI->isSuperRegister(OperReg, Reg)) {
-        // If the ret already has an operand for this physreg or a superset,
-        // don't duplicate it. Set the kill flag if the value is defined.
-        if (hasDef && !MO.isKill())
-          MO.setIsKill();
-        Found = true;
-        break;
-      }
-    }
-    if (!Found)
-      MachineInstrBuilder(*MF, MI)
-        .addReg(Reg, llvm::RegState::Implicit | getKillRegState(hasDef));
-  }
+    markRegUsedInInstr(PartialDefs[i]);
 }
 
 void RAFast::AllocateBasicBlock() {
@@ -1023,7 +978,7 @@ void RAFast::AllocateBasicBlock() {
 
     for (UsedInInstrSet::iterator
          I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
-      MRI->setPhysRegUsed(*I);
+      MRI->setRegUnitUsed(*I);
 
     // Track registers defined by instruction - early clobbers and tied uses at
     // this point.
@@ -1036,8 +991,7 @@ void RAFast::AllocateBasicBlock() {
         if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
         // Look for physreg defs and tied uses.
         if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue;
-        for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-          UsedInInstr.insert(*AI);
+        markRegUsedInInstr(Reg);
       }
     }
 
@@ -1089,7 +1043,7 @@ void RAFast::AllocateBasicBlock() {
 
     for (UsedInInstrSet::iterator
          I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
-      MRI->setPhysRegUsed(*I);
+      MRI->setRegUnitUsed(*I);
 
     if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
       DEBUG(dbgs() << "-- coalescing: " << *MI);
@@ -1109,9 +1063,6 @@ void RAFast::AllocateBasicBlock() {
     MBB->erase(Coalesced[i]);
   NumCopies += Coalesced.size();
 
-  // addRetOperands must run after we've seen all defs in this block.
-  addRetOperands(MBB);
-
   DEBUG(MBB->dump());
 }
 
@@ -1128,7 +1079,7 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   MRI->freezeReservedRegs(Fn);
   RegClassInfo.runOnMachineFunction(Fn);
   UsedInInstr.clear();
-  UsedInInstr.setUniverse(TRI->getNumRegs());
+  UsedInInstr.setUniverse(TRI->getNumRegUnits());
 
   assert(!MRI->isSSA() && "regalloc requires leaving SSA");
 
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 1884452..6344a73 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -632,16 +632,33 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
   // Keep track of the cheapest interference seen so far.
   EvictionCost BestCost(~0u);
   unsigned BestPhys = 0;
+  unsigned OrderLimit = Order.getOrder().size();
 
   // When we are just looking for a reduced cost per use, don't break any
   // hints, and only evict smaller spill weights.
   if (CostPerUseLimit < ~0u) {
     BestCost.BrokenHints = 0;
     BestCost.MaxWeight = VirtReg.weight;
+
+    // Check of any registers in RC are below CostPerUseLimit.
+    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg);
+    unsigned MinCost = RegClassInfo.getMinCost(RC);
+    if (MinCost >= CostPerUseLimit) {
+      DEBUG(dbgs() << RC->getName() << " minimum cost = " << MinCost
+                   << ", no cheaper registers to be found.\n");
+      return 0;
+    }
+
+    // It is normal for register classes to have a long tail of registers with
+    // the same cost. We don't need to look at them if they're too expensive.
+    if (TRI->getCostPerUse(Order.getOrder().back()) >= CostPerUseLimit) {
+      OrderLimit = RegClassInfo.getLastCostChange(RC);
+      DEBUG(dbgs() << "Only trying the first " << OrderLimit << " regs.\n");
+    }
   }
 
   Order.rewind();
-  while (unsigned PhysReg = Order.next()) {
+  while (unsigned PhysReg = Order.nextWithDups(OrderLimit)) {
     if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit)
       continue;
     // The first use of a callee-saved register in a function has cost 1.
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 078a0df..87382d8 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -83,6 +83,9 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
 
   unsigned N = 0;
   SmallVector<MCPhysReg, 16> CSRAlias;
+  unsigned MinCost = 0xff;
+  unsigned LastCost = ~0u;
+  unsigned LastCostChange = 0;
 
   // FIXME: Once targets reserve registers instead of removing them from the
   // allocation order, we can simply use begin/end here.
@@ -92,17 +95,31 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
     // Remove reserved registers from the allocation order.
     if (Reserved.test(PhysReg))
       continue;
+    unsigned Cost = TRI->getCostPerUse(PhysReg);
+    MinCost = std::min(MinCost, Cost);
+
     if (CSRNum[PhysReg])
       // PhysReg aliases a CSR, save it for later.
       CSRAlias.push_back(PhysReg);
-    else
+    else {
+      if (Cost != LastCost)
+        LastCostChange = N;
       RCI.Order[N++] = PhysReg;
+      LastCost = Cost;
+    }
   }
   RCI.NumRegs = N + CSRAlias.size();
   assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
 
   // CSR aliases go after the volatile registers, preserve the target's order.
-  std::copy(CSRAlias.begin(), CSRAlias.end(), &RCI.Order[N]);
+  for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) {
+    unsigned PhysReg = CSRAlias[i];
+    unsigned Cost = TRI->getCostPerUse(PhysReg);
+    if (Cost != LastCost)
+      LastCostChange = N;
+    RCI.Order[N++] = PhysReg;
+    LastCost = Cost;
+  }
 
   // Register allocator stress test.  Clip register class to N registers.
   if (StressRA && RCI.NumRegs > StressRA)
@@ -113,6 +130,9 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
     if (Super != RC && getNumAllocatableRegs(Super) > RCI.NumRegs)
       RCI.ProperSubClass = true;
 
+  RCI.MinCost = uint8_t(MinCost);
+  RCI.LastCostChange = LastCostChange;
+
   DEBUG({
     dbgs() << "AllocationOrder(" << RC->getName() << ") = [";
     for (unsigned I = 0; I != RCI.NumRegs; ++I)
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 36d8101..e2488ad 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -15,7 +15,6 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "RegisterCoalescer.h"
-#include "LiveDebugVariables.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -84,7 +83,6 @@ namespace {
     const TargetRegisterInfo* TRI;
     const TargetInstrInfo* TII;
     LiveIntervals *LIS;
-    LiveDebugVariables *LDV;
     const MachineLoopInfo* Loops;
     AliasAnalysis *AA;
     RegisterClassInfo RegClassInfo;
@@ -169,8 +167,7 @@ namespace {
 
     /// reMaterializeTrivialDef - If the source of a copy is defined by a
     /// trivial computation, replace the copy by rematerialize the definition.
-    bool reMaterializeTrivialDef(LiveInterval &SrcInt, unsigned DstReg,
-                                 MachineInstr *CopyMI);
+    bool reMaterializeTrivialDef(CoalescerPair &CP, MachineInstr *CopyMI);
 
     /// canJoinPhys - Return true if a physreg copy should be joined.
     bool canJoinPhys(const CoalescerPair &CP);
@@ -208,7 +205,6 @@ char &llvm::RegisterCoalescerID = RegisterCoalescer::ID;
 INITIALIZE_PASS_BEGIN(RegisterCoalescer, "simple-register-coalescing",
                       "Simple Register Coalescing", false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
@@ -394,8 +390,6 @@ void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AliasAnalysis>();
   AU.addRequired<LiveIntervals>();
   AU.addPreserved<LiveIntervals>();
-  AU.addRequired<LiveDebugVariables>();
-  AU.addPreserved<LiveDebugVariables>();
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<MachineLoopInfo>();
   AU.addPreserved<MachineLoopInfo>();
@@ -737,9 +731,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
 /// reMaterializeTrivialDef - If the source of a copy is defined by a trivial
 /// computation, replace the copy by rematerialize the definition.
-bool RegisterCoalescer::reMaterializeTrivialDef(LiveInterval &SrcInt,
-                                                unsigned DstReg,
+bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
                                                 MachineInstr *CopyMI) {
+  unsigned SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
+  unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
+    return false;
+
+  LiveInterval &SrcInt = LIS->getInterval(SrcReg);
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true);
   LiveInterval::iterator SrcLR = SrcInt.FindLiveRangeContaining(CopyIdx);
   assert(SrcLR != SrcInt.end() && "Live range not found!");
@@ -760,13 +759,17 @@ bool RegisterCoalescer::reMaterializeTrivialDef(LiveInterval &SrcInt,
   const MCInstrDesc &MCID = DefMI->getDesc();
   if (MCID.getNumDefs() != 1)
     return false;
+  // Only support subregister destinations when the def is read-undef.
+  MachineOperand &DstOperand = CopyMI->getOperand(0);
+  if (DstOperand.getSubReg() && !DstOperand.isUndef())
+    return false;
   if (!DefMI->isImplicitDef()) {
     // Make sure the copy destination register class fits the instruction
     // definition register class. The mismatch can happen as a result of earlier
     // extract_subreg, insert_subreg, subreg_to_reg coalescing.
     const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI, *MF);
     if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
-      if (MRI->getRegClass(DstReg) != RC)
+      if (!MRI->constrainRegClass(DstReg, RC))
         return false;
     } else if (!RC->contains(DstReg))
       return false;
@@ -778,6 +781,12 @@ bool RegisterCoalescer::reMaterializeTrivialDef(LiveInterval &SrcInt,
   TII->reMaterialize(*MBB, MII, DstReg, 0, DefMI, *TRI);
   MachineInstr *NewMI = prior(MII);
 
+  // The original DefMI may have been a subregister def, but the full register
+  // class of its destination matches the destination of CopyMI, and CopyMI is
+  // either a full register def or is read-undef. Therefore we can clear the
+  // subregister index on the rematerialized instruction.
+  NewMI->getOperand(0).setSubReg(0);
+
   // NewMI may have dead implicit defs (E.g. EFLAGS for MOV<bits>r0 on X86).
   // We need to remember these so we can add intervals once we insert
   // NewMI into SlotIndexes.
@@ -883,9 +892,6 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
   bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
   LiveInterval *DstInt = DstIsPhys ? 0 : &LIS->getInterval(DstReg);
 
-  // Update LiveDebugVariables.
-  LDV->renameRegister(SrcReg, DstReg, SubIdx);
-
   SmallPtrSet<MachineInstr*, 8> Visited;
   for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg);
        MachineInstr *UseMI = I.skipInstruction();) {
@@ -1010,9 +1016,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     if (!canJoinPhys(CP)) {
       // Before giving up coalescing, if definition of source is defined by
       // trivial computation, try rematerializing it.
-      if (!CP.isFlipped() &&
-          reMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()),
-                                  CP.getDstReg(), CopyMI))
+      if (reMaterializeTrivialDef(CP, CopyMI))
         return true;
       return false;
     }
@@ -1045,9 +1049,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
     // If definition of source is defined by trivial computation, try
     // rematerializing it.
-    if (!CP.isFlipped() &&
-        reMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()),
-                                CP.getDstReg(), CopyMI))
+    if (reMaterializeTrivialDef(CP, CopyMI))
       return true;
 
     // If we can eliminate the copy without merging the live ranges, do so now.
@@ -2136,7 +2138,6 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   TRI = TM->getRegisterInfo();
   TII = TM->getInstrInfo();
   LIS = &getAnalysis<LiveIntervals>();
-  LDV = &getAnalysis<LiveDebugVariables>();
   AA = &getAnalysis<AliasAnalysis>();
   Loops = &getAnalysis<MachineLoopInfo>();
 
@@ -2182,7 +2183,6 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   }
 
   DEBUG(dump());
-  DEBUG(LDV->dump());
   if (VerifyCoalescing)
     MF->verify(this, "After register coalescing");
   return true;
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 62e95aa..97f22e1 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -357,15 +357,14 @@ protected:
 /// Collect physical and virtual register operands.
 static void collectOperands(const MachineInstr *MI,
                             RegisterOperands &RegOpers) {
-  for(ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
+  for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
     RegOpers.collect(*OperI);
 
   // Remove redundant physreg dead defs.
-  for (unsigned i = RegOpers.DeadDefs.size(); i > 0; --i) {
-    unsigned Reg = RegOpers.DeadDefs[i-1];
-    if (containsReg(RegOpers.Defs, Reg))
-      RegOpers.DeadDefs.erase(&RegOpers.DeadDefs[i-1]);
-  }
+  SmallVectorImpl<unsigned>::iterator I =
+    std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(),
+                   std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs));
+  RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
 }
 
 /// Force liveness of registers.
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 88f67da..6da901f 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -316,6 +316,16 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
   return Survivor;
 }
 
+static unsigned getFrameIndexOperandNum(MachineInstr *MI) {
+  unsigned i = 0;
+  while (!MI->getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+  return i;
+}
+
 unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
                                         MachineBasicBlock::iterator I,
                                         int SPAdj) {
@@ -364,12 +374,16 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
            "Cannot scavenge register without an emergency spill slot!");
     TII->storeRegToStackSlot(*MBB, I, SReg, true, ScavengingFrameIndex, RC,TRI);
     MachineBasicBlock::iterator II = prior(I);
-    TRI->eliminateFrameIndex(II, SPAdj, this);
+
+    unsigned FIOperandNum = getFrameIndexOperandNum(II);
+    TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
 
     // Restore the scavenged register before its use (or first terminator).
     TII->loadRegFromStackSlot(*MBB, UseMI, SReg, ScavengingFrameIndex, RC, TRI);
     II = prior(UseMI);
-    TRI->eliminateFrameIndex(II, SPAdj, this);
+
+    FIOperandNum = getFrameIndexOperandNum(II);
+    TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
   }
 
   ScavengeRestore = prior(UseMI);
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index e639c55..45b4f68 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -135,20 +135,14 @@ void SUnit::removePred(const SDep &D) {
   for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end();
        I != E; ++I)
     if (*I == D) {
-      bool FoundSucc = false;
       // Find the corresponding successor in N.
       SDep P = D;
       P.setSUnit(this);
       SUnit *N = D.getSUnit();
-      for (SmallVector<SDep, 4>::iterator II = N->Succs.begin(),
-             EE = N->Succs.end(); II != EE; ++II)
-        if (*II == P) {
-          FoundSucc = true;
-          N->Succs.erase(II);
-          break;
-        }
-      assert(FoundSucc && "Mismatching preds / succs lists!");
-      (void)FoundSucc;
+      SmallVectorImpl<SDep>::iterator Succ = std::find(N->Succs.begin(),
+                                                       N->Succs.end(), P);
+      assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
+      N->Succs.erase(Succ);
       Preds.erase(I);
       // Update the bookkeeping.
       if (P.getKind() == SDep::Data) {
@@ -301,6 +295,21 @@ void SUnit::ComputeHeight() {
   } while (!WorkList.empty());
 }
 
+void SUnit::biasCriticalPath() {
+  if (NumPreds < 2)
+    return;
+
+  SUnit::pred_iterator BestI = Preds.begin();
+  unsigned MaxDepth = BestI->getSUnit()->getDepth();
+  for (SUnit::pred_iterator
+         I = llvm::next(BestI), E = Preds.end(); I != E; ++I) {
+    if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth)
+      BestI = I;
+  }
+  if (BestI != Preds.begin())
+    std::swap(*Preds.begin(), *BestI);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
 /// a group of nodes flagged together.
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 662fc0e..71e7a21 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -168,20 +168,6 @@ void ScheduleDAGInstrs::finishBlock() {
   BB = 0;
 }
 
-/// Initialize the map with the number of registers.
-void Reg2SUnitsMap::setRegLimit(unsigned Limit) {
-  PhysRegSet.setUniverse(Limit);
-  SUnits.resize(Limit);
-}
-
-/// Clear the map without deallocating storage.
-void Reg2SUnitsMap::clear() {
-  for (const_iterator I = reg_begin(), E = reg_end(); I != E; ++I) {
-    SUnits[*I].clear();
-  }
-  PhysRegSet.clear();
-}
-
 /// Initialize the DAG and common scheduler state for the current scheduling
 /// region. This does not actually create the DAG, only clears it. The
 /// scheduling driver may call BuildSchedGraph multiple times per scheduling
@@ -228,7 +214,7 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
       if (Reg == 0) continue;
 
       if (TRI->isPhysicalRegister(Reg))
-        Uses[Reg].push_back(PhysRegSUOper(&ExitSU, -1));
+        Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
       else {
         assert(!IsPostRA && "Virtual register encountered after regalloc.");
         if (MO.readsReg()) // ignore undef operands
@@ -245,7 +231,7 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
              E = (*SI)->livein_end(); I != E; ++I) {
         unsigned Reg = *I;
         if (!Uses.contains(Reg))
-          Uses[Reg].push_back(PhysRegSUOper(&ExitSU, -1));
+          Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
       }
   }
 }
@@ -263,15 +249,14 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
        Alias.isValid(); ++Alias) {
     if (!Uses.contains(*Alias))
       continue;
-    std::vector<PhysRegSUOper> &UseList = Uses[*Alias];
-    for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
-      SUnit *UseSU = UseList[i].SU;
+    for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
+      SUnit *UseSU = I->SU;
       if (UseSU == SU)
         continue;
 
       // Adjust the dependence latency using operand def/use information,
       // then allow the target to perform its own adjustments.
-      int UseOp = UseList[i].OpIdx;
+      int UseOp = I->OpIdx;
       MachineInstr *RegUse = 0;
       SDep Dep;
       if (UseOp < 0)
@@ -311,9 +296,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
        Alias.isValid(); ++Alias) {
     if (!Defs.contains(*Alias))
       continue;
-    std::vector<PhysRegSUOper> &DefList = Defs[*Alias];
-    for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
-      SUnit *DefSU = DefList[i].SU;
+    for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) {
+      SUnit *DefSU = I->SU;
       if (DefSU == &ExitSU)
         continue;
       if (DefSU != SU &&
@@ -337,33 +321,37 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
     // Either insert a new Reg2SUnits entry with an empty SUnits list, or
     // retrieve the existing SUnits list for this register's uses.
     // Push this SUnit on the use list.
-    Uses[MO.getReg()].push_back(PhysRegSUOper(SU, OperIdx));
+    Uses.insert(PhysRegSUOper(SU, OperIdx, MO.getReg()));
   }
   else {
     addPhysRegDataDeps(SU, OperIdx);
-
-    // Either insert a new Reg2SUnits entry with an empty SUnits list, or
-    // retrieve the existing SUnits list for this register's defs.
-    std::vector<PhysRegSUOper> &DefList = Defs[MO.getReg()];
+    unsigned Reg = MO.getReg();
 
     // clear this register's use list
-    if (Uses.contains(MO.getReg()))
-      Uses[MO.getReg()].clear();
-
-    if (!MO.isDead())
-      DefList.clear();
-
-    // Calls will not be reordered because of chain dependencies (see
-    // below). Since call operands are dead, calls may continue to be added
-    // to the DefList making dependence checking quadratic in the size of
-    // the block. Instead, we leave only one call at the back of the
-    // DefList.
-    if (SU->isCall) {
-      while (!DefList.empty() && DefList.back().SU->isCall)
-        DefList.pop_back();
+    if (Uses.contains(Reg))
+      Uses.eraseAll(Reg);
+
+    if (!MO.isDead()) {
+      Defs.eraseAll(Reg);
+    } else if (SU->isCall) {
+      // Calls will not be reordered because of chain dependencies (see
+      // below). Since call operands are dead, calls may continue to be added
+      // to the DefList making dependence checking quadratic in the size of
+      // the block. Instead, we leave only one call at the back of the
+      // DefList.
+      Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg);
+      Reg2SUnitsMap::iterator B = P.first;
+      Reg2SUnitsMap::iterator I = P.second;
+      for (bool isBegin = I == B; !isBegin; /* empty */) {
+        isBegin = (--I) == B;
+        if (!I->SU->isCall)
+          break;
+        I = Defs.erase(I);
+      }
     }
+
     // Defs are pushed in the order they are visited and never reordered.
-    DefList.push_back(PhysRegSUOper(SU, OperIdx));
+    Defs.insert(PhysRegSUOper(SU, OperIdx, Reg));
   }
 }
 
@@ -726,8 +714,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
 
   assert(Defs.empty() && Uses.empty() &&
          "Only BuildGraph should update Defs/Uses");
-  Defs.setRegLimit(TRI->getNumRegs());
-  Uses.setRegLimit(TRI->getNumRegs());
+  Defs.setUniverse(TRI->getNumRegs());
+  Uses.setUniverse(TRI->getNumRegs());
 
   assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
   // FIXME: Allow SparseSet to reserve space for the creation of virtual
@@ -758,7 +746,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       assert(RPTracker->getPos() == prior(MII) && "RPTracker can't find MI");
     }
 
-    assert((!MI->isTerminator() || CanHandleTerminators) && !MI->isLabel() &&
+    assert((CanHandleTerminators || (!MI->isTerminator() && !MI->isLabel())) &&
            "Cannot schedule terminators or labels!");
 
     SUnit *SU = MISUnitMap[MI];
@@ -1006,7 +994,7 @@ std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
   else if (SU == &ExitSU)
     oss << "<exit>";
   else
-    SU->getInstr()->print(oss);
+    SU->getInstr()->print(oss, &TM, /*SkipOpers=*/true);
   return oss.str();
 }
 
@@ -1030,58 +1018,95 @@ class SchedDFSImpl {
   /// List PredSU, SuccSU pairs that represent data edges between subtrees.
   std::vector<std::pair<const SUnit*, const SUnit*> > ConnectionPairs;
 
+  struct RootData {
+    unsigned NodeID;
+    unsigned ParentNodeID;  // Parent node (member of the parent subtree).
+    unsigned SubInstrCount; // Instr count in this tree only, not children.
+
+    RootData(unsigned id): NodeID(id),
+                           ParentNodeID(SchedDFSResult::InvalidSubtreeID),
+                           SubInstrCount(0) {}
+
+    unsigned getSparseSetIndex() const { return NodeID; }
+  };
+
+  SparseSet<RootData> RootSet;
+
 public:
-  SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSData.size()) {}
+  SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) {
+    RootSet.setUniverse(R.DFSNodeData.size());
+  }
 
-  /// SubtreID is initialized to zero, set to itself to flag the root of a
-  /// subtree, set to the parent to indicate an interior node,
-  /// then set to a representative subtree ID during finalization.
+  /// Return true if this node been visited by the DFS traversal.
+  ///
+  /// During visitPostorderNode the Node's SubtreeID is assigned to the Node
+  /// ID. Later, SubtreeID is updated but remains valid.
   bool isVisited(const SUnit *SU) const {
-    return R.DFSData[SU->NodeNum].SubtreeID;
+    return R.DFSNodeData[SU->NodeNum].SubtreeID
+      != SchedDFSResult::InvalidSubtreeID;
   }
 
   /// Initialize this node's instruction count. We don't need to flag the node
   /// visited until visitPostorder because the DAG cannot have cycles.
   void visitPreorder(const SUnit *SU) {
-    R.DFSData[SU->NodeNum].InstrCount = SU->getInstr()->isTransient() ? 0 : 1;
+    R.DFSNodeData[SU->NodeNum].InstrCount =
+      SU->getInstr()->isTransient() ? 0 : 1;
   }
 
-  /// Mark this node as either the root of a subtree or an interior
-  /// node. Increment the parent node's instruction count.
-  void visitPostorder(const SUnit *SU, const SDep *PredDep, const SUnit *Parent) {
-    R.DFSData[SU->NodeNum].SubtreeID = SU->NodeNum;
-
-    // Join the child to its parent if they are connected via data dependence
-    // and do not exceed the limit.
-    if (!Parent || PredDep->getKind() != SDep::Data)
-      return;
-
-    unsigned PredCnt = R.DFSData[SU->NodeNum].InstrCount;
-    if (PredCnt > R.SubtreeLimit)
-      return;
-
-    R.DFSData[SU->NodeNum].SubtreeID = Parent->NodeNum;
+  /// Called once for each node after all predecessors are visited. Revisit this
+  /// node's predecessors and potentially join them now that we know the ILP of
+  /// the other predecessors.
+  void visitPostorderNode(const SUnit *SU) {
+    // Mark this node as the root of a subtree. It may be joined with its
+    // successors later.
+    R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum;
+    RootData RData(SU->NodeNum);
+    RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1;
+
+    // If any predecessors are still in their own subtree, they either cannot be
+    // joined or are large enough to remain separate. If this parent node's
+    // total instruction count is not greater than a child subtree by at least
+    // the subtree limit, then try to join it now since splitting subtrees is
+    // only useful if multiple high-pressure paths are possible.
+    unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount;
+    for (SUnit::const_pred_iterator
+           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
+      if (PI->getKind() != SDep::Data)
+        continue;
+      unsigned PredNum = PI->getSUnit()->NodeNum;
+      if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit)
+        joinPredSubtree(*PI, SU, /*CheckLimit=*/false);
+
+      // Either link or merge the TreeData entry from the child to the parent.
+      if (R.DFSNodeData[PredNum].SubtreeID == PredNum) {
+        // If the predecessor's parent is invalid, this is a tree edge and the
+        // current node is the parent.
+        if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
+          RootSet[PredNum].ParentNodeID = SU->NodeNum;
+      }
+      else if (RootSet.count(PredNum)) {
+        // The predecessor is not a root, but is still in the root set. This
+        // must be the new parent that it was just joined to. Note that
+        // RootSet[PredNum].ParentNodeID may either be invalid or may still be
+        // set to the original parent.
+        RData.SubInstrCount += RootSet[PredNum].SubInstrCount;
+        RootSet.erase(PredNum);
+      }
+    }
+    RootSet[SU->NodeNum] = RData;
+  }
 
-    // Add the recently finished predecessor's bottom-up descendent count.
-    R.DFSData[Parent->NodeNum].InstrCount += PredCnt;
-    SubtreeClasses.join(Parent->NodeNum, SU->NodeNum);
+  /// Called once for each tree edge after calling visitPostOrderNode on the
+  /// predecessor. Increment the parent node's instruction count and
+  /// preemptively join this subtree to its parent's if it is small enough.
+  void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
+    R.DFSNodeData[Succ->NodeNum].InstrCount
+      += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount;
+    joinPredSubtree(PredDep, Succ);
   }
 
-  /// Determine whether the DFS cross edge should be considered a subtree edge
-  /// or a connection between subtrees.
-  void visitCross(const SDep &PredDep, const SUnit *Succ) {
-    if (PredDep.getKind() == SDep::Data) {
-      // If this is a cross edge to a root, join the subtrees. This happens when
-      // the root was first reached by a non-data dependence.
-      unsigned NodeNum = PredDep.getSUnit()->NodeNum;
-      unsigned PredCnt = R.DFSData[NodeNum].InstrCount;
-      if (R.DFSData[NodeNum].SubtreeID == NodeNum && PredCnt < R.SubtreeLimit) {
-        R.DFSData[NodeNum].SubtreeID = Succ->NodeNum;
-        R.DFSData[Succ->NodeNum].InstrCount += PredCnt;
-        SubtreeClasses.join(Succ->NodeNum, NodeNum);
-        return;
-      }
-    }
+  /// Add a connection for cross edges.
+  void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
     ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ));
   }
 
@@ -1089,13 +1114,27 @@ public:
   /// between trees.
   void finalize() {
     SubtreeClasses.compress();
+    R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
+    assert(SubtreeClasses.getNumClasses() == RootSet.size()
+           && "number of roots should match trees");
+    for (SparseSet<RootData>::const_iterator
+           RI = RootSet.begin(), RE = RootSet.end(); RI != RE; ++RI) {
+      unsigned TreeID = SubtreeClasses[RI->NodeID];
+      if (RI->ParentNodeID != SchedDFSResult::InvalidSubtreeID)
+        R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[RI->ParentNodeID];
+      R.DFSTreeData[TreeID].SubInstrCount = RI->SubInstrCount;
+      // Note that SubInstrCount may be greater than InstrCount if we joined
+      // subtrees across a cross edge. InstrCount will be attributed to the
+      // original parent, while SubInstrCount will be attributed to the joined
+      // parent.
+    }
     R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
     R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
     DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
-    for (unsigned Idx = 0, End = R.DFSData.size(); Idx != End; ++Idx) {
-      R.DFSData[Idx].SubtreeID = SubtreeClasses[Idx];
+    for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
+      R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
       DEBUG(dbgs() << "  SU(" << Idx << ") in tree "
-            << R.DFSData[Idx].SubtreeID << '\n');
+            << R.DFSNodeData[Idx].SubtreeID << '\n');
     }
     for (std::vector<std::pair<const SUnit*, const SUnit*> >::const_iterator
            I = ConnectionPairs.begin(), E = ConnectionPairs.end();
@@ -1111,21 +1150,53 @@ public:
   }
 
 protected:
+  /// Join the predecessor subtree with the successor that is its DFS
+  /// parent. Apply some heuristics before joining.
+  bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
+                       bool CheckLimit = true) {
+    assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
+
+    // Check if the predecessor is already joined.
+    const SUnit *PredSU = PredDep.getSUnit();
+    unsigned PredNum = PredSU->NodeNum;
+    if (R.DFSNodeData[PredNum].SubtreeID != PredNum)
+      return false;
+
+    // Four is the magic number of successors before a node is considered a
+    // pinch point.
+    unsigned NumDataSucs = 0;
+    for (SUnit::const_succ_iterator SI = PredSU->Succs.begin(),
+           SE = PredSU->Succs.end(); SI != SE; ++SI) {
+      if (SI->getKind() == SDep::Data) {
+        if (++NumDataSucs >= 4)
+          return false;
+      }
+    }
+    if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit)
+      return false;
+    R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum;
+    SubtreeClasses.join(Succ->NodeNum, PredNum);
+    return true;
+  }
+
   /// Called by finalize() to record a connection between trees.
   void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) {
     if (!Depth)
       return;
 
-    SmallVectorImpl<SchedDFSResult::Connection> &Connections =
-      R.SubtreeConnections[FromTree];
-    for (SmallVectorImpl<SchedDFSResult::Connection>::iterator
-           I = Connections.begin(), E = Connections.end(); I != E; ++I) {
-      if (I->TreeID == ToTree) {
-        I->Level = std::max(I->Level, Depth);
-        return;
+    do {
+      SmallVectorImpl<SchedDFSResult::Connection> &Connections =
+        R.SubtreeConnections[FromTree];
+      for (SmallVectorImpl<SchedDFSResult::Connection>::iterator
+             I = Connections.begin(), E = Connections.end(); I != E; ++I) {
+        if (I->TreeID == ToTree) {
+          I->Level = std::max(I->Level, Depth);
+          return;
+        }
       }
-    }
-    Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
+      Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
+      FromTree = R.DFSTreeData[FromTree].ParentTreeID;
+    } while (FromTree != SchedDFSResult::InvalidSubtreeID);
   }
 };
 } // namespace llvm
@@ -1157,28 +1228,44 @@ public:
 };
 } // anonymous
 
+static bool hasDataSucc(const SUnit *SU) {
+  for (SUnit::const_succ_iterator
+         SI = SU->Succs.begin(), SE = SU->Succs.end(); SI != SE; ++SI) {
+    if (SI->getKind() == SDep::Data && !SI->getSUnit()->isBoundaryNode())
+      return true;
+  }
+  return false;
+}
+
 /// Compute an ILP metric for all nodes in the subDAG reachable via depth-first
 /// search from this root.
-void SchedDFSResult::compute(ArrayRef<SUnit *> Roots) {
+void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
   if (!IsBottomUp)
     llvm_unreachable("Top-down ILP metric is unimplemnted");
 
   SchedDFSImpl Impl(*this);
-  for (ArrayRef<const SUnit*>::const_iterator
-         RootI = Roots.begin(), RootE = Roots.end(); RootI != RootE; ++RootI) {
+  for (ArrayRef<SUnit>::const_iterator
+         SI = SUnits.begin(), SE = SUnits.end(); SI != SE; ++SI) {
+    const SUnit *SU = &*SI;
+    if (Impl.isVisited(SU) || hasDataSucc(SU))
+      continue;
+
     SchedDAGReverseDFS DFS;
-    Impl.visitPreorder(*RootI);
-    DFS.follow(*RootI);
+    Impl.visitPreorder(SU);
+    DFS.follow(SU);
     for (;;) {
       // Traverse the leftmost path as far as possible.
       while (DFS.getPred() != DFS.getPredEnd()) {
         const SDep &PredDep = *DFS.getPred();
         DFS.advance();
-        // If the pred is already valid, skip it. We may preorder visit a node
-        // with InstrCount==0 more than once, but it won't affect heuristics
-        // because we don't care about cross edges to leaf copies.
+        // Ignore non-data edges.
+        if (PredDep.getKind() != SDep::Data
+            || PredDep.getSUnit()->isBoundaryNode()) {
+          continue;
+        }
+        // An already visited edge is a cross edge, assuming an acyclic DAG.
         if (Impl.isVisited(PredDep.getSUnit())) {
-          Impl.visitCross(PredDep, DFS.getCurr());
+          Impl.visitCrossEdge(PredDep, DFS.getCurr());
           continue;
         }
         Impl.visitPreorder(PredDep.getSUnit());
@@ -1187,7 +1274,9 @@ void SchedDFSResult::compute(ArrayRef<SUnit *> Roots) {
       // Visit the top of the stack in postorder and backtrack.
       const SUnit *Child = DFS.getCurr();
       const SDep *PredDep = DFS.backtrack();
-      Impl.visitPostorder(Child, PredDep, PredDep ? DFS.getCurr() : 0);
+      Impl.visitPostorderNode(Child);
+      if (PredDep)
+        Impl.visitPostorderEdge(*PredDep, DFS.getCurr());
       if (DFS.isComplete())
         break;
     }
diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
index 6c50913..8ddb3e8 100644
--- a/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -41,6 +41,10 @@ namespace llvm {
       return true;
     }
 
+    static bool isNodeHidden(const SUnit *Node) {
+      return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+    }
+
     static bool hasNodeAddressLabel(const SUnit *Node,
                                     const ScheduleDAG *Graph) {
       return true;
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ff00d0d..ec52d7e 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2634,7 +2634,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger);
       if (Result != ISD::SETCC_INVALID &&
           (!LegalOperations ||
-           TLI.isCondCodeLegal(Result, LL.getSimpleValueType())))
+           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
+            TLI.isOperationLegal(ISD::SETCC,
+                            TLI.getSetCCResultType(N0.getSimpleValueType())))))
         return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
                             LL, LR, Result);
     }
@@ -3144,7 +3146,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger);
       if (Result != ISD::SETCC_INVALID &&
           (!LegalOperations ||
-           TLI.isCondCodeLegal(Result, LL.getSimpleValueType())))
+           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
+            TLI.isOperationLegal(ISD::SETCC,
+              TLI.getSetCCResultType(N0.getValueType())))))
         return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
                             LL, LR, Result);
     }
@@ -5100,16 +5104,26 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   // If we haven't found a load, we can't narrow it.  Don't transform one with
   // multiple uses, this would require adding a new load.
-  if (!isa<LoadSDNode>(N0) || !N0.hasOneUse() ||
-      // Don't change the width of a volatile load.
-      cast<LoadSDNode>(N0)->isVolatile())
+  if (!isa<LoadSDNode>(N0) || !N0.hasOneUse())
+    return SDValue();
+
+  // Don't change the width of a volatile load.
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  if (LN0->isVolatile())
     return SDValue();
 
   // Verify that we are actually reducing a load width here.
-  if (cast<LoadSDNode>(N0)->getMemoryVT().getSizeInBits() < EVTBits)
+  if (LN0->getMemoryVT().getSizeInBits() < EVTBits)
+    return SDValue();
+
+  // For the transform to be legal, the load must produce only two values
+  // (the value loaded and the chain).  Don't transform a pre-increment
+  // load, for example, which produces an extra value.  Otherwise the 
+  // transformation is not equivalent, and the downstream logic to replace
+  // uses gets things wrong.
+  if (LN0->getNumValues() > 2)
     return SDValue();
 
-  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   EVT PtrType = N0.getOperand(1).getValueType();
 
   if (PtrType == MVT::Untyped || PtrType.isExtended())
@@ -5153,8 +5167,15 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     EVT ShImmTy = getShiftAmountTy(Result.getValueType());
     if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt))
       ShImmTy = VT;
-    Result = DAG.getNode(ISD::SHL, N0.getDebugLoc(), VT,
-                         Result, DAG.getConstant(ShLeftAmt, ShImmTy));
+    // If the shift amount is as large as the result size (but, presumably,
+    // no larger than the source) then the useful bits of the result are
+    // zero; we can't simply return the shortened shift, because the result
+    // of that operation is undefined.
+    if (ShLeftAmt >= VT.getSizeInBits())
+      Result = DAG.getConstant(0, VT);
+    else
+      Result = DAG.getNode(ISD::SHL, N0.getDebugLoc(), VT,
+                          Result, DAG.getConstant(ShLeftAmt, ShImmTy));
   }
 
   // Return the new loaded value.
@@ -5340,6 +5361,38 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     }
   }
 
+  // Fold a series of buildvector, bitcast, and truncate if possible.
+  // For example fold
+  //   (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
+  //   (2xi32 (buildvector x, y)).
+  if (Level == AfterLegalizeVectorOps && VT.isVector() &&
+      N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+      N0.getOperand(0).hasOneUse()) {
+
+    SDValue BuildVect = N0.getOperand(0);
+    EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
+    EVT TruncVecEltTy = VT.getVectorElementType();
+
+    // Check that the element types match.
+    if (BuildVectEltTy == TruncVecEltTy) {
+      // Now we only need to compute the offset of the truncated elements.
+      unsigned BuildVecNumElts =  BuildVect.getNumOperands();
+      unsigned TruncVecNumElts = VT.getVectorNumElements();
+      unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;
+
+      assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
+             "Invalid number of elements");
+
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
+        Opnds.push_back(BuildVect.getOperand(i));
+
+      return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT, &Opnds[0],
+                         Opnds.size());
+    }
+  }
+
   // See if we can simplify the input to this truncate through knowledge that
   // only the low bits are being used.
   // For example "trunc (or (shl x, 8), y)" // -> trunc y
@@ -5822,13 +5875,6 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
                            N1, NewCFP);
       }
 
-      // (fadd (fadd x, x), x) -> (fmul 3.0, x)
-      if (!CFP00 && !CFP01 && N0.getOperand(0) == N0.getOperand(1) &&
-          N0.getOperand(0) == N1) {
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
-                           N1, DAG.getConstantFP(3.0, VT));
-      }
-
       // (fadd (fmul c, x), (fadd x, x)) -> (fmul c+2, x)
       if (CFP00 && !CFP01 && N1.getOpcode() == ISD::FADD &&
           N1.getOperand(0) == N1.getOperand(1) &&
@@ -5874,12 +5920,6 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
                            N0, NewCFP);
       }
 
-      // (fadd x, (fadd x, x)) -> (fmul 3.0, x)
-      if (!CFP10 && !CFP11 && N1.getOperand(0) == N1.getOperand(1) &&
-          N1.getOperand(0) == N0) {
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
-                           N0, DAG.getConstantFP(3.0, VT));
-      }
 
       // (fadd (fadd x, x), (fmul c, x)) -> (fmul c+2, x)
       if (CFP10 && !CFP11 && N1.getOpcode() == ISD::FADD &&
@@ -5904,6 +5944,26 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       }
     }
 
+    if (N0.getOpcode() == ISD::FADD) {
+      ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+      // (fadd (fadd x, x), x) -> (fmul 3.0, x)
+      if (!CFP && N0.getOperand(0) == N0.getOperand(1) &&
+          (N0.getOperand(0) == N1)) {
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, DAG.getConstantFP(3.0, VT));
+      }
+    }
+
+    if (N1.getOpcode() == ISD::FADD) {
+      ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+      // (fadd x, (fadd x, x)) -> (fmul 3.0, x)
+      if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
+          N1.getOperand(0) == N0) {
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, DAG.getConstantFP(3.0, VT));
+      }
+    }
+
     // (fadd (fadd x, x), (fadd x, x)) -> (fmul 4.0, x)
     if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
         N0.getOperand(0) == N0.getOperand(1) &&
@@ -6735,18 +6795,24 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
     if (Op0.getOpcode() == Op1.getOpcode()) {
       // Avoid missing important xor optimizations.
       SDValue Tmp = visitXOR(TheXor);
-      if (Tmp.getNode() && Tmp.getNode() != TheXor) {
-        DEBUG(dbgs() << "\nReplacing.8 ";
-              TheXor->dump(&DAG);
-              dbgs() << "\nWith: ";
-              Tmp.getNode()->dump(&DAG);
-              dbgs() << '\n');
-        WorkListRemover DeadNodes(*this);
-        DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
-        removeFromWorkList(TheXor);
-        DAG.DeleteNode(TheXor);
-        return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
-                           MVT::Other, Chain, Tmp, N2);
+      if (Tmp.getNode()) {
+        if (Tmp.getNode() != TheXor) {
+          DEBUG(dbgs() << "\nReplacing.8 ";
+                TheXor->dump(&DAG);
+                dbgs() << "\nWith: ";
+                Tmp.getNode()->dump(&DAG);
+                dbgs() << '\n');
+          WorkListRemover DeadNodes(*this);
+          DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
+          removeFromWorkList(TheXor);
+          DAG.DeleteNode(TheXor);
+          return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+                             MVT::Other, Chain, Tmp, N2);
+        }
+
+        // visitXOR has changed XOR's operands.
+        Op0 = TheXor->getOperand(0);
+        Op1 = TheXor->getOperand(1);
       }
     }
 
@@ -6894,6 +6960,16 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   ISD::MemIndexedMode AM = ISD::UNINDEXED;
   if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
     return false;
+
+  // Backends without true r+i pre-indexed forms may need to pass a
+  // constant base with a variable offset so that constant coercion
+  // will work with the patterns in canonical form.
+  bool Swapped = false;
+  if (isa<ConstantSDNode>(BasePtr)) {
+    std::swap(BasePtr, Offset);
+    Swapped = true;
+  }
+
   // Don't create a indexed load / store with zero offset.
   if (isa<ConstantSDNode>(Offset) &&
       cast<ConstantSDNode>(Offset)->isNullValue())
@@ -6919,6 +6995,48 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
       return false;
   }
 
+  // If the offset is a constant, there may be other adds of constants that
+  // can be folded with this one. We should do this to avoid having to keep
+  // a copy of the original base pointer.
+  SmallVector<SDNode *, 16> OtherUses;
+  if (isa<ConstantSDNode>(Offset))
+    for (SDNode::use_iterator I = BasePtr.getNode()->use_begin(),
+         E = BasePtr.getNode()->use_end(); I != E; ++I) {
+      SDNode *Use = *I;
+      if (Use == Ptr.getNode())
+        continue;
+
+      if (Use->isPredecessorOf(N))
+        continue;
+
+      if (Use->getOpcode() != ISD::ADD && Use->getOpcode() != ISD::SUB) {
+        OtherUses.clear();
+        break;
+      }
+
+      SDValue Op0 = Use->getOperand(0), Op1 = Use->getOperand(1);
+      if (Op1.getNode() == BasePtr.getNode())
+        std::swap(Op0, Op1);
+      assert(Op0.getNode() == BasePtr.getNode() &&
+             "Use of ADD/SUB but not an operand");
+
+      if (!isa<ConstantSDNode>(Op1)) {
+        OtherUses.clear();
+        break;
+      }
+
+      // FIXME: In some cases, we can be smarter about this.
+      if (Op1.getValueType() != Offset.getValueType()) {
+        OtherUses.clear();
+        break;
+      }
+
+      OtherUses.push_back(Use);
+    }
+
+  if (Swapped)
+    std::swap(BasePtr, Offset);
+
   // Now check for #3 and #4.
   bool RealUse = false;
 
@@ -6968,6 +7086,43 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // Finally, since the node is now dead, remove it from the graph.
   DAG.DeleteNode(N);
 
+  if (Swapped)
+    std::swap(BasePtr, Offset);
+
+  // Replace other uses of BasePtr that can be updated to use Ptr
+  for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
+    unsigned OffsetIdx = 1;
+    if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
+      OffsetIdx = 0;
+    assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
+           BasePtr.getNode() && "Expected BasePtr operand");
+
+    APInt OV =
+      cast<ConstantSDNode>(Offset)->getAPIntValue();
+    if (AM == ISD::PRE_DEC)
+      OV = -OV;
+
+    ConstantSDNode *CN =
+      cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
+    APInt CNV = CN->getAPIntValue();
+    if (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1)
+      CNV += OV;
+    else
+      CNV -= OV;
+
+    SDValue NewOp1 = Result.getValue(isLoad ? 1 : 0);
+    SDValue NewOp2 = DAG.getConstant(CNV, CN->getValueType(0));
+    if (OffsetIdx == 0)
+      std::swap(NewOp1, NewOp2);
+
+    SDValue NewUse = DAG.getNode(OtherUses[i]->getOpcode(),
+                                 OtherUses[i]->getDebugLoc(),
+                                 OtherUses[i]->getValueType(0), NewOp1, NewOp2);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
+    removeFromWorkList(OtherUses[i]);
+    DAG.DeleteNode(OtherUses[i]);
+  }
+
   // Replace the uses of Ptr with uses of the updated base value.
   DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
   removeFromWorkList(Ptr.getNode());
@@ -7176,12 +7331,15 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   // Try to infer better alignment information than the load already has.
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
-      if (Align > LD->getAlignment())
-        return DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
+      if (Align > LD->getMemOperand()->getBaseAlignment()) {
+        SDValue NewLoad =
+               DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
                               LD->getValueType(0),
                               Chain, Ptr, LD->getPointerInfo(),
                               LD->getMemoryVT(),
                               LD->isVolatile(), LD->isNonTemporal(), Align);
+        return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
+      }
     }
   }
 
@@ -7576,6 +7734,8 @@ struct ConsecutiveMemoryChainSorter {
 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
   EVT MemVT = St->getMemoryVT();
   int64_t ElementSizeBytes = MemVT.getSizeInBits()/8;
+  bool NoVectors = DAG.getMachineFunction().getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
 
   // Don't merge vectors into wider inputs.
   if (MemVT.isVector() || !MemVT.isSimple())
@@ -7751,16 +7911,14 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
 
     // We only use vectors if the constant is known to be zero and the
     // function is not marked with the noimplicitfloat attribute.
-    if (NonZero || (DAG.getMachineFunction().getFunction()->getAttributes().
-                    hasAttribute(AttributeSet::FunctionIndex,
-                                 Attribute::NoImplicitFloat)))
+    if (NonZero || NoVectors)
       LastLegalVectorType = 0;
 
     // Check if we found a legal integer type to store.
     if (LastLegalType == 0 && LastLegalVectorType == 0)
       return false;
 
-    bool UseVector = LastLegalVectorType > LastLegalType;
+    bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
     unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
 
     // Make sure we have something to merge.
@@ -7913,7 +8071,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
     // All loads much share the same chain.
     if (LoadNodes[i].MemNode->getChain() != FirstChain)
       break;
-    
+
     int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
     if (CurrAddress - StartAddress != (ElementSizeBytes * i))
       break;
@@ -7933,7 +8091,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
 
   // Only use vector types if the vector type is larger than the integer type.
   // If they are the same, use integers.
-  bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType;
+  bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
   unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType);
 
   // We add +1 here because the LastXXX variables refer to location while
@@ -9104,11 +9262,6 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
 
 /// SimplifyVBinOp - Visit a binary vector operation, like ADD.
 SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
-  // After legalize, the target may be depending on adds and other
-  // binary ops to provide legal ways to construct constants or other
-  // things. Simplifying them may result in a loss of legality.
-  if (LegalOperations) return SDValue();
-
   assert(N->getValueType(0).isVector() &&
          "SimplifyVBinOp only works on vectors!");
 
@@ -9178,11 +9331,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
 
 /// SimplifyVUnaryOp - Visit a binary vector operation, like FABS/FNEG.
 SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) {
-  // After legalize, the target may be depending on adds and other
-  // binary ops to provide legal ways to construct constants or other
-  // things. Simplifying them may result in a loss of legality.
-  if (LegalOperations) return SDValue();
-
   assert(N->getValueType(0).isVector() &&
          "SimplifyVUnaryOp only works on vectors!");
 
@@ -9285,7 +9433,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
         // src value info, don't do the transformation if the memory
         // locations are not in the default address space.
         LLD->getPointerInfo().getAddrSpace() != 0 ||
-        RLD->getPointerInfo().getAddrSpace() != 0)
+        RLD->getPointerInfo().getAddrSpace() != 0 ||
+        !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
+                                      LLD->getBasePtr().getValueType()))
       return false;
 
     // Check that the select condition doesn't reach either load.  If so,
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 0d90a07..ff9b2ba 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -87,6 +87,27 @@ void FastISel::startNewBlock() {
   LastLocalValue = EmitStartPt;
 }
 
+bool FastISel::LowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    // Fallback to SDISel argument lowering code to deal with sret pointer
+    // parameter.
+    return false;
+  
+  if (!FastLowerArguments())
+    return false;
+
+  // Enter non-dead arguments into ValueMap for uses in non-entry BBs.
+  for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(),
+         E = FuncInfo.Fn->arg_end(); I != E; ++I) {
+    if (!I->use_empty()) {
+      DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(I);
+      assert(VI != LocalValueMap.end() && "Missed an argument?");
+      FuncInfo.ValueMap[I] = VI->second;
+    }
+  }
+  return true;
+}
+
 void FastISel::flushLocalValueMap() {
   LocalValueMap.clear();
   LastLocalValue = EmitStartPt;
@@ -684,7 +705,7 @@ bool FastISel::SelectCall(const User *I) {
   // all the values which have already been materialized,
   // appear after the call. It also makes sense to skip intrinsics
   // since they tend to be inlined.
-  if (!isa<IntrinsicInst>(F))
+  if (!isa<IntrinsicInst>(Call))
     flushLocalValueMap();
 
   // An arbitrary call. Bail.
@@ -801,7 +822,7 @@ FastISel::SelectInstruction(const Instruction *I) {
   }
 
   // First, try doing target-independent selection.
-  if (SelectOperator(I, I->getOpcode())) {
+  if (!SkipTargetIndependentFastISel() && SelectOperator(I, I->getOpcode())) {
     ++NumFastIselSuccessIndependent;
     DL = DebugLoc();
     return true;
@@ -836,7 +857,8 @@ FastISel::SelectInstruction(const Instruction *I) {
 void
 FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DL) {
 
-  if (FuncInfo.MBB->getBasicBlock()->size() > 1 && FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
+  if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
+      FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only instruction
     // in the block then emit it, otherwise we have the unconditional
     // fall-through case, which needs no instructions.
@@ -1067,6 +1089,10 @@ FastISel::FastISel(FunctionLoweringInfo &funcInfo,
 
 FastISel::~FastISel() {}
 
+bool FastISel::FastLowerArguments() {
+  return false;
+}
+
 unsigned FastISel::FastEmit_(MVT, MVT,
                              unsigned) {
   return 0;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5eaf67e..f085e44 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -102,7 +102,8 @@ private:
                                                  SDNode *Node, bool isSigned);
   SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
                           RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
-                          RTLIB::Libcall Call_F128, RTLIB::Libcall Call_PPCF128);
+                          RTLIB::Libcall Call_F128,
+                          RTLIB::Libcall Call_PPCF128);
   SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
                            RTLIB::Libcall Call_I8,
                            RTLIB::Libcall Call_I16,
@@ -110,6 +111,7 @@ private:
                            RTLIB::Libcall Call_I64,
                            RTLIB::Libcall Call_I128);
   void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, DebugLoc dl);
   SDValue ExpandBUILD_VECTOR(SDNode *Node);
@@ -1841,26 +1843,6 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   return ExpandVectorBuildThroughStack(Node);
 }
 
-static bool isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
-                                SDValue &Chain, const TargetLowering &TLI) {
-  const Function *F = DAG.getMachineFunction().getFunction();
-
-  // Conservatively require the attributes of the call to match those of
-  // the return. Ignore noalias because it doesn't affect the call sequence.
-  Attribute CallerRetAttr = F->getAttributes().getRetAttributes();
-  if (AttrBuilder(CallerRetAttr)
-      .removeAttribute(Attribute::NoAlias).hasAttributes())
-    return false;
-
-  // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerRetAttr.hasAttribute(Attribute::ZExt) ||
-      CallerRetAttr.hasAttribute(Attribute::SExt))
-    return false;
-
-  // Check if the only use is a function return node.
-  return TLI.isUsedByReturnOnly(Node, Chain);
-}
-
 // ExpandLibCall - Expand a node into a call to a libcall.  If the result value
 // does not fit into a register, return the lo part and set the hi part to the
 // by-reg argument.  If it does fit into a single register, return the result
@@ -1891,7 +1873,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   // isTailCall may be true since the callee does not reference caller stack
   // frame. Check if it's in the right position.
   SDValue TCChain = InChain;
-  bool isTailCall = isInTailCallPosition(DAG, Node, TCChain, TLI);
+  bool isTailCall = TLI.isInTailCallPosition(DAG, Node, TCChain);
   if (isTailCall)
     InChain = TCChain;
 
@@ -2115,6 +2097,120 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   Results.push_back(Rem);
 }
 
+/// isSinCosLibcallAvailable - Return true if sincos libcall is available.
+static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
+  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
+  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
+  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
+  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
+  }
+  return TLI.getLibcallName(LC) != 0;
+}
+
+/// canCombineSinCosLibcall - Return true if sincos libcall is available and
+/// can be used to combine sin and cos.
+static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
+                                    const TargetMachine &TM) {
+  if (!isSinCosLibcallAvailable(Node, TLI))
+    return false;
+  // GNU sin/cos functions set errno while sincos does not. Therefore
+  // combining sin and cos is only safe if unsafe-fpmath is enabled.
+  bool isGNU = Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU;
+  if (isGNU && !TM.Options.UnsafeFPMath)
+    return false;
+  return true;
+}
+
+/// useSinCos - Only issue sincos libcall if both sin and cos are
+/// needed.
+static bool useSinCos(SDNode *Node) {
+  unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
+    ? ISD::FCOS : ISD::FSIN;
+  
+  SDValue Op0 = Node->getOperand(0);
+  for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
+       UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == Node)
+      continue;
+    // The other user might have been turned into sincos already.
+    if (User->getOpcode() == OtherOpcode || User->getOpcode() == ISD::FSINCOS)
+      return true;
+  }
+  return false;
+}
+
+/// ExpandSinCosLibCall - Issue libcalls to sincos to compute sin / cos
+/// pairs.
+void
+SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
+  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
+  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
+  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
+  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
+  }
+  
+  // The input chain to this libcall is the entry node of the function.
+  // Legalizing the call will automatically add the previous call to the
+  // dependence.
+  SDValue InChain = DAG.getEntryNode();
+  
+  EVT RetVT = Node->getValueType(0);
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  
+  // Pass the argument.
+  Entry.Node = Node->getOperand(0);
+  Entry.Ty = RetTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+  
+  // Pass the return address of sin.
+  SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = SinPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+  
+  // Also pass the return address of the cos.
+  SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = CosPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+  
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+  
+  DebugLoc dl = Node->getDebugLoc();
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, Type::getVoidTy(*DAG.getContext()),
+                       false, false, false, false,
+                       0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
+                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                       Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  Results.push_back(DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr,
+                                MachinePointerInfo(), false, false, false, 0));
+  Results.push_back(DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr,
+                                MachinePointerInfo(), false, false, false, 0));
+}
+
 /// ExpandLegalINT_TO_FP - This function is responsible for legalizing a
 /// INT_TO_FP operation of the specified operand when the target requests that
 /// we expand it.  At this point, we know that the result and operand types are
@@ -2443,18 +2539,6 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, DebugLoc dl) {
   }
 }
 
-/// SplatByte - Distribute ByteVal over NumBits bits.
-// FIXME: Move this helper to a common place.
-static APInt SplatByte(unsigned NumBits, uint8_t ByteVal) {
-  APInt Val = APInt(NumBits, ByteVal);
-  unsigned Shift = 8;
-  for (unsigned i = NumBits; i > 8; i >>= 1) {
-    Val = (Val << Shift) | Val;
-    Shift <<= 1;
-  }
-  return Val;
-}
-
 /// ExpandBitCount - Expand the specified bitcount instruction into operations.
 ///
 SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
@@ -2472,10 +2556,10 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
     // This is the "best" algorithm from
     // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
 
-    SDValue Mask55 = DAG.getConstant(SplatByte(Len, 0x55), VT);
-    SDValue Mask33 = DAG.getConstant(SplatByte(Len, 0x33), VT);
-    SDValue Mask0F = DAG.getConstant(SplatByte(Len, 0x0F), VT);
-    SDValue Mask01 = DAG.getConstant(SplatByte(Len, 0x01), VT);
+    SDValue Mask55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), VT);
+    SDValue Mask33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), VT);
+    SDValue Mask0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), VT);
+    SDValue Mask01 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), VT);
 
     // v = v - ((v >> 1) & 0x55555555...)
     Op = DAG.getNode(ISD::SUB, dl, VT, Op,
@@ -2825,7 +2909,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SDValue True, False;
     EVT VT =  Node->getOperand(0).getValueType();
     EVT NVT = Node->getValueType(0);
-    APFloat apf(APInt::getNullValue(VT.getSizeInBits()));
+    APFloat apf(DAG.EVTToAPFloatSemantics(VT),
+                APInt::getNullValue(VT.getSizeInBits()));
     APInt x = APInt::getSignBit(NVT.getSizeInBits());
     (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
     Tmp1 = DAG.getConstantFP(apf, VT);
@@ -3060,14 +3145,33 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                       RTLIB::SQRT_PPCF128));
     break;
   case ISD::FSIN:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
-                                      RTLIB::SIN_F80, RTLIB::SIN_F128,
-                                      RTLIB::SIN_PPCF128));
+  case ISD::FCOS: {
+    EVT VT = Node->getValueType(0);
+    bool isSIN = Node->getOpcode() == ISD::FSIN;
+    // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
+    // fcos which share the same operand and both are used.
+    if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
+         canCombineSinCosLibcall(Node, TLI, TM))
+        && useSinCos(Node)) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
+      Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
+      if (!isSIN)
+        Tmp1 = Tmp1.getValue(1);
+      Results.push_back(Tmp1);
+    } else if (isSIN) {
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                        RTLIB::SIN_F80, RTLIB::SIN_F128,
+                                        RTLIB::SIN_PPCF128));
+    } else {
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
+                                        RTLIB::COS_F80, RTLIB::COS_F128,
+                                        RTLIB::COS_PPCF128));
+    }
     break;
-  case ISD::FCOS:
-    Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
-                                      RTLIB::COS_F80, RTLIB::COS_F128,
-                                      RTLIB::COS_PPCF128));
+  }
+  case ISD::FSINCOS:
+    // Expand into sincos libcall.
+    ExpandSinCosLibCall(Node, Results);
     break;
   case ISD::FLOG:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
@@ -3200,7 +3304,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::UREM:
   case ISD::SREM: {
     EVT VT = Node->getValueType(0);
-    SDVTList VTs = DAG.getVTList(VT, VT);
     bool isSigned = Node->getOpcode() == ISD::SREM;
     unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
     unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
@@ -3211,6 +3314,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
          // If div is legal, it's better to do the normal expansion
          !TLI.isOperationLegalOrCustom(DivOpc, Node->getValueType(0)) &&
          useDivRem(Node, isSigned, false))) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
     } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
       // X % Y -> X-X/Y*Y
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 92dc5a9..1ee2192 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -152,23 +152,23 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::ADD_F32,
-                                  RTLIB::ADD_F64,
-                                  RTLIB::ADD_F80,
-                                  RTLIB::ADD_PPCF128),
-                     NVT, Ops, 2, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::ADD_F32,
+                                           RTLIB::ADD_F64,
+                                           RTLIB::ADD_F80,
+                                           RTLIB::ADD_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::CEIL_F32,
-                                  RTLIB::CEIL_F64,
-                                  RTLIB::CEIL_F80,
-                                  RTLIB::CEIL_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::CEIL_F32,
+                                           RTLIB::CEIL_F64,
+                                           RTLIB::CEIL_F80,
+                                           RTLIB::CEIL_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
@@ -216,90 +216,90 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::COS_F32,
-                                  RTLIB::COS_F64,
-                                  RTLIB::COS_F80,
-                                  RTLIB::COS_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::COS_F32,
+                                           RTLIB::COS_F64,
+                                           RTLIB::COS_F80,
+                                           RTLIB::COS_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::DIV_F32,
-                                  RTLIB::DIV_F64,
-                                  RTLIB::DIV_F80,
-                                  RTLIB::DIV_PPCF128),
-                     NVT, Ops, 2, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::DIV_F32,
+                                           RTLIB::DIV_F64,
+                                           RTLIB::DIV_F80,
+                                           RTLIB::DIV_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::EXP_F32,
-                                  RTLIB::EXP_F64,
-                                  RTLIB::EXP_F80,
-                                  RTLIB::EXP_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::EXP_F32,
+                                           RTLIB::EXP_F64,
+                                           RTLIB::EXP_F80,
+                                           RTLIB::EXP_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::EXP2_F32,
-                                  RTLIB::EXP2_F64,
-                                  RTLIB::EXP2_F80,
-                                  RTLIB::EXP2_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::EXP2_F32,
+                                           RTLIB::EXP2_F64,
+                                           RTLIB::EXP2_F80,
+                                           RTLIB::EXP2_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::FLOOR_F32,
-                                  RTLIB::FLOOR_F64,
-                                  RTLIB::FLOOR_F80,
-                                  RTLIB::FLOOR_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FLOOR_F32,
+                                           RTLIB::FLOOR_F64,
+                                           RTLIB::FLOOR_F80,
+                                           RTLIB::FLOOR_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::LOG_F32,
-                                  RTLIB::LOG_F64,
-                                  RTLIB::LOG_F80,
-                                  RTLIB::LOG_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::LOG_F32,
+                                           RTLIB::LOG_F64,
+                                           RTLIB::LOG_F80,
+                                           RTLIB::LOG_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::LOG2_F32,
-                                  RTLIB::LOG2_F64,
-                                  RTLIB::LOG2_F80,
-                                  RTLIB::LOG2_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::LOG2_F32,
+                                           RTLIB::LOG2_F64,
+                                           RTLIB::LOG2_F80,
+                                           RTLIB::LOG2_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::LOG10_F32,
-                                  RTLIB::LOG10_F64,
-                                  RTLIB::LOG10_F80,
-                                  RTLIB::LOG10_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::LOG10_F32,
+                                           RTLIB::LOG10_F64,
+                                           RTLIB::LOG10_F80,
+                                           RTLIB::LOG10_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
@@ -307,35 +307,35 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
   SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)),
                      GetSoftenedFloat(N->getOperand(2)) };
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::FMA_F32,
-                                  RTLIB::FMA_F64,
-                                  RTLIB::FMA_F80,
-                                  RTLIB::FMA_PPCF128),
-                     NVT, Ops, 3, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FMA_F32,
+                                           RTLIB::FMA_F64,
+                                           RTLIB::FMA_F80,
+                                           RTLIB::FMA_PPCF128),
+                         NVT, Ops, 3, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::MUL_F32,
-                                  RTLIB::MUL_F64,
-                                  RTLIB::MUL_F80,
-                                  RTLIB::MUL_PPCF128),
-                     NVT, Ops, 2, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::MUL_F32,
+                                           RTLIB::MUL_F64,
+                                           RTLIB::MUL_F80,
+                                           RTLIB::MUL_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::NEARBYINT_F32,
-                                  RTLIB::NEARBYINT_F64,
-                                  RTLIB::NEARBYINT_F80,
-                                  RTLIB::NEARBYINT_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::NEARBYINT_F32,
+                                           RTLIB::NEARBYINT_F64,
+                                           RTLIB::NEARBYINT_F80,
+                                           RTLIB::NEARBYINT_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
@@ -343,12 +343,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
   // Expand Y = FNEG(X) -> Y = SUB -0.0, X
   SDValue Ops[2] = { DAG.getConstantFP(-0.0, N->getValueType(0)),
                      GetSoftenedFloat(N->getOperand(0)) };
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::SUB_F32,
-                                  RTLIB::SUB_F64,
-                                  RTLIB::SUB_F80,
-                                  RTLIB::SUB_PPCF128),
-                     NVT, Ops, 2, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SUB_F32,
+                                           RTLIB::SUB_F64,
+                                           RTLIB::SUB_F80,
+                                           RTLIB::SUB_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
@@ -356,7 +356,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return MakeLibCall(LC, NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
@@ -364,8 +364,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP32(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = N->getOperand(0);
-  return MakeLibCall(RTLIB::FPEXT_F16_F32, NVT, &Op, 1, false,
-                     N->getDebugLoc());
+  return TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, NVT, &Op, 1, false,
+                         N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
@@ -373,19 +373,19 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
-  return MakeLibCall(LC, NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::POW_F32,
-                                  RTLIB::POW_F64,
-                                  RTLIB::POW_F80,
-                                  RTLIB::POW_PPCF128),
-                     NVT, Ops, 2, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::POW_F32,
+                                           RTLIB::POW_F64,
+                                           RTLIB::POW_F80,
+                                           RTLIB::POW_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
@@ -393,80 +393,80 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
          "Unsupported power type!");
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)), N->getOperand(1) };
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::POWI_F32,
-                                  RTLIB::POWI_F64,
-                                  RTLIB::POWI_F80,
-                                  RTLIB::POWI_PPCF128),
-                     NVT, Ops, 2, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::POWI_F32,
+                                           RTLIB::POWI_F64,
+                                           RTLIB::POWI_F80,
+                                           RTLIB::POWI_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::REM_F32,
-                                  RTLIB::REM_F64,
-                                  RTLIB::REM_F80,
-                                  RTLIB::REM_PPCF128),
-                     NVT, Ops, 2, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::REM_F32,
+                                           RTLIB::REM_F64,
+                                           RTLIB::REM_F80,
+                                           RTLIB::REM_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::RINT_F32,
-                                  RTLIB::RINT_F64,
-                                  RTLIB::RINT_F80,
-                                  RTLIB::RINT_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::RINT_F32,
+                                           RTLIB::RINT_F64,
+                                           RTLIB::RINT_F80,
+                                           RTLIB::RINT_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::SIN_F32,
-                                  RTLIB::SIN_F64,
-                                  RTLIB::SIN_F80,
-                                  RTLIB::SIN_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SIN_F32,
+                                           RTLIB::SIN_F64,
+                                           RTLIB::SIN_F80,
+                                           RTLIB::SIN_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::SQRT_F32,
-                                  RTLIB::SQRT_F64,
-                                  RTLIB::SQRT_F80,
-                                  RTLIB::SQRT_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SQRT_F32,
+                                           RTLIB::SQRT_F64,
+                                           RTLIB::SQRT_F80,
+                                           RTLIB::SQRT_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::SUB_F32,
-                                  RTLIB::SUB_F64,
-                                  RTLIB::SUB_F80,
-                                  RTLIB::SUB_PPCF128),
-                     NVT, Ops, 2, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SUB_F32,
+                                           RTLIB::SUB_F64,
+                                           RTLIB::SUB_F80,
+                                           RTLIB::SUB_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                  RTLIB::TRUNC_F32,
-                                  RTLIB::TRUNC_F64,
-                                  RTLIB::TRUNC_F80,
-                                  RTLIB::TRUNC_PPCF128),
-                     NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::TRUNC_F32,
+                                           RTLIB::TRUNC_F64,
+                                           RTLIB::TRUNC_F80,
+                                           RTLIB::TRUNC_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
@@ -559,8 +559,9 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
   // Sign/zero extend the argument if the libcall takes a larger type.
   SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                            NVT, N->getOperand(0));
-  return MakeLibCall(LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                     &Op, 1, false, dl);
+  return TLI.makeLibCall(DAG, LC,
+                         TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
+                         &Op, 1, false, dl);
 }
 
 
@@ -607,92 +608,6 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   return false;
 }
 
-/// SoftenSetCCOperands - Soften the operands of a comparison.  This code is
-/// shared among BR_CC, SELECT_CC, and SETCC handlers.
-void DAGTypeLegalizer::SoftenSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
-                                           ISD::CondCode &CCCode, DebugLoc dl) {
-  SDValue LHSInt = GetSoftenedFloat(NewLHS);
-  SDValue RHSInt = GetSoftenedFloat(NewRHS);
-  EVT VT = NewLHS.getValueType();
-
-  assert((VT == MVT::f32 || VT == MVT::f64) && "Unsupported setcc type!");
-
-  // Expand into one or more soft-fp libcall(s).
-  RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
-  switch (CCCode) {
-  case ISD::SETEQ:
-  case ISD::SETOEQ:
-    LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : RTLIB::OEQ_F64;
-    break;
-  case ISD::SETNE:
-  case ISD::SETUNE:
-    LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 : RTLIB::UNE_F64;
-    break;
-  case ISD::SETGE:
-  case ISD::SETOGE:
-    LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 : RTLIB::OGE_F64;
-    break;
-  case ISD::SETLT:
-  case ISD::SETOLT:
-    LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : RTLIB::OLT_F64;
-    break;
-  case ISD::SETLE:
-  case ISD::SETOLE:
-    LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 : RTLIB::OLE_F64;
-    break;
-  case ISD::SETGT:
-  case ISD::SETOGT:
-    LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 : RTLIB::OGT_F64;
-    break;
-  case ISD::SETUO:
-    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : RTLIB::UO_F64;
-    break;
-  case ISD::SETO:
-    LC1 = (VT == MVT::f32) ? RTLIB::O_F32 : RTLIB::O_F64;
-    break;
-  default:
-    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 : RTLIB::UO_F64;
-    switch (CCCode) {
-    case ISD::SETONE:
-      // SETONE = SETOLT | SETOGT
-      LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 : RTLIB::OLT_F64;
-      // Fallthrough
-    case ISD::SETUGT:
-      LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 : RTLIB::OGT_F64;
-      break;
-    case ISD::SETUGE:
-      LC2 = (VT == MVT::f32) ? RTLIB::OGE_F32 : RTLIB::OGE_F64;
-      break;
-    case ISD::SETULT:
-      LC2 = (VT == MVT::f32) ? RTLIB::OLT_F32 : RTLIB::OLT_F64;
-      break;
-    case ISD::SETULE:
-      LC2 = (VT == MVT::f32) ? RTLIB::OLE_F32 : RTLIB::OLE_F64;
-      break;
-    case ISD::SETUEQ:
-      LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 : RTLIB::OEQ_F64;
-      break;
-    default: llvm_unreachable("Do not know how to soften this setcc!");
-    }
-  }
-
-  // Use the target specific return value for comparions lib calls.
-  EVT RetVT = TLI.getCmpLibcallReturnType();
-  SDValue Ops[2] = { LHSInt, RHSInt };
-  NewLHS = MakeLibCall(LC1, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
-  NewRHS = DAG.getConstant(0, RetVT);
-  CCCode = TLI.getCmpLibcallCC(LC1);
-  if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
-    SDValue Tmp = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(RetVT),
-                                NewLHS, NewRHS, DAG.getCondCode(CCCode));
-    NewLHS = MakeLibCall(LC2, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
-    NewLHS = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(RetVT), NewLHS,
-                         NewRHS, DAG.getCondCode(TLI.getCmpLibcallCC(LC2)));
-    NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
-    NewRHS = SDValue();
-  }
-}
-
 SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
   return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getValueType(0),
                      GetSoftenedFloat(N->getOperand(0)));
@@ -706,15 +621,19 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
-  SoftenSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
 
-  // If SoftenSetCCOperands returned a scalar, we need to compare the result
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
   if (NewRHS.getNode() == 0) {
     NewRHS = DAG.getConstant(0, NewLHS.getValueType());
@@ -733,7 +652,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
@@ -741,22 +660,26 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP32_TO_FP16(SDNode *N) {
   EVT RVT = N->getValueType(0);
   RTLIB::Libcall LC = RTLIB::FPROUND_F32_F16;
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return MakeLibCall(LC, RVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
-  SoftenSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
 
-  // If SoftenSetCCOperands returned a scalar, we need to compare the result
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
   if (NewRHS.getNode() == 0) {
     NewRHS = DAG.getConstant(0, NewLHS.getValueType());
@@ -773,9 +696,13 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
 SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  SoftenSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
 
-  // If SoftenSetCCOperands returned a scalar, use it.
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If softenSetCCOperands returned a scalar, use it.
   if (NewRHS.getNode() == 0) {
     assert(NewLHS.getValueType() == N->getValueType(0) &&
            "Unexpected setcc expansion!");
@@ -886,9 +813,11 @@ void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
   assert(NVT.getSizeInBits() == integerPartWidth &&
          "Do not know how to expand this float constant!");
   APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
-  Lo = DAG.getConstantFP(APFloat(APInt(integerPartWidth, C.getRawData()[1])),
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(integerPartWidth, C.getRawData()[1])),
                          NVT);
-  Hi = DAG.getConstantFP(APFloat(APInt(integerPartWidth, C.getRawData()[0])),
+  Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(integerPartWidth, C.getRawData()[0])),
                          NVT);
 }
 
@@ -947,13 +876,13 @@ void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
 void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                          RTLIB::DIV_F32,
-                                          RTLIB::DIV_F64,
-                                          RTLIB::DIV_F80,
-                                          RTLIB::DIV_PPCF128),
-                             N->getValueType(0), Ops, 2, false,
-                             N->getDebugLoc());
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::DIV_F32,
+                                                   RTLIB::DIV_F64,
+                                                   RTLIB::DIV_F80,
+                                                   RTLIB::DIV_PPCF128),
+                                 N->getValueType(0), Ops, 2, false,
+                                 N->getDebugLoc());
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1014,26 +943,26 @@ void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
 void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
   SDValue Ops[3] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
-  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                          RTLIB::FMA_F32,
-                                          RTLIB::FMA_F64,
-                                          RTLIB::FMA_F80,
-                                          RTLIB::FMA_PPCF128),
-                             N->getValueType(0), Ops, 3, false,
-                             N->getDebugLoc());
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::FMA_F32,
+                                                   RTLIB::FMA_F64,
+                                                   RTLIB::FMA_F80,
+                                                   RTLIB::FMA_PPCF128),
+                                 N->getValueType(0), Ops, 3, false,
+                                 N->getDebugLoc());
   GetPairElements(Call, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                          RTLIB::MUL_F32,
-                                          RTLIB::MUL_F64,
-                                          RTLIB::MUL_F80,
-                                          RTLIB::MUL_PPCF128),
-                             N->getValueType(0), Ops, 2, false,
-                             N->getDebugLoc());
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::MUL_F32,
+                                                   RTLIB::MUL_F64,
+                                                   RTLIB::MUL_F80,
+                                                   RTLIB::MUL_PPCF128),
+                                 N->getValueType(0), Ops, 2, false,
+                                 N->getDebugLoc());
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1060,7 +989,8 @@ void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   Hi = DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), NVT, N->getOperand(0));
-  Lo = DAG.getConstantFP(APFloat(APInt(NVT.getSizeInBits(), 0)), NVT);
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(NVT.getSizeInBits(), 0)), NVT);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FPOW(SDNode *N,
@@ -1111,13 +1041,13 @@ void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
 void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SDValue Call = MakeLibCall(GetFPLibCall(N->getValueType(0),
-                                          RTLIB::SUB_F32,
-                                          RTLIB::SUB_F64,
-                                          RTLIB::SUB_F80,
-                                          RTLIB::SUB_PPCF128),
-                             N->getValueType(0), Ops, 2, false,
-                             N->getDebugLoc());
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::SUB_F32,
+                                                   RTLIB::SUB_F64,
+                                                   RTLIB::SUB_F80,
+                                                   RTLIB::SUB_PPCF128),
+                                 N->getValueType(0), Ops, 2, false,
+                                 N->getDebugLoc());
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1155,7 +1085,8 @@ void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
   Chain = Hi.getValue(1);
 
   // The low part is zero.
-  Lo = DAG.getConstantFP(APFloat(APInt(NVT.getSizeInBits(), 0)), NVT);
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(NVT.getSizeInBits(), 0)), NVT);
 
   // Modified the chain - switch anything that used the old chain to use the
   // new one.
@@ -1179,7 +1110,8 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     // The integer can be represented exactly in an f64.
     Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                       MVT::i32, Src);
-    Lo = DAG.getConstantFP(APFloat(APInt(NVT.getSizeInBits(), 0)), NVT);
+    Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                   APInt(NVT.getSizeInBits(), 0)), NVT);
     Hi = DAG.getNode(ISD::SINT_TO_FP, dl, NVT, Src);
   } else {
     RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
@@ -1193,7 +1125,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     }
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
-    Hi = MakeLibCall(LC, VT, &Src, 1, true, dl);
+    Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl);
     GetPairElements(Hi, Lo, Hi);
   }
 
@@ -1225,7 +1157,8 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
   }
 
   Lo = DAG.getNode(ISD::FADD, dl, VT, Hi,
-                   DAG.getConstantFP(APFloat(APInt(128, Parts)),
+                   DAG.getConstantFP(APFloat(APFloat::PPCDoubleDouble,
+                                             APInt(128, Parts)),
                                      MVT::ppcf128));
   Lo = DAG.getNode(ISD::SELECT_CC, dl, VT, Src, DAG.getConstant(0, SrcVT),
                    Lo, Hi, DAG.getCondCode(ISD::SETLT));
@@ -1364,7 +1297,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
-  return MakeLibCall(LC, RVT, &N->getOperand(0), 1, false, dl);
+  return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
@@ -1377,7 +1310,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
     assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
            "Logic only correct for ppcf128!");
     const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
-    APFloat APF = APFloat(APInt(128, TwoE31));
+    APFloat APF = APFloat(APFloat::PPCDoubleDouble, APInt(128, TwoE31));
     SDValue Tmp = DAG.getConstantFP(APF, MVT::ppcf128);
     //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
     // FIXME: generated code sucks.
@@ -1396,7 +1329,8 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
-  return MakeLibCall(LC, N->getValueType(0), &N->getOperand(0), 1, false, dl);
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0), &N->getOperand(0), 1,
+                         false, dl);
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 5e33ef1..182b7f3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1767,7 +1767,8 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
-  SplitInteger(MakeLibCall(LC, VT, &Op, 1, true/*irrelevant*/, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/, dl),
+               Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
@@ -1777,7 +1778,8 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
-  SplitInteger(MakeLibCall(LC, VT, &Op, 1, false/*irrelevant*/, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/, dl),
+               Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
@@ -1992,7 +1994,8 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(MakeLibCall(LC, VT, Ops, 2, true/*irrelevant*/, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/, dl),
+               Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
@@ -2054,7 +2057,7 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(MakeLibCall(LC, VT, Ops, 2, true, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
@@ -2092,9 +2095,19 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
     // Expand the subcomponents.
     SDValue LHSL, LHSH;
     GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
-
-    SDValue Ops[] = { LHSL, LHSH, N->getOperand(1) };
     EVT VT = LHSL.getValueType();
+
+    // If the shift amount operand is coming from a vector legalization it may
+    // have an illegal type.  Fix that first by casting the operand, otherwise
+    // the new SHL_PARTS operation would need further legalization.
+    SDValue ShiftOp = N->getOperand(1);
+    MVT ShiftTy = TLI.getShiftAmountTy(VT);
+    assert(ShiftTy.getSizeInBits() >= Log2_32_Ceil(VT.getSizeInBits()) &&
+           "ShiftAmountTy is too small to cover the range of this type!");
+    if (ShiftOp.getValueType() != ShiftTy)
+      ShiftOp = DAG.getZExtOrTrunc(ShiftOp, dl, ShiftTy);
+
+    SDValue Ops[] = { LHSL, LHSH, ShiftOp };
     Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops, 3);
     Hi = Lo.getValue(1);
     return;
@@ -2138,7 +2151,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
 
   if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    SplitInteger(MakeLibCall(LC, VT, Ops, 2, isSigned, dl), Lo, Hi);
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl), Lo, Hi);
     return;
   }
 
@@ -2221,7 +2234,7 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(MakeLibCall(LC, VT, Ops, 2, true, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
@@ -2361,7 +2374,7 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(MakeLibCall(LC, VT, Ops, 2, false, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
@@ -2381,7 +2394,7 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(MakeLibCall(LC, VT, Ops, 2, false, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
@@ -2668,7 +2681,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this SINT_TO_FP!");
-  return MakeLibCall(LC, DstVT, &Op, 1, true, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, N->getDebugLoc());
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -2764,17 +2777,6 @@ SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) {
   return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), N->getValueType(0), InL);
 }
 
-static const fltSemantics *EVTToAPFloatSemantics(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("Unknown FP format");
-  case MVT::f32:     return &APFloat::IEEEsingle;
-  case MVT::f64:     return &APFloat::IEEEdouble;
-  case MVT::f80:     return &APFloat::x87DoubleExtended;
-  case MVT::f128:    return &APFloat::IEEEquad;
-  case MVT::ppcf128: return &APFloat::PPCDoubleDouble;
-  }
-}
-
 SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   SDValue Op = N->getOperand(0);
   EVT SrcVT = Op.getValueType();
@@ -2784,8 +2786,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   // The following optimization is valid only if every value in SrcVT (when
   // treated as signed) is representable in DstVT.  Check that the mantissa
   // size of DstVT is >= than the number of bits in SrcVT -1.
-  const fltSemantics *sem = EVTToAPFloatSemantics(DstVT);
-  if (APFloat::semanticsPrecision(*sem) >= SrcVT.getSizeInBits()-1 &&
+  const fltSemantics &sem = DAG.EVTToAPFloatSemantics(DstVT);
+  if (APFloat::semanticsPrecision(sem) >= SrcVT.getSizeInBits()-1 &&
       TLI.getOperationAction(ISD::SINT_TO_FP, SrcVT) == TargetLowering::Custom){
     // Do a signed conversion then adjust the result.
     SDValue SignedConv = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Op);
@@ -2846,7 +2848,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this UINT_TO_FP!");
-  return MakeLibCall(LC, DstVT, &Op, 1, true, dl);
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 6aea2d8..e26d165 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -1020,50 +1020,20 @@ SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
   unsigned NumOps = N->getNumOperands();
   DebugLoc dl = N->getDebugLoc();
   if (NumOps == 0) {
-    return MakeLibCall(LC, N->getValueType(0), 0, 0, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), 0, 0, isSigned, dl);
   } else if (NumOps == 1) {
     SDValue Op = N->getOperand(0);
-    return MakeLibCall(LC, N->getValueType(0), &Op, 1, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned, dl);
   } else if (NumOps == 2) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    return MakeLibCall(LC, N->getValueType(0), Ops, 2, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned, dl);
   }
   SmallVector<SDValue, 8> Ops(NumOps);
   for (unsigned i = 0; i < NumOps; ++i)
     Ops[i] = N->getOperand(i);
 
-  return MakeLibCall(LC, N->getValueType(0), &Ops[0], NumOps, isSigned, dl);
-}
-
-/// MakeLibCall - Generate a libcall taking the given operands as arguments and
-/// returning a result of type RetVT.
-SDValue DAGTypeLegalizer::MakeLibCall(RTLIB::Libcall LC, EVT RetVT,
-                                      const SDValue *Ops, unsigned NumOps,
-                                      bool isSigned, DebugLoc dl) {
-  TargetLowering::ArgListTy Args;
-  Args.reserve(NumOps);
-
-  TargetLowering::ArgListEntry Entry;
-  for (unsigned i = 0; i != NumOps; ++i) {
-    Entry.Node = Ops[i];
-    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
-    Args.push_back(Entry);
-  }
-  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy());
-
-  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-  TargetLowering::
-  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
-                    false, 0, TLI.getLibcallCallingConv(LC),
-                    /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, dl);
-  std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
-
-  return CallInfo.first;
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0),
+                         &Ops[0], NumOps, isSigned, dl);
 }
 
 // ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 8c53ba3..7de42ea 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -80,35 +80,35 @@ private:
 
   /// PromotedIntegers - For integer nodes that are below legal width, this map
   /// indicates what promoted value to use.
-  DenseMap<SDValue, SDValue> PromotedIntegers;
+  SmallDenseMap<SDValue, SDValue, 8> PromotedIntegers;
 
   /// ExpandedIntegers - For integer nodes that need to be expanded this map
   /// indicates which operands are the expanded version of the input.
-  DenseMap<SDValue, std::pair<SDValue, SDValue> > ExpandedIntegers;
+  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedIntegers;
 
   /// SoftenedFloats - For floating point nodes converted to integers of
   /// the same size, this map indicates the converted value to use.
-  DenseMap<SDValue, SDValue> SoftenedFloats;
+  SmallDenseMap<SDValue, SDValue, 8> SoftenedFloats;
 
   /// ExpandedFloats - For float nodes that need to be expanded this map
   /// indicates which operands are the expanded version of the input.
-  DenseMap<SDValue, std::pair<SDValue, SDValue> > ExpandedFloats;
+  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedFloats;
 
   /// ScalarizedVectors - For nodes that are <1 x ty>, this map indicates the
   /// scalar value of type 'ty' to use.
-  DenseMap<SDValue, SDValue> ScalarizedVectors;
+  SmallDenseMap<SDValue, SDValue, 8> ScalarizedVectors;
 
   /// SplitVectors - For nodes that need to be split this map indicates
   /// which operands are the expanded version of the input.
-  DenseMap<SDValue, std::pair<SDValue, SDValue> > SplitVectors;
+  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> SplitVectors;
 
   /// WidenedVectors - For vector nodes that need to be widened, indicates
   /// the widened value to use.
-  DenseMap<SDValue, SDValue> WidenedVectors;
+  SmallDenseMap<SDValue, SDValue, 8> WidenedVectors;
 
   /// ReplacedValues - For values that have been replaced with another,
   /// indicates the replacement value to use.
-  DenseMap<SDValue, SDValue> ReplacedValues;
+  SmallDenseMap<SDValue, SDValue, 8> ReplacedValues;
 
   /// Worklist - This defines a worklist of nodes to process.  In order to be
   /// pushed onto this worklist, all operands of a node must have already been
@@ -159,9 +159,6 @@ private:
   SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
   SDValue JoinIntegers(SDValue Lo, SDValue Hi);
   SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
-  SDValue MakeLibCall(RTLIB::Libcall LC, EVT RetVT,
-                      const SDValue *Ops, unsigned NumOps, bool isSigned,
-                      DebugLoc dl);
   
   std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
                                                  SDNode *Node, bool isSigned);
@@ -433,9 +430,6 @@ private:
   SDValue SoftenFloatOp_SETCC(SDNode *N);
   SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
 
-  void SoftenSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
-                           ISD::CondCode &CCCode, DebugLoc dl);
-
   //===--------------------------------------------------------------------===//
   // Float Expansion Support: LegalizeFloatTypes.cpp
   //===--------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index de6bbe3..c6e066e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -40,7 +40,7 @@ class VectorLegalizer {
   /// LegalizedNodes - For nodes that are of legal width, and that have more
   /// than one use, this map indicates what regularized operand to use.  This
   /// allows us to avoid legalizing the same thing more than once.
-  DenseMap<SDValue, SDValue> LegalizedNodes;
+  SmallDenseMap<SDValue, SDValue, 64> LegalizedNodes;
 
   // Adds a node to the translation cache
   void AddLegalizedOperand(SDValue From, SDValue To) {
@@ -61,6 +61,8 @@ class VectorLegalizer {
   // Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
   // SINT_TO_FLOAT and SHR on vectors isn't legal.
   SDValue ExpandUINT_TO_FLOAT(SDValue Op);
+  // Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
+  SDValue ExpandSEXTINREG(SDValue Op);
   // Implement vselect in terms of XOR, AND, OR when blend is not supported
   // by the target.
   SDValue ExpandVSELECT(SDValue Op);
@@ -83,6 +85,25 @@ class VectorLegalizer {
 };
 
 bool VectorLegalizer::Run() {
+  // Before we start legalizing vector nodes, check if there are any vectors.
+  bool HasVectors = false;
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = prior(DAG.allnodes_end()); I != llvm::next(E); ++I) {
+    // Check if the values of the nodes contain vectors. We don't need to check
+    // the operands because we are going to check their values at some point.
+    for (SDNode::value_iterator J = I->value_begin(), E = I->value_end();
+         J != E; ++J)
+      HasVectors |= J->isVector();
+
+    // If we found a vector node we can start the legalization.
+    if (HasVectors)
+      break;
+  }
+
+  // If this basic block has no vectors then no need to legalize vectors.
+  if (!HasVectors)
+    return false;
+
   // The legalize process is inherently a bottom-up recursive process (users
   // legalize their uses before themselves).  Given infinite stack space, we
   // could just start legalizing on the root and traverse the whole graph.  In
@@ -262,7 +283,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     // FALL THROUGH
   }
   case TargetLowering::Expand:
-    if (Node->getOpcode() == ISD::VSELECT)
+    if (Node->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Result = ExpandSEXTINREG(Op);
+    else if (Node->getOpcode() == ISD::VSELECT)
       Result = ExpandVSELECT(Op);
     else if (Node->getOpcode() == ISD::SELECT)
       Result = ExpandSELECT(Op);
@@ -359,30 +382,135 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
   EVT SrcVT = LD->getMemoryVT();
   ISD::LoadExtType ExtType = LD->getExtensionType();
 
-  SmallVector<SDValue, 8> LoadVals;
+  SmallVector<SDValue, 8> Vals;
   SmallVector<SDValue, 8> LoadChains;
   unsigned NumElem = SrcVT.getVectorNumElements();
-  unsigned Stride = SrcVT.getScalarType().getSizeInBits()/8;
 
-  for (unsigned Idx=0; Idx<NumElem; Idx++) {
-    SDValue ScalarLoad = DAG.getExtLoad(ExtType, dl,
-              Op.getNode()->getValueType(0).getScalarType(),
-              Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
-              SrcVT.getScalarType(),
-              LD->isVolatile(), LD->isNonTemporal(),
-              LD->getAlignment());
+  EVT SrcEltVT = SrcVT.getScalarType();
+  EVT DstEltVT = Op.getNode()->getValueType(0).getScalarType();
+
+  if (SrcVT.getVectorNumElements() > 1 && !SrcEltVT.isByteSized()) {
+    // When elements in a vector is not byte-addressable, we cannot directly
+    // load each element by advancing pointer, which could only address bytes.
+    // Instead, we load all significant words, mask bits off, and concatenate
+    // them to form each element. Finally, they are extended to destination
+    // scalar type to build the destination vector.
+    EVT WideVT = TLI.getPointerTy();
+
+    assert(WideVT.isRound() &&
+           "Could not handle the sophisticated case when the widest integer is"
+           " not power of 2.");
+    assert(WideVT.bitsGE(SrcEltVT) &&
+           "Type is not legalized?");
+
+    unsigned WideBytes = WideVT.getStoreSize();
+    unsigned Offset = 0;
+    unsigned RemainingBytes = SrcVT.getStoreSize();
+    SmallVector<SDValue, 8> LoadVals;
+
+    while (RemainingBytes > 0) {
+      SDValue ScalarLoad;
+      unsigned LoadBytes = WideBytes;
+
+      if (RemainingBytes >= LoadBytes) {
+        ScalarLoad = DAG.getLoad(WideVT, dl, Chain, BasePTR,
+                                 LD->getPointerInfo().getWithOffset(Offset),
+                                 LD->isVolatile(), LD->isNonTemporal(),
+                                 LD->isInvariant(), LD->getAlignment());
+      } else {
+        EVT LoadVT = WideVT;
+        while (RemainingBytes < LoadBytes) {
+          LoadBytes >>= 1; // Reduce the load size by half.
+          LoadVT = EVT::getIntegerVT(*DAG.getContext(), LoadBytes << 3);
+        }
+        ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
+                                    LD->getPointerInfo().getWithOffset(Offset),
+                                    LoadVT, LD->isVolatile(),
+                                    LD->isNonTemporal(), LD->getAlignment());
+      }
 
-    BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                       DAG.getIntPtrConstant(Stride));
+      RemainingBytes -= LoadBytes;
+      Offset += LoadBytes;
+      BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
+                            DAG.getIntPtrConstant(LoadBytes));
+
+      LoadVals.push_back(ScalarLoad.getValue(0));
+      LoadChains.push_back(ScalarLoad.getValue(1));
+    }
+
+    // Extract bits, pack and extend/trunc them into destination type.
+    unsigned SrcEltBits = SrcEltVT.getSizeInBits();
+    SDValue SrcEltBitMask = DAG.getConstant((1U << SrcEltBits) - 1, WideVT);
+
+    unsigned BitOffset = 0;
+    unsigned WideIdx = 0;
+    unsigned WideBits = WideVT.getSizeInBits();
+
+    for (unsigned Idx = 0; Idx != NumElem; ++Idx) {
+      SDValue Lo, Hi, ShAmt;
+
+      if (BitOffset < WideBits) {
+        ShAmt = DAG.getConstant(BitOffset, TLI.getShiftAmountTy(WideVT));
+        Lo = DAG.getNode(ISD::SRL, dl, WideVT, LoadVals[WideIdx], ShAmt);
+        Lo = DAG.getNode(ISD::AND, dl, WideVT, Lo, SrcEltBitMask);
+      }
 
-     LoadVals.push_back(ScalarLoad.getValue(0));
-     LoadChains.push_back(ScalarLoad.getValue(1));
+      BitOffset += SrcEltBits;
+      if (BitOffset >= WideBits) {
+        WideIdx++;
+        Offset -= WideBits;
+        if (Offset > 0) {
+          ShAmt = DAG.getConstant(SrcEltBits - Offset,
+                                  TLI.getShiftAmountTy(WideVT));
+          Hi = DAG.getNode(ISD::SHL, dl, WideVT, LoadVals[WideIdx], ShAmt);
+          Hi = DAG.getNode(ISD::AND, dl, WideVT, Hi, SrcEltBitMask);
+        }
+      }
+
+      if (Hi.getNode())
+        Lo = DAG.getNode(ISD::OR, dl, WideVT, Lo, Hi);
+
+      switch (ExtType) {
+      default: llvm_unreachable("Unknown extended-load op!");
+      case ISD::EXTLOAD:
+        Lo = DAG.getAnyExtOrTrunc(Lo, dl, DstEltVT);
+        break;
+      case ISD::ZEXTLOAD:
+        Lo = DAG.getZExtOrTrunc(Lo, dl, DstEltVT);
+        break;
+      case ISD::SEXTLOAD:
+        ShAmt = DAG.getConstant(WideBits - SrcEltBits,
+                                TLI.getShiftAmountTy(WideVT));
+        Lo = DAG.getNode(ISD::SHL, dl, WideVT, Lo, ShAmt);
+        Lo = DAG.getNode(ISD::SRA, dl, WideVT, Lo, ShAmt);
+        Lo = DAG.getSExtOrTrunc(Lo, dl, DstEltVT);
+        break;
+      }
+      Vals.push_back(Lo);
+    }
+  } else {
+    unsigned Stride = SrcVT.getScalarType().getSizeInBits()/8;
+
+    for (unsigned Idx=0; Idx<NumElem; Idx++) {
+      SDValue ScalarLoad = DAG.getExtLoad(ExtType, dl,
+                Op.getNode()->getValueType(0).getScalarType(),
+                Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
+                SrcVT.getScalarType(),
+                LD->isVolatile(), LD->isNonTemporal(),
+                LD->getAlignment());
+
+      BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
+                         DAG.getIntPtrConstant(Stride));
+
+      Vals.push_back(ScalarLoad.getValue(0));
+      LoadChains.push_back(ScalarLoad.getValue(1));
+    }
   }
 
   SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
             &LoadChains[0], LoadChains.size());
   SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
-            Op.getNode()->getValueType(0), &LoadVals[0], LoadVals.size());
+            Op.getNode()->getValueType(0), &Vals[0], Vals.size());
 
   AddLegalizedOperand(Op.getValue(0), Value);
   AddLegalizedOperand(Op.getValue(1), NewChain);
@@ -501,6 +629,26 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
 }
 
+SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) {
+  EVT VT = Op.getValueType();
+
+  // Make sure that the SRA and SHL instructions are available.
+  if (TLI.getOperationAction(ISD::SRA, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::SHL, VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OrigTy = cast<VTSDNode>(Op->getOperand(1))->getVT();
+
+  unsigned BW = VT.getScalarType().getSizeInBits();
+  unsigned OrigBW = OrigTy.getScalarType().getSizeInBits();
+  SDValue ShiftSz = DAG.getConstant(BW - OrigBW, VT);
+
+  Op = Op.getOperand(0);
+  Op =   DAG.getNode(ISD::SHL, DL, VT, Op, ShiftSz);
+  return DAG.getNode(ISD::SRA, DL, VT, Op, ShiftSz);
+}
+
 SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   // Implement VSELECT in terms of XOR, AND, OR
   // on platforms which do not support blend natively.
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 31b9bf3..addfccb 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/DataLayout.h"
@@ -142,6 +143,12 @@ private:
   std::vector<SUnit*> LiveRegDefs;
   std::vector<SUnit*> LiveRegGens;
 
+  // Collect interferences between physical register use/defs.
+  // Each interference is an SUnit and set of physical registers.
+  SmallVector<SUnit*, 4> Interferences;
+  typedef DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMapT;
+  LRegsMapT LRegsMap;
+
   /// Topo - A topological ordering for SUnits which permits fast IsReachable
   /// and similar queries.
   ScheduleDAGTopologicalSort Topo;
@@ -225,6 +232,8 @@ private:
                                 SmallVector<SUnit*, 2>&);
   bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
 
+  void releaseInterferences(unsigned Reg = 0);
+
   SUnit *PickNodeToScheduleBottomUp();
   void ListScheduleBottomUp();
 
@@ -274,8 +283,17 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
   // the expansion of custom DAG-to-DAG patterns.
   if (VT == MVT::Untyped) {
     const SDNode *Node = RegDefPos.GetNode();
-    unsigned Opcode = Node->getMachineOpcode();
 
+    // Special handling for CopyFromReg of untyped values.
+    if (!Node->isMachineOpcode() && Node->getOpcode() == ISD::CopyFromReg) {
+      unsigned Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
+      RegClass = RC->getID();
+      Cost = 1;
+      return;
+    }
+
+    unsigned Opcode = Node->getMachineOpcode();
     if (Opcode == TargetOpcode::REG_SEQUENCE) {
       unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
       const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
@@ -312,6 +330,7 @@ void ScheduleDAGRRList::Schedule() {
   LiveRegDefs.resize(TRI->getNumRegs() + 1, NULL);
   LiveRegGens.resize(TRI->getNumRegs() + 1, NULL);
   CallSeqEndForStart.clear();
+  assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");
 
   // Build the scheduling graph.
   BuildSchedGraph(NULL);
@@ -725,6 +744,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
       --NumLiveRegs;
       LiveRegDefs[I->getReg()] = NULL;
       LiveRegGens[I->getReg()] = NULL;
+      releaseInterferences(I->getReg());
     }
   }
   // Release the special call resource dependence, if this is the beginning
@@ -739,6 +759,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
         --NumLiveRegs;
         LiveRegDefs[CallResource] = NULL;
         LiveRegGens[CallResource] = NULL;
+        releaseInterferences(CallResource);
       }
     }
 
@@ -794,6 +815,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
       --NumLiveRegs;
       LiveRegDefs[I->getReg()] = NULL;
       LiveRegGens[I->getReg()] = NULL;
+      releaseInterferences(I->getReg());
     }
   }
 
@@ -821,6 +843,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
         --NumLiveRegs;
         LiveRegDefs[CallResource] = NULL;
         LiveRegGens[CallResource] = NULL;
+        releaseInterferences(CallResource);
       }
     }
 
@@ -1305,34 +1328,58 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVector<unsigned, 4> &LRegs) {
   return !LRegs.empty();
 }
 
+void ScheduleDAGRRList::releaseInterferences(unsigned Reg) {
+  // Add the nodes that aren't ready back onto the available list.
+  for (unsigned i = Interferences.size(); i > 0; --i) {
+    SUnit *SU = Interferences[i-1];
+    LRegsMapT::iterator LRegsPos = LRegsMap.find(SU);
+    if (Reg) {
+      SmallVector<unsigned, 4> &LRegs = LRegsPos->second;
+      if (std::find(LRegs.begin(), LRegs.end(), Reg) == LRegs.end())
+        continue;
+    }
+    SU->isPending = false;
+    // The interfering node may no longer be available due to backtracking.
+    // Furthermore, it may have been made available again, in which case it is
+    // now already in the AvailableQueue.
+    if (SU->isAvailable && !SU->NodeQueueId) {
+      DEBUG(dbgs() << "    Repushing SU #" << SU->NodeNum << '\n');
+      AvailableQueue->push(SU);
+    }
+    if (i < Interferences.size())
+      Interferences[i-1] = Interferences.back();
+    Interferences.pop_back();
+    LRegsMap.erase(LRegsPos);
+  }
+}
+
 /// Return a node that can be scheduled in this cycle. Requirements:
 /// (1) Ready: latency has been satisfied
 /// (2) No Hazards: resources are available
 /// (3) No Interferences: may unschedule to break register interferences.
 SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
-  SmallVector<SUnit*, 4> Interferences;
-  DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMap;
-
-  SUnit *CurSU = AvailableQueue->pop();
+  SUnit *CurSU = AvailableQueue->empty() ? 0 : AvailableQueue->pop();
   while (CurSU) {
     SmallVector<unsigned, 4> LRegs;
     if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
       break;
-    LRegsMap.insert(std::make_pair(CurSU, LRegs));
-
-    CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
-    Interferences.push_back(CurSU);
+    DEBUG(dbgs() << "    Interfering reg " << TRI->getName(LRegs[0])
+          << " SU #" << CurSU->NodeNum << '\n');
+    std::pair<LRegsMapT::iterator, bool> LRegsPair =
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+    if (LRegsPair.second) {
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      Interferences.push_back(CurSU);
+    }
+    else {
+      assert(CurSU->isPending && "Intereferences are pending");
+      // Update the interference with current live regs.
+      LRegsPair.first->second = LRegs;
+    }
     CurSU = AvailableQueue->pop();
   }
-  if (CurSU) {
-    // Add the nodes that aren't ready back onto the available list.
-    for (unsigned i = 0, e = Interferences.size(); i != e; ++i) {
-      Interferences[i]->isPending = false;
-      assert(Interferences[i]->isAvailable && "must still be available");
-      AvailableQueue->push(Interferences[i]);
-    }
+  if (CurSU)
     return CurSU;
-  }
 
   // All candidates are delayed due to live physical reg dependencies.
   // Try backtracking, code duplication, or inserting cross class copies
@@ -1353,6 +1400,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
       }
     }
     if (!WillCreateCycle(TrySU, BtSU))  {
+      // BacktrackBottomUp mutates Interferences!
       BacktrackBottomUp(TrySU, BtSU);
 
       // Force the current node to be scheduled before the node that
@@ -1362,19 +1410,19 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
         if (!BtSU->isPending)
           AvailableQueue->remove(BtSU);
       }
+      DEBUG(dbgs() << "ARTIFICIAL edge from SU(" << BtSU->NodeNum << ") to SU("
+            << TrySU->NodeNum << ")\n");
       AddPred(TrySU, SDep(BtSU, SDep::Artificial));
 
       // If one or more successors has been unscheduled, then the current
-      // node is no longer avaialable. Schedule a successor that's now
-      // available instead.
-      if (!TrySU->isAvailable) {
+      // node is no longer available.
+      if (!TrySU->isAvailable)
         CurSU = AvailableQueue->pop();
-      }
       else {
+        AvailableQueue->remove(TrySU);
         CurSU = TrySU;
-        TrySU->isPending = false;
-        Interferences.erase(Interferences.begin()+i);
       }
+      // Interferences has been mutated. We must break.
       break;
     }
   }
@@ -1425,17 +1473,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
     TrySU->isAvailable = false;
     CurSU = NewDef;
   }
-
   assert(CurSU && "Unable to resolve live physical register dependencies!");
-
-  // Add the nodes that aren't ready back onto the available list.
-  for (unsigned i = 0, e = Interferences.size(); i != e; ++i) {
-    Interferences[i]->isPending = false;
-    // May no longer be available due to backtracking.
-    if (Interferences[i]->isAvailable) {
-      AvailableQueue->push(Interferences[i]);
-    }
-  }
   return CurSU;
 }
 
@@ -1456,7 +1494,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() {
   // While Available queue is not empty, grab the node with the highest
   // priority. If it is not ready put it back.  Schedule the node.
   Sequence.reserve(SUnits.size());
-  while (!AvailableQueue->empty()) {
+  while (!AvailableQueue->empty() || !Interferences.empty()) {
     DEBUG(dbgs() << "\nExamining Available:\n";
           AvailableQueue->dump(this));
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 76067a1..2ff37e0 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -15,8 +15,8 @@
 #ifndef SCHEDULEDAGSDNODES_H
 #define SCHEDULEDAGSDNODES_H
 
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 
 namespace llvm {
   /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 6c29c67..db8ae6e 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -60,18 +60,6 @@ static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
   return Res;
 }
 
-static const fltSemantics *EVTToAPFloatSemantics(EVT VT) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("Unknown FP format");
-  case MVT::f16:     return &APFloat::IEEEhalf;
-  case MVT::f32:     return &APFloat::IEEEsingle;
-  case MVT::f64:     return &APFloat::IEEEdouble;
-  case MVT::f80:     return &APFloat::x87DoubleExtended;
-  case MVT::f128:    return &APFloat::IEEEquad;
-  case MVT::ppcf128: return &APFloat::PPCDoubleDouble;
-  }
-}
-
 // Default null implementations of the callbacks.
 void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {}
 void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
@@ -95,7 +83,8 @@ bool ConstantFPSDNode::isValueValidForType(EVT VT,
   // convert modifies in place, so make a copy.
   APFloat Val2 = APFloat(Val);
   bool losesInfo;
-  (void) Val2.convert(*EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven,
+  (void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven,
                       &losesInfo);
   return !losesInfo;
 }
@@ -1081,7 +1070,7 @@ SDValue SelectionDAG::getConstantFP(double Val, EVT VT, bool isTarget) {
            EltVT==MVT::f16) {
     bool ignored;
     APFloat apf = APFloat(Val);
-    apf.convert(*EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
+    apf.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
                 &ignored);
     return getConstantFP(apf, VT, isTarget);
   } else
@@ -2442,7 +2431,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
       return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), VT);
     case ISD::UINT_TO_FP:
     case ISD::SINT_TO_FP: {
-      APFloat apf(APInt::getNullValue(VT.getSizeInBits()));
+      APFloat apf(EVTToAPFloatSemantics(VT),
+                  APInt::getNullValue(VT.getSizeInBits()));
       (void)apf.convertFromAPInt(Val,
                                  Opcode==ISD::SINT_TO_FP,
                                  APFloat::rmNearestTiesToEven);
@@ -2450,9 +2440,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     }
     case ISD::BITCAST:
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
-        return getConstantFP(APFloat(Val), VT);
+        return getConstantFP(APFloat(APFloat::IEEEsingle, Val), VT);
       else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
-        return getConstantFP(APFloat(Val), VT);
+        return getConstantFP(APFloat(APFloat::IEEEdouble, Val), VT);
       break;
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), VT);
@@ -2499,7 +2489,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
       bool ignored;
       // This can return overflow, underflow, or inexact; we don't care.
       // FIXME need to be more flexible about rounding mode.
-      (void)V.convert(*EVTToAPFloatSemantics(VT),
+      (void)V.convert(EVTToAPFloatSemantics(VT),
                       APFloat::rmNearestTiesToEven, &ignored);
       return getConstantFP(V, VT);
     }
@@ -2690,44 +2680,117 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode,
-                                             EVT VT,
-                                             ConstantSDNode *Cst1,
-                                             ConstantSDNode *Cst2) {
-  const APInt &C1 = Cst1->getAPIntValue(), &C2 = Cst2->getAPIntValue();
+SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, EVT VT,
+                                             SDNode *Cst1, SDNode *Cst2) {
+  SmallVector<std::pair<ConstantSDNode *, ConstantSDNode *>, 4> Inputs;
+  SmallVector<SDValue, 4> Outputs;
+  EVT SVT = VT.getScalarType();
 
-  switch (Opcode) {
-  case ISD::ADD:  return getConstant(C1 + C2, VT);
-  case ISD::SUB:  return getConstant(C1 - C2, VT);
-  case ISD::MUL:  return getConstant(C1 * C2, VT);
-  case ISD::UDIV:
-    if (C2.getBoolValue()) return getConstant(C1.udiv(C2), VT);
-    break;
-  case ISD::UREM:
-    if (C2.getBoolValue()) return getConstant(C1.urem(C2), VT);
-    break;
-  case ISD::SDIV:
-    if (C2.getBoolValue()) return getConstant(C1.sdiv(C2), VT);
-    break;
-  case ISD::SREM:
-    if (C2.getBoolValue()) return getConstant(C1.srem(C2), VT);
-    break;
-  case ISD::AND:  return getConstant(C1 & C2, VT);
-  case ISD::OR:   return getConstant(C1 | C2, VT);
-  case ISD::XOR:  return getConstant(C1 ^ C2, VT);
-  case ISD::SHL:  return getConstant(C1 << C2, VT);
-  case ISD::SRL:  return getConstant(C1.lshr(C2), VT);
-  case ISD::SRA:  return getConstant(C1.ashr(C2), VT);
-  case ISD::ROTL: return getConstant(C1.rotl(C2), VT);
-  case ISD::ROTR: return getConstant(C1.rotr(C2), VT);
-  default: break;
+  ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1);
+  ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2);
+  if (Scalar1 && Scalar2) {
+    // Scalar instruction.
+    Inputs.push_back(std::make_pair(Scalar1, Scalar2));
+  } else {
+    // For vectors extract each constant element into Inputs so we can constant
+    // fold them individually.
+    BuildVectorSDNode *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
+    BuildVectorSDNode *BV2 = dyn_cast<BuildVectorSDNode>(Cst2);
+    if (!BV1 || !BV2)
+      return SDValue();
+
+    assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!");
+
+    for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) {
+      ConstantSDNode *V1 = dyn_cast<ConstantSDNode>(BV1->getOperand(I));
+      ConstantSDNode *V2 = dyn_cast<ConstantSDNode>(BV2->getOperand(I));
+      if (!V1 || !V2) // Not a constant, bail.
+        return SDValue();
+
+      // Avoid BUILD_VECTOR nodes that perform implicit truncation.
+      // FIXME: This is valid and could be handled by truncating the APInts.
+      if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT)
+        return SDValue();
+
+      Inputs.push_back(std::make_pair(V1, V2));
+    }
   }
 
-  return SDValue();
+  // We have a number of constant values, constant fold them element by element.
+  for (unsigned I = 0, E = Inputs.size(); I != E; ++I) {
+    const APInt &C1 = Inputs[I].first->getAPIntValue();
+    const APInt &C2 = Inputs[I].second->getAPIntValue();
+
+    switch (Opcode) {
+    case ISD::ADD:
+      Outputs.push_back(getConstant(C1 + C2, SVT));
+      break;
+    case ISD::SUB:
+      Outputs.push_back(getConstant(C1 - C2, SVT));
+      break;
+    case ISD::MUL:
+      Outputs.push_back(getConstant(C1 * C2, SVT));
+      break;
+    case ISD::UDIV:
+      if (!C2.getBoolValue())
+        return SDValue();
+      Outputs.push_back(getConstant(C1.udiv(C2), SVT));
+      break;
+    case ISD::UREM:
+      if (!C2.getBoolValue())
+        return SDValue();
+      Outputs.push_back(getConstant(C1.urem(C2), SVT));
+      break;
+    case ISD::SDIV:
+      if (!C2.getBoolValue())
+        return SDValue();
+      Outputs.push_back(getConstant(C1.sdiv(C2), SVT));
+      break;
+    case ISD::SREM:
+      if (!C2.getBoolValue())
+        return SDValue();
+      Outputs.push_back(getConstant(C1.srem(C2), SVT));
+      break;
+    case ISD::AND:
+      Outputs.push_back(getConstant(C1 & C2, SVT));
+      break;
+    case ISD::OR:
+      Outputs.push_back(getConstant(C1 | C2, SVT));
+      break;
+    case ISD::XOR:
+      Outputs.push_back(getConstant(C1 ^ C2, SVT));
+      break;
+    case ISD::SHL:
+      Outputs.push_back(getConstant(C1 << C2, SVT));
+      break;
+    case ISD::SRL:
+      Outputs.push_back(getConstant(C1.lshr(C2), SVT));
+      break;
+    case ISD::SRA:
+      Outputs.push_back(getConstant(C1.ashr(C2), SVT));
+      break;
+    case ISD::ROTL:
+      Outputs.push_back(getConstant(C1.rotl(C2), SVT));
+      break;
+    case ISD::ROTR:
+      Outputs.push_back(getConstant(C1.rotr(C2), SVT));
+      break;
+    default:
+      return SDValue();
+    }
+  }
+
+  // Handle the scalar case first.
+  if (Outputs.size() == 1)
+    return Outputs.back();
+
+  // Otherwise build a big vector out of the scalar elements we generated.
+  return getNode(ISD::BUILD_VECTOR, DebugLoc(), VT, Outputs.data(),
+                 Outputs.size());
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
-                              SDValue N1, SDValue N2) {
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT, SDValue N1,
+                              SDValue N2) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
   switch (Opcode) {
@@ -3023,16 +3086,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
   }
   }
 
-  if (N1C) {
-    if (N2C) {
-      SDValue SV = FoldConstantArithmetic(Opcode, VT, N1C, N2C);
-      if (SV.getNode()) return SV;
-    } else {      // Cannonicalize constant to RHS if commutative
-      if (isCommutativeBinOp(Opcode)) {
-        std::swap(N1C, N2C);
-        std::swap(N1, N2);
-      }
-    }
+  // Perform trivial constant folding.
+  SDValue SV = FoldConstantArithmetic(Opcode, VT, N1.getNode(), N2.getNode());
+  if (SV.getNode()) return SV;
+
+  // Canonicalize constant to RHS if commutative.
+  if (N1C && !N2C && isCommutativeBinOp(Opcode)) {
+    std::swap(N1C, N2C);
+    std::swap(N1, N2);
   }
 
   // Constant fold FP operations.
@@ -3040,7 +3101,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
   ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
   if (N1CFP) {
     if (!N2CFP && isCommutativeBinOp(Opcode)) {
-      // Cannonicalize constant to RHS if commutative
+      // Canonicalize constant to RHS if commutative.
       std::swap(N1CFP, N2CFP);
       std::swap(N1, N2);
     } else if (N2CFP) {
@@ -3084,7 +3145,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
       bool ignored;
       // This can return overflow, underflow, or inexact; we don't care.
       // FIXME need to be more flexible about rounding mode.
-      (void)V.convert(*EVTToAPFloatSemantics(VT),
+      (void)V.convert(EVTToAPFloatSemantics(VT),
                       APFloat::rmNearestTiesToEven, &ignored);
       return getConstantFP(V, VT);
     }
@@ -3316,17 +3377,6 @@ SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
                  &ArgChains[0], ArgChains.size());
 }
 
-/// SplatByte - Distribute ByteVal over NumBits bits.
-static APInt SplatByte(unsigned NumBits, uint8_t ByteVal) {
-  APInt Val = APInt(NumBits, ByteVal);
-  unsigned Shift = 8;
-  for (unsigned i = NumBits; i > 8; i >>= 1) {
-    Val = (Val << Shift) | Val;
-    Shift <<= 1;
-  }
-  return Val;
-}
-
 /// getMemsetValue - Vectorized representation of the memset value
 /// operand.
 static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
@@ -3335,17 +3385,18 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
 
   unsigned NumBits = VT.getScalarType().getSizeInBits();
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
-    APInt Val = SplatByte(NumBits, C->getZExtValue() & 255);
+    assert(C->getAPIntValue().getBitWidth() == 8);
+    APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
     if (VT.isInteger())
       return DAG.getConstant(Val, VT);
-    return DAG.getConstantFP(APFloat(Val), VT);
+    return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), VT);
   }
 
   Value = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Value);
   if (NumBits > 8) {
     // Use a multiplication with 0x010101... to extend the input to the
     // required length.
-    APInt Magic = SplatByte(NumBits, 0x01);
+    APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
     Value = DAG.getNode(ISD::MUL, dl, VT, Value, DAG.getConstant(Magic, VT));
   }
 
@@ -3374,10 +3425,11 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
   }
 
   assert(!VT.isVector() && "Can't handle vector type here!");
-  unsigned NumVTBytes = VT.getSizeInBits() / 8;
+  unsigned NumVTBits = VT.getSizeInBits();
+  unsigned NumVTBytes = NumVTBits / 8;
   unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
 
-  APInt Val(NumBytes*8, 0);
+  APInt Val(NumVTBits, 0);
   if (TLI.isLittleEndian()) {
     for (unsigned i = 0; i != NumBytes; ++i)
       Val |= (uint64_t)(unsigned char)Str[i] << i*8;
@@ -3570,6 +3622,15 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
     unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.  
+    const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+    if (!TRI->needsStackRealignment(MF))
+       while (NewAlign > Align &&
+             TLI.getDataLayout()->exceedsNaturalStackAlignment(NewAlign))
+          NewAlign /= 2;
+
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
@@ -3806,6 +3867,7 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
                                 unsigned Align, bool isVol, bool AlwaysInline,
                                 MachinePointerInfo DstPtrInfo,
                                 MachinePointerInfo SrcPtrInfo) {
+  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
 
   // Check to see if we should lower the memcpy to loads and stores first.
   // For cases within the target-specified limits, this is the best choice.
@@ -3873,6 +3935,7 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
                                  unsigned Align, bool isVol,
                                  MachinePointerInfo DstPtrInfo,
                                  MachinePointerInfo SrcPtrInfo) {
+  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
 
   // Check to see if we should lower the memmove to loads and stores first.
   // For cases within the target-specified limits, this is the best choice.
@@ -3927,6 +3990,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
                                 SDValue Src, SDValue Size,
                                 unsigned Align, bool isVol,
                                 MachinePointerInfo DstPtrInfo) {
+  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
 
   // Check to see if we should lower the memset to stores first.
   // For cases within the target-specified limits, this is the best choice.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 8c22db3..b8ab2a9 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -15,9 +15,9 @@
 #include "SelectionDAGBuilder.h"
 #include "SDNodeDbgValue.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -3519,7 +3519,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
 
   EVT VT = TLI.getValueType(I.getType());
 
-  if (I.getAlignment() * 8 < VT.getSizeInBits())
+  if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic load");
 
   SDValue L =
@@ -3549,7 +3549,7 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
 
   EVT VT = TLI.getValueType(I.getValueOperand()->getType());
 
-  if (I.getAlignment() * 8 < VT.getSizeInBits())
+  if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic store");
 
   if (TLI.getInsertFencesForAtomic())
@@ -3663,7 +3663,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 ///
 ///   Op = (Op & 0x007fffff) | 0x3f800000;
 ///
-/// where Op is the hexidecimal representation of floating point value.
+/// where Op is the hexadecimal representation of floating point value.
 static SDValue
 GetSignificand(SelectionDAG &DAG, SDValue Op, DebugLoc dl) {
   SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
@@ -3677,7 +3677,7 @@ GetSignificand(SelectionDAG &DAG, SDValue Op, DebugLoc dl) {
 ///
 ///   (float)(int)(((Op & 0x7f800000) >> 23) - 127);
 ///
-/// where Op is the hexidecimal representation of floating point value.
+/// where Op is the hexadecimal representation of floating point value.
 static SDValue
 GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI,
             DebugLoc dl) {
@@ -3693,7 +3693,8 @@ GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI,
 /// getF32Constant - Get 32-bit floating point constant.
 static SDValue
 getF32Constant(SelectionDAG &DAG, unsigned Flt) {
-  return DAG.getConstantFP(APFloat(APInt(32, Flt)), MVT::f32);
+  return DAG.getConstantFP(APFloat(APFloat::IEEEsingle, APInt(32, Flt)),
+                           MVT::f32);
 }
 
 /// expandExp - Lower an exp intrinsic. Handles the special sequences for
@@ -4466,6 +4467,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
     unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    if (!Align)
+      Align = 1; // @llvm.memcpy defines 0 and 1 to both mean no alignment.
     bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
     DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, isVol, false,
                               MachinePointerInfo(I.getArgOperand(0)),
@@ -4482,6 +4485,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
     unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    if (!Align)
+      Align = 1; // @llvm.memset defines 0 and 1 to both mean no alignment.
     bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
     DAG.setRoot(DAG.getMemset(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
                               MachinePointerInfo(I.getArgOperand(0))));
@@ -4499,6 +4504,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
     unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    if (!Align)
+      Align = 1; // @llvm.memmove defines 0 and 1 to both mean no alignment.
     bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
     DAG.setRoot(DAG.getMemmove(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
                                MachinePointerInfo(I.getArgOperand(0)),
@@ -5168,6 +5175,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       Res = DAG.getNode(Opcode, dl, MVT::Other, Ops, 2);
       DAG.setRoot(Res);
     }
+    return 0;
   }
   case Intrinsic::invariant_start:
     // Discard region information.
@@ -5276,8 +5284,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   // Check if target-independent constraints permit a tail call here.
   // Target-dependent constraints are checked within TLI.LowerCallTo.
-  if (isTailCall &&
-      !isInTailCallPosition(CS, CS.getAttributes().getRetAttributes(), TLI))
+  if (isTailCall && !isInTailCallPosition(CS, TLI))
     isTailCall = false;
 
   TargetLowering::
@@ -5947,6 +5954,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // Compute the constraint code and ConstraintType to use.
     TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
 
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        OpInfo.Type == InlineAsm::isClobber)
+      continue;
+
     // If this is a memory input, and if the operand is not indirect, do what we
     // need to to provide an address for the memory input.
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
@@ -6050,6 +6061,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         ExtraInfo |= InlineAsm::Extra_MayLoad;
       else if (OpInfo.Type == InlineAsm::isOutput)
         ExtraInfo |= InlineAsm::Extra_MayStore;
+      else if (OpInfo.Type == InlineAsm::isClobber)
+        ExtraInfo |= (InlineAsm::Extra_MayLoad | InlineAsm::Extra_MayStore);
     }
   }
 
@@ -6584,10 +6597,6 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
   const DataLayout *TD = TLI.getDataLayout();
   SmallVector<ISD::InputArg, 16> Ins;
 
-  // Check whether the function can return without sret-demotion.
-  SmallVector<ISD::OutputArg, 4> Outs;
-  GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
-
   if (!FuncInfo->CanLowerReturn) {
     // Put in an sret pointer parameter before all the other parameters.
     SmallVector<EVT, 1> ValueVTs;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 5701b13..3b5823b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -140,6 +140,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FSQRT:                      return "fsqrt";
   case ISD::FSIN:                       return "fsin";
   case ISD::FCOS:                       return "fcos";
+  case ISD::FSINCOS:                    return "fsincos";
   case ISD::FTRUNC:                     return "ftrunc";
   case ISD::FFLOOR:                     return "ffloor";
   case ISD::FCEIL:                      return "fceil";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index d4e9a50..acae58c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -144,7 +144,12 @@ EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
                    "instruction selector"));
 static cl::opt<bool>
 EnableFastISelAbort("fast-isel-abort", cl::Hidden,
-          cl::desc("Enable abort calls when \"fast\" instruction fails"));
+          cl::desc("Enable abort calls when \"fast\" instruction selection "
+                   "fails to lower an instruction"));
+static cl::opt<bool>
+EnableFastISelAbortArgs("fast-isel-abort-args", cl::Hidden,
+          cl::desc("Enable abort calls when \"fast\" instruction selection "
+                   "fails to lower a formal argument"));
 
 static cl::opt<bool>
 UseMBPI("use-mbpi",
@@ -354,6 +359,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   TTI = getAnalysisIfAvailable<TargetTransformInfo>();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : 0;
 
+  TargetSubtargetInfo &ST =
+    const_cast<TargetSubtargetInfo&>(TM.getSubtarget<TargetSubtargetInfo>());
+  ST.resetSubtargetFeatures(MF);
+
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), this);
@@ -368,6 +377,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
 
   SDB->init(GFI, *AA, LibInfo);
 
+  MF->setHasMSInlineAsm(false);
   SelectAllBasicBlocks(Fn);
 
   // If the first basic block in the function has live ins that need to be
@@ -438,24 +448,26 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
 
   // Determine if there are any calls in this machine function.
   MachineFrameInfo *MFI = MF->getFrameInfo();
-  if (!MFI->hasCalls()) {
-    for (MachineFunction::const_iterator
-           I = MF->begin(), E = MF->end(); I != E; ++I) {
-      const MachineBasicBlock *MBB = I;
-      for (MachineBasicBlock::const_iterator
-             II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
-        const MCInstrDesc &MCID = TM.getInstrInfo()->get(II->getOpcode());
-
-        if ((MCID.isCall() && !MCID.isReturn()) ||
-            II->isStackAligningInlineAsm()) {
-          MFI->setHasCalls(true);
-          goto done;
-        }
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
+       ++I) {
+
+    if (MFI->hasCalls() && MF->hasMSInlineAsm())
+      break;
+
+    const MachineBasicBlock *MBB = I;
+    for (MachineBasicBlock::const_iterator II = MBB->begin(), IE = MBB->end();
+         II != IE; ++II) {
+      const MCInstrDesc &MCID = TM.getInstrInfo()->get(II->getOpcode());
+      if ((MCID.isCall() && !MCID.isReturn()) ||
+          II->isStackAligningInlineAsm()) {
+        MFI->setHasCalls(true);
+      }
+      if (II->isMSInlineAsm()) {
+        MF->setHasMSInlineAsm(true);
       }
     }
   }
 
-  done:
   // Determine if there is a call to setjmp in the machine function.
   MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice());
 
@@ -1032,10 +1044,6 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
     if (FuncInfo->MBB->isLandingPad())
       PrepareEHLandingPad();
 
-    // Lower any arguments needed in this block if this is the entry block.
-    if (LLVMBB == &Fn.getEntryBlock())
-      LowerArguments(LLVMBB);
-
     // Before doing SelectionDAG ISel, see if FastISel has been requested.
     if (FastIS) {
       FastIS->startNewBlock();
@@ -1043,9 +1051,21 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       // Emit code for any incoming arguments. This must happen before
       // beginning FastISel on the entry block.
       if (LLVMBB == &Fn.getEntryBlock()) {
-        CurDAG->setRoot(SDB->getControlRoot());
-        SDB->clear();
-        CodeGenAndEmitDAG();
+        // Lower any arguments needed in this block if this is the entry block.
+        if (!FastIS->LowerArguments()) {
+
+          if (EnableFastISelAbortArgs)
+            // The "fast" selector couldn't lower these arguments.  For the
+            // purpose of debugging, just abort.
+            llvm_unreachable("FastISel didn't lower all arguments");
+
+          // Call target indepedent SDISel argument lowering code if the target
+          // specific routine is not successful.
+          LowerArguments(LLVMBB);
+          CurDAG->setRoot(SDB->getControlRoot());
+          SDB->clear();
+          CodeGenAndEmitDAG();
+        }
 
         // If we inserted any instructions at the beginning, make a note of
         // where they are, so we can be sure to emit subsequent instructions
@@ -1156,6 +1176,10 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       }
 
       FastIS->recomputeInsertPt();
+    } else {
+      // Lower any arguments needed in this block if this is the entry block.
+      if (LLVMBB == &Fn.getEntryBlock())
+        LowerArguments(LLVMBB);
     }
 
     if (Begin != BI)
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 34f3bc9..f5fc66c 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -33,1114 +33,168 @@
 #include <cctype>
 using namespace llvm;
 
-/// InitLibcallNames - Set default libcall names.
-///
-static void InitLibcallNames(const char **Names) {
-  Names[RTLIB::SHL_I16] = "__ashlhi3";
-  Names[RTLIB::SHL_I32] = "__ashlsi3";
-  Names[RTLIB::SHL_I64] = "__ashldi3";
-  Names[RTLIB::SHL_I128] = "__ashlti3";
-  Names[RTLIB::SRL_I16] = "__lshrhi3";
-  Names[RTLIB::SRL_I32] = "__lshrsi3";
-  Names[RTLIB::SRL_I64] = "__lshrdi3";
-  Names[RTLIB::SRL_I128] = "__lshrti3";
-  Names[RTLIB::SRA_I16] = "__ashrhi3";
-  Names[RTLIB::SRA_I32] = "__ashrsi3";
-  Names[RTLIB::SRA_I64] = "__ashrdi3";
-  Names[RTLIB::SRA_I128] = "__ashrti3";
-  Names[RTLIB::MUL_I8] = "__mulqi3";
-  Names[RTLIB::MUL_I16] = "__mulhi3";
-  Names[RTLIB::MUL_I32] = "__mulsi3";
-  Names[RTLIB::MUL_I64] = "__muldi3";
-  Names[RTLIB::MUL_I128] = "__multi3";
-  Names[RTLIB::MULO_I32] = "__mulosi4";
-  Names[RTLIB::MULO_I64] = "__mulodi4";
-  Names[RTLIB::MULO_I128] = "__muloti4";
-  Names[RTLIB::SDIV_I8] = "__divqi3";
-  Names[RTLIB::SDIV_I16] = "__divhi3";
-  Names[RTLIB::SDIV_I32] = "__divsi3";
-  Names[RTLIB::SDIV_I64] = "__divdi3";
-  Names[RTLIB::SDIV_I128] = "__divti3";
-  Names[RTLIB::UDIV_I8] = "__udivqi3";
-  Names[RTLIB::UDIV_I16] = "__udivhi3";
-  Names[RTLIB::UDIV_I32] = "__udivsi3";
-  Names[RTLIB::UDIV_I64] = "__udivdi3";
-  Names[RTLIB::UDIV_I128] = "__udivti3";
-  Names[RTLIB::SREM_I8] = "__modqi3";
-  Names[RTLIB::SREM_I16] = "__modhi3";
-  Names[RTLIB::SREM_I32] = "__modsi3";
-  Names[RTLIB::SREM_I64] = "__moddi3";
-  Names[RTLIB::SREM_I128] = "__modti3";
-  Names[RTLIB::UREM_I8] = "__umodqi3";
-  Names[RTLIB::UREM_I16] = "__umodhi3";
-  Names[RTLIB::UREM_I32] = "__umodsi3";
-  Names[RTLIB::UREM_I64] = "__umoddi3";
-  Names[RTLIB::UREM_I128] = "__umodti3";
-
-  // These are generally not available.
-  Names[RTLIB::SDIVREM_I8] = 0;
-  Names[RTLIB::SDIVREM_I16] = 0;
-  Names[RTLIB::SDIVREM_I32] = 0;
-  Names[RTLIB::SDIVREM_I64] = 0;
-  Names[RTLIB::SDIVREM_I128] = 0;
-  Names[RTLIB::UDIVREM_I8] = 0;
-  Names[RTLIB::UDIVREM_I16] = 0;
-  Names[RTLIB::UDIVREM_I32] = 0;
-  Names[RTLIB::UDIVREM_I64] = 0;
-  Names[RTLIB::UDIVREM_I128] = 0;
-
-  Names[RTLIB::NEG_I32] = "__negsi2";
-  Names[RTLIB::NEG_I64] = "__negdi2";
-  Names[RTLIB::ADD_F32] = "__addsf3";
-  Names[RTLIB::ADD_F64] = "__adddf3";
-  Names[RTLIB::ADD_F80] = "__addxf3";
-  Names[RTLIB::ADD_F128] = "__addtf3";
-  Names[RTLIB::ADD_PPCF128] = "__gcc_qadd";
-  Names[RTLIB::SUB_F32] = "__subsf3";
-  Names[RTLIB::SUB_F64] = "__subdf3";
-  Names[RTLIB::SUB_F80] = "__subxf3";
-  Names[RTLIB::SUB_F128] = "__subtf3";
-  Names[RTLIB::SUB_PPCF128] = "__gcc_qsub";
-  Names[RTLIB::MUL_F32] = "__mulsf3";
-  Names[RTLIB::MUL_F64] = "__muldf3";
-  Names[RTLIB::MUL_F80] = "__mulxf3";
-  Names[RTLIB::MUL_F128] = "__multf3";
-  Names[RTLIB::MUL_PPCF128] = "__gcc_qmul";
-  Names[RTLIB::DIV_F32] = "__divsf3";
-  Names[RTLIB::DIV_F64] = "__divdf3";
-  Names[RTLIB::DIV_F80] = "__divxf3";
-  Names[RTLIB::DIV_F128] = "__divtf3";
-  Names[RTLIB::DIV_PPCF128] = "__gcc_qdiv";
-  Names[RTLIB::REM_F32] = "fmodf";
-  Names[RTLIB::REM_F64] = "fmod";
-  Names[RTLIB::REM_F80] = "fmodl";
-  Names[RTLIB::REM_F128] = "fmodl";
-  Names[RTLIB::REM_PPCF128] = "fmodl";
-  Names[RTLIB::FMA_F32] = "fmaf";
-  Names[RTLIB::FMA_F64] = "fma";
-  Names[RTLIB::FMA_F80] = "fmal";
-  Names[RTLIB::FMA_F128] = "fmal";
-  Names[RTLIB::FMA_PPCF128] = "fmal";
-  Names[RTLIB::POWI_F32] = "__powisf2";
-  Names[RTLIB::POWI_F64] = "__powidf2";
-  Names[RTLIB::POWI_F80] = "__powixf2";
-  Names[RTLIB::POWI_F128] = "__powitf2";
-  Names[RTLIB::POWI_PPCF128] = "__powitf2";
-  Names[RTLIB::SQRT_F32] = "sqrtf";
-  Names[RTLIB::SQRT_F64] = "sqrt";
-  Names[RTLIB::SQRT_F80] = "sqrtl";
-  Names[RTLIB::SQRT_F128] = "sqrtl";
-  Names[RTLIB::SQRT_PPCF128] = "sqrtl";
-  Names[RTLIB::LOG_F32] = "logf";
-  Names[RTLIB::LOG_F64] = "log";
-  Names[RTLIB::LOG_F80] = "logl";
-  Names[RTLIB::LOG_F128] = "logl";
-  Names[RTLIB::LOG_PPCF128] = "logl";
-  Names[RTLIB::LOG2_F32] = "log2f";
-  Names[RTLIB::LOG2_F64] = "log2";
-  Names[RTLIB::LOG2_F80] = "log2l";
-  Names[RTLIB::LOG2_F128] = "log2l";
-  Names[RTLIB::LOG2_PPCF128] = "log2l";
-  Names[RTLIB::LOG10_F32] = "log10f";
-  Names[RTLIB::LOG10_F64] = "log10";
-  Names[RTLIB::LOG10_F80] = "log10l";
-  Names[RTLIB::LOG10_F128] = "log10l";
-  Names[RTLIB::LOG10_PPCF128] = "log10l";
-  Names[RTLIB::EXP_F32] = "expf";
-  Names[RTLIB::EXP_F64] = "exp";
-  Names[RTLIB::EXP_F80] = "expl";
-  Names[RTLIB::EXP_F128] = "expl";
-  Names[RTLIB::EXP_PPCF128] = "expl";
-  Names[RTLIB::EXP2_F32] = "exp2f";
-  Names[RTLIB::EXP2_F64] = "exp2";
-  Names[RTLIB::EXP2_F80] = "exp2l";
-  Names[RTLIB::EXP2_F128] = "exp2l";
-  Names[RTLIB::EXP2_PPCF128] = "exp2l";
-  Names[RTLIB::SIN_F32] = "sinf";
-  Names[RTLIB::SIN_F64] = "sin";
-  Names[RTLIB::SIN_F80] = "sinl";
-  Names[RTLIB::SIN_F128] = "sinl";
-  Names[RTLIB::SIN_PPCF128] = "sinl";
-  Names[RTLIB::COS_F32] = "cosf";
-  Names[RTLIB::COS_F64] = "cos";
-  Names[RTLIB::COS_F80] = "cosl";
-  Names[RTLIB::COS_F128] = "cosl";
-  Names[RTLIB::COS_PPCF128] = "cosl";
-  Names[RTLIB::POW_F32] = "powf";
-  Names[RTLIB::POW_F64] = "pow";
-  Names[RTLIB::POW_F80] = "powl";
-  Names[RTLIB::POW_F128] = "powl";
-  Names[RTLIB::POW_PPCF128] = "powl";
-  Names[RTLIB::CEIL_F32] = "ceilf";
-  Names[RTLIB::CEIL_F64] = "ceil";
-  Names[RTLIB::CEIL_F80] = "ceill";
-  Names[RTLIB::CEIL_F128] = "ceill";
-  Names[RTLIB::CEIL_PPCF128] = "ceill";
-  Names[RTLIB::TRUNC_F32] = "truncf";
-  Names[RTLIB::TRUNC_F64] = "trunc";
-  Names[RTLIB::TRUNC_F80] = "truncl";
-  Names[RTLIB::TRUNC_F128] = "truncl";
-  Names[RTLIB::TRUNC_PPCF128] = "truncl";
-  Names[RTLIB::RINT_F32] = "rintf";
-  Names[RTLIB::RINT_F64] = "rint";
-  Names[RTLIB::RINT_F80] = "rintl";
-  Names[RTLIB::RINT_F128] = "rintl";
-  Names[RTLIB::RINT_PPCF128] = "rintl";
-  Names[RTLIB::NEARBYINT_F32] = "nearbyintf";
-  Names[RTLIB::NEARBYINT_F64] = "nearbyint";
-  Names[RTLIB::NEARBYINT_F80] = "nearbyintl";
-  Names[RTLIB::NEARBYINT_F128] = "nearbyintl";
-  Names[RTLIB::NEARBYINT_PPCF128] = "nearbyintl";
-  Names[RTLIB::FLOOR_F32] = "floorf";
-  Names[RTLIB::FLOOR_F64] = "floor";
-  Names[RTLIB::FLOOR_F80] = "floorl";
-  Names[RTLIB::FLOOR_F128] = "floorl";
-  Names[RTLIB::FLOOR_PPCF128] = "floorl";
-  Names[RTLIB::COPYSIGN_F32] = "copysignf";
-  Names[RTLIB::COPYSIGN_F64] = "copysign";
-  Names[RTLIB::COPYSIGN_F80] = "copysignl";
-  Names[RTLIB::COPYSIGN_F128] = "copysignl";
-  Names[RTLIB::COPYSIGN_PPCF128] = "copysignl";
-  Names[RTLIB::FPEXT_F64_F128] = "__extenddftf2";
-  Names[RTLIB::FPEXT_F32_F128] = "__extendsftf2";
-  Names[RTLIB::FPEXT_F32_F64] = "__extendsfdf2";
-  Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee";
-  Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee";
-  Names[RTLIB::FPROUND_F64_F32] = "__truncdfsf2";
-  Names[RTLIB::FPROUND_F80_F32] = "__truncxfsf2";
-  Names[RTLIB::FPROUND_F128_F32] = "__trunctfsf2";
-  Names[RTLIB::FPROUND_PPCF128_F32] = "__trunctfsf2";
-  Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2";
-  Names[RTLIB::FPROUND_F128_F64] = "__trunctfdf2";
-  Names[RTLIB::FPROUND_PPCF128_F64] = "__trunctfdf2";
-  Names[RTLIB::FPTOSINT_F32_I8] = "__fixsfqi";
-  Names[RTLIB::FPTOSINT_F32_I16] = "__fixsfhi";
-  Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi";
-  Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi";
-  Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti";
-  Names[RTLIB::FPTOSINT_F64_I8] = "__fixdfqi";
-  Names[RTLIB::FPTOSINT_F64_I16] = "__fixdfhi";
-  Names[RTLIB::FPTOSINT_F64_I32] = "__fixdfsi";
-  Names[RTLIB::FPTOSINT_F64_I64] = "__fixdfdi";
-  Names[RTLIB::FPTOSINT_F64_I128] = "__fixdfti";
-  Names[RTLIB::FPTOSINT_F80_I32] = "__fixxfsi";
-  Names[RTLIB::FPTOSINT_F80_I64] = "__fixxfdi";
-  Names[RTLIB::FPTOSINT_F80_I128] = "__fixxfti";
-  Names[RTLIB::FPTOSINT_F128_I32] = "__fixtfsi";
-  Names[RTLIB::FPTOSINT_F128_I64] = "__fixtfdi";
-  Names[RTLIB::FPTOSINT_F128_I128] = "__fixtfti";
-  Names[RTLIB::FPTOSINT_PPCF128_I32] = "__fixtfsi";
-  Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi";
-  Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti";
-  Names[RTLIB::FPTOUINT_F32_I8] = "__fixunssfqi";
-  Names[RTLIB::FPTOUINT_F32_I16] = "__fixunssfhi";
-  Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi";
-  Names[RTLIB::FPTOUINT_F32_I64] = "__fixunssfdi";
-  Names[RTLIB::FPTOUINT_F32_I128] = "__fixunssfti";
-  Names[RTLIB::FPTOUINT_F64_I8] = "__fixunsdfqi";
-  Names[RTLIB::FPTOUINT_F64_I16] = "__fixunsdfhi";
-  Names[RTLIB::FPTOUINT_F64_I32] = "__fixunsdfsi";
-  Names[RTLIB::FPTOUINT_F64_I64] = "__fixunsdfdi";
-  Names[RTLIB::FPTOUINT_F64_I128] = "__fixunsdfti";
-  Names[RTLIB::FPTOUINT_F80_I32] = "__fixunsxfsi";
-  Names[RTLIB::FPTOUINT_F80_I64] = "__fixunsxfdi";
-  Names[RTLIB::FPTOUINT_F80_I128] = "__fixunsxfti";
-  Names[RTLIB::FPTOUINT_F128_I32] = "__fixunstfsi";
-  Names[RTLIB::FPTOUINT_F128_I64] = "__fixunstfdi";
-  Names[RTLIB::FPTOUINT_F128_I128] = "__fixunstfti";
-  Names[RTLIB::FPTOUINT_PPCF128_I32] = "__fixunstfsi";
-  Names[RTLIB::FPTOUINT_PPCF128_I64] = "__fixunstfdi";
-  Names[RTLIB::FPTOUINT_PPCF128_I128] = "__fixunstfti";
-  Names[RTLIB::SINTTOFP_I32_F32] = "__floatsisf";
-  Names[RTLIB::SINTTOFP_I32_F64] = "__floatsidf";
-  Names[RTLIB::SINTTOFP_I32_F80] = "__floatsixf";
-  Names[RTLIB::SINTTOFP_I32_F128] = "__floatsitf";
-  Names[RTLIB::SINTTOFP_I32_PPCF128] = "__floatsitf";
-  Names[RTLIB::SINTTOFP_I64_F32] = "__floatdisf";
-  Names[RTLIB::SINTTOFP_I64_F64] = "__floatdidf";
-  Names[RTLIB::SINTTOFP_I64_F80] = "__floatdixf";
-  Names[RTLIB::SINTTOFP_I64_F128] = "__floatditf";
-  Names[RTLIB::SINTTOFP_I64_PPCF128] = "__floatditf";
-  Names[RTLIB::SINTTOFP_I128_F32] = "__floattisf";
-  Names[RTLIB::SINTTOFP_I128_F64] = "__floattidf";
-  Names[RTLIB::SINTTOFP_I128_F80] = "__floattixf";
-  Names[RTLIB::SINTTOFP_I128_F128] = "__floattitf";
-  Names[RTLIB::SINTTOFP_I128_PPCF128] = "__floattitf";
-  Names[RTLIB::UINTTOFP_I32_F32] = "__floatunsisf";
-  Names[RTLIB::UINTTOFP_I32_F64] = "__floatunsidf";
-  Names[RTLIB::UINTTOFP_I32_F80] = "__floatunsixf";
-  Names[RTLIB::UINTTOFP_I32_F128] = "__floatunsitf";
-  Names[RTLIB::UINTTOFP_I32_PPCF128] = "__floatunsitf";
-  Names[RTLIB::UINTTOFP_I64_F32] = "__floatundisf";
-  Names[RTLIB::UINTTOFP_I64_F64] = "__floatundidf";
-  Names[RTLIB::UINTTOFP_I64_F80] = "__floatundixf";
-  Names[RTLIB::UINTTOFP_I64_F128] = "__floatunditf";
-  Names[RTLIB::UINTTOFP_I64_PPCF128] = "__floatunditf";
-  Names[RTLIB::UINTTOFP_I128_F32] = "__floatuntisf";
-  Names[RTLIB::UINTTOFP_I128_F64] = "__floatuntidf";
-  Names[RTLIB::UINTTOFP_I128_F80] = "__floatuntixf";
-  Names[RTLIB::UINTTOFP_I128_F128] = "__floatuntitf";
-  Names[RTLIB::UINTTOFP_I128_PPCF128] = "__floatuntitf";
-  Names[RTLIB::OEQ_F32] = "__eqsf2";
-  Names[RTLIB::OEQ_F64] = "__eqdf2";
-  Names[RTLIB::OEQ_F128] = "__eqtf2";
-  Names[RTLIB::UNE_F32] = "__nesf2";
-  Names[RTLIB::UNE_F64] = "__nedf2";
-  Names[RTLIB::UNE_F128] = "__netf2";
-  Names[RTLIB::OGE_F32] = "__gesf2";
-  Names[RTLIB::OGE_F64] = "__gedf2";
-  Names[RTLIB::OGE_F128] = "__getf2";
-  Names[RTLIB::OLT_F32] = "__ltsf2";
-  Names[RTLIB::OLT_F64] = "__ltdf2";
-  Names[RTLIB::OLT_F128] = "__lttf2";
-  Names[RTLIB::OLE_F32] = "__lesf2";
-  Names[RTLIB::OLE_F64] = "__ledf2";
-  Names[RTLIB::OLE_F128] = "__letf2";
-  Names[RTLIB::OGT_F32] = "__gtsf2";
-  Names[RTLIB::OGT_F64] = "__gtdf2";
-  Names[RTLIB::OGT_F128] = "__gttf2";
-  Names[RTLIB::UO_F32] = "__unordsf2";
-  Names[RTLIB::UO_F64] = "__unorddf2";
-  Names[RTLIB::UO_F128] = "__unordtf2";
-  Names[RTLIB::O_F32] = "__unordsf2";
-  Names[RTLIB::O_F64] = "__unorddf2";
-  Names[RTLIB::O_F128] = "__unordtf2";
-  Names[RTLIB::MEMCPY] = "memcpy";
-  Names[RTLIB::MEMMOVE] = "memmove";
-  Names[RTLIB::MEMSET] = "memset";
-  Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
-  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
-  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
-  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
-  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
-  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
-  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
-  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
-  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
-  Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
-  Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
-  Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
-  Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
-  Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
-  Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
-  Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
-  Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
-  Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
-  Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
-  Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
-  Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
-  Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
-  Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
-  Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
-  Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
-  Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
-  Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
-  Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
-  Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
-}
-
-/// InitLibcallCallingConvs - Set default libcall CallingConvs.
-///
-static void InitLibcallCallingConvs(CallingConv::ID *CCs) {
-  for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
-    CCs[i] = CallingConv::C;
-  }
-}
-
-/// getFPEXT - Return the FPEXT_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
-  if (OpVT == MVT::f32) {
-    if (RetVT == MVT::f64)
-      return FPEXT_F32_F64;
-    if (RetVT == MVT::f128)
-      return FPEXT_F32_F128;
-  } else if (OpVT == MVT::f64) {
-    if (RetVT == MVT::f128)
-      return FPEXT_F64_F128;
-  }
-
-  return UNKNOWN_LIBCALL;
-}
-
-/// getFPROUND - Return the FPROUND_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
-  if (RetVT == MVT::f32) {
-    if (OpVT == MVT::f64)
-      return FPROUND_F64_F32;
-    if (OpVT == MVT::f80)
-      return FPROUND_F80_F32;
-    if (OpVT == MVT::f128)
-      return FPROUND_F128_F32;
-    if (OpVT == MVT::ppcf128)
-      return FPROUND_PPCF128_F32;
-  } else if (RetVT == MVT::f64) {
-    if (OpVT == MVT::f80)
-      return FPROUND_F80_F64;
-    if (OpVT == MVT::f128)
-      return FPROUND_F128_F64;
-    if (OpVT == MVT::ppcf128)
-      return FPROUND_PPCF128_F64;
-  }
-
-  return UNKNOWN_LIBCALL;
-}
-
-/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
-  if (OpVT == MVT::f32) {
-    if (RetVT == MVT::i8)
-      return FPTOSINT_F32_I8;
-    if (RetVT == MVT::i16)
-      return FPTOSINT_F32_I16;
-    if (RetVT == MVT::i32)
-      return FPTOSINT_F32_I32;
-    if (RetVT == MVT::i64)
-      return FPTOSINT_F32_I64;
-    if (RetVT == MVT::i128)
-      return FPTOSINT_F32_I128;
-  } else if (OpVT == MVT::f64) {
-    if (RetVT == MVT::i8)
-      return FPTOSINT_F64_I8;
-    if (RetVT == MVT::i16)
-      return FPTOSINT_F64_I16;
-    if (RetVT == MVT::i32)
-      return FPTOSINT_F64_I32;
-    if (RetVT == MVT::i64)
-      return FPTOSINT_F64_I64;
-    if (RetVT == MVT::i128)
-      return FPTOSINT_F64_I128;
-  } else if (OpVT == MVT::f80) {
-    if (RetVT == MVT::i32)
-      return FPTOSINT_F80_I32;
-    if (RetVT == MVT::i64)
-      return FPTOSINT_F80_I64;
-    if (RetVT == MVT::i128)
-      return FPTOSINT_F80_I128;
-  } else if (OpVT == MVT::f128) {
-    if (RetVT == MVT::i32)
-      return FPTOSINT_F128_I32;
-    if (RetVT == MVT::i64)
-      return FPTOSINT_F128_I64;
-    if (RetVT == MVT::i128)
-      return FPTOSINT_F128_I128;
-  } else if (OpVT == MVT::ppcf128) {
-    if (RetVT == MVT::i32)
-      return FPTOSINT_PPCF128_I32;
-    if (RetVT == MVT::i64)
-      return FPTOSINT_PPCF128_I64;
-    if (RetVT == MVT::i128)
-      return FPTOSINT_PPCF128_I128;
-  }
-  return UNKNOWN_LIBCALL;
-}
-
-/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
-  if (OpVT == MVT::f32) {
-    if (RetVT == MVT::i8)
-      return FPTOUINT_F32_I8;
-    if (RetVT == MVT::i16)
-      return FPTOUINT_F32_I16;
-    if (RetVT == MVT::i32)
-      return FPTOUINT_F32_I32;
-    if (RetVT == MVT::i64)
-      return FPTOUINT_F32_I64;
-    if (RetVT == MVT::i128)
-      return FPTOUINT_F32_I128;
-  } else if (OpVT == MVT::f64) {
-    if (RetVT == MVT::i8)
-      return FPTOUINT_F64_I8;
-    if (RetVT == MVT::i16)
-      return FPTOUINT_F64_I16;
-    if (RetVT == MVT::i32)
-      return FPTOUINT_F64_I32;
-    if (RetVT == MVT::i64)
-      return FPTOUINT_F64_I64;
-    if (RetVT == MVT::i128)
-      return FPTOUINT_F64_I128;
-  } else if (OpVT == MVT::f80) {
-    if (RetVT == MVT::i32)
-      return FPTOUINT_F80_I32;
-    if (RetVT == MVT::i64)
-      return FPTOUINT_F80_I64;
-    if (RetVT == MVT::i128)
-      return FPTOUINT_F80_I128;
-  } else if (OpVT == MVT::f128) {
-    if (RetVT == MVT::i32)
-      return FPTOUINT_F128_I32;
-    if (RetVT == MVT::i64)
-      return FPTOUINT_F128_I64;
-    if (RetVT == MVT::i128)
-      return FPTOUINT_F128_I128;
-  } else if (OpVT == MVT::ppcf128) {
-    if (RetVT == MVT::i32)
-      return FPTOUINT_PPCF128_I32;
-    if (RetVT == MVT::i64)
-      return FPTOUINT_PPCF128_I64;
-    if (RetVT == MVT::i128)
-      return FPTOUINT_PPCF128_I128;
-  }
-  return UNKNOWN_LIBCALL;
-}
-
-/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
-  if (OpVT == MVT::i32) {
-    if (RetVT == MVT::f32)
-      return SINTTOFP_I32_F32;
-    if (RetVT == MVT::f64)
-      return SINTTOFP_I32_F64;
-    if (RetVT == MVT::f80)
-      return SINTTOFP_I32_F80;
-    if (RetVT == MVT::f128)
-      return SINTTOFP_I32_F128;
-    if (RetVT == MVT::ppcf128)
-      return SINTTOFP_I32_PPCF128;
-  } else if (OpVT == MVT::i64) {
-    if (RetVT == MVT::f32)
-      return SINTTOFP_I64_F32;
-    if (RetVT == MVT::f64)
-      return SINTTOFP_I64_F64;
-    if (RetVT == MVT::f80)
-      return SINTTOFP_I64_F80;
-    if (RetVT == MVT::f128)
-      return SINTTOFP_I64_F128;
-    if (RetVT == MVT::ppcf128)
-      return SINTTOFP_I64_PPCF128;
-  } else if (OpVT == MVT::i128) {
-    if (RetVT == MVT::f32)
-      return SINTTOFP_I128_F32;
-    if (RetVT == MVT::f64)
-      return SINTTOFP_I128_F64;
-    if (RetVT == MVT::f80)
-      return SINTTOFP_I128_F80;
-    if (RetVT == MVT::f128)
-      return SINTTOFP_I128_F128;
-    if (RetVT == MVT::ppcf128)
-      return SINTTOFP_I128_PPCF128;
-  }
-  return UNKNOWN_LIBCALL;
-}
-
-/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
-/// UNKNOWN_LIBCALL if there is none.
-RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
-  if (OpVT == MVT::i32) {
-    if (RetVT == MVT::f32)
-      return UINTTOFP_I32_F32;
-    if (RetVT == MVT::f64)
-      return UINTTOFP_I32_F64;
-    if (RetVT == MVT::f80)
-      return UINTTOFP_I32_F80;
-    if (RetVT == MVT::f128)
-      return UINTTOFP_I32_F128;
-    if (RetVT == MVT::ppcf128)
-      return UINTTOFP_I32_PPCF128;
-  } else if (OpVT == MVT::i64) {
-    if (RetVT == MVT::f32)
-      return UINTTOFP_I64_F32;
-    if (RetVT == MVT::f64)
-      return UINTTOFP_I64_F64;
-    if (RetVT == MVT::f80)
-      return UINTTOFP_I64_F80;
-    if (RetVT == MVT::f128)
-      return UINTTOFP_I64_F128;
-    if (RetVT == MVT::ppcf128)
-      return UINTTOFP_I64_PPCF128;
-  } else if (OpVT == MVT::i128) {
-    if (RetVT == MVT::f32)
-      return UINTTOFP_I128_F32;
-    if (RetVT == MVT::f64)
-      return UINTTOFP_I128_F64;
-    if (RetVT == MVT::f80)
-      return UINTTOFP_I128_F80;
-    if (RetVT == MVT::f128)
-      return UINTTOFP_I128_F128;
-    if (RetVT == MVT::ppcf128)
-      return UINTTOFP_I128_PPCF128;
-  }
-  return UNKNOWN_LIBCALL;
-}
-
-/// InitCmpLibcallCCs - Set default comparison libcall CC.
-///
-static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
-  memset(CCs, ISD::SETCC_INVALID, sizeof(ISD::CondCode)*RTLIB::UNKNOWN_LIBCALL);
-  CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
-  CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
-  CCs[RTLIB::OEQ_F128] = ISD::SETEQ;
-  CCs[RTLIB::UNE_F32] = ISD::SETNE;
-  CCs[RTLIB::UNE_F64] = ISD::SETNE;
-  CCs[RTLIB::UNE_F128] = ISD::SETNE;
-  CCs[RTLIB::OGE_F32] = ISD::SETGE;
-  CCs[RTLIB::OGE_F64] = ISD::SETGE;
-  CCs[RTLIB::OGE_F128] = ISD::SETGE;
-  CCs[RTLIB::OLT_F32] = ISD::SETLT;
-  CCs[RTLIB::OLT_F64] = ISD::SETLT;
-  CCs[RTLIB::OLT_F128] = ISD::SETLT;
-  CCs[RTLIB::OLE_F32] = ISD::SETLE;
-  CCs[RTLIB::OLE_F64] = ISD::SETLE;
-  CCs[RTLIB::OLE_F128] = ISD::SETLE;
-  CCs[RTLIB::OGT_F32] = ISD::SETGT;
-  CCs[RTLIB::OGT_F64] = ISD::SETGT;
-  CCs[RTLIB::OGT_F128] = ISD::SETGT;
-  CCs[RTLIB::UO_F32] = ISD::SETNE;
-  CCs[RTLIB::UO_F64] = ISD::SETNE;
-  CCs[RTLIB::UO_F128] = ISD::SETNE;
-  CCs[RTLIB::O_F32] = ISD::SETEQ;
-  CCs[RTLIB::O_F64] = ISD::SETEQ;
-  CCs[RTLIB::O_F128] = ISD::SETEQ;
-}
-
 /// NOTE: The constructor takes ownership of TLOF.
 TargetLowering::TargetLowering(const TargetMachine &tm,
                                const TargetLoweringObjectFile *tlof)
-  : TM(tm), TD(TM.getDataLayout()), TLOF(*tlof) {
-  // All operations default to being supported.
-  memset(OpActions, 0, sizeof(OpActions));
-  memset(LoadExtActions, 0, sizeof(LoadExtActions));
-  memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
-  memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
-  memset(CondCodeActions, 0, sizeof(CondCodeActions));
-
-  // Set default actions for various operations.
-  for (unsigned VT = 0; VT != (unsigned)MVT::LAST_VALUETYPE; ++VT) {
-    // Default all indexed load / store to expand.
-    for (unsigned IM = (unsigned)ISD::PRE_INC;
-         IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
-      setIndexedLoadAction(IM, (MVT::SimpleValueType)VT, Expand);
-      setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
-    }
-
-    // These operations default to expand.
-    setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
-  }
+  : TargetLoweringBase(tm, tlof) {}
 
-  // Most targets ignore the @llvm.prefetch intrinsic.
-  setOperationAction(ISD::PREFETCH, MVT::Other, Expand);
-
-  // ConstantFP nodes default to expand.  Targets can either change this to
-  // Legal, in which case all fp constants are legal, or use isFPImmLegal()
-  // to optimize expansions for certain constants.
-  setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
-  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
-  setOperationAction(ISD::ConstantFP, MVT::f80, Expand);
-  setOperationAction(ISD::ConstantFP, MVT::f128, Expand);
-
-  // These library functions default to expand.
-  setOperationAction(ISD::FLOG ,  MVT::f16, Expand);
-  setOperationAction(ISD::FLOG2,  MVT::f16, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f16, Expand);
-  setOperationAction(ISD::FEXP ,  MVT::f16, Expand);
-  setOperationAction(ISD::FEXP2,  MVT::f16, Expand);
-  setOperationAction(ISD::FFLOOR, MVT::f16, Expand);
-  setOperationAction(ISD::FNEARBYINT, MVT::f16, Expand);
-  setOperationAction(ISD::FCEIL,  MVT::f16, Expand);
-  setOperationAction(ISD::FRINT,  MVT::f16, Expand);
-  setOperationAction(ISD::FTRUNC, MVT::f16, Expand);
-  setOperationAction(ISD::FLOG ,  MVT::f32, Expand);
-  setOperationAction(ISD::FLOG2,  MVT::f32, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP ,  MVT::f32, Expand);
-  setOperationAction(ISD::FEXP2,  MVT::f32, Expand);
-  setOperationAction(ISD::FFLOOR, MVT::f32, Expand);
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Expand);
-  setOperationAction(ISD::FCEIL,  MVT::f32, Expand);
-  setOperationAction(ISD::FRINT,  MVT::f32, Expand);
-  setOperationAction(ISD::FTRUNC, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG ,  MVT::f64, Expand);
-  setOperationAction(ISD::FLOG2,  MVT::f64, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
-  setOperationAction(ISD::FEXP ,  MVT::f64, Expand);
-  setOperationAction(ISD::FEXP2,  MVT::f64, Expand);
-  setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
-  setOperationAction(ISD::FCEIL,  MVT::f64, Expand);
-  setOperationAction(ISD::FRINT,  MVT::f64, Expand);
-  setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
-  setOperationAction(ISD::FLOG ,  MVT::f128, Expand);
-  setOperationAction(ISD::FLOG2,  MVT::f128, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f128, Expand);
-  setOperationAction(ISD::FEXP ,  MVT::f128, Expand);
-  setOperationAction(ISD::FEXP2,  MVT::f128, Expand);
-  setOperationAction(ISD::FFLOOR, MVT::f128, Expand);
-  setOperationAction(ISD::FNEARBYINT, MVT::f128, Expand);
-  setOperationAction(ISD::FCEIL,  MVT::f128, Expand);
-  setOperationAction(ISD::FRINT,  MVT::f128, Expand);
-  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
-
-  // Default ISD::TRAP to expand (which turns it into abort).
-  setOperationAction(ISD::TRAP, MVT::Other, Expand);
-
-  // On most systems, DEBUGTRAP and TRAP have no difference. The "Expand"
-  // here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP.
-  //
-  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
-
-  IsLittleEndian = TD->isLittleEndian();
-  PointerTy = MVT::getIntegerVT(8*TD->getPointerSize(0));
-  memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*));
-  memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
-  maxStoresPerMemset = maxStoresPerMemcpy = maxStoresPerMemmove = 8;
-  maxStoresPerMemsetOptSize = maxStoresPerMemcpyOptSize
-    = maxStoresPerMemmoveOptSize = 4;
-  benefitFromCodePlacementOpt = false;
-  UseUnderscoreSetJmp = false;
-  UseUnderscoreLongJmp = false;
-  SelectIsExpensive = false;
-  IntDivIsCheap = false;
-  Pow2DivIsCheap = false;
-  JumpIsExpensive = false;
-  predictableSelectIsExpensive = false;
-  StackPointerRegisterToSaveRestore = 0;
-  ExceptionPointerRegister = 0;
-  ExceptionSelectorRegister = 0;
-  BooleanContents = UndefinedBooleanContent;
-  BooleanVectorContents = UndefinedBooleanContent;
-  SchedPreferenceInfo = Sched::ILP;
-  JumpBufSize = 0;
-  JumpBufAlignment = 0;
-  MinFunctionAlignment = 0;
-  PrefFunctionAlignment = 0;
-  PrefLoopAlignment = 0;
-  MinStackArgumentAlignment = 1;
-  ShouldFoldAtomicFences = false;
-  InsertFencesForAtomic = false;
-  SupportJumpTables = true;
-  MinimumJumpTableEntries = 4;
-
-  InitLibcallNames(LibcallRoutineNames);
-  InitCmpLibcallCCs(CmpLibcallCCs);
-  InitLibcallCallingConvs(LibcallCallingConvs);
-}
-
-TargetLowering::~TargetLowering() {
-  delete &TLOF;
-}
-
-MVT TargetLowering::getShiftAmountTy(EVT LHSTy) const {
-  return MVT::getIntegerVT(8*TD->getPointerSize(0));
+const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  return NULL;
 }
 
-/// canOpTrap - Returns true if the operation can trap for the value type.
-/// VT must be a legal type.
-bool TargetLowering::canOpTrap(unsigned Op, EVT VT) const {
-  assert(isTypeLegal(VT));
-  switch (Op) {
-  default:
+/// Check whether a given call node is in tail position within its function. If
+/// so, it sets Chain to the input chain of the tail call.
+bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
+                                          SDValue &Chain) const {
+  const Function *F = DAG.getMachineFunction().getFunction();
+
+  // Conservatively require the attributes of the call to match those of
+  // the return. Ignore noalias because it doesn't affect the call sequence.
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex)
+      .removeAttribute(Attribute::NoAlias).hasAttributes())
     return false;
-  case ISD::FDIV:
-  case ISD::FREM:
-  case ISD::SDIV:
-  case ISD::UDIV:
-  case ISD::SREM:
-  case ISD::UREM:
-    return true;
-  }
-}
 
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    return false;
 
-static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
-                                          unsigned &NumIntermediates,
-                                          MVT &RegisterVT,
-                                          TargetLowering *TLI) {
-  // Figure out the right, legal destination reg to copy into.
-  unsigned NumElts = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType();
-
-  unsigned NumVectorRegs = 1;
-
-  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
-  // could break down into LHS/RHS like LegalizeDAG does.
-  if (!isPowerOf2_32(NumElts)) {
-    NumVectorRegs = NumElts;
-    NumElts = 1;
-  }
-
-  // Divide the input until we get to a supported size.  This will always
-  // end with a scalar if the target doesn't support vectors.
-  while (NumElts > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, NumElts))) {
-    NumElts >>= 1;
-    NumVectorRegs <<= 1;
-  }
-
-  NumIntermediates = NumVectorRegs;
-
-  MVT NewVT = MVT::getVectorVT(EltTy, NumElts);
-  if (!TLI->isTypeLegal(NewVT))
-    NewVT = EltTy;
-  IntermediateVT = NewVT;
-
-  unsigned NewVTSize = NewVT.getSizeInBits();
-
-  // Convert sizes such as i33 to i64.
-  if (!isPowerOf2_32(NewVTSize))
-    NewVTSize = NextPowerOf2(NewVTSize);
-
-  MVT DestVT = TLI->getRegisterType(NewVT);
-  RegisterVT = DestVT;
-  if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
-    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
-
-  // Otherwise, promotion or legal types use the same number of registers as
-  // the vector decimated to the appropriate level.
-  return NumVectorRegs;
-}
-
-/// isLegalRC - Return true if the value types that can be represented by the
-/// specified register class are all legal.
-bool TargetLowering::isLegalRC(const TargetRegisterClass *RC) const {
-  for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
-       I != E; ++I) {
-    if (isTypeLegal(*I))
-      return true;
-  }
-  return false;
-}
-
-/// findRepresentativeClass - Return the largest legal super-reg register class
-/// of the register class for the specified type and its associated "cost".
-std::pair<const TargetRegisterClass*, uint8_t>
-TargetLowering::findRepresentativeClass(MVT VT) const {
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
-  if (!RC)
-    return std::make_pair(RC, 0);
-
-  // Compute the set of all super-register classes.
-  BitVector SuperRegRC(TRI->getNumRegClasses());
-  for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI)
-    SuperRegRC.setBitsInMask(RCI.getMask());
-
-  // Find the first legal register class with the largest spill size.
-  const TargetRegisterClass *BestRC = RC;
-  for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) {
-    const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
-    // We want the largest possible spill size.
-    if (SuperRC->getSize() <= BestRC->getSize())
-      continue;
-    if (!isLegalRC(SuperRC))
-      continue;
-    BestRC = SuperRC;
-  }
-  return std::make_pair(BestRC, 1);
-}
-
-/// computeRegisterProperties - Once all of the register classes are added,
-/// this allows us to compute derived properties we expose.
-void TargetLowering::computeRegisterProperties() {
-  assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
-         "Too many value types for ValueTypeActions to hold!");
-
-  // Everything defaults to needing one register.
-  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
-    NumRegistersForVT[i] = 1;
-    RegisterTypeForVT[i] = TransformToType[i] = (MVT::SimpleValueType)i;
-  }
-  // ...except isVoid, which doesn't need any registers.
-  NumRegistersForVT[MVT::isVoid] = 0;
-
-  // Find the largest integer register class.
-  unsigned LargestIntReg = MVT::LAST_INTEGER_VALUETYPE;
-  for (; RegClassForVT[LargestIntReg] == 0; --LargestIntReg)
-    assert(LargestIntReg != MVT::i1 && "No integer registers defined!");
-
-  // Every integer value type larger than this largest register takes twice as
-  // many registers to represent as the previous ValueType.
-  for (unsigned ExpandedReg = LargestIntReg + 1;
-       ExpandedReg <= MVT::LAST_INTEGER_VALUETYPE; ++ExpandedReg) {
-    NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
-    RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
-    TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
-    ValueTypeActions.setTypeAction((MVT::SimpleValueType)ExpandedReg,
-                                   TypeExpandInteger);
-  }
-
-  // Inspect all of the ValueType's smaller than the largest integer
-  // register to see which ones need promotion.
-  unsigned LegalIntReg = LargestIntReg;
-  for (unsigned IntReg = LargestIntReg - 1;
-       IntReg >= (unsigned)MVT::i1; --IntReg) {
-    MVT IVT = (MVT::SimpleValueType)IntReg;
-    if (isTypeLegal(IVT)) {
-      LegalIntReg = IntReg;
-    } else {
-      RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
-        (const MVT::SimpleValueType)LegalIntReg;
-      ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
-    }
-  }
-
-  // ppcf128 type is really two f64's.
-  if (!isTypeLegal(MVT::ppcf128)) {
-    NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
-    RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
-    TransformToType[MVT::ppcf128] = MVT::f64;
-    ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
-  }
-
-  // Decide how to handle f64. If the target does not have native f64 support,
-  // expand it to i64 and we will be generating soft float library calls.
-  if (!isTypeLegal(MVT::f64)) {
-    NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
-    RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
-    TransformToType[MVT::f64] = MVT::i64;
-    ValueTypeActions.setTypeAction(MVT::f64, TypeSoftenFloat);
-  }
-
-  // Decide how to handle f32. If the target does not have native support for
-  // f32, promote it to f64 if it is legal. Otherwise, expand it to i32.
-  if (!isTypeLegal(MVT::f32)) {
-    if (isTypeLegal(MVT::f64)) {
-      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::f64];
-      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::f64];
-      TransformToType[MVT::f32] = MVT::f64;
-      ValueTypeActions.setTypeAction(MVT::f32, TypePromoteInteger);
-    } else {
-      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
-      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
-      TransformToType[MVT::f32] = MVT::i32;
-      ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat);
-    }
-  }
-
-  // Loop over all of the vector value types to see which need transformations.
-  for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
-       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
-    MVT VT = (MVT::SimpleValueType)i;
-    if (isTypeLegal(VT)) continue;
-
-    // Determine if there is a legal wider type.  If so, we should promote to
-    // that wider vector type.
-    MVT EltVT = VT.getVectorElementType();
-    unsigned NElts = VT.getVectorNumElements();
-    if (NElts != 1 && !shouldSplitVectorElementType(EltVT)) {
-      bool IsLegalWiderType = false;
-      // First try to promote the elements of integer vectors. If no legal
-      // promotion was found, fallback to the widen-vector method.
-      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-        MVT SVT = (MVT::SimpleValueType)nVT;
-        // Promote vectors of integers to vectors with the same number
-        // of elements, with a wider element type.
-        if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits()
-            && SVT.getVectorNumElements() == NElts &&
-            isTypeLegal(SVT) && SVT.getScalarType().isInteger()) {
-          TransformToType[i] = SVT;
-          RegisterTypeForVT[i] = SVT;
-          NumRegistersForVT[i] = 1;
-          ValueTypeActions.setTypeAction(VT, TypePromoteInteger);
-          IsLegalWiderType = true;
-          break;
-        }
-      }
-
-      if (IsLegalWiderType) continue;
-
-      // Try to widen the vector.
-      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-        MVT SVT = (MVT::SimpleValueType)nVT;
-        if (SVT.getVectorElementType() == EltVT &&
-            SVT.getVectorNumElements() > NElts &&
-            isTypeLegal(SVT)) {
-          TransformToType[i] = SVT;
-          RegisterTypeForVT[i] = SVT;
-          NumRegistersForVT[i] = 1;
-          ValueTypeActions.setTypeAction(VT, TypeWidenVector);
-          IsLegalWiderType = true;
-          break;
-        }
-      }
-      if (IsLegalWiderType) continue;
-    }
-
-    MVT IntermediateVT;
-    MVT RegisterVT;
-    unsigned NumIntermediates;
-    NumRegistersForVT[i] =
-      getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates,
-                                RegisterVT, this);
-    RegisterTypeForVT[i] = RegisterVT;
-
-    MVT NVT = VT.getPow2VectorType();
-    if (NVT == VT) {
-      // Type is already a power of 2.  The default action is to split.
-      TransformToType[i] = MVT::Other;
-      unsigned NumElts = VT.getVectorNumElements();
-      ValueTypeActions.setTypeAction(VT,
-            NumElts > 1 ? TypeSplitVector : TypeScalarizeVector);
-    } else {
-      TransformToType[i] = NVT;
-      ValueTypeActions.setTypeAction(VT, TypeWidenVector);
-    }
-  }
-
-  // Determine the 'representative' register class for each value type.
-  // An representative register class is the largest (meaning one which is
-  // not a sub-register class / subreg register class) legal register class for
-  // a group of value types. For example, on i386, i8, i16, and i32
-  // representative would be GR32; while on x86_64 it's GR64.
-  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
-    const TargetRegisterClass* RRC;
-    uint8_t Cost;
-    tie(RRC, Cost) =  findRepresentativeClass((MVT::SimpleValueType)i);
-    RepRegClassForVT[i] = RRC;
-    RepRegClassCostForVT[i] = Cost;
-  }
+  // Check if the only use is a function return node.
+  return isUsedByReturnOnly(Node, Chain);
 }
 
-const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
-  return NULL;
-}
 
-EVT TargetLowering::getSetCCResultType(EVT VT) const {
-  assert(!VT.isVector() && "No default SetCC type for vectors!");
-  return getPointerTy(0).SimpleTy;
+/// Generate a libcall taking the given operands as arguments and returning a
+/// result of type RetVT.
+SDValue TargetLowering::makeLibCall(SelectionDAG &DAG,
+                                    RTLIB::Libcall LC, EVT RetVT,
+                                    const SDValue *Ops, unsigned NumOps,
+                                    bool isSigned, DebugLoc dl) const {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumOps);
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    Entry.Node = Ops[i];
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy());
+
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                    false, 0, getLibcallCallingConv(LC),
+                    /*isTailCall=*/false,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallInfo = LowerCallTo(CLI);
+
+  return CallInfo.first;
 }
 
-MVT::SimpleValueType TargetLowering::getCmpLibcallReturnType() const {
-  return MVT::i32; // return the default value
-}
 
-/// getVectorTypeBreakdown - Vector types are broken down into some number of
-/// legal first class types.  For example, MVT::v8f32 maps to 2 MVT::v4f32
-/// with Altivec or SSE1, or 8 promoted MVT::f64 values with the X86 FP stack.
-/// Similarly, MVT::v2i64 turns into 4 MVT::i32 values with both PPC and X86.
-///
-/// This method returns the number of registers needed, and the VT for each
-/// register.  It also returns the VT and quantity of the intermediate values
-/// before they are promoted/expanded.
-///
-unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
-                                                EVT &IntermediateVT,
-                                                unsigned &NumIntermediates,
-                                                MVT &RegisterVT) const {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  // If there is a wider vector type with the same element type as this one,
-  // or a promoted vector type that has the same number of elements which
-  // are wider, then we should convert to that legal vector type.
-  // This handles things like <2 x float> -> <4 x float> and
-  // <4 x i1> -> <4 x i32>.
-  LegalizeTypeAction TA = getTypeAction(Context, VT);
-  if (NumElts != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
-    EVT RegisterEVT = getTypeToTransformTo(Context, VT);
-    if (isTypeLegal(RegisterEVT)) {
-      IntermediateVT = RegisterEVT;
-      RegisterVT = RegisterEVT.getSimpleVT();
-      NumIntermediates = 1;
-      return 1;
+/// SoftenSetCCOperands - Soften the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
+                                         SDValue &NewLHS, SDValue &NewRHS,
+                                         ISD::CondCode &CCCode,
+                                         DebugLoc dl) const {
+  assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
+         && "Unsupported setcc type!");
+
+  // Expand into one or more soft-fp libcall(s).
+  RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
+  switch (CCCode) {
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
+          (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 :
+          (VT == MVT::f64) ? RTLIB::UNE_F64 : RTLIB::UNE_F128;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
+          (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128;
+    break;
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+          (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+    break;
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
+          (VT == MVT::f64) ? RTLIB::OLE_F64 : RTLIB::OLE_F128;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+          (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
+    break;
+  case ISD::SETUO:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
+          (VT == MVT::f64) ? RTLIB::UO_F64 : RTLIB::UO_F128;
+    break;
+  case ISD::SETO:
+    LC1 = (VT == MVT::f32) ? RTLIB::O_F32 :
+          (VT == MVT::f64) ? RTLIB::O_F64 : RTLIB::O_F128;
+    break;
+  default:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
+          (VT == MVT::f64) ? RTLIB::UO_F64 : RTLIB::UO_F128;
+    switch (CCCode) {
+    case ISD::SETONE:
+      // SETONE = SETOLT | SETOGT
+      LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+            (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+      // Fallthrough
+    case ISD::SETUGT:
+      LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+            (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
+      break;
+    case ISD::SETUGE:
+      LC2 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
+            (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128;
+      break;
+    case ISD::SETULT:
+      LC2 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+            (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+      break;
+    case ISD::SETULE:
+      LC2 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
+            (VT == MVT::f64) ? RTLIB::OLE_F64 : RTLIB::OLE_F128;
+      break;
+    case ISD::SETUEQ:
+      LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
+            (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128;
+      break;
+    default: llvm_unreachable("Do not know how to soften this setcc!");
     }
   }
 
-  // Figure out the right, legal destination reg to copy into.
-  EVT EltTy = VT.getVectorElementType();
-
-  unsigned NumVectorRegs = 1;
-
-  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
-  // could break down into LHS/RHS like LegalizeDAG does.
-  if (!isPowerOf2_32(NumElts)) {
-    NumVectorRegs = NumElts;
-    NumElts = 1;
+  // Use the target specific return value for comparions lib calls.
+  EVT RetVT = getCmpLibcallReturnType();
+  SDValue Ops[2] = { NewLHS, NewRHS };
+  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+  NewRHS = DAG.getConstant(0, RetVT);
+  CCCode = getCmpLibcallCC(LC1);
+  if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
+    SDValue Tmp = DAG.getNode(ISD::SETCC, dl, getSetCCResultType(RetVT),
+                              NewLHS, NewRHS, DAG.getCondCode(CCCode));
+    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+    NewLHS = DAG.getNode(ISD::SETCC, dl, getSetCCResultType(RetVT), NewLHS,
+                         NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
+    NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
+    NewRHS = SDValue();
   }
-
-  // Divide the input until we get to a supported size.  This will always
-  // end with a scalar if the target doesn't support vectors.
-  while (NumElts > 1 && !isTypeLegal(
-                                   EVT::getVectorVT(Context, EltTy, NumElts))) {
-    NumElts >>= 1;
-    NumVectorRegs <<= 1;
-  }
-
-  NumIntermediates = NumVectorRegs;
-
-  EVT NewVT = EVT::getVectorVT(Context, EltTy, NumElts);
-  if (!isTypeLegal(NewVT))
-    NewVT = EltTy;
-  IntermediateVT = NewVT;
-
-  MVT DestVT = getRegisterType(Context, NewVT);
-  RegisterVT = DestVT;
-  unsigned NewVTSize = NewVT.getSizeInBits();
-
-  // Convert sizes such as i33 to i64.
-  if (!isPowerOf2_32(NewVTSize))
-    NewVTSize = NextPowerOf2(NewVTSize);
-
-  if (EVT(DestVT).bitsLT(NewVT))   // Value is expanded, e.g. i64 -> i16.
-    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
-
-  // Otherwise, promotion or legal types use the same number of registers as
-  // the vector decimated to the appropriate level.
-  return NumVectorRegs;
-}
-
-/// Get the EVTs and ArgFlags collections that represent the legalized return
-/// type of the given function.  This does not require a DAG or a return value,
-/// and is suitable for use before any DAGs for the function are constructed.
-/// TODO: Move this out of TargetLowering.cpp.
-void llvm::GetReturnInfo(Type* ReturnType, AttributeSet attr,
-                         SmallVectorImpl<ISD::OutputArg> &Outs,
-                         const TargetLowering &TLI) {
-  SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, ReturnType, ValueVTs);
-  unsigned NumValues = ValueVTs.size();
-  if (NumValues == 0) return;
-
-  for (unsigned j = 0, f = NumValues; j != f; ++j) {
-    EVT VT = ValueVTs[j];
-    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
-
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
-      ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
-      ExtendKind = ISD::ZERO_EXTEND;
-
-    // FIXME: C calling convention requires the return type to be promoted to
-    // at least 32-bit. But this is not necessary for non-C calling
-    // conventions. The frontend should mark functions whose return values
-    // require promoting with signext or zeroext attributes.
-    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
-      MVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
-      if (VT.bitsLT(MinVT))
-        VT = MinVT;
-    }
-
-    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
-    MVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
-
-    // 'inreg' on function refers to return value
-    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg))
-      Flags.setInReg();
-
-    // Propagate extension type if any
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
-      Flags.setSExt();
-    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
-      Flags.setZExt();
-
-    for (unsigned i = 0; i < NumParts; ++i)
-      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true, 0, 0));
-  }
-}
-
-/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
-/// function arguments in the caller parameter area.  This is the actual
-/// alignment, not its logarithm.
-unsigned TargetLowering::getByValTypeAlignment(Type *Ty) const {
-  return TD->getCallFrameTypeAlignment(Ty);
 }
 
 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
@@ -1199,103 +253,6 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 }
 
 //===----------------------------------------------------------------------===//
-//  TargetTransformInfo Helpers
-//===----------------------------------------------------------------------===//
-
-int TargetLowering::InstructionOpcodeToISD(unsigned Opcode) const {
-  enum InstructionOpcodes {
-#define HANDLE_INST(NUM, OPCODE, CLASS) OPCODE = NUM,
-#define LAST_OTHER_INST(NUM) InstructionOpcodesCount = NUM
-#include "llvm/IR/Instruction.def"
-  };
-  switch (static_cast<InstructionOpcodes>(Opcode)) {
-  case Ret:            return 0;
-  case Br:             return 0;
-  case Switch:         return 0;
-  case IndirectBr:     return 0;
-  case Invoke:         return 0;
-  case Resume:         return 0;
-  case Unreachable:    return 0;
-  case Add:            return ISD::ADD;
-  case FAdd:           return ISD::FADD;
-  case Sub:            return ISD::SUB;
-  case FSub:           return ISD::FSUB;
-  case Mul:            return ISD::MUL;
-  case FMul:           return ISD::FMUL;
-  case UDiv:           return ISD::UDIV;
-  case SDiv:           return ISD::UDIV;
-  case FDiv:           return ISD::FDIV;
-  case URem:           return ISD::UREM;
-  case SRem:           return ISD::SREM;
-  case FRem:           return ISD::FREM;
-  case Shl:            return ISD::SHL;
-  case LShr:           return ISD::SRL;
-  case AShr:           return ISD::SRA;
-  case And:            return ISD::AND;
-  case Or:             return ISD::OR;
-  case Xor:            return ISD::XOR;
-  case Alloca:         return 0;
-  case Load:           return ISD::LOAD;
-  case Store:          return ISD::STORE;
-  case GetElementPtr:  return 0;
-  case Fence:          return 0;
-  case AtomicCmpXchg:  return 0;
-  case AtomicRMW:      return 0;
-  case Trunc:          return ISD::TRUNCATE;
-  case ZExt:           return ISD::ZERO_EXTEND;
-  case SExt:           return ISD::SIGN_EXTEND;
-  case FPToUI:         return ISD::FP_TO_UINT;
-  case FPToSI:         return ISD::FP_TO_SINT;
-  case UIToFP:         return ISD::UINT_TO_FP;
-  case SIToFP:         return ISD::SINT_TO_FP;
-  case FPTrunc:        return ISD::FP_ROUND;
-  case FPExt:          return ISD::FP_EXTEND;
-  case PtrToInt:       return ISD::BITCAST;
-  case IntToPtr:       return ISD::BITCAST;
-  case BitCast:        return ISD::BITCAST;
-  case ICmp:           return ISD::SETCC;
-  case FCmp:           return ISD::SETCC;
-  case PHI:            return 0;
-  case Call:           return 0;
-  case Select:         return ISD::SELECT;
-  case UserOp1:        return 0;
-  case UserOp2:        return 0;
-  case VAArg:          return 0;
-  case ExtractElement: return ISD::EXTRACT_VECTOR_ELT;
-  case InsertElement:  return ISD::INSERT_VECTOR_ELT;
-  case ShuffleVector:  return ISD::VECTOR_SHUFFLE;
-  case ExtractValue:   return ISD::MERGE_VALUES;
-  case InsertValue:    return ISD::MERGE_VALUES;
-  case LandingPad:     return 0;
-  }
-
-  llvm_unreachable("Unknown instruction type encountered!");
-}
-
-std::pair<unsigned, MVT>
-TargetLowering::getTypeLegalizationCost(Type *Ty) const {
-  LLVMContext &C = Ty->getContext();
-  EVT MTy = getValueType(Ty);
-
-  unsigned Cost = 1;
-  // We keep legalizing the type until we find a legal kind. We assume that
-  // the only operation that costs anything is the split. After splitting
-  // we need to handle two types.
-  while (true) {
-    LegalizeKind LK = getTypeConversion(C, MTy);
-
-    if (LK.first == TypeLegal)
-      return std::make_pair(Cost, MTy.getSimpleVT());
-
-    if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger)
-      Cost *= 2;
-
-    // Keep legalizing the type.
-    MTy = LK.second;
-  }
-}
-
-//===----------------------------------------------------------------------===//
 //  Optimization Methods
 //===----------------------------------------------------------------------===//
 
@@ -2239,7 +1196,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           APInt newMask = APInt::getLowBitsSet(maskWidth, width);
           for (unsigned offset=0; offset<origWidth/width; offset++) {
             if ((newMask & Mask) == Mask) {
-              if (!TD->isLittleEndian())
+              if (!getDataLayout()->isLittleEndian())
                 bestOffset = (origWidth/width - offset - 1) * (width/8);
               else
                 bestOffset = (uint64_t)offset * (width/8);
@@ -2913,7 +1870,9 @@ PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
 
 TargetLowering::ConstraintType
 TargetLowering::getConstraintType(const std::string &Constraint) const {
-  if (Constraint.size() == 1) {
+  unsigned S = Constraint.size();
+
+  if (S == 1) {
     switch (Constraint[0]) {
     default: break;
     case 'r': return C_RegisterClass;
@@ -2942,9 +1901,11 @@ TargetLowering::getConstraintType(const std::string &Constraint) const {
     }
   }
 
-  if (Constraint.size() > 1 && Constraint[0] == '{' &&
-      Constraint[Constraint.size()-1] == '}')
+  if (S > 1 && Constraint[0] == '{' && Constraint[S-1] == '}') {
+    if (S == 8 && !Constraint.compare(1, 6, "memory", 6))  // "{memory}"
+      return C_Memory;
     return C_Register;
+  }
   return C_Unknown;
 }
 
@@ -3040,7 +2001,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
     std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
 
   // Figure out which register class contains this reg.
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
+  const TargetRegisterInfo *RI = getTargetMachine().getRegisterInfo();
   for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
        E = RI->regclass_end(); RCI != E; ++RCI) {
     const TargetRegisterClass *RC = *RCI;
@@ -3077,7 +2038,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
 /// a matching constraint like "4".
 bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const {
   assert(!ConstraintCode.empty() && "No known constraint!");
-  return isdigit(ConstraintCode[0]);
+  return isdigit(static_cast<unsigned char>(ConstraintCode[0]));
 }
 
 /// getMatchedOperand - If this is an input matching constraint, this method
@@ -3164,7 +2125,7 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
       // If OpTy is not a single value, it may be a struct/union that we
       // can tile with integers.
       if (!OpTy->isSingleValueType() && OpTy->isSized()) {
-        unsigned BitSize = TD->getTypeSizeInBits(OpTy);
+        unsigned BitSize = getDataLayout()->getTypeSizeInBits(OpTy);
         switch (BitSize) {
         default: break;
         case 1:
@@ -3179,7 +2140,7 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
         }
       } else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
         OpInfo.ConstraintVT = MVT::getIntegerVT(
-            8*TD->getPointerSize(PT->getAddressSpace()));
+            8*getDataLayout()->getPointerSize(PT->getAddressSpace()));
       } else {
         OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
       }
@@ -3474,44 +2435,6 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
   }
 }
 
-//===----------------------------------------------------------------------===//
-//  Loop Strength Reduction hooks
-//===----------------------------------------------------------------------===//
-
-/// isLegalAddressingMode - Return true if the addressing mode represented
-/// by AM is legal for this target, for a load/store of the specified type.
-bool TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                           Type *Ty) const {
-  // The default implementation of this implements a conservative RISCy, r+r and
-  // r+i addr mode.
-
-  // Allows a sign-extended 16-bit immediate field.
-  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
-    return false;
-
-  // No global is ever allowed as a base.
-  if (AM.BaseGV)
-    return false;
-
-  // Only support r+r,
-  switch (AM.Scale) {
-  case 0:  // "r+i" or just "i", depending on HasBaseReg.
-    break;
-  case 1:
-    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
-      return false;
-    // Otherwise we have r+r or r+i.
-    break;
-  case 2:
-    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
-      return false;
-    // Allow 2*r as r+r.
-    break;
-  }
-
-  return true;
-}
-
 /// BuildExactDiv - Given an exact SDIV by a constant, create a multiplication
 /// with the multiplicative inverse of the constant.
 SDValue TargetLowering::BuildExactSDIV(SDValue Op1, SDValue Op2, DebugLoc dl,
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 09e923c..b58bb85 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -43,7 +43,7 @@ STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
 namespace {
   class SjLjEHPrepare : public FunctionPass {
-    const TargetLowering *TLI;
+    const TargetLoweringBase *TLI;
     Type *FunctionContextTy;
     Constant *RegisterFn;
     Constant *UnregisterFn;
@@ -58,7 +58,7 @@ namespace {
     AllocaInst *FuncCtx;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit SjLjEHPrepare(const TargetLowering *tli = NULL)
+    explicit SjLjEHPrepare(const TargetLoweringBase *tli = NULL)
       : FunctionPass(ID), TLI(tli) { }
     bool doInitialization(Module &M);
     bool runOnFunction(Function &F);
@@ -82,7 +82,7 @@ namespace {
 char SjLjEHPrepare::ID = 0;
 
 // Public Interface To the SjLjEHPrepare pass.
-FunctionPass *llvm::createSjLjEHPreparePass(const TargetLowering *TLI) {
+FunctionPass *llvm::createSjLjEHPreparePass(const TargetLoweringBase *TLI) {
   return new SjLjEHPrepare(TLI);
 }
 // doInitialization - Set up decalarations and types needed to process
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index 95faafa..20049a8 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -142,6 +142,76 @@ void SlotIndexes::renumberIndexes(IndexList::iterator curItr) {
   ++NumLocalRenum;
 }
 
+// Repair indexes after adding and removing instructions.
+void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator Begin,
+                                       MachineBasicBlock::iterator End) {
+  // FIXME: Is this really necessary? The only caller repairIntervalsForRange()
+  // does the same thing.
+  // Find anchor points, which are at the beginning/end of blocks or at
+  // instructions that already have indexes.
+  while (Begin != MBB->begin() && !hasIndex(Begin))
+    --Begin;
+  while (End != MBB->end() && !hasIndex(End))
+    ++End;
+
+  bool includeStart = (Begin == MBB->begin());
+  SlotIndex startIdx;
+  if (includeStart)
+    startIdx = getMBBStartIdx(MBB);
+  else
+    startIdx = getInstructionIndex(Begin);
+
+  SlotIndex endIdx;
+  if (End == MBB->end())
+    endIdx = getMBBEndIdx(MBB);
+  else
+    endIdx = getInstructionIndex(End);
+
+  // FIXME: Conceptually, this code is implementing an iterator on MBB that
+  // optionally includes an additional position prior to MBB->begin(), indicated
+  // by the includeStart flag. This is done so that we can iterate MIs in a MBB
+  // in parallel with SlotIndexes, but there should be a better way to do this.
+  IndexList::iterator ListB = startIdx.listEntry();
+  IndexList::iterator ListI = endIdx.listEntry();
+  MachineBasicBlock::iterator MBBI = End;
+  bool pastStart = false;
+  while (ListI != ListB || MBBI != Begin || (includeStart && !pastStart)) {
+    assert(ListI->getIndex() >= startIdx.getIndex() &&
+           (includeStart || !pastStart) &&
+           "Decremented past the beginning of region to repair.");
+
+    MachineInstr *SlotMI = ListI->getInstr();
+    MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? MBBI : 0;
+    bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
+
+    if (SlotMI == MI && !MBBIAtBegin) {
+      --ListI;
+      if (MBBI != Begin)
+        --MBBI;
+      else
+        pastStart = true;
+    } else if (MI && mi2iMap.find(MI) == mi2iMap.end()) {
+      if (MBBI != Begin)
+        --MBBI;
+      else
+        pastStart = true;
+    } else {
+      --ListI;
+      if (SlotMI)
+        removeMachineInstrFromMaps(SlotMI);
+    }
+  }
+
+  // In theory this could be combined with the previous loop, but it is tricky
+  // to update the IndexList while we are iterating it.
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr *MI = I;
+    if (!MI->isDebugValue() && mi2iMap.find(MI) == mi2iMap.end())
+      insertMachineInstrInMaps(MI);
+  }
+}
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void SlotIndexes::dump() const {
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index 42502eb..ec44b8c 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -23,7 +23,6 @@
 
 #define DEBUG_TYPE "stackcoloring"
 #include "llvm/CodeGen/Passes.h"
-#include "MachineTraceMetrics.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
@@ -103,12 +102,13 @@ class StackColoring : public MachineFunctionPass {
   };
 
   /// Maps active slots (per bit) for each basic block.
-  DenseMap<MachineBasicBlock*, BlockLifetimeInfo> BlockLiveness;
+  typedef DenseMap<const MachineBasicBlock*, BlockLifetimeInfo> LivenessMap;
+  LivenessMap BlockLiveness;
 
   /// Maps serial numbers to basic blocks.
-  DenseMap<MachineBasicBlock*, int> BasicBlocks;
+  DenseMap<const MachineBasicBlock*, int> BasicBlocks;
   /// Maps basic blocks to a serial number.
-  SmallVector<MachineBasicBlock*, 8> BasicBlockNumbering;
+  SmallVector<const MachineBasicBlock*, 8> BasicBlockNumbering;
 
   /// Maps liveness intervals for each slot.
   SmallVector<LiveInterval*, 16> Intervals;
@@ -145,7 +145,7 @@ public:
 
 private:
   /// Debug.
-  void dump();
+  void dump() const;
 
   /// Removes all of the lifetime marker instructions from the function.
   /// \returns true if any markers were removed.
@@ -200,31 +200,35 @@ void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-void StackColoring::dump() {
+void StackColoring::dump() const {
   for (df_iterator<MachineFunction*> FI = df_begin(MF), FE = df_end(MF);
        FI != FE; ++FI) {
-    unsigned Num = BasicBlocks[*FI];
-    DEBUG(dbgs()<<"Inspecting block #"<<Num<<" ["<<FI->getName()<<"]\n");
-    Num = 0;
+    DEBUG(dbgs()<<"Inspecting block #"<<BasicBlocks.lookup(*FI)<<
+          " ["<<FI->getName()<<"]\n");
+
+    LivenessMap::const_iterator BI = BlockLiveness.find(*FI);
+    assert(BI != BlockLiveness.end() && "Block not found");
+    const BlockLifetimeInfo &BlockInfo = BI->second;
+
     DEBUG(dbgs()<<"BEGIN  : {");
-    for (unsigned i=0; i < BlockLiveness[*FI].Begin.size(); ++i)
-      DEBUG(dbgs()<<BlockLiveness[*FI].Begin.test(i)<<" ");
+    for (unsigned i=0; i < BlockInfo.Begin.size(); ++i)
+      DEBUG(dbgs()<<BlockInfo.Begin.test(i)<<" ");
     DEBUG(dbgs()<<"}\n");
 
     DEBUG(dbgs()<<"END    : {");
-    for (unsigned i=0; i < BlockLiveness[*FI].End.size(); ++i)
-      DEBUG(dbgs()<<BlockLiveness[*FI].End.test(i)<<" ");
+    for (unsigned i=0; i < BlockInfo.End.size(); ++i)
+      DEBUG(dbgs()<<BlockInfo.End.test(i)<<" ");
 
     DEBUG(dbgs()<<"}\n");
 
     DEBUG(dbgs()<<"LIVE_IN: {");
-    for (unsigned i=0; i < BlockLiveness[*FI].LiveIn.size(); ++i)
-      DEBUG(dbgs()<<BlockLiveness[*FI].LiveIn.test(i)<<" ");
+    for (unsigned i=0; i < BlockInfo.LiveIn.size(); ++i)
+      DEBUG(dbgs()<<BlockInfo.LiveIn.test(i)<<" ");
 
     DEBUG(dbgs()<<"}\n");
     DEBUG(dbgs()<<"LIVEOUT: {");
-    for (unsigned i=0; i < BlockLiveness[*FI].LiveOut.size(); ++i)
-      DEBUG(dbgs()<<BlockLiveness[*FI].LiveOut.test(i)<<" ");
+    for (unsigned i=0; i < BlockInfo.LiveOut.size(); ++i)
+      DEBUG(dbgs()<<BlockInfo.LiveOut.test(i)<<" ");
     DEBUG(dbgs()<<"}\n");
   }
 }
@@ -242,8 +246,11 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
     BasicBlocks[*FI] = BasicBlockNumbering.size();
     BasicBlockNumbering.push_back(*FI);
 
-    BlockLiveness[*FI].Begin.resize(NumSlot);
-    BlockLiveness[*FI].End.resize(NumSlot);
+    // Keep a reference to avoid repeated lookups.
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[*FI];
+
+    BlockInfo.Begin.resize(NumSlot);
+    BlockInfo.End.resize(NumSlot);
 
     for (MachineBasicBlock::iterator BI = (*FI)->begin(), BE = (*FI)->end();
          BI != BE; ++BI) {
@@ -255,7 +262,7 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
       Markers.push_back(BI);
 
       bool IsStart = BI->getOpcode() == TargetOpcode::LIFETIME_START;
-      MachineOperand &MI = BI->getOperand(0);
+      const MachineOperand &MI = BI->getOperand(0);
       unsigned Slot = MI.getIndex();
 
       MarkersFound++;
@@ -267,15 +274,15 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
       }
 
       if (IsStart) {
-        BlockLiveness[*FI].Begin.set(Slot);
+        BlockInfo.Begin.set(Slot);
       } else {
-        if (BlockLiveness[*FI].Begin.test(Slot)) {
+        if (BlockInfo.Begin.test(Slot)) {
           // Allocas that start and end within a single block are handled
           // specially when computing the LiveIntervals to avoid pessimizing
           // the liveness propagation.
-          BlockLiveness[*FI].Begin.reset(Slot);
+          BlockInfo.Begin.reset(Slot);
         } else {
-          BlockLiveness[*FI].End.set(Slot);
+          BlockInfo.End.set(Slot);
         }
       }
     }
@@ -292,47 +299,58 @@ void StackColoring::calculateLocalLiveness() {
   // formulation, and END is equivalent to GEN.  The result of this computation
   // is a map from blocks to bitvectors where the bitvectors represent which
   // allocas are live in/out of that block.
-  SmallPtrSet<MachineBasicBlock*, 8> BBSet(BasicBlockNumbering.begin(),
-                                           BasicBlockNumbering.end());
+  SmallPtrSet<const MachineBasicBlock*, 8> BBSet(BasicBlockNumbering.begin(),
+                                                 BasicBlockNumbering.end());
   unsigned NumSSMIters = 0;
   bool changed = true;
   while (changed) {
     changed = false;
     ++NumSSMIters;
 
-    SmallPtrSet<MachineBasicBlock*, 8> NextBBSet;
+    SmallPtrSet<const MachineBasicBlock*, 8> NextBBSet;
 
-    for (SmallVector<MachineBasicBlock*, 8>::iterator
+    for (SmallVector<const MachineBasicBlock*, 8>::iterator
          PI = BasicBlockNumbering.begin(), PE = BasicBlockNumbering.end();
          PI != PE; ++PI) {
 
-      MachineBasicBlock *BB = *PI;
+      const MachineBasicBlock *BB = *PI;
       if (!BBSet.count(BB)) continue;
 
+      // Use an iterator to avoid repeated lookups.
+      LivenessMap::iterator BI = BlockLiveness.find(BB);
+      assert(BI != BlockLiveness.end() && "Block not found");
+      BlockLifetimeInfo &BlockInfo = BI->second;
+
       BitVector LocalLiveIn;
       BitVector LocalLiveOut;
 
       // Forward propagation from begins to ends.
-      for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
-           PE = BB->pred_end(); PI != PE; ++PI)
-        LocalLiveIn |= BlockLiveness[*PI].LiveOut;
-      LocalLiveIn |= BlockLiveness[BB].End;
-      LocalLiveIn.reset(BlockLiveness[BB].Begin);
+      for (MachineBasicBlock::const_pred_iterator PI = BB->pred_begin(),
+           PE = BB->pred_end(); PI != PE; ++PI) {
+        LivenessMap::const_iterator I = BlockLiveness.find(*PI);
+        assert(I != BlockLiveness.end() && "Predecessor not found");
+        LocalLiveIn |= I->second.LiveOut;
+      }
+      LocalLiveIn |= BlockInfo.End;
+      LocalLiveIn.reset(BlockInfo.Begin);
 
       // Reverse propagation from ends to begins.
-      for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
-           SE = BB->succ_end(); SI != SE; ++SI)
-        LocalLiveOut |= BlockLiveness[*SI].LiveIn;
-      LocalLiveOut |= BlockLiveness[BB].Begin;
-      LocalLiveOut.reset(BlockLiveness[BB].End);
+      for (MachineBasicBlock::const_succ_iterator SI = BB->succ_begin(),
+           SE = BB->succ_end(); SI != SE; ++SI) {
+        LivenessMap::const_iterator I = BlockLiveness.find(*SI);
+        assert(I != BlockLiveness.end() && "Successor not found");
+        LocalLiveOut |= I->second.LiveIn;
+      }
+      LocalLiveOut |= BlockInfo.Begin;
+      LocalLiveOut.reset(BlockInfo.End);
 
       LocalLiveIn |= LocalLiveOut;
       LocalLiveOut |= LocalLiveIn;
 
       // After adopting the live bits, we need to turn-off the bits which
       // are de-activated in this block.
-      LocalLiveOut.reset(BlockLiveness[BB].End);
-      LocalLiveIn.reset(BlockLiveness[BB].Begin);
+      LocalLiveOut.reset(BlockInfo.End);
+      LocalLiveIn.reset(BlockInfo.Begin);
 
       // If we have both BEGIN and END markers in the same basic block then
       // we know that the BEGIN marker comes after the END, because we already
@@ -341,25 +359,25 @@ void StackColoring::calculateLocalLiveness() {
       // Want to enable the LIVE_IN and LIVE_OUT of slots that have both
       // BEGIN and END because it means that the value lives before and after
       // this basic block.
-      BitVector LocalEndBegin = BlockLiveness[BB].End;
-      LocalEndBegin &= BlockLiveness[BB].Begin;
+      BitVector LocalEndBegin = BlockInfo.End;
+      LocalEndBegin &= BlockInfo.Begin;
       LocalLiveIn |= LocalEndBegin;
       LocalLiveOut |= LocalEndBegin;
 
-      if (LocalLiveIn.test(BlockLiveness[BB].LiveIn)) {
+      if (LocalLiveIn.test(BlockInfo.LiveIn)) {
         changed = true;
-        BlockLiveness[BB].LiveIn |= LocalLiveIn;
+        BlockInfo.LiveIn |= LocalLiveIn;
 
-        for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+        for (MachineBasicBlock::const_pred_iterator PI = BB->pred_begin(),
              PE = BB->pred_end(); PI != PE; ++PI)
           NextBBSet.insert(*PI);
       }
 
-      if (LocalLiveOut.test(BlockLiveness[BB].LiveOut)) {
+      if (LocalLiveOut.test(BlockInfo.LiveOut)) {
         changed = true;
-        BlockLiveness[BB].LiveOut |= LocalLiveOut;
+        BlockInfo.LiveOut |= LocalLiveOut;
 
-        for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+        for (MachineBasicBlock::const_succ_iterator SI = BB->succ_begin(),
              SE = BB->succ_end(); SI != SE; ++SI)
           NextBBSet.insert(*SI);
       }
@@ -383,9 +401,9 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
     Finishes.resize(NumSlots);
 
     // Create the interval for the basic blocks with lifetime markers in them.
-    for (SmallVector<MachineInstr*, 8>::iterator it = Markers.begin(),
+    for (SmallVectorImpl<MachineInstr*>::const_iterator it = Markers.begin(),
          e = Markers.end(); it != e; ++it) {
-      MachineInstr *MI = *it;
+      const MachineInstr *MI = *it;
       if (MI->getParent() != MBB)
         continue;
 
@@ -394,7 +412,7 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
              "Invalid Lifetime marker");
 
       bool IsStart = MI->getOpcode() == TargetOpcode::LIFETIME_START;
-      MachineOperand &Mo = MI->getOperand(0);
+      const MachineOperand &Mo = MI->getOperand(0);
       int Slot = Mo.getIndex();
       assert(Slot >= 0 && "Invalid slot");
 
@@ -481,7 +499,7 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
 
   // Keep a list of *allocas* which need to be remapped.
   DenseMap<const AllocaInst*, const AllocaInst*> Allocas;
-  for (DenseMap<int, int>::iterator it = SlotRemap.begin(),
+  for (DenseMap<int, int>::const_iterator it = SlotRemap.begin(),
        e = SlotRemap.end(); it != e; ++it) {
     const AllocaInst *From = MFI->getObjectAllocation(it->first);
     const AllocaInst *To = MFI->getObjectAllocation(it->second);
@@ -576,8 +594,8 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
 }
 
 void StackColoring::removeInvalidSlotRanges() {
-  MachineFunction::iterator BB, BBE;
-  MachineBasicBlock::iterator I, IE;
+  MachineFunction::const_iterator BB, BBE;
+  MachineBasicBlock::const_iterator I, IE;
   for (BB = MF->begin(), BBE = MF->end(); BB != BBE; ++BB)
     for (I = BB->begin(), IE = BB->end(); I != IE; ++I) {
 
@@ -596,7 +614,7 @@ void StackColoring::removeInvalidSlotRanges() {
 
       // Check all of the machine operands.
       for (unsigned i = 0 ; i <  I->getNumOperands(); ++i) {
-        MachineOperand &MO = I->getOperand(i);
+        const MachineOperand &MO = I->getOperand(i);
 
         if (!MO.isFI())
           continue;
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 665388b..f3be37c 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -16,6 +16,8 @@
 
 #define DEBUG_TYPE "stack-protector"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/IR/Attributes.h"
@@ -32,17 +34,27 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+STATISTIC(NumFunProtected, "Number of functions protected");
+STATISTIC(NumAddrTaken, "Number of local variables that have their address"
+                        " taken.");
+
 namespace {
   class StackProtector : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// target type sizes.
-    const TargetLowering *TLI;
+    const TargetLoweringBase *TLI;
 
     Function *F;
     Module *M;
 
     DominatorTree *DT;
 
+    /// VisitedPHIs - The set of PHI nodes visited when determining
+    /// if a variable's reference has been taken.  This set 
+    /// is maintained to ensure we don't visit the same PHI node multiple
+    /// times.
+    SmallPtrSet<const PHINode*, 16> VisitedPHIs;
+
     /// InsertStackProtectors - Insert code into the prologue and epilogue of
     /// the function.
     ///
@@ -58,17 +70,21 @@ namespace {
     /// ContainsProtectableArray - Check whether the type either is an array or
     /// contains an array of sufficient size so that we need stack protectors
     /// for it.
-    bool ContainsProtectableArray(Type *Ty, bool InStruct = false) const;
+    bool ContainsProtectableArray(Type *Ty, bool Strong = false,
+                                  bool InStruct = false) const;
+
+    /// \brief Check whether a stack allocation has its address taken.
+    bool HasAddressTaken(const Instruction *AI);
 
     /// RequiresStackProtector - Check whether or not this function needs a
     /// stack protector based upon the stack protector level.
-    bool RequiresStackProtector() const;
+    bool RequiresStackProtector();
   public:
     static char ID;             // Pass identification, replacement for typeid.
     StackProtector() : FunctionPass(ID), TLI(0) {
       initializeStackProtectorPass(*PassRegistry::getPassRegistry());
     }
-    StackProtector(const TargetLowering *tli)
+    StackProtector(const TargetLoweringBase *tli)
       : FunctionPass(ID), TLI(tli) {
       initializeStackProtectorPass(*PassRegistry::getPassRegistry());
     }
@@ -85,7 +101,7 @@ char StackProtector::ID = 0;
 INITIALIZE_PASS(StackProtector, "stack-protector",
                 "Insert stack protectors", false, false)
 
-FunctionPass *llvm::createStackProtectorPass(const TargetLowering *tli) {
+FunctionPass *llvm::createStackProtectorPass(const TargetLoweringBase *tli) {
   return new StackProtector(tli);
 }
 
@@ -96,15 +112,21 @@ bool StackProtector::runOnFunction(Function &Fn) {
 
   if (!RequiresStackProtector()) return false;
 
+  ++NumFunProtected;
   return InsertStackProtectors();
 }
 
 /// ContainsProtectableArray - Check whether the type either is an array or
 /// contains a char array of sufficient size so that we need stack protectors
 /// for it.
-bool StackProtector::ContainsProtectableArray(Type *Ty, bool InStruct) const {
+bool StackProtector::ContainsProtectableArray(Type *Ty, bool Strong,
+                                              bool InStruct) const {
   if (!Ty) return false;
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    // In strong mode any array, regardless of type and size, triggers a
+    // protector
+    if (Strong)
+      return true;
     const TargetMachine &TM = TLI->getTargetMachine();
     if (!AT->getElementType()->isIntegerTy(8)) {
       Triple Trip(TM.getTargetTriple());
@@ -126,39 +148,103 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool InStruct) const {
 
   for (StructType::element_iterator I = ST->element_begin(),
          E = ST->element_end(); I != E; ++I)
-    if (ContainsProtectableArray(*I, true))
+    if (ContainsProtectableArray(*I, Strong, true))
       return true;
 
   return false;
 }
 
-/// RequiresStackProtector - Check whether or not this function needs a stack
-/// protector based upon the stack protector level. The heuristic we use is to
-/// add a guard variable to functions that call alloca, and functions with
-/// buffers larger than SSPBufferSize bytes.
-bool StackProtector::RequiresStackProtector() const {
+bool StackProtector::HasAddressTaken(const Instruction *AI) {
+  for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
+        UI != UE; ++UI) {
+    const User *U = *UI;
+    if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (AI == SI->getValueOperand())
+        return true;
+    } else if (const PtrToIntInst *SI = dyn_cast<PtrToIntInst>(U)) {
+      if (AI == SI->getOperand(0))
+        return true;
+    } else if (isa<CallInst>(U)) {
+      return true;
+    } else if (isa<InvokeInst>(U)) {
+      return true;
+    } else if (const SelectInst *SI = dyn_cast<SelectInst>(U)) {
+      if (HasAddressTaken(SI))
+        return true;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
+      // Keep track of what PHI nodes we have already visited to ensure
+      // they are only visited once.
+      if (VisitedPHIs.insert(PN))
+        if (HasAddressTaken(PN))
+          return true;
+    } else if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      if (HasAddressTaken(GEP))
+        return true;
+    } else if (const BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
+      if (HasAddressTaken(BI))
+        return true;
+    }
+  }
+  return false;
+}
+
+/// \brief Check whether or not this function needs a stack protector based
+/// upon the stack protector level.
+///
+/// We use two heuristics: a standard (ssp) and strong (sspstrong).
+/// The standard heuristic which will add a guard variable to functions that
+/// call alloca with a either a variable size or a size >= SSPBufferSize,
+/// functions with character buffers larger than SSPBufferSize, and functions
+/// with aggregates containing character buffers larger than SSPBufferSize. The
+/// strong heuristic will add a guard variables to functions that call alloca
+/// regardless of size, functions with any buffer regardless of type and size,
+/// functions with aggregates that contain any buffer regardless of type and
+/// size, and functions that contain stack-based variables that have had their
+/// address taken.
+bool StackProtector::RequiresStackProtector() {
+  bool Strong = false;
   if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::StackProtectReq))
     return true;
-
-  if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::StackProtect))
+  else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::StackProtectStrong))
+    Strong = true;
+  else if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                            Attribute::StackProtect))
     return false;
 
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
     BasicBlock *BB = I;
 
     for (BasicBlock::iterator
-           II = BB->begin(), IE = BB->end(); II != IE; ++II)
+           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
       if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
-        if (AI->isArrayAllocation())
-          // This is a call to alloca with a variable size. Emit stack
-          // protectors.
+        if (AI->isArrayAllocation()) {
+          // SSP-Strong: Enable protectors for any call to alloca, regardless
+          // of size.
+          if (Strong)
+            return true;
+  
+          if (const ConstantInt *CI =
+               dyn_cast<ConstantInt>(AI->getArraySize())) {
+            unsigned BufferSize = TLI->getTargetMachine().Options.SSPBufferSize;
+            if (CI->getLimitedValue(BufferSize) >= BufferSize)
+              // A call to alloca with size >= SSPBufferSize requires
+              // stack protectors.
+              return true;
+          } else // A call to alloca with a variable size requires protectors.
+            return true;
+        }
+
+        if (ContainsProtectableArray(AI->getAllocatedType(), Strong))
           return true;
 
-        if (ContainsProtectableArray(AI->getAllocatedType()))
+        if (Strong && HasAddressTaken(AI)) {
+          ++NumAddrTaken; 
           return true;
+        }
       }
+    }
   }
 
   return false;
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index d5fbf14..20eb918 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -80,7 +80,7 @@ unsigned TargetInstrInfo::getInlineAsmLength(const char *Str,
     if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
                                 strlen(MAI.getSeparatorString())) == 0)
       atInsnStart = true;
-    if (atInsnStart && !std::isspace(*Str)) {
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
       Length += MAI.getMaxInstLength();
       atInsnStart = false;
     }
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
new file mode 100644
index 0000000..2a02f6a
--- /dev/null
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -0,0 +1,1290 @@
+//===-- TargetLoweringBase.cpp - Implement the TargetLoweringBase class ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetLoweringBase class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cctype>
+using namespace llvm;
+
+/// InitLibcallNames - Set default libcall names.
+///
+static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
+  Names[RTLIB::SHL_I16] = "__ashlhi3";
+  Names[RTLIB::SHL_I32] = "__ashlsi3";
+  Names[RTLIB::SHL_I64] = "__ashldi3";
+  Names[RTLIB::SHL_I128] = "__ashlti3";
+  Names[RTLIB::SRL_I16] = "__lshrhi3";
+  Names[RTLIB::SRL_I32] = "__lshrsi3";
+  Names[RTLIB::SRL_I64] = "__lshrdi3";
+  Names[RTLIB::SRL_I128] = "__lshrti3";
+  Names[RTLIB::SRA_I16] = "__ashrhi3";
+  Names[RTLIB::SRA_I32] = "__ashrsi3";
+  Names[RTLIB::SRA_I64] = "__ashrdi3";
+  Names[RTLIB::SRA_I128] = "__ashrti3";
+  Names[RTLIB::MUL_I8] = "__mulqi3";
+  Names[RTLIB::MUL_I16] = "__mulhi3";
+  Names[RTLIB::MUL_I32] = "__mulsi3";
+  Names[RTLIB::MUL_I64] = "__muldi3";
+  Names[RTLIB::MUL_I128] = "__multi3";
+  Names[RTLIB::MULO_I32] = "__mulosi4";
+  Names[RTLIB::MULO_I64] = "__mulodi4";
+  Names[RTLIB::MULO_I128] = "__muloti4";
+  Names[RTLIB::SDIV_I8] = "__divqi3";
+  Names[RTLIB::SDIV_I16] = "__divhi3";
+  Names[RTLIB::SDIV_I32] = "__divsi3";
+  Names[RTLIB::SDIV_I64] = "__divdi3";
+  Names[RTLIB::SDIV_I128] = "__divti3";
+  Names[RTLIB::UDIV_I8] = "__udivqi3";
+  Names[RTLIB::UDIV_I16] = "__udivhi3";
+  Names[RTLIB::UDIV_I32] = "__udivsi3";
+  Names[RTLIB::UDIV_I64] = "__udivdi3";
+  Names[RTLIB::UDIV_I128] = "__udivti3";
+  Names[RTLIB::SREM_I8] = "__modqi3";
+  Names[RTLIB::SREM_I16] = "__modhi3";
+  Names[RTLIB::SREM_I32] = "__modsi3";
+  Names[RTLIB::SREM_I64] = "__moddi3";
+  Names[RTLIB::SREM_I128] = "__modti3";
+  Names[RTLIB::UREM_I8] = "__umodqi3";
+  Names[RTLIB::UREM_I16] = "__umodhi3";
+  Names[RTLIB::UREM_I32] = "__umodsi3";
+  Names[RTLIB::UREM_I64] = "__umoddi3";
+  Names[RTLIB::UREM_I128] = "__umodti3";
+
+  // These are generally not available.
+  Names[RTLIB::SDIVREM_I8] = 0;
+  Names[RTLIB::SDIVREM_I16] = 0;
+  Names[RTLIB::SDIVREM_I32] = 0;
+  Names[RTLIB::SDIVREM_I64] = 0;
+  Names[RTLIB::SDIVREM_I128] = 0;
+  Names[RTLIB::UDIVREM_I8] = 0;
+  Names[RTLIB::UDIVREM_I16] = 0;
+  Names[RTLIB::UDIVREM_I32] = 0;
+  Names[RTLIB::UDIVREM_I64] = 0;
+  Names[RTLIB::UDIVREM_I128] = 0;
+
+  Names[RTLIB::NEG_I32] = "__negsi2";
+  Names[RTLIB::NEG_I64] = "__negdi2";
+  Names[RTLIB::ADD_F32] = "__addsf3";
+  Names[RTLIB::ADD_F64] = "__adddf3";
+  Names[RTLIB::ADD_F80] = "__addxf3";
+  Names[RTLIB::ADD_F128] = "__addtf3";
+  Names[RTLIB::ADD_PPCF128] = "__gcc_qadd";
+  Names[RTLIB::SUB_F32] = "__subsf3";
+  Names[RTLIB::SUB_F64] = "__subdf3";
+  Names[RTLIB::SUB_F80] = "__subxf3";
+  Names[RTLIB::SUB_F128] = "__subtf3";
+  Names[RTLIB::SUB_PPCF128] = "__gcc_qsub";
+  Names[RTLIB::MUL_F32] = "__mulsf3";
+  Names[RTLIB::MUL_F64] = "__muldf3";
+  Names[RTLIB::MUL_F80] = "__mulxf3";
+  Names[RTLIB::MUL_F128] = "__multf3";
+  Names[RTLIB::MUL_PPCF128] = "__gcc_qmul";
+  Names[RTLIB::DIV_F32] = "__divsf3";
+  Names[RTLIB::DIV_F64] = "__divdf3";
+  Names[RTLIB::DIV_F80] = "__divxf3";
+  Names[RTLIB::DIV_F128] = "__divtf3";
+  Names[RTLIB::DIV_PPCF128] = "__gcc_qdiv";
+  Names[RTLIB::REM_F32] = "fmodf";
+  Names[RTLIB::REM_F64] = "fmod";
+  Names[RTLIB::REM_F80] = "fmodl";
+  Names[RTLIB::REM_F128] = "fmodl";
+  Names[RTLIB::REM_PPCF128] = "fmodl";
+  Names[RTLIB::FMA_F32] = "fmaf";
+  Names[RTLIB::FMA_F64] = "fma";
+  Names[RTLIB::FMA_F80] = "fmal";
+  Names[RTLIB::FMA_F128] = "fmal";
+  Names[RTLIB::FMA_PPCF128] = "fmal";
+  Names[RTLIB::POWI_F32] = "__powisf2";
+  Names[RTLIB::POWI_F64] = "__powidf2";
+  Names[RTLIB::POWI_F80] = "__powixf2";
+  Names[RTLIB::POWI_F128] = "__powitf2";
+  Names[RTLIB::POWI_PPCF128] = "__powitf2";
+  Names[RTLIB::SQRT_F32] = "sqrtf";
+  Names[RTLIB::SQRT_F64] = "sqrt";
+  Names[RTLIB::SQRT_F80] = "sqrtl";
+  Names[RTLIB::SQRT_F128] = "sqrtl";
+  Names[RTLIB::SQRT_PPCF128] = "sqrtl";
+  Names[RTLIB::LOG_F32] = "logf";
+  Names[RTLIB::LOG_F64] = "log";
+  Names[RTLIB::LOG_F80] = "logl";
+  Names[RTLIB::LOG_F128] = "logl";
+  Names[RTLIB::LOG_PPCF128] = "logl";
+  Names[RTLIB::LOG2_F32] = "log2f";
+  Names[RTLIB::LOG2_F64] = "log2";
+  Names[RTLIB::LOG2_F80] = "log2l";
+  Names[RTLIB::LOG2_F128] = "log2l";
+  Names[RTLIB::LOG2_PPCF128] = "log2l";
+  Names[RTLIB::LOG10_F32] = "log10f";
+  Names[RTLIB::LOG10_F64] = "log10";
+  Names[RTLIB::LOG10_F80] = "log10l";
+  Names[RTLIB::LOG10_F128] = "log10l";
+  Names[RTLIB::LOG10_PPCF128] = "log10l";
+  Names[RTLIB::EXP_F32] = "expf";
+  Names[RTLIB::EXP_F64] = "exp";
+  Names[RTLIB::EXP_F80] = "expl";
+  Names[RTLIB::EXP_F128] = "expl";
+  Names[RTLIB::EXP_PPCF128] = "expl";
+  Names[RTLIB::EXP2_F32] = "exp2f";
+  Names[RTLIB::EXP2_F64] = "exp2";
+  Names[RTLIB::EXP2_F80] = "exp2l";
+  Names[RTLIB::EXP2_F128] = "exp2l";
+  Names[RTLIB::EXP2_PPCF128] = "exp2l";
+  Names[RTLIB::SIN_F32] = "sinf";
+  Names[RTLIB::SIN_F64] = "sin";
+  Names[RTLIB::SIN_F80] = "sinl";
+  Names[RTLIB::SIN_F128] = "sinl";
+  Names[RTLIB::SIN_PPCF128] = "sinl";
+  Names[RTLIB::COS_F32] = "cosf";
+  Names[RTLIB::COS_F64] = "cos";
+  Names[RTLIB::COS_F80] = "cosl";
+  Names[RTLIB::COS_F128] = "cosl";
+  Names[RTLIB::COS_PPCF128] = "cosl";
+  Names[RTLIB::POW_F32] = "powf";
+  Names[RTLIB::POW_F64] = "pow";
+  Names[RTLIB::POW_F80] = "powl";
+  Names[RTLIB::POW_F128] = "powl";
+  Names[RTLIB::POW_PPCF128] = "powl";
+  Names[RTLIB::CEIL_F32] = "ceilf";
+  Names[RTLIB::CEIL_F64] = "ceil";
+  Names[RTLIB::CEIL_F80] = "ceill";
+  Names[RTLIB::CEIL_F128] = "ceill";
+  Names[RTLIB::CEIL_PPCF128] = "ceill";
+  Names[RTLIB::TRUNC_F32] = "truncf";
+  Names[RTLIB::TRUNC_F64] = "trunc";
+  Names[RTLIB::TRUNC_F80] = "truncl";
+  Names[RTLIB::TRUNC_F128] = "truncl";
+  Names[RTLIB::TRUNC_PPCF128] = "truncl";
+  Names[RTLIB::RINT_F32] = "rintf";
+  Names[RTLIB::RINT_F64] = "rint";
+  Names[RTLIB::RINT_F80] = "rintl";
+  Names[RTLIB::RINT_F128] = "rintl";
+  Names[RTLIB::RINT_PPCF128] = "rintl";
+  Names[RTLIB::NEARBYINT_F32] = "nearbyintf";
+  Names[RTLIB::NEARBYINT_F64] = "nearbyint";
+  Names[RTLIB::NEARBYINT_F80] = "nearbyintl";
+  Names[RTLIB::NEARBYINT_F128] = "nearbyintl";
+  Names[RTLIB::NEARBYINT_PPCF128] = "nearbyintl";
+  Names[RTLIB::FLOOR_F32] = "floorf";
+  Names[RTLIB::FLOOR_F64] = "floor";
+  Names[RTLIB::FLOOR_F80] = "floorl";
+  Names[RTLIB::FLOOR_F128] = "floorl";
+  Names[RTLIB::FLOOR_PPCF128] = "floorl";
+  Names[RTLIB::COPYSIGN_F32] = "copysignf";
+  Names[RTLIB::COPYSIGN_F64] = "copysign";
+  Names[RTLIB::COPYSIGN_F80] = "copysignl";
+  Names[RTLIB::COPYSIGN_F128] = "copysignl";
+  Names[RTLIB::COPYSIGN_PPCF128] = "copysignl";
+  Names[RTLIB::FPEXT_F64_F128] = "__extenddftf2";
+  Names[RTLIB::FPEXT_F32_F128] = "__extendsftf2";
+  Names[RTLIB::FPEXT_F32_F64] = "__extendsfdf2";
+  Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee";
+  Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee";
+  Names[RTLIB::FPROUND_F64_F32] = "__truncdfsf2";
+  Names[RTLIB::FPROUND_F80_F32] = "__truncxfsf2";
+  Names[RTLIB::FPROUND_F128_F32] = "__trunctfsf2";
+  Names[RTLIB::FPROUND_PPCF128_F32] = "__trunctfsf2";
+  Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2";
+  Names[RTLIB::FPROUND_F128_F64] = "__trunctfdf2";
+  Names[RTLIB::FPROUND_PPCF128_F64] = "__trunctfdf2";
+  Names[RTLIB::FPTOSINT_F32_I8] = "__fixsfqi";
+  Names[RTLIB::FPTOSINT_F32_I16] = "__fixsfhi";
+  Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi";
+  Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi";
+  Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti";
+  Names[RTLIB::FPTOSINT_F64_I8] = "__fixdfqi";
+  Names[RTLIB::FPTOSINT_F64_I16] = "__fixdfhi";
+  Names[RTLIB::FPTOSINT_F64_I32] = "__fixdfsi";
+  Names[RTLIB::FPTOSINT_F64_I64] = "__fixdfdi";
+  Names[RTLIB::FPTOSINT_F64_I128] = "__fixdfti";
+  Names[RTLIB::FPTOSINT_F80_I32] = "__fixxfsi";
+  Names[RTLIB::FPTOSINT_F80_I64] = "__fixxfdi";
+  Names[RTLIB::FPTOSINT_F80_I128] = "__fixxfti";
+  Names[RTLIB::FPTOSINT_F128_I32] = "__fixtfsi";
+  Names[RTLIB::FPTOSINT_F128_I64] = "__fixtfdi";
+  Names[RTLIB::FPTOSINT_F128_I128] = "__fixtfti";
+  Names[RTLIB::FPTOSINT_PPCF128_I32] = "__fixtfsi";
+  Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi";
+  Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti";
+  Names[RTLIB::FPTOUINT_F32_I8] = "__fixunssfqi";
+  Names[RTLIB::FPTOUINT_F32_I16] = "__fixunssfhi";
+  Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi";
+  Names[RTLIB::FPTOUINT_F32_I64] = "__fixunssfdi";
+  Names[RTLIB::FPTOUINT_F32_I128] = "__fixunssfti";
+  Names[RTLIB::FPTOUINT_F64_I8] = "__fixunsdfqi";
+  Names[RTLIB::FPTOUINT_F64_I16] = "__fixunsdfhi";
+  Names[RTLIB::FPTOUINT_F64_I32] = "__fixunsdfsi";
+  Names[RTLIB::FPTOUINT_F64_I64] = "__fixunsdfdi";
+  Names[RTLIB::FPTOUINT_F64_I128] = "__fixunsdfti";
+  Names[RTLIB::FPTOUINT_F80_I32] = "__fixunsxfsi";
+  Names[RTLIB::FPTOUINT_F80_I64] = "__fixunsxfdi";
+  Names[RTLIB::FPTOUINT_F80_I128] = "__fixunsxfti";
+  Names[RTLIB::FPTOUINT_F128_I32] = "__fixunstfsi";
+  Names[RTLIB::FPTOUINT_F128_I64] = "__fixunstfdi";
+  Names[RTLIB::FPTOUINT_F128_I128] = "__fixunstfti";
+  Names[RTLIB::FPTOUINT_PPCF128_I32] = "__fixunstfsi";
+  Names[RTLIB::FPTOUINT_PPCF128_I64] = "__fixunstfdi";
+  Names[RTLIB::FPTOUINT_PPCF128_I128] = "__fixunstfti";
+  Names[RTLIB::SINTTOFP_I32_F32] = "__floatsisf";
+  Names[RTLIB::SINTTOFP_I32_F64] = "__floatsidf";
+  Names[RTLIB::SINTTOFP_I32_F80] = "__floatsixf";
+  Names[RTLIB::SINTTOFP_I32_F128] = "__floatsitf";
+  Names[RTLIB::SINTTOFP_I32_PPCF128] = "__floatsitf";
+  Names[RTLIB::SINTTOFP_I64_F32] = "__floatdisf";
+  Names[RTLIB::SINTTOFP_I64_F64] = "__floatdidf";
+  Names[RTLIB::SINTTOFP_I64_F80] = "__floatdixf";
+  Names[RTLIB::SINTTOFP_I64_F128] = "__floatditf";
+  Names[RTLIB::SINTTOFP_I64_PPCF128] = "__floatditf";
+  Names[RTLIB::SINTTOFP_I128_F32] = "__floattisf";
+  Names[RTLIB::SINTTOFP_I128_F64] = "__floattidf";
+  Names[RTLIB::SINTTOFP_I128_F80] = "__floattixf";
+  Names[RTLIB::SINTTOFP_I128_F128] = "__floattitf";
+  Names[RTLIB::SINTTOFP_I128_PPCF128] = "__floattitf";
+  Names[RTLIB::UINTTOFP_I32_F32] = "__floatunsisf";
+  Names[RTLIB::UINTTOFP_I32_F64] = "__floatunsidf";
+  Names[RTLIB::UINTTOFP_I32_F80] = "__floatunsixf";
+  Names[RTLIB::UINTTOFP_I32_F128] = "__floatunsitf";
+  Names[RTLIB::UINTTOFP_I32_PPCF128] = "__floatunsitf";
+  Names[RTLIB::UINTTOFP_I64_F32] = "__floatundisf";
+  Names[RTLIB::UINTTOFP_I64_F64] = "__floatundidf";
+  Names[RTLIB::UINTTOFP_I64_F80] = "__floatundixf";
+  Names[RTLIB::UINTTOFP_I64_F128] = "__floatunditf";
+  Names[RTLIB::UINTTOFP_I64_PPCF128] = "__floatunditf";
+  Names[RTLIB::UINTTOFP_I128_F32] = "__floatuntisf";
+  Names[RTLIB::UINTTOFP_I128_F64] = "__floatuntidf";
+  Names[RTLIB::UINTTOFP_I128_F80] = "__floatuntixf";
+  Names[RTLIB::UINTTOFP_I128_F128] = "__floatuntitf";
+  Names[RTLIB::UINTTOFP_I128_PPCF128] = "__floatuntitf";
+  Names[RTLIB::OEQ_F32] = "__eqsf2";
+  Names[RTLIB::OEQ_F64] = "__eqdf2";
+  Names[RTLIB::OEQ_F128] = "__eqtf2";
+  Names[RTLIB::UNE_F32] = "__nesf2";
+  Names[RTLIB::UNE_F64] = "__nedf2";
+  Names[RTLIB::UNE_F128] = "__netf2";
+  Names[RTLIB::OGE_F32] = "__gesf2";
+  Names[RTLIB::OGE_F64] = "__gedf2";
+  Names[RTLIB::OGE_F128] = "__getf2";
+  Names[RTLIB::OLT_F32] = "__ltsf2";
+  Names[RTLIB::OLT_F64] = "__ltdf2";
+  Names[RTLIB::OLT_F128] = "__lttf2";
+  Names[RTLIB::OLE_F32] = "__lesf2";
+  Names[RTLIB::OLE_F64] = "__ledf2";
+  Names[RTLIB::OLE_F128] = "__letf2";
+  Names[RTLIB::OGT_F32] = "__gtsf2";
+  Names[RTLIB::OGT_F64] = "__gtdf2";
+  Names[RTLIB::OGT_F128] = "__gttf2";
+  Names[RTLIB::UO_F32] = "__unordsf2";
+  Names[RTLIB::UO_F64] = "__unorddf2";
+  Names[RTLIB::UO_F128] = "__unordtf2";
+  Names[RTLIB::O_F32] = "__unordsf2";
+  Names[RTLIB::O_F64] = "__unorddf2";
+  Names[RTLIB::O_F128] = "__unordtf2";
+  Names[RTLIB::MEMCPY] = "memcpy";
+  Names[RTLIB::MEMMOVE] = "memmove";
+  Names[RTLIB::MEMSET] = "memset";
+  Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
+  Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
+  Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
+  Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
+  Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
+  Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
+  Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
+  
+  if (Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU) {
+    Names[RTLIB::SINCOS_F32] = "sincosf";
+    Names[RTLIB::SINCOS_F64] = "sincos";
+    Names[RTLIB::SINCOS_F80] = "sincosl";
+    Names[RTLIB::SINCOS_F128] = "sincosl";
+    Names[RTLIB::SINCOS_PPCF128] = "sincosl";
+  } else {
+    // These are generally not available.
+    Names[RTLIB::SINCOS_F32] = 0;
+    Names[RTLIB::SINCOS_F64] = 0;
+    Names[RTLIB::SINCOS_F80] = 0;
+    Names[RTLIB::SINCOS_F128] = 0;
+    Names[RTLIB::SINCOS_PPCF128] = 0;
+  }
+}
+
+/// InitLibcallCallingConvs - Set default libcall CallingConvs.
+///
+static void InitLibcallCallingConvs(CallingConv::ID *CCs) {
+  for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
+    CCs[i] = CallingConv::C;
+  }
+}
+
+/// getFPEXT - Return the FPEXT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::f64)
+      return FPEXT_F32_F64;
+    if (RetVT == MVT::f128)
+      return FPEXT_F32_F128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::f128)
+      return FPEXT_F64_F128;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPROUND - Return the FPROUND_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
+  if (RetVT == MVT::f32) {
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_F32;
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F32;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F32;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F32;
+  } else if (RetVT == MVT::f64) {
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F64;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F64;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F64;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i8)
+      return FPTOSINT_F32_I8;
+    if (RetVT == MVT::i16)
+      return FPTOSINT_F32_I16;
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i8)
+      return FPTOSINT_F64_I8;
+    if (RetVT == MVT::i16)
+      return FPTOSINT_F64_I16;
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F80_I128;
+  } else if (OpVT == MVT::f128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F128_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i8)
+      return FPTOUINT_F32_I8;
+    if (RetVT == MVT::i16)
+      return FPTOUINT_F32_I16;
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i8)
+      return FPTOUINT_F64_I8;
+    if (RetVT == MVT::i16)
+      return FPTOUINT_F64_I16;
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F80_I128;
+  } else if (OpVT == MVT::f128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F128_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I32_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I32_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I32_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I32_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I64_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I64_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I64_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I64_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I128_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I128_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I128_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I128_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I32_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I32_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I32_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I32_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I64_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I64_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I64_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I64_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I128_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I128_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I128_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I128_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// InitCmpLibcallCCs - Set default comparison libcall CC.
+///
+static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
+  memset(CCs, ISD::SETCC_INVALID, sizeof(ISD::CondCode)*RTLIB::UNKNOWN_LIBCALL);
+  CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F128] = ISD::SETEQ;
+  CCs[RTLIB::UNE_F32] = ISD::SETNE;
+  CCs[RTLIB::UNE_F64] = ISD::SETNE;
+  CCs[RTLIB::UNE_F128] = ISD::SETNE;
+  CCs[RTLIB::OGE_F32] = ISD::SETGE;
+  CCs[RTLIB::OGE_F64] = ISD::SETGE;
+  CCs[RTLIB::OGE_F128] = ISD::SETGE;
+  CCs[RTLIB::OLT_F32] = ISD::SETLT;
+  CCs[RTLIB::OLT_F64] = ISD::SETLT;
+  CCs[RTLIB::OLT_F128] = ISD::SETLT;
+  CCs[RTLIB::OLE_F32] = ISD::SETLE;
+  CCs[RTLIB::OLE_F64] = ISD::SETLE;
+  CCs[RTLIB::OLE_F128] = ISD::SETLE;
+  CCs[RTLIB::OGT_F32] = ISD::SETGT;
+  CCs[RTLIB::OGT_F64] = ISD::SETGT;
+  CCs[RTLIB::OGT_F128] = ISD::SETGT;
+  CCs[RTLIB::UO_F32] = ISD::SETNE;
+  CCs[RTLIB::UO_F64] = ISD::SETNE;
+  CCs[RTLIB::UO_F128] = ISD::SETNE;
+  CCs[RTLIB::O_F32] = ISD::SETEQ;
+  CCs[RTLIB::O_F64] = ISD::SETEQ;
+  CCs[RTLIB::O_F128] = ISD::SETEQ;
+}
+
+/// NOTE: The constructor takes ownership of TLOF.
+TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
+                                       const TargetLoweringObjectFile *tlof)
+  : TM(tm), TD(TM.getDataLayout()), TLOF(*tlof) {
+  // All operations default to being supported.
+  memset(OpActions, 0, sizeof(OpActions));
+  memset(LoadExtActions, 0, sizeof(LoadExtActions));
+  memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
+  memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
+  memset(CondCodeActions, 0, sizeof(CondCodeActions));
+
+  // Set default actions for various operations.
+  for (unsigned VT = 0; VT != (unsigned)MVT::LAST_VALUETYPE; ++VT) {
+    // Default all indexed load / store to expand.
+    for (unsigned IM = (unsigned)ISD::PRE_INC;
+         IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
+      setIndexedLoadAction(IM, (MVT::SimpleValueType)VT, Expand);
+      setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
+    }
+
+    // These operations default to expand.
+    setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
+  }
+
+  // Most targets ignore the @llvm.prefetch intrinsic.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Expand);
+
+  // ConstantFP nodes default to expand.  Targets can either change this to
+  // Legal, in which case all fp constants are legal, or use isFPImmLegal()
+  // to optimize expansions for certain constants.
+  setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f80, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f128, Expand);
+
+  // These library functions default to expand.
+  setOperationAction(ISD::FLOG ,  MVT::f16, Expand);
+  setOperationAction(ISD::FLOG2,  MVT::f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f16, Expand);
+  setOperationAction(ISD::FEXP ,  MVT::f16, Expand);
+  setOperationAction(ISD::FEXP2,  MVT::f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::f16, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::f16, Expand);
+  setOperationAction(ISD::FRINT,  MVT::f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::f16, Expand);
+  setOperationAction(ISD::FLOG ,  MVT::f32, Expand);
+  setOperationAction(ISD::FLOG2,  MVT::f32, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
+  setOperationAction(ISD::FEXP ,  MVT::f32, Expand);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::f32, Expand);
+  setOperationAction(ISD::FRINT,  MVT::f32, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG ,  MVT::f64, Expand);
+  setOperationAction(ISD::FLOG2,  MVT::f64, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
+  setOperationAction(ISD::FEXP ,  MVT::f64, Expand);
+  setOperationAction(ISD::FEXP2,  MVT::f64, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::f64, Expand);
+  setOperationAction(ISD::FRINT,  MVT::f64, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
+  setOperationAction(ISD::FLOG ,  MVT::f128, Expand);
+  setOperationAction(ISD::FLOG2,  MVT::f128, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f128, Expand);
+  setOperationAction(ISD::FEXP ,  MVT::f128, Expand);
+  setOperationAction(ISD::FEXP2,  MVT::f128, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::f128, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::f128, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::f128, Expand);
+  setOperationAction(ISD::FRINT,  MVT::f128, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+
+  // Default ISD::TRAP to expand (which turns it into abort).
+  setOperationAction(ISD::TRAP, MVT::Other, Expand);
+
+  // On most systems, DEBUGTRAP and TRAP have no difference. The "Expand"
+  // here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP.
+  //
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
+
+  IsLittleEndian = TD->isLittleEndian();
+  PointerTy = MVT::getIntegerVT(8*TD->getPointerSize(0));
+  memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*));
+  memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
+    = MaxStoresPerMemmoveOptSize = 4;
+  BenefitFromCodePlacementOpt = false;
+  UseUnderscoreSetJmp = false;
+  UseUnderscoreLongJmp = false;
+  SelectIsExpensive = false;
+  IntDivIsCheap = false;
+  Pow2DivIsCheap = false;
+  JumpIsExpensive = false;
+  PredictableSelectIsExpensive = false;
+  StackPointerRegisterToSaveRestore = 0;
+  ExceptionPointerRegister = 0;
+  ExceptionSelectorRegister = 0;
+  BooleanContents = UndefinedBooleanContent;
+  BooleanVectorContents = UndefinedBooleanContent;
+  SchedPreferenceInfo = Sched::ILP;
+  JumpBufSize = 0;
+  JumpBufAlignment = 0;
+  MinFunctionAlignment = 0;
+  PrefFunctionAlignment = 0;
+  PrefLoopAlignment = 0;
+  MinStackArgumentAlignment = 1;
+  ShouldFoldAtomicFences = false;
+  InsertFencesForAtomic = false;
+  SupportJumpTables = true;
+  MinimumJumpTableEntries = 4;
+
+  InitLibcallNames(LibcallRoutineNames, TM);
+  InitCmpLibcallCCs(CmpLibcallCCs);
+  InitLibcallCallingConvs(LibcallCallingConvs);
+}
+
+TargetLoweringBase::~TargetLoweringBase() {
+  delete &TLOF;
+}
+
+MVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy) const {
+  return MVT::getIntegerVT(8*TD->getPointerSize(0));
+}
+
+/// canOpTrap - Returns true if the operation can trap for the value type.
+/// VT must be a legal type.
+bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const {
+  assert(isTypeLegal(VT));
+  switch (Op) {
+  default:
+    return false;
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+    return true;
+  }
+}
+
+
+static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
+                                          unsigned &NumIntermediates,
+                                          MVT &RegisterVT,
+                                          TargetLoweringBase *TLI) {
+  // Figure out the right, legal destination reg to copy into.
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType();
+
+  unsigned NumVectorRegs = 1;
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
+  // could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(NumElts)) {
+    NumVectorRegs = NumElts;
+    NumElts = 1;
+  }
+
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (NumElts > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, NumElts))) {
+    NumElts >>= 1;
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+
+  MVT NewVT = MVT::getVectorVT(EltTy, NumElts);
+  if (!TLI->isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  unsigned NewVTSize = NewVT.getSizeInBits();
+
+  // Convert sizes such as i33 to i64.
+  if (!isPowerOf2_32(NewVTSize))
+    NewVTSize = NextPowerOf2(NewVTSize);
+
+  MVT DestVT = TLI->getRegisterType(NewVT);
+  RegisterVT = DestVT;
+  if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
+
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
+}
+
+/// isLegalRC - Return true if the value types that can be represented by the
+/// specified register class are all legal.
+bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const {
+  for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+       I != E; ++I) {
+    if (isTypeLegal(*I))
+      return true;
+  }
+  return false;
+}
+
+/// findRepresentativeClass - Return the largest legal super-reg register class
+/// of the register class for the specified type and its associated "cost".
+std::pair<const TargetRegisterClass*, uint8_t>
+TargetLoweringBase::findRepresentativeClass(MVT VT) const {
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
+  if (!RC)
+    return std::make_pair(RC, 0);
+
+  // Compute the set of all super-register classes.
+  BitVector SuperRegRC(TRI->getNumRegClasses());
+  for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI)
+    SuperRegRC.setBitsInMask(RCI.getMask());
+
+  // Find the first legal register class with the largest spill size.
+  const TargetRegisterClass *BestRC = RC;
+  for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) {
+    const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
+    // We want the largest possible spill size.
+    if (SuperRC->getSize() <= BestRC->getSize())
+      continue;
+    if (!isLegalRC(SuperRC))
+      continue;
+    BestRC = SuperRC;
+  }
+  return std::make_pair(BestRC, 1);
+}
+
+/// computeRegisterProperties - Once all of the register classes are added,
+/// this allows us to compute derived properties we expose.
+void TargetLoweringBase::computeRegisterProperties() {
+  assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
+         "Too many value types for ValueTypeActions to hold!");
+
+  // Everything defaults to needing one register.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    NumRegistersForVT[i] = 1;
+    RegisterTypeForVT[i] = TransformToType[i] = (MVT::SimpleValueType)i;
+  }
+  // ...except isVoid, which doesn't need any registers.
+  NumRegistersForVT[MVT::isVoid] = 0;
+
+  // Find the largest integer register class.
+  unsigned LargestIntReg = MVT::LAST_INTEGER_VALUETYPE;
+  for (; RegClassForVT[LargestIntReg] == 0; --LargestIntReg)
+    assert(LargestIntReg != MVT::i1 && "No integer registers defined!");
+
+  // Every integer value type larger than this largest register takes twice as
+  // many registers to represent as the previous ValueType.
+  for (unsigned ExpandedReg = LargestIntReg + 1;
+       ExpandedReg <= MVT::LAST_INTEGER_VALUETYPE; ++ExpandedReg) {
+    NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
+    RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
+    TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
+    ValueTypeActions.setTypeAction((MVT::SimpleValueType)ExpandedReg,
+                                   TypeExpandInteger);
+  }
+
+  // Inspect all of the ValueType's smaller than the largest integer
+  // register to see which ones need promotion.
+  unsigned LegalIntReg = LargestIntReg;
+  for (unsigned IntReg = LargestIntReg - 1;
+       IntReg >= (unsigned)MVT::i1; --IntReg) {
+    MVT IVT = (MVT::SimpleValueType)IntReg;
+    if (isTypeLegal(IVT)) {
+      LegalIntReg = IntReg;
+    } else {
+      RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
+        (const MVT::SimpleValueType)LegalIntReg;
+      ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
+    }
+  }
+
+  // ppcf128 type is really two f64's.
+  if (!isTypeLegal(MVT::ppcf128)) {
+    NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
+    RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
+    TransformToType[MVT::ppcf128] = MVT::f64;
+    ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
+  }
+
+  // Decide how to handle f64. If the target does not have native f64 support,
+  // expand it to i64 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f64)) {
+    NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
+    RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
+    TransformToType[MVT::f64] = MVT::i64;
+    ValueTypeActions.setTypeAction(MVT::f64, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f32. If the target does not have native support for
+  // f32, promote it to f64 if it is legal. Otherwise, expand it to i32.
+  if (!isTypeLegal(MVT::f32)) {
+    if (isTypeLegal(MVT::f64)) {
+      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::f64];
+      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::f64];
+      TransformToType[MVT::f32] = MVT::f64;
+      ValueTypeActions.setTypeAction(MVT::f32, TypePromoteInteger);
+    } else {
+      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
+      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
+      TransformToType[MVT::f32] = MVT::i32;
+      ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat);
+    }
+  }
+
+  // Loop over all of the vector value types to see which need transformations.
+  for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType)i;
+    if (isTypeLegal(VT)) continue;
+
+    // Determine if there is a legal wider type.  If so, we should promote to
+    // that wider vector type.
+    MVT EltVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    if (NElts != 1 && !shouldSplitVectorElementType(EltVT)) {
+      bool IsLegalWiderType = false;
+      // First try to promote the elements of integer vectors. If no legal
+      // promotion was found, fallback to the widen-vector method.
+      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType)nVT;
+        // Promote vectors of integers to vectors with the same number
+        // of elements, with a wider element type.
+        if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits()
+            && SVT.getVectorNumElements() == NElts &&
+            isTypeLegal(SVT) && SVT.getScalarType().isInteger()) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, TypePromoteInteger);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+
+      if (IsLegalWiderType) continue;
+
+      // Try to widen the vector.
+      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType)nVT;
+        if (SVT.getVectorElementType() == EltVT &&
+            SVT.getVectorNumElements() > NElts &&
+            isTypeLegal(SVT)) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+      if (IsLegalWiderType) continue;
+    }
+
+    MVT IntermediateVT;
+    MVT RegisterVT;
+    unsigned NumIntermediates;
+    NumRegistersForVT[i] =
+      getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates,
+                                RegisterVT, this);
+    RegisterTypeForVT[i] = RegisterVT;
+
+    MVT NVT = VT.getPow2VectorType();
+    if (NVT == VT) {
+      // Type is already a power of 2.  The default action is to split.
+      TransformToType[i] = MVT::Other;
+      unsigned NumElts = VT.getVectorNumElements();
+      ValueTypeActions.setTypeAction(VT,
+            NumElts > 1 ? TypeSplitVector : TypeScalarizeVector);
+    } else {
+      TransformToType[i] = NVT;
+      ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+    }
+  }
+
+  // Determine the 'representative' register class for each value type.
+  // An representative register class is the largest (meaning one which is
+  // not a sub-register class / subreg register class) legal register class for
+  // a group of value types. For example, on i386, i8, i16, and i32
+  // representative would be GR32; while on x86_64 it's GR64.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    const TargetRegisterClass* RRC;
+    uint8_t Cost;
+    tie(RRC, Cost) =  findRepresentativeClass((MVT::SimpleValueType)i);
+    RepRegClassForVT[i] = RRC;
+    RepRegClassCostForVT[i] = Cost;
+  }
+}
+
+EVT TargetLoweringBase::getSetCCResultType(EVT VT) const {
+  assert(!VT.isVector() && "No default SetCC type for vectors!");
+  return getPointerTy(0).SimpleTy;
+}
+
+MVT::SimpleValueType TargetLoweringBase::getCmpLibcallReturnType() const {
+  return MVT::i32; // return the default value
+}
+
+/// getVectorTypeBreakdown - Vector types are broken down into some number of
+/// legal first class types.  For example, MVT::v8f32 maps to 2 MVT::v4f32
+/// with Altivec or SSE1, or 8 promoted MVT::f64 values with the X86 FP stack.
+/// Similarly, MVT::v2i64 turns into 4 MVT::i32 values with both PPC and X86.
+///
+/// This method returns the number of registers needed, and the VT for each
+/// register.  It also returns the VT and quantity of the intermediate values
+/// before they are promoted/expanded.
+///
+unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                MVT &RegisterVT) const {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // If there is a wider vector type with the same element type as this one,
+  // or a promoted vector type that has the same number of elements which
+  // are wider, then we should convert to that legal vector type.
+  // This handles things like <2 x float> -> <4 x float> and
+  // <4 x i1> -> <4 x i32>.
+  LegalizeTypeAction TA = getTypeAction(Context, VT);
+  if (NumElts != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
+    EVT RegisterEVT = getTypeToTransformTo(Context, VT);
+    if (isTypeLegal(RegisterEVT)) {
+      IntermediateVT = RegisterEVT;
+      RegisterVT = RegisterEVT.getSimpleVT();
+      NumIntermediates = 1;
+      return 1;
+    }
+  }
+
+  // Figure out the right, legal destination reg to copy into.
+  EVT EltTy = VT.getVectorElementType();
+
+  unsigned NumVectorRegs = 1;
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
+  // could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(NumElts)) {
+    NumVectorRegs = NumElts;
+    NumElts = 1;
+  }
+
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (NumElts > 1 && !isTypeLegal(
+                                   EVT::getVectorVT(Context, EltTy, NumElts))) {
+    NumElts >>= 1;
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+
+  EVT NewVT = EVT::getVectorVT(Context, EltTy, NumElts);
+  if (!isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  MVT DestVT = getRegisterType(Context, NewVT);
+  RegisterVT = DestVT;
+  unsigned NewVTSize = NewVT.getSizeInBits();
+
+  // Convert sizes such as i33 to i64.
+  if (!isPowerOf2_32(NewVTSize))
+    NewVTSize = NextPowerOf2(NewVTSize);
+
+  if (EVT(DestVT).bitsLT(NewVT))   // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
+
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
+}
+
+/// Get the EVTs and ArgFlags collections that represent the legalized return
+/// type of the given function.  This does not require a DAG or a return value,
+/// and is suitable for use before any DAGs for the function are constructed.
+/// TODO: Move this out of TargetLowering.cpp.
+void llvm::GetReturnInfo(Type* ReturnType, AttributeSet attr,
+                         SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const TargetLowering &TLI) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, ReturnType, ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  for (unsigned j = 0, f = NumValues; j != f; ++j) {
+    EVT VT = ValueVTs[j];
+    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+      ExtendKind = ISD::SIGN_EXTEND;
+    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+      ExtendKind = ISD::ZERO_EXTEND;
+
+    // FIXME: C calling convention requires the return type to be promoted to
+    // at least 32-bit. But this is not necessary for non-C calling
+    // conventions. The frontend should mark functions whose return values
+    // require promoting with signext or zeroext attributes.
+    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
+      MVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
+      if (VT.bitsLT(MinVT))
+        VT = MinVT;
+    }
+
+    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
+    MVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
+
+    // 'inreg' on function refers to return value
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg))
+      Flags.setInReg();
+
+    // Propagate extension type if any
+    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+      Flags.setSExt();
+    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+      Flags.setZExt();
+
+    for (unsigned i = 0; i < NumParts; ++i)
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true, 0, 0));
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.  This is the actual
+/// alignment, not its logarithm.
+unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty) const {
+  return TD->getCallFrameTypeAlignment(Ty);
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetTransformInfo Helpers
+//===----------------------------------------------------------------------===//
+
+int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
+  enum InstructionOpcodes {
+#define HANDLE_INST(NUM, OPCODE, CLASS) OPCODE = NUM,
+#define LAST_OTHER_INST(NUM) InstructionOpcodesCount = NUM
+#include "llvm/IR/Instruction.def"
+  };
+  switch (static_cast<InstructionOpcodes>(Opcode)) {
+  case Ret:            return 0;
+  case Br:             return 0;
+  case Switch:         return 0;
+  case IndirectBr:     return 0;
+  case Invoke:         return 0;
+  case Resume:         return 0;
+  case Unreachable:    return 0;
+  case Add:            return ISD::ADD;
+  case FAdd:           return ISD::FADD;
+  case Sub:            return ISD::SUB;
+  case FSub:           return ISD::FSUB;
+  case Mul:            return ISD::MUL;
+  case FMul:           return ISD::FMUL;
+  case UDiv:           return ISD::UDIV;
+  case SDiv:           return ISD::UDIV;
+  case FDiv:           return ISD::FDIV;
+  case URem:           return ISD::UREM;
+  case SRem:           return ISD::SREM;
+  case FRem:           return ISD::FREM;
+  case Shl:            return ISD::SHL;
+  case LShr:           return ISD::SRL;
+  case AShr:           return ISD::SRA;
+  case And:            return ISD::AND;
+  case Or:             return ISD::OR;
+  case Xor:            return ISD::XOR;
+  case Alloca:         return 0;
+  case Load:           return ISD::LOAD;
+  case Store:          return ISD::STORE;
+  case GetElementPtr:  return 0;
+  case Fence:          return 0;
+  case AtomicCmpXchg:  return 0;
+  case AtomicRMW:      return 0;
+  case Trunc:          return ISD::TRUNCATE;
+  case ZExt:           return ISD::ZERO_EXTEND;
+  case SExt:           return ISD::SIGN_EXTEND;
+  case FPToUI:         return ISD::FP_TO_UINT;
+  case FPToSI:         return ISD::FP_TO_SINT;
+  case UIToFP:         return ISD::UINT_TO_FP;
+  case SIToFP:         return ISD::SINT_TO_FP;
+  case FPTrunc:        return ISD::FP_ROUND;
+  case FPExt:          return ISD::FP_EXTEND;
+  case PtrToInt:       return ISD::BITCAST;
+  case IntToPtr:       return ISD::BITCAST;
+  case BitCast:        return ISD::BITCAST;
+  case ICmp:           return ISD::SETCC;
+  case FCmp:           return ISD::SETCC;
+  case PHI:            return 0;
+  case Call:           return 0;
+  case Select:         return ISD::SELECT;
+  case UserOp1:        return 0;
+  case UserOp2:        return 0;
+  case VAArg:          return 0;
+  case ExtractElement: return ISD::EXTRACT_VECTOR_ELT;
+  case InsertElement:  return ISD::INSERT_VECTOR_ELT;
+  case ShuffleVector:  return ISD::VECTOR_SHUFFLE;
+  case ExtractValue:   return ISD::MERGE_VALUES;
+  case InsertValue:    return ISD::MERGE_VALUES;
+  case LandingPad:     return 0;
+  }
+
+  llvm_unreachable("Unknown instruction type encountered!");
+}
+
+std::pair<unsigned, MVT>
+TargetLoweringBase::getTypeLegalizationCost(Type *Ty) const {
+  LLVMContext &C = Ty->getContext();
+  EVT MTy = getValueType(Ty);
+
+  unsigned Cost = 1;
+  // We keep legalizing the type until we find a legal kind. We assume that
+  // the only operation that costs anything is the split. After splitting
+  // we need to handle two types.
+  while (true) {
+    LegalizeKind LK = getTypeConversion(C, MTy);
+
+    if (LK.first == TypeLegal)
+      return std::make_pair(Cost, MTy.getSimpleVT());
+
+    if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger)
+      Cost *= 2;
+
+    // Keep legalizing the type.
+    MTy = LK.second;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Loop Strength Reduction hooks
+//===----------------------------------------------------------------------===//
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool TargetLoweringBase::isLegalAddressingMode(const AddrMode &AM,
+                                           Type *Ty) const {
+  // The default implementation of this implements a conservative RISCy, r+r and
+  // r+i addr mode.
+
+  // Allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // Only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  }
+
+  return true;
+}
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 76c2546..1170bf2 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -406,14 +406,14 @@ TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
 //                                 MachO
 //===----------------------------------------------------------------------===//
 
-/// emitModuleFlags - Emit the module flags that specify the garbage collection
-/// information.
+/// emitModuleFlags - Perform code emission for module flags.
 void TargetLoweringObjectFileMachO::
 emitModuleFlags(MCStreamer &Streamer,
                 ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
                 Mangler *Mang, const TargetMachine &TM) const {
   unsigned VersionVal = 0;
   unsigned ImageInfoFlags = 0;
+  MDNode *LinkerOptions = 0;
   StringRef SectionVal;
 
   for (ArrayRef<Module::ModuleFlagEntry>::iterator
@@ -427,14 +427,33 @@ emitModuleFlags(MCStreamer &Streamer,
     StringRef Key = MFE.Key->getString();
     Value *Val = MFE.Val;
 
-    if (Key == "Objective-C Image Info Version")
+    if (Key == "Objective-C Image Info Version") {
       VersionVal = cast<ConstantInt>(Val)->getZExtValue();
-    else if (Key == "Objective-C Garbage Collection" ||
-             Key == "Objective-C GC Only" ||
-             Key == "Objective-C Is Simulated")
+    } else if (Key == "Objective-C Garbage Collection" ||
+               Key == "Objective-C GC Only" ||
+               Key == "Objective-C Is Simulated") {
       ImageInfoFlags |= cast<ConstantInt>(Val)->getZExtValue();
-    else if (Key == "Objective-C Image Info Section")
+    } else if (Key == "Objective-C Image Info Section") {
       SectionVal = cast<MDString>(Val)->getString();
+    } else if (Key == "Linker Options") {
+      LinkerOptions = cast<MDNode>(Val);
+    }
+  }
+
+  // Emit the linker options if present.
+  if (LinkerOptions) {
+    for (unsigned i = 0, e = LinkerOptions->getNumOperands(); i != e; ++i) {
+      MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
+      SmallVector<std::string, 4> StrOptions;
+
+      // Convert to strings.
+      for (unsigned ii = 0, ie = MDOptions->getNumOperands(); ii != ie; ++ii) {
+        MDString *MDOption = cast<MDString>(MDOptions->getOperand(ii));
+        StrOptions.push_back(MDOption->getString());
+      }
+
+      Streamer.EmitLinkerOptions(StrOptions);
+    }
   }
 
   // The section is mandatory. If we don't have it, then we don't have GC info.
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 9b776d1..84b4bfc 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 8e6f809..26c5fe4 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -67,7 +67,6 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   const InstrItineraryData *InstrItins;
   MachineRegisterInfo *MRI;
   LiveVariables *LV;
-  SlotIndexes *Indexes;
   LiveIntervals *LIS;
   AliasAnalysis *AA;
   CodeGenOpt::Level OptLevel;
@@ -121,7 +120,7 @@ class TwoAddressInstructionPass : public MachineFunctionPass {
   bool tryInstructionTransform(MachineBasicBlock::iterator &mi,
                                MachineBasicBlock::iterator &nmi,
                                unsigned SrcIdx, unsigned DstIdx,
-                               unsigned Dist);
+                               unsigned Dist, bool shouldOnlyCommute);
 
   void scanUses(unsigned DstReg);
 
@@ -164,6 +163,8 @@ INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction",
 
 char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
 
+static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS);
+
 /// sink3AddrInstruction - A two-address instruction has been converted to a
 /// three-address instruction to avoid clobbering a register. Try to sink it
 /// past the instruction that would kill the above mentioned register to reduce
@@ -205,14 +206,29 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
 
   // Find the instruction that kills SavedReg.
   MachineInstr *KillMI = NULL;
-  for (MachineRegisterInfo::use_nodbg_iterator
-         UI = MRI->use_nodbg_begin(SavedReg),
-         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-    MachineOperand &UseMO = UI.getOperand();
-    if (!UseMO.isKill())
-      continue;
-    KillMI = UseMO.getParent();
-    break;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(SavedReg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  }
+  if (!KillMI) {
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SavedReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineOperand &UseMO = UI.getOperand();
+      if (!UseMO.isKill())
+        continue;
+      KillMI = UseMO.getParent();
+      break;
+    }
   }
 
   // If we find the instruction that kills SavedReg, and it is in an
@@ -251,7 +267,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
       if (DefReg == MOReg)
         return false;
 
-      if (MO.isKill()) {
+      if (MO.isKill() || (LIS && isPlainlyKilled(OtherMI, MOReg, LIS))) {
         if (OtherMI == KillMI && MOReg == SavedReg)
           // Save the operand that kills the register. We want to unset the kill
           // marker if we can sink MI past it.
@@ -264,13 +280,15 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   }
   assert(KillMO && "Didn't find kill");
 
-  // Update kill and LV information.
-  KillMO->setIsKill(false);
-  KillMO = MI->findRegisterUseOperand(SavedReg, false, TRI);
-  KillMO->setIsKill(true);
+  if (!LIS) {
+    // Update kill and LV information.
+    KillMO->setIsKill(false);
+    KillMO = MI->findRegisterUseOperand(SavedReg, false, TRI);
+    KillMO->setIsKill(true);
 
-  if (LV)
-    LV->replaceKillInstruction(SavedReg, KillMI, MI);
+    if (LV)
+      LV->replaceKillInstruction(SavedReg, KillMI, MI);
+  }
 
   // Move instruction to its destination.
   MBB->remove(MI);
@@ -331,6 +349,33 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   return true;
 }
 
+/// isPLainlyKilled - Test if the given register value, which is used by the
+// given instruction, is killed by the given instruction.
+static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
+                            LiveIntervals *LIS) {
+  if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) &&
+      !LIS->isNotInMIMap(MI)) {
+    // FIXME: Sometimes tryInstructionTransform() will add instructions and
+    // test whether they can be folded before keeping them. In this case it
+    // sets a kill before recursively calling tryInstructionTransform() again.
+    // If there is no interval available, we assume that this instruction is
+    // one of those. A kill flag is manually inserted on the operand so the
+    // check below will handle it.
+    LiveInterval &LI = LIS->getInterval(Reg);
+    // This is to match the kill flag version where undefs don't have kill
+    // flags.
+    if (!LI.hasAtLeastOneValue())
+      return false;
+
+    SlotIndex useIdx = LIS->getInstructionIndex(MI);
+    LiveInterval::const_iterator I = LI.find(useIdx);
+    assert(I != LI.end() && "Reg must be live-in to use.");
+    return !I->end.isBlock() && SlotIndex::isSameInstr(I->end, useIdx);
+  }
+
+  return MI->killsRegister(Reg);
+}
+
 /// isKilled - Test if the given register value, which is used by the given
 /// instruction, is killed by the given instruction. This looks through
 /// coalescable copies to see if the original value is potentially not killed.
@@ -346,12 +391,20 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
 /// normal heuristics commute the (two-address) add, which lets
 /// coalescing eliminate the extra copy.
 ///
+/// If allowFalsePositives is true then likely kills are treated as kills even
+/// if it can't be proven that they are kills.
 static bool isKilled(MachineInstr &MI, unsigned Reg,
                      const MachineRegisterInfo *MRI,
-                     const TargetInstrInfo *TII) {
+                     const TargetInstrInfo *TII,
+                     LiveIntervals *LIS,
+                     bool allowFalsePositives) {
   MachineInstr *DefMI = &MI;
   for (;;) {
-    if (!DefMI->killsRegister(Reg))
+    // All uses of physical registers are likely to be kills.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+        (allowFalsePositives || MRI->hasOneUse(Reg)))
+      return true;
+    if (!isPlainlyKilled(DefMI, Reg, LIS))
       return false;
     if (TargetRegisterInfo::isPhysicalRegister(Reg))
       return true;
@@ -472,7 +525,7 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   // insert => %reg1030<def> = MOV8rr %reg1029
   // %reg1030<def> = ADD8rr %reg1029<kill>, %reg1028<kill>, %EFLAGS<imp-def,dead>
 
-  if (!MI->killsRegister(regC))
+  if (!isPlainlyKilled(MI, regC, LIS))
     return false;
 
   // Ok, we have something like:
@@ -528,19 +581,9 @@ commuteInstruction(MachineBasicBlock::iterator &mi,
   }
 
   DEBUG(dbgs() << "2addr: COMMUTED TO: " << *NewMI);
-  // If the instruction changed to commute it, update livevar.
-  if (NewMI != MI) {
-    if (LV)
-      // Update live variables
-      LV->replaceKillInstruction(RegC, MI, NewMI);
-    if (Indexes)
-      Indexes->replaceMachineInstrInMaps(MI, NewMI);
-
-    MBB->insert(mi, NewMI);           // Insert the new inst
-    MBB->erase(mi);                   // Nuke the old inst.
-    mi = NewMI;
-    DistanceMap.insert(std::make_pair(NewMI, Dist));
-  }
+  assert(NewMI == MI &&
+         "TargetInstrInfo::commuteInstruction() should not return a new "
+         "instruction unless it was requested.");
 
   // Update source register map.
   unsigned FromRegC = getMappedReg(RegC, SrcRegMap);
@@ -587,8 +630,8 @@ TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
   DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
   bool Sunk = false;
 
-  if (Indexes)
-    Indexes->replaceMachineInstrInMaps(mi, NewMI);
+  if (LIS)
+    LIS->ReplaceMachineInstrInMaps(mi, NewMI);
 
   if (NewMI->findRegisterUseOperand(RegB, false, TRI))
     // FIXME: Temporary workaround. If the new instruction doesn't
@@ -700,9 +743,9 @@ bool TwoAddressInstructionPass::
 rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
                       MachineBasicBlock::iterator &nmi,
                       unsigned Reg) {
-  // Bail immediately if we don't have LV available. We use it to find kills
-  // efficiently.
-  if (!LV)
+  // Bail immediately if we don't have LV or LIS available. We use them to find
+  // kills efficiently.
+  if (!LV && !LIS)
     return false;
 
   MachineInstr *MI = &*mi;
@@ -711,7 +754,22 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  MachineInstr *KillMI = 0;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(Reg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  } else {
+    KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  }
   if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
     // Don't mess with copies, they may be coalesced later.
     return false;
@@ -747,24 +805,27 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
       Defs.insert(MOReg);
     else {
       Uses.insert(MOReg);
-      if (MO.isKill() && MOReg != Reg)
+      if (MOReg != Reg && (MO.isKill() ||
+                           (LIS && isPlainlyKilled(MI, MOReg, LIS))))
         Kills.insert(MOReg);
     }
   }
 
   // Move the copies connected to MI down as well.
-  MachineBasicBlock::iterator From = MI;
-  MachineBasicBlock::iterator To = llvm::next(From);
-  while (To->isCopy() && Defs.count(To->getOperand(1).getReg())) {
-    Defs.insert(To->getOperand(0).getReg());
-    ++To;
+  MachineBasicBlock::iterator Begin = MI;
+  MachineBasicBlock::iterator AfterMI = llvm::next(Begin);
+
+  MachineBasicBlock::iterator End = AfterMI;
+  while (End->isCopy() && Defs.count(End->getOperand(1).getReg())) {
+    Defs.insert(End->getOperand(0).getReg());
+    ++End;
   }
 
   // Check if the reschedule will not break depedencies.
   unsigned NumVisited = 0;
   MachineBasicBlock::iterator KillPos = KillMI;
   ++KillPos;
-  for (MachineBasicBlock::iterator I = To; I != KillPos; ++I) {
+  for (MachineBasicBlock::iterator I = End; I != KillPos; ++I) {
     MachineInstr *OtherMI = I;
     // DBG_VALUE cannot be counted against the limit.
     if (OtherMI->isDebugValue())
@@ -795,11 +856,13 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
       } else {
         if (Defs.count(MOReg))
           return false;
+        bool isKill = MO.isKill() ||
+                      (LIS && isPlainlyKilled(OtherMI, MOReg, LIS));
         if (MOReg != Reg &&
-            ((MO.isKill() && Uses.count(MOReg)) || Kills.count(MOReg)))
+            ((isKill && Uses.count(MOReg)) || Kills.count(MOReg)))
           // Don't want to extend other live ranges and update kills.
           return false;
-        if (MOReg == Reg && !MO.isKill())
+        if (MOReg == Reg && !isKill)
           // We can't schedule across a use of the register in question.
           return false;
         // Ensure that if this is register in question, its the kill we expect.
@@ -810,19 +873,35 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   }
 
   // Move debug info as well.
-  while (From != MBB->begin() && llvm::prior(From)->isDebugValue())
-    --From;
+  while (Begin != MBB->begin() && llvm::prior(Begin)->isDebugValue())
+    --Begin;
+
+  nmi = End;
+  MachineBasicBlock::iterator InsertPos = KillPos;
+  if (LIS) {
+    // We have to move the copies first so that the MBB is still well-formed
+    // when calling handleMove().
+    for (MachineBasicBlock::iterator MBBI = AfterMI; MBBI != End;) {
+      MachineInstr *CopyMI = MBBI;
+      ++MBBI;
+      MBB->splice(InsertPos, MBB, CopyMI);
+      LIS->handleMove(CopyMI);
+      InsertPos = CopyMI;
+    }
+    End = llvm::next(MachineBasicBlock::iterator(MI));
+  }
 
   // Copies following MI may have been moved as well.
-  nmi = To;
-  MBB->splice(KillPos, MBB, From, To);
+  MBB->splice(InsertPos, MBB, Begin, End);
   DistanceMap.erase(DI);
 
   // Update live variables
-  LV->removeVirtualRegisterKilled(Reg, KillMI);
-  LV->addVirtualRegisterKilled(Reg, MI);
-  if (LIS)
+  if (LIS) {
     LIS->handleMove(MI);
+  } else {
+    LV->removeVirtualRegisterKilled(Reg, KillMI);
+    LV->addVirtualRegisterKilled(Reg, MI);
+  }
 
   DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
   return true;
@@ -858,9 +937,9 @@ bool TwoAddressInstructionPass::
 rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
                       MachineBasicBlock::iterator &nmi,
                       unsigned Reg) {
-  // Bail immediately if we don't have LV available. We use it to find kills
-  // efficiently.
-  if (!LV)
+  // Bail immediately if we don't have LV or LIS available. We use them to find
+  // kills efficiently.
+  if (!LV && !LIS)
     return false;
 
   MachineInstr *MI = &*mi;
@@ -869,7 +948,22 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  MachineInstr *KillMI = 0;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(Reg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  } else {
+    KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  }
   if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
     // Don't mess with copies, they may be coalesced later.
     return false;
@@ -896,10 +990,11 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
         continue;
       if (isDefTooClose(MOReg, DI->second, MI))
         return false;
-      if (MOReg == Reg && !MO.isKill())
+      bool isKill = MO.isKill() || (LIS && isPlainlyKilled(KillMI, MOReg, LIS));
+      if (MOReg == Reg && !isKill)
         return false;
       Uses.insert(MOReg);
-      if (MO.isKill() && MOReg != Reg)
+      if (isKill && MOReg != Reg)
         Kills.insert(MOReg);
     } else if (TargetRegisterInfo::isPhysicalRegister(MOReg)) {
       Defs.insert(MOReg);
@@ -939,7 +1034,8 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
         if (Kills.count(MOReg))
           // Don't want to extend other live ranges and update kills.
           return false;
-        if (OtherMI != MI && MOReg == Reg && !MO.isKill())
+        if (OtherMI != MI && MOReg == Reg &&
+            !(MO.isKill() || (LIS && isPlainlyKilled(OtherMI, MOReg, LIS))))
           // We can't schedule across a use of the register in question.
           return false;
       } else {
@@ -973,10 +1069,12 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   DistanceMap.erase(DI);
 
   // Update live variables
-  LV->removeVirtualRegisterKilled(Reg, KillMI);
-  LV->addVirtualRegisterKilled(Reg, MI);
-  if (LIS)
+  if (LIS) {
     LIS->handleMove(KillMI);
+  } else {
+    LV->removeVirtualRegisterKilled(Reg, KillMI);
+    LV->addVirtualRegisterKilled(Reg, MI);
+  }
 
   DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
   return true;
@@ -987,11 +1085,13 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
 /// either eliminate the tied operands or improve the opportunities for
 /// coalescing away the register copy.  Returns true if no copy needs to be
 /// inserted to untie mi's operands (either because they were untied, or
-/// because mi was rescheduled, and will be visited again later).
+/// because mi was rescheduled, and will be visited again later). If the
+/// shouldOnlyCommute flag is true, only instruction commutation is attempted.
 bool TwoAddressInstructionPass::
 tryInstructionTransform(MachineBasicBlock::iterator &mi,
                         MachineBasicBlock::iterator &nmi,
-                        unsigned SrcIdx, unsigned DstIdx, unsigned Dist) {
+                        unsigned SrcIdx, unsigned DstIdx,
+                        unsigned Dist, bool shouldOnlyCommute) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
@@ -1001,7 +1101,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
 
   assert(TargetRegisterInfo::isVirtualRegister(regB) &&
          "cannot make instruction into two-address form");
-  bool regBKilled = isKilled(MI, regB, MRI, TII);
+  bool regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
 
   if (TargetRegisterInfo::isVirtualRegister(regA))
     scanUses(regA);
@@ -1021,7 +1121,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
 
     if (regCIdx != ~0U) {
       regC = MI.getOperand(regCIdx).getReg();
-      if (!regBKilled && isKilled(MI, regC, MRI, TII))
+      if (!regBKilled && isKilled(MI, regC, MRI, TII, LIS, false))
         // If C dies but B does not, swap the B and C operands.
         // This makes the live ranges of A and C joinable.
         TryCommute = true;
@@ -1040,6 +1140,9 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
     return false;
   }
 
+  if (shouldOnlyCommute)
+    return false;
+
   // If there is one more use of regB later in the same MBB, consider
   // re-schedule this MI below it.
   if (rescheduleMIBelowKill(mi, nmi, regB)) {
@@ -1115,10 +1218,12 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
         unsigned NewDstIdx = NewMIs[1]->findRegisterDefOperandIdx(regA);
         unsigned NewSrcIdx = NewMIs[1]->findRegisterUseOperandIdx(regB);
         MachineBasicBlock::iterator NewMI = NewMIs[1];
-        bool TransformSuccess =
-          tryInstructionTransform(NewMI, mi, NewSrcIdx, NewDstIdx, Dist);
-        if (TransformSuccess ||
-            NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
+        bool TransformResult =
+          tryInstructionTransform(NewMI, mi, NewSrcIdx, NewDstIdx, Dist, true);
+        (void)TransformResult;
+        assert(!TransformResult &&
+               "tryInstructionTransform() should return false.");
+        if (NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
           // Success, or at least we made an improvement. Keep the unfolded
           // instructions and discard the original.
           if (LV) {
@@ -1149,10 +1254,26 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
             }
             LV->addVirtualRegisterKilled(Reg, NewMIs[1]);
           }
+
+          SmallVector<unsigned, 4> OrigRegs;
+          if (LIS) {
+            for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
+                 MOE = MI.operands_end(); MOI != MOE; ++MOI) {
+              if (MOI->isReg())
+                OrigRegs.push_back(MOI->getReg());
+            }
+          }
+
           MI.eraseFromParent();
+
+          // Update LiveIntervals.
+          if (LIS) {
+            MachineBasicBlock::iterator Begin(NewMIs[0]);
+            MachineBasicBlock::iterator End(NewMIs[1]);
+            LIS->repairIntervalsInRange(MBB, Begin, End, OrigRegs);
+          }
+
           mi = NewMIs[1];
-          if (TransformSuccess)
-            return true;
         } else {
           // Transforming didn't eliminate the tie and didn't lead to an
           // improvement. Clean up the unfolded instructions and keep the
@@ -1215,9 +1336,15 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
                                             TiedPairList &TiedPairs,
                                             unsigned &Dist) {
   bool IsEarlyClobber = false;
+  for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
+    const MachineOperand &DstMO = MI->getOperand(TiedPairs[tpi].second);
+    IsEarlyClobber |= DstMO.isEarlyClobber();
+  }
+
   bool RemovedKillFlag = false;
   bool AllUsesCopied = true;
   unsigned LastCopiedReg = 0;
+  SlotIndex LastCopyIdx;
   unsigned RegB = 0;
   for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
     unsigned SrcIdx = TiedPairs[tpi].first;
@@ -1225,7 +1352,6 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
 
     const MachineOperand &DstMO = MI->getOperand(DstIdx);
     unsigned RegA = DstMO.getReg();
-    IsEarlyClobber |= DstMO.isEarlyClobber();
 
     // Grab RegB from the instruction because it may have changed if the
     // instruction was commuted.
@@ -1263,9 +1389,17 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     DistanceMap.insert(std::make_pair(PrevMI, Dist));
     DistanceMap[MI] = ++Dist;
 
-    SlotIndex CopyIdx;
-    if (Indexes)
-      CopyIdx = Indexes->insertMachineInstrInMaps(PrevMI).getRegSlot();
+    if (LIS) {
+      LastCopyIdx = LIS->InsertMachineInstrInMaps(PrevMI).getRegSlot();
+
+      if (TargetRegisterInfo::isVirtualRegister(RegA)) {
+        LiveInterval &LI = LIS->getInterval(RegA);
+        VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
+        SlotIndex endIdx =
+          LIS->getInstructionIndex(MI).getRegSlot(IsEarlyClobber);
+        LI.addRange(LiveRange(LastCopyIdx, endIdx, VNI));
+      }
+    }
 
     DEBUG(dbgs() << "\t\tprepend:\t" << *PrevMI);
 
@@ -1311,6 +1445,18 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
       LV->addVirtualRegisterKilled(RegB, PrevMI);
     }
 
+    // Update LiveIntervals.
+    if (LIS) {
+      LiveInterval &LI = LIS->getInterval(RegB);
+      SlotIndex MIIdx = LIS->getInstructionIndex(MI);
+      LiveInterval::const_iterator I = LI.find(MIIdx);
+      assert(I != LI.end() && "RegB must be live-in to use.");
+
+      SlotIndex UseIdx = MIIdx.getRegSlot(IsEarlyClobber);
+      if (I->end == UseIdx)
+        LI.removeRange(LastCopyIdx, UseIdx);
+    }
+
   } else if (RemovedKillFlag) {
     // Some tied uses of regB matched their destination registers, so
     // regB is still used in this instruction, but a kill flag was
@@ -1335,7 +1481,6 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   TII = TM.getInstrInfo();
   TRI = TM.getRegisterInfo();
   InstrItins = TM.getInstrItineraryData();
-  Indexes = getAnalysisIfAvailable<SlotIndexes>();
   LV = getAnalysisIfAvailable<LiveVariables>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();
@@ -1399,7 +1544,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
           unsigned SrcReg = mi->getOperand(SrcIdx).getReg();
           unsigned DstReg = mi->getOperand(DstIdx).getReg();
           if (SrcReg != DstReg &&
-              tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist)) {
+              tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist, false)) {
             // The tied operands have been eliminated or shifted further down the
             // block to ease elimination. Continue processing with 'nmi'.
             TiedOperands.clear();
@@ -1437,6 +1582,9 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
     }
   }
 
+  if (LIS)
+    MF->verify(this, "After two-address instruction pass");
+
   return MadeChange;
 }
 
@@ -1462,6 +1610,13 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     llvm_unreachable(0);
   }
 
+  SmallVector<unsigned, 4> OrigRegs;
+  if (LIS) {
+    OrigRegs.push_back(MI->getOperand(0).getReg());
+    for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2)
+      OrigRegs.push_back(MI->getOperand(i).getReg());
+  }
+
   bool DefEmitted = false;
   for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
     MachineOperand &UseMO = MI->getOperand(i);
@@ -1505,6 +1660,9 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     DEBUG(dbgs() << "Inserted: " << *CopyMI);
   }
 
+  MachineBasicBlock::iterator EndMBBI =
+      llvm::next(MachineBasicBlock::iterator(MI));
+
   if (!DefEmitted) {
     DEBUG(dbgs() << "Turned: " << *MI << " into an IMPLICIT_DEF");
     MI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
@@ -1514,4 +1672,8 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     DEBUG(dbgs() << "Eliminated: " << *MI);
     MI->eraseFromParent();
   }
+
+  // Udpate LiveIntervals.
+  if (LIS)
+    LIS->repairIntervalsInRange(MBB, MBBI, EndMBBI, OrigRegs);
 }
diff --git a/lib/DebugInfo/CMakeLists.txt b/lib/DebugInfo/CMakeLists.txt
index 1e9e509..e97455a 100644
--- a/lib/DebugInfo/CMakeLists.txt
+++ b/lib/DebugInfo/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMDebugInfo
   DWARFDebugAbbrev.cpp
   DWARFDebugArangeSet.cpp
   DWARFDebugAranges.cpp
+  DWARFDebugFrame.cpp
   DWARFDebugInfoEntry.cpp
   DWARFDebugLine.cpp
   DWARFDebugRangeList.cpp
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index c58664f..2a74605 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h
@@ -13,6 +13,7 @@
 #include "DWARFDebugAbbrev.h"
 #include "DWARFDebugInfoEntry.h"
 #include "DWARFDebugRangeList.h"
+#include "DWARFRelocMap.h"
 #include <vector>
 
 namespace llvm {
@@ -20,7 +21,6 @@ namespace llvm {
 class DWARFDebugAbbrev;
 class StringRef;
 class raw_ostream;
-typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t> > RelocAddrMap;
 
 class DWARFCompileUnit {
   const DWARFDebugAbbrev *Abbrev;
@@ -29,6 +29,7 @@ class DWARFCompileUnit {
   StringRef RangeSection;
   StringRef StringSection;
   StringRef StringOffsetSection;
+  StringRef AddrOffsetSection;
   const RelocAddrMap *RelocMap;
   bool isLittleEndian;
 
@@ -43,16 +44,17 @@ class DWARFCompileUnit {
 public:
 
   DWARFCompileUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
-                   StringRef RS, StringRef SS, StringRef SOS,
+                   StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
                    const RelocAddrMap *M, bool LE) :
     Abbrev(DA), InfoSection(IS), AbbrevSection(AS),
     RangeSection(RS), StringSection(SS), StringOffsetSection(SOS),
-    RelocMap(M), isLittleEndian(LE) {
+    AddrOffsetSection(AOS), RelocMap(M), isLittleEndian(LE) {
     clear();
   }
 
   StringRef getStringSection() const { return StringSection; }
   StringRef getStringOffsetSection() const { return StringOffsetSection; }
+  StringRef getAddrOffsetSection() const { return AddrOffsetSection; }
   const RelocAddrMap *getRelocMap() const { return RelocMap; }
   DataExtractor getDebugInfoExtractor() const;
 
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index d6d9fcf..9e19310 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -19,72 +19,123 @@ using namespace dwarf;
 
 typedef DWARFDebugLine::LineTable DWARFLineTable;
 
-void DWARFContext::dump(raw_ostream &OS) {
-  OS << ".debug_abbrev contents:\n";
-  getDebugAbbrev()->dump(OS);
+void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
+  if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
+    OS << ".debug_abbrev contents:\n";
+    getDebugAbbrev()->dump(OS);
+  }
 
-  OS << "\n.debug_info contents:\n";
-  for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i)
-    getCompileUnitAtIndex(i)->dump(OS);
+  if (DumpType == DIDT_All || DumpType == DIDT_Info) {
+    OS << "\n.debug_info contents:\n";
+    for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i)
+      getCompileUnitAtIndex(i)->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Frames) {
+    OS << "\n.debug_frame contents:\n";
+    getDebugFrame()->dump(OS);
+  }
 
-  OS << "\n.debug_aranges contents:\n";
-  DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
   uint32_t offset = 0;
-  DWARFDebugArangeSet set;
-  while (set.extract(arangesData, &offset))
-    set.dump(OS);
+  if (DumpType == DIDT_All || DumpType == DIDT_Aranges) {
+    OS << "\n.debug_aranges contents:\n";
+    DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
+    DWARFDebugArangeSet set;
+    while (set.extract(arangesData, &offset))
+      set.dump(OS);
+  }
 
   uint8_t savedAddressByteSize = 0;
-  OS << "\n.debug_line contents:\n";
-  for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i) {
-    DWARFCompileUnit *cu = getCompileUnitAtIndex(i);
-    savedAddressByteSize = cu->getAddressByteSize();
-    unsigned stmtOffset =
-      cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
-                                                           -1U);
-    if (stmtOffset != -1U) {
-      DataExtractor lineData(getLineSection(), isLittleEndian(),
+  if (DumpType == DIDT_All || DumpType == DIDT_Line) {
+    OS << "\n.debug_line contents:\n";
+    for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i) {
+      DWARFCompileUnit *cu = getCompileUnitAtIndex(i);
+      savedAddressByteSize = cu->getAddressByteSize();
+      unsigned stmtOffset =
+        cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
+                                                             -1U);
+      if (stmtOffset != -1U) {
+        DataExtractor lineData(getLineSection(), isLittleEndian(),
+                               savedAddressByteSize);
+        DWARFDebugLine::DumpingState state(OS);
+        DWARFDebugLine::parseStatementTable(lineData, &lineRelocMap(), &stmtOffset, state);
+      }
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Str) {
+    OS << "\n.debug_str contents:\n";
+    DataExtractor strData(getStringSection(), isLittleEndian(), 0);
+    offset = 0;
+    uint32_t strOffset = 0;
+    while (const char *s = strData.getCStr(&offset)) {
+      OS << format("0x%8.8x: \"%s\"\n", strOffset, s);
+      strOffset = offset;
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Ranges) {
+    OS << "\n.debug_ranges contents:\n";
+    // In fact, different compile units may have different address byte
+    // sizes, but for simplicity we just use the address byte size of the last
+    // compile unit (there is no easy and fast way to associate address range
+    // list and the compile unit it describes).
+    DataExtractor rangesData(getRangeSection(), isLittleEndian(),
                              savedAddressByteSize);
-      DWARFDebugLine::DumpingState state(OS);
-      DWARFDebugLine::parseStatementTable(lineData, &stmtOffset, state);
+    offset = 0;
+    DWARFDebugRangeList rangeList;
+    while (rangeList.extract(rangesData, &offset))
+      rangeList.dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames) {
+    OS << "\n.debug_pubnames contents:\n";
+    DataExtractor pubNames(getPubNamesSection(), isLittleEndian(), 0);
+    offset = 0;
+    OS << "Length:                " << pubNames.getU32(&offset) << "\n";
+    OS << "Version:               " << pubNames.getU16(&offset) << "\n";
+    OS << "Offset in .debug_info: " << pubNames.getU32(&offset) << "\n";
+    OS << "Size:                  " << pubNames.getU32(&offset) << "\n";
+    OS << "\n  Offset    Name\n";
+    while (offset < getPubNamesSection().size()) {
+      uint32_t n = pubNames.getU32(&offset);
+      if (n == 0)
+        break;
+      OS << format("%8x    ", n);
+      OS << pubNames.getCStr(&offset) << "\n";
     }
   }
 
-  OS << "\n.debug_str contents:\n";
-  DataExtractor strData(getStringSection(), isLittleEndian(), 0);
-  offset = 0;
-  uint32_t strOffset = 0;
-  while (const char *s = strData.getCStr(&offset)) {
-    OS << format("0x%8.8x: \"%s\"\n", strOffset, s);
-    strOffset = offset;
+  if (DumpType == DIDT_All || DumpType == DIDT_AbbrevDwo) {
+    OS << "\n.debug_abbrev.dwo contents:\n";
+    getDebugAbbrevDWO()->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_InfoDwo) {
+    OS << "\n.debug_info.dwo contents:\n";
+    for (unsigned i = 0, e = getNumDWOCompileUnits(); i != e; ++i)
+      getDWOCompileUnitAtIndex(i)->dump(OS);
   }
 
-  OS << "\n.debug_ranges contents:\n";
-  // In fact, different compile units may have different address byte
-  // sizes, but for simplicity we just use the address byte size of the last
-  // compile unit (there is no easy and fast way to associate address range
-  // list and the compile unit it describes).
-  DataExtractor rangesData(getRangeSection(), isLittleEndian(),
-                           savedAddressByteSize);
-  offset = 0;
-  DWARFDebugRangeList rangeList;
-  while (rangeList.extract(rangesData, &offset))
-    rangeList.dump(OS);
-
-  OS << "\n.debug_abbrev.dwo contents:\n";
-  getDebugAbbrevDWO()->dump(OS);
-
-  OS << "\n.debug_info.dwo contents:\n";
-  for (unsigned i = 0, e = getNumDWOCompileUnits(); i != e; ++i)
-    getDWOCompileUnitAtIndex(i)->dump(OS);
-
-  OS << "\n.debug_str.dwo contents:\n";
-  DataExtractor strDWOData(getStringDWOSection(), isLittleEndian(), 0);
-  offset = 0;
-  uint32_t strDWOOffset = 0;
-  while (const char *s = strDWOData.getCStr(&offset)) {
-    OS << format("0x%8.8x: \"%s\"\n", strDWOOffset, s);
-    strDWOOffset = offset;
+  if (DumpType == DIDT_All || DumpType == DIDT_StrDwo) {
+    OS << "\n.debug_str.dwo contents:\n";
+    DataExtractor strDWOData(getStringDWOSection(), isLittleEndian(), 0);
+    offset = 0;
+    uint32_t strDWOOffset = 0;
+    while (const char *s = strDWOData.getCStr(&offset)) {
+      OS << format("0x%8.8x: \"%s\"\n", strDWOOffset, s);
+      strDWOOffset = offset;
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_StrOffsetsDwo) {
+    OS << "\n.debug_str_offsets.dwo contents:\n";
+    DataExtractor strOffsetExt(getStringOffsetDWOSection(), isLittleEndian(), 0);
+    offset = 0;
+    while (offset < getStringOffsetDWOSection().size()) {
+      OS << format("0x%8.8x: ", offset);
+      OS << format("%8.8x\n", strOffsetExt.getU32(&offset));
+    }
   }
 }
 
@@ -124,10 +175,30 @@ const DWARFDebugAranges *DWARFContext::getDebugAranges() {
   return Aranges.get();
 }
 
+const DWARFDebugFrame *DWARFContext::getDebugFrame() {
+  if (DebugFrame)
+    return DebugFrame.get();
+
+  // There's a "bug" in the DWARFv3 standard with respect to the target address
+  // size within debug frame sections. While DWARF is supposed to be independent
+  // of its container, FDEs have fields with size being "target address size",
+  // which isn't specified in DWARF in general. It's only specified for CUs, but
+  // .eh_frame can appear without a .debug_info section. Follow the example of
+  // other tools (libdwarf) and extract this from the container (ObjectFile
+  // provides this information). This problem is fixed in DWARFv4
+  // See this dwarf-discuss discussion for more details:
+  // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
+  DataExtractor debugFrameData(getDebugFrameSection(), isLittleEndian(),
+                               getAddressSize());
+  DebugFrame.reset(new DWARFDebugFrame());
+  DebugFrame->parse(debugFrameData);
+  return DebugFrame.get();
+}
+
 const DWARFLineTable *
 DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
   if (!Line)
-    Line.reset(new DWARFDebugLine());
+    Line.reset(new DWARFDebugLine(&lineRelocMap()));
 
   unsigned stmtOffset =
     cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
@@ -152,7 +223,8 @@ void DWARFContext::parseCompileUnits() {
   while (DIData.isValidOffset(offset)) {
     CUs.push_back(DWARFCompileUnit(getDebugAbbrev(), getInfoSection(),
                                    getAbbrevSection(), getRangeSection(),
-                                   getStringSection(), "",
+                                   getStringSection(), StringRef(),
+                                   getAddrSection(),
                                    &infoRelocMap(),
                                    isLittleEndian()));
     if (!CUs.back().extract(DIData, &offset)) {
@@ -174,6 +246,7 @@ void DWARFContext::parseDWOCompileUnits() {
                                       getRangeDWOSection(),
                                       getStringDWOSection(),
                                       getStringOffsetDWOSection(),
+                                      getAddrSection(),
                                       &infoDWORelocMap(),
                                       isLittleEndian()));
     if (!DWOCUs.back().extract(DIData, &offset)) {
@@ -295,6 +368,64 @@ DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
                     Line, Column);
 }
 
+DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
+    uint64_t Size,
+    DILineInfoSpecifier Specifier) {
+  DILineInfoTable  Lines;
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
+    return Lines;
+
+  std::string FunctionName = "<invalid>";
+  if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
+    // The address may correspond to instruction in some inlined function,
+    // so we have to build the chain of inlined functions and take the
+    // name of the topmost function in it.
+    const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+        CU->getInlinedChainForAddress(Address);
+    if (InlinedChain.size() > 0) {
+      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain[0];
+      if (const char *Name = TopFunctionDIE.getSubroutineName(CU))
+        FunctionName = Name;
+    }
+  }
+
+  StringRef  FuncNameRef = StringRef(FunctionName);
+
+  // If the Specifier says we don't need FileLineInfo, just
+  // return the top-most function at the starting address.
+  if (!Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+    Lines.push_back(std::make_pair(Address, 
+                                   DILineInfo(StringRef("<invalid>"), 
+                                              FuncNameRef, 0, 0)));
+    return Lines;
+  }
+
+  const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU);
+  const bool NeedsAbsoluteFilePath =
+      Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
+
+  // Get the index of row we're looking for in the line table.
+  std::vector<uint32_t> RowVector;
+  if (!LineTable->lookupAddressRange(Address, Size, RowVector))
+    return Lines;
+
+  uint32_t NumRows = RowVector.size();
+  for (uint32_t i = 0; i < NumRows; ++i) {
+    uint32_t RowIndex = RowVector[i];
+    // Take file number and line/column from the row.
+    const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
+    std::string FileName = "<invalid>";
+    getFileNameForCompileUnit(CU, LineTable, Row.File,
+                              NeedsAbsoluteFilePath, FileName);
+    Lines.push_back(std::make_pair(Row.Address, 
+                                   DILineInfo(StringRef(FileName),
+                                         FuncNameRef, Row.Line, Row.Column)));
+  }
+
+  return Lines;
+}
+
 DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
     DILineInfoSpecifier Specifier) {
   DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
@@ -352,7 +483,8 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
 }
 
 DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
-  IsLittleEndian(true /* FIXME */) {
+  IsLittleEndian(Obj->isLittleEndian()),
+  AddressSize(Obj->getBytesInAddress()) {
   error_code ec;
   for (object::section_iterator i = Obj->begin_sections(),
          e = Obj->end_sections();
@@ -371,6 +503,8 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
       LineSection = data;
     else if (name == "debug_aranges")
       ARangeSection = data;
+    else if (name == "debug_frame")
+      DebugFrameSection = data;
     else if (name == "debug_str")
       StringSection = data;
     else if (name == "debug_ranges") {
@@ -378,6 +512,8 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
       RangeDWOSection = data;
       RangeSection = data;
     }
+    else if (name == "debug_pubnames")
+      PubNamesSection = data;
     else if (name == "debug_info.dwo")
       InfoDWOSection = data;
     else if (name == "debug_abbrev.dwo")
@@ -386,16 +522,21 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
       StringDWOSection = data;
     else if (name == "debug_str_offsets.dwo")
       StringOffsetDWOSection = data;
+    else if (name == "debug_addr")
+      AddrSection = data;
     // Any more debug info sections go here.
     else
       continue;
 
-    // TODO: For now only handle relocations for the debug_info section.
+    // TODO: Add support for relocations in other sections as needed.
+    // Record relocations for the debug_info and debug_line sections.
     RelocAddrMap *Map;
     if (name == "debug_info")
       Map = &InfoRelocMap;
     else if (name == "debug_info.dwo")
       Map = &InfoDWORelocMap;
+    else if (name == "debug_line")
+      Map = &LineRelocMap;
     else
       continue;
 
@@ -409,10 +550,17 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
         reloc_i->getAddress(Address);
         uint64_t Type;
         reloc_i->getType(Type);
+        uint64_t SymAddr = 0;
+        // ELF relocations may need the symbol address
+        if (Obj->isELF()) {
+          object::SymbolRef Sym;
+          reloc_i->getSymbol(Sym);
+          Sym.getAddress(SymAddr);
+        }
 
         object::RelocVisitor V(Obj->getFileFormatName());
         // The section address is always 0 for debug sections.
-        object::RelocToApply R(V.visit(Type, *reloc_i));
+        object::RelocToApply R(V.visit(Type, *reloc_i, 0, SymAddr));
         if (V.error()) {
           SmallString<32> Name;
           error_code ec(reloc_i->getTypeName(Name));
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index 687ff93..37b2729 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -12,6 +12,7 @@
 
 #include "DWARFCompileUnit.h"
 #include "DWARFDebugAranges.h"
+#include "DWARFDebugFrame.h"
 #include "DWARFDebugLine.h"
 #include "DWARFDebugRangeList.h"
 #include "llvm/ADT/OwningPtr.h"
@@ -29,6 +30,7 @@ class DWARFContext : public DIContext {
   OwningPtr<DWARFDebugAbbrev> Abbrev;
   OwningPtr<DWARFDebugAranges> Aranges;
   OwningPtr<DWARFDebugLine> Line;
+  OwningPtr<DWARFDebugFrame> DebugFrame;
 
   SmallVector<DWARFCompileUnit, 1> DWOCUs;
   OwningPtr<DWARFDebugAbbrev> AbbrevDWO;
@@ -45,7 +47,7 @@ class DWARFContext : public DIContext {
 
 public:
   DWARFContext() {}
-  virtual void dump(raw_ostream &OS);
+  virtual void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All);
 
   /// Get the number of compile units in this context.
   unsigned getNumCompileUnits() {
@@ -84,23 +86,32 @@ public:
   /// Get a pointer to the parsed DebugAranges object.
   const DWARFDebugAranges *getDebugAranges();
 
+  /// Get a pointer to the parsed frame information object.
+  const DWARFDebugFrame *getDebugFrame();
+
   /// Get a pointer to a parsed line table corresponding to a compile unit.
   const DWARFDebugLine::LineTable *
   getLineTableForCompileUnit(DWARFCompileUnit *cu);
 
   virtual DILineInfo getLineInfoForAddress(uint64_t Address,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier());
+  virtual DILineInfoTable getLineInfoForAddressRange(uint64_t Address,
+      uint64_t Size, DILineInfoSpecifier Specifier = DILineInfoSpecifier());
   virtual DIInliningInfo getInliningInfoForAddress(uint64_t Address,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier());
 
   virtual bool isLittleEndian() const = 0;
+  virtual uint8_t getAddressSize() const = 0;
   virtual const RelocAddrMap &infoRelocMap() const = 0;
+  virtual const RelocAddrMap &lineRelocMap() const = 0;
   virtual StringRef getInfoSection() = 0;
   virtual StringRef getAbbrevSection() = 0;
   virtual StringRef getARangeSection() = 0;
+  virtual StringRef getDebugFrameSection() = 0;
   virtual StringRef getLineSection() = 0;
   virtual StringRef getStringSection() = 0;
   virtual StringRef getRangeSection() = 0;
+  virtual StringRef getPubNamesSection() = 0;
 
   // Sections for DWARF5 split dwarf proposal.
   virtual StringRef getInfoDWOSection() = 0;
@@ -108,6 +119,7 @@ public:
   virtual StringRef getStringDWOSection() = 0;
   virtual StringRef getStringOffsetDWOSection() = 0;
   virtual StringRef getRangeDWOSection() = 0;
+  virtual StringRef getAddrSection() = 0;
   virtual const RelocAddrMap &infoDWORelocMap() const = 0;
 
   static bool isSupportedVersion(unsigned version) {
@@ -128,13 +140,17 @@ private:
 class DWARFContextInMemory : public DWARFContext {
   virtual void anchor();
   bool IsLittleEndian;
+  uint8_t AddressSize;
   RelocAddrMap InfoRelocMap;
+  RelocAddrMap LineRelocMap;
   StringRef InfoSection;
   StringRef AbbrevSection;
   StringRef ARangeSection;
+  StringRef DebugFrameSection;
   StringRef LineSection;
   StringRef StringSection;
   StringRef RangeSection;
+  StringRef PubNamesSection;
 
   // Sections for DWARF5 split dwarf proposal.
   RelocAddrMap InfoDWORelocMap;
@@ -143,17 +159,22 @@ class DWARFContextInMemory : public DWARFContext {
   StringRef StringDWOSection;
   StringRef StringOffsetDWOSection;
   StringRef RangeDWOSection;
+  StringRef AddrSection;
 
 public:
   DWARFContextInMemory(object::ObjectFile *);
   virtual bool isLittleEndian() const { return IsLittleEndian; }
+  virtual uint8_t getAddressSize() const { return AddressSize; }
   virtual const RelocAddrMap &infoRelocMap() const { return InfoRelocMap; }
+  virtual const RelocAddrMap &lineRelocMap() const { return LineRelocMap; }
   virtual StringRef getInfoSection() { return InfoSection; }
   virtual StringRef getAbbrevSection() { return AbbrevSection; }
   virtual StringRef getARangeSection() { return ARangeSection; }
+  virtual StringRef getDebugFrameSection() { return DebugFrameSection; }
   virtual StringRef getLineSection() { return LineSection; }
   virtual StringRef getStringSection() { return StringSection; }
   virtual StringRef getRangeSection() { return RangeSection; }
+  virtual StringRef getPubNamesSection() { return PubNamesSection; }
 
   // Sections for DWARF5 split dwarf proposal.
   virtual StringRef getInfoDWOSection() { return InfoDWOSection; }
@@ -163,6 +184,9 @@ public:
     return StringOffsetDWOSection;
   }
   virtual StringRef getRangeDWOSection() { return RangeDWOSection; }
+  virtual StringRef getAddrSection() {
+    return AddrSection;
+  }
   virtual const RelocAddrMap &infoDWORelocMap() const {
     return InfoDWORelocMap;
   }
diff --git a/lib/DebugInfo/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARFDebugFrame.cpp
new file mode 100644
index 0000000..3efe6a1
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugFrame.cpp
@@ -0,0 +1,391 @@
+//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugFrame.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace dwarf;
+
+
+/// \brief Abstract frame entry defining the common interface concrete
+/// entries implement.
+class llvm::FrameEntry {
+public:
+  enum FrameKind {FK_CIE, FK_FDE};
+  FrameEntry(FrameKind K, DataExtractor D, uint64_t Offset, uint64_t Length)
+    : Kind(K), Data(D), Offset(Offset), Length(Length) {}
+
+  virtual ~FrameEntry() {
+  }
+
+  FrameKind getKind() const { return Kind; }
+  virtual uint64_t getOffset() const { return Offset; }
+
+  /// \brief Parse and store a sequence of CFI instructions from our data
+  /// stream, starting at *Offset and ending at EndOffset. If everything
+  /// goes well, *Offset should be equal to EndOffset when this method
+  /// returns. Otherwise, an error occurred.
+  virtual void parseInstructions(uint32_t *Offset, uint32_t EndOffset);
+
+  /// \brief Dump the entry header to the given output stream.
+  virtual void dumpHeader(raw_ostream &OS) const = 0;
+
+  /// \brief Dump the entry's instructions to the given output stream.
+  virtual void dumpInstructions(raw_ostream &OS) const;
+
+protected:
+  const FrameKind Kind;
+
+  /// \brief The data stream holding the section from which the entry was
+  /// parsed.
+  DataExtractor Data;
+
+  /// \brief Offset of this entry in the section.
+  uint64_t Offset;
+
+  /// \brief Entry length as specified in DWARF.
+  uint64_t Length;
+
+  /// An entry may contain CFI instructions. An instruction consists of an
+  /// opcode and an optional sequence of operands.
+  typedef std::vector<uint64_t> Operands;
+  struct Instruction {
+    Instruction(uint8_t Opcode)
+      : Opcode(Opcode)
+    {}
+
+    uint8_t Opcode;
+    Operands Ops;
+  };
+
+  std::vector<Instruction> Instructions;
+
+  /// Convenience methods to add a new instruction with the given opcode and
+  /// operands to the Instructions vector.
+  void addInstruction(uint8_t Opcode) {
+    Instructions.push_back(Instruction(Opcode));
+  }
+
+  void addInstruction(uint8_t Opcode, uint64_t Operand1) {
+    Instructions.push_back(Instruction(Opcode));
+    Instructions.back().Ops.push_back(Operand1);
+  }
+
+  void addInstruction(uint8_t Opcode, uint64_t Operand1, uint64_t Operand2) {
+    Instructions.push_back(Instruction(Opcode));
+    Instructions.back().Ops.push_back(Operand1);
+    Instructions.back().Ops.push_back(Operand2);
+  }
+};
+
+
+// See DWARF standard v3, section 7.23
+const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
+const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
+
+
+void FrameEntry::parseInstructions(uint32_t *Offset, uint32_t EndOffset) {
+  while (*Offset < EndOffset) {
+    uint8_t Opcode = Data.getU8(Offset);
+    // Some instructions have a primary opcode encoded in the top bits.
+    uint8_t Primary = Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK;
+
+    if (Primary) {
+      // If it's a primary opcode, the first operand is encoded in the bottom
+      // bits of the opcode itself.
+      uint64_t Op1 = Opcode & DWARF_CFI_PRIMARY_OPERAND_MASK;
+      switch (Primary) {
+        default: llvm_unreachable("Impossible primary CFI opcode");
+        case DW_CFA_advance_loc:
+        case DW_CFA_restore:
+          addInstruction(Primary, Op1);
+          break;
+        case DW_CFA_offset:
+          addInstruction(Primary, Op1, Data.getULEB128(Offset));
+          break;
+      }
+    } else {
+      // Extended opcode - its value is Opcode itself.
+      switch (Opcode) {
+        default: llvm_unreachable("Invalid extended CFI opcode");
+        case DW_CFA_nop:
+        case DW_CFA_remember_state:
+        case DW_CFA_restore_state:
+          // No operands
+          addInstruction(Opcode);
+          break;
+        case DW_CFA_set_loc:
+          // Operands: Address
+          addInstruction(Opcode, Data.getAddress(Offset));
+          break;
+        case DW_CFA_advance_loc1:
+          // Operands: 1-byte delta
+          addInstruction(Opcode, Data.getU8(Offset));
+          break;
+        case DW_CFA_advance_loc2:
+          // Operands: 2-byte delta
+          addInstruction(Opcode, Data.getU16(Offset));
+          break;
+        case DW_CFA_advance_loc4:
+          // Operands: 4-byte delta
+          addInstruction(Opcode, Data.getU32(Offset));
+          break;
+        case DW_CFA_restore_extended:
+        case DW_CFA_undefined:
+        case DW_CFA_same_value:
+        case DW_CFA_def_cfa_register:
+        case DW_CFA_def_cfa_offset:
+          // Operands: ULEB128
+          addInstruction(Opcode, Data.getULEB128(Offset));
+          break;
+        case DW_CFA_def_cfa_offset_sf:
+          // Operands: SLEB128
+          addInstruction(Opcode, Data.getSLEB128(Offset));
+          break;
+        case DW_CFA_offset_extended:
+        case DW_CFA_register:
+        case DW_CFA_def_cfa:
+        case DW_CFA_val_offset:
+          // Operands: ULEB128, ULEB128
+          addInstruction(Opcode, Data.getULEB128(Offset),
+                                 Data.getULEB128(Offset));
+          break;
+        case DW_CFA_offset_extended_sf:
+        case DW_CFA_def_cfa_sf:
+        case DW_CFA_val_offset_sf:
+          // Operands: ULEB128, SLEB128
+          addInstruction(Opcode, Data.getULEB128(Offset),
+                                 Data.getSLEB128(Offset));
+          break;
+        case DW_CFA_def_cfa_expression:
+        case DW_CFA_expression:
+        case DW_CFA_val_expression:
+          // TODO: implement this
+          report_fatal_error("Values with expressions not implemented yet!");
+      }
+    }
+  }
+}
+
+
+void FrameEntry::dumpInstructions(raw_ostream &OS) const {
+  // TODO: at the moment only instruction names are dumped. Expand this to
+  // dump operands as well.
+  for (std::vector<Instruction>::const_iterator I = Instructions.begin(),
+                                                E = Instructions.end();
+       I != E; ++I) {
+    uint8_t Opcode = I->Opcode;
+    if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK)
+      Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK;
+    OS << "  " << CallFrameString(Opcode) << ":\n";
+  }
+}
+
+
+namespace {
+/// \brief DWARF Common Information Entry (CIE)
+class CIE : public FrameEntry {
+public:
+  // CIEs (and FDEs) are simply container classes, so the only sensible way to
+  // create them is by providing the full parsed contents in the constructor.
+  CIE(DataExtractor D, uint64_t Offset, uint64_t Length, uint8_t Version,
+      SmallString<8> Augmentation, uint64_t CodeAlignmentFactor,
+      int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister)
+   : FrameEntry(FK_CIE, D, Offset, Length), Version(Version),
+     Augmentation(Augmentation), CodeAlignmentFactor(CodeAlignmentFactor),
+     DataAlignmentFactor(DataAlignmentFactor),
+     ReturnAddressRegister(ReturnAddressRegister) {}
+
+  ~CIE() {
+  }
+
+  void dumpHeader(raw_ostream &OS) const {
+    OS << format("%08x %08x %08x CIE",
+                 (uint32_t)Offset, (uint32_t)Length, DW_CIE_ID)
+       << "\n";
+    OS << format("  Version:               %d\n", Version);
+    OS << "  Augmentation:          \"" << Augmentation << "\"\n";
+    OS << format("  Code alignment factor: %u\n",
+                 (uint32_t)CodeAlignmentFactor);
+    OS << format("  Data alignment factor: %d\n",
+                 (int32_t)DataAlignmentFactor);
+    OS << format("  Return address column: %d\n",
+                 (int32_t)ReturnAddressRegister);
+    OS << "\n";
+  }
+
+  static bool classof(const FrameEntry *FE) {
+    return FE->getKind() == FK_CIE;
+  } 
+
+private:
+  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
+  uint8_t Version;
+  SmallString<8> Augmentation;
+  uint64_t CodeAlignmentFactor;
+  int64_t DataAlignmentFactor;
+  uint64_t ReturnAddressRegister;
+};
+
+
+/// \brief DWARF Frame Description Entry (FDE)
+class FDE : public FrameEntry {
+public:
+  // Each FDE has a CIE it's "linked to". Our FDE contains is constructed with
+  // an offset to the CIE (provided by parsing the FDE header). The CIE itself
+  // is obtained lazily once it's actually required.
+  FDE(DataExtractor D, uint64_t Offset, uint64_t Length,
+      int64_t LinkedCIEOffset, uint64_t InitialLocation, uint64_t AddressRange)
+   : FrameEntry(FK_FDE, D, Offset, Length), LinkedCIEOffset(LinkedCIEOffset),
+     InitialLocation(InitialLocation), AddressRange(AddressRange),
+     LinkedCIE(NULL) {}
+
+  ~FDE() {
+  }
+
+  void dumpHeader(raw_ostream &OS) const {
+    OS << format("%08x %08x %08x FDE ",
+                 (uint32_t)Offset, (uint32_t)Length, (int32_t)LinkedCIEOffset);
+    OS << format("cie=%08x pc=%08x...%08x\n",
+                 (int32_t)LinkedCIEOffset,
+                 (uint32_t)InitialLocation,
+                 (uint32_t)InitialLocation + (uint32_t)AddressRange);
+    if (LinkedCIE) {
+      OS << format("%p\n", LinkedCIE);
+    }
+  }
+
+  static bool classof(const FrameEntry *FE) {
+    return FE->getKind() == FK_FDE;
+  } 
+private:
+
+  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
+  uint64_t LinkedCIEOffset;
+  uint64_t InitialLocation;
+  uint64_t AddressRange;
+  CIE *LinkedCIE;
+};
+} // end anonymous namespace
+
+
+DWARFDebugFrame::DWARFDebugFrame() {
+}
+
+
+DWARFDebugFrame::~DWARFDebugFrame() {
+  for (EntryVector::iterator I = Entries.begin(), E = Entries.end();
+       I != E; ++I) {
+    delete *I;
+  }
+}
+
+
+static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
+                                              uint32_t Offset, int Length) {
+  errs() << "DUMP: ";
+  for (int i = 0; i < Length; ++i) {
+    uint8_t c = Data.getU8(&Offset);
+    errs().write_hex(c); errs() << " ";
+  }
+  errs() << "\n";
+}
+
+
+void DWARFDebugFrame::parse(DataExtractor Data) {
+  uint32_t Offset = 0;
+
+  while (Data.isValidOffset(Offset)) {
+    uint32_t StartOffset = Offset;
+
+    bool IsDWARF64 = false;
+    uint64_t Length = Data.getU32(&Offset);
+    uint64_t Id;
+
+    if (Length == UINT32_MAX) {
+      // DWARF-64 is distinguished by the first 32 bits of the initial length
+      // field being 0xffffffff. Then, the next 64 bits are the actual entry
+      // length.
+      IsDWARF64 = true;
+      Length = Data.getU64(&Offset);
+    }
+
+    // At this point, Offset points to the next field after Length.
+    // Length is the structure size excluding itself. Compute an offset one
+    // past the end of the structure (needed to know how many instructions to
+    // read).
+    // TODO: For honest DWARF64 support, DataExtractor will have to treat
+    //       offset_ptr as uint64_t*
+    uint32_t EndStructureOffset = Offset + static_cast<uint32_t>(Length);
+
+    // The Id field's size depends on the DWARF format
+    Id = Data.getUnsigned(&Offset, IsDWARF64 ? 8 : 4);
+    bool IsCIE = ((IsDWARF64 && Id == DW64_CIE_ID) || Id == DW_CIE_ID);
+
+    FrameEntry *Entry = 0;
+    if (IsCIE) {
+      // Note: this is specifically DWARFv3 CIE header structure. It was
+      // changed in DWARFv4. We currently don't support reading DWARFv4
+      // here because LLVM itself does not emit it (and LLDB doesn't
+      // support it either).
+      uint8_t Version = Data.getU8(&Offset);
+      const char *Augmentation = Data.getCStr(&Offset);
+      uint64_t CodeAlignmentFactor = Data.getULEB128(&Offset);
+      int64_t DataAlignmentFactor = Data.getSLEB128(&Offset);
+      uint64_t ReturnAddressRegister = Data.getULEB128(&Offset);
+
+      Entry = new CIE(Data, StartOffset, Length, Version,
+                      StringRef(Augmentation), CodeAlignmentFactor,
+                      DataAlignmentFactor, ReturnAddressRegister);
+    } else {
+      // FDE
+      uint64_t CIEPointer = Id;
+      uint64_t InitialLocation = Data.getAddress(&Offset);
+      uint64_t AddressRange = Data.getAddress(&Offset);
+
+      Entry = new FDE(Data, StartOffset, Length, CIEPointer,
+                      InitialLocation, AddressRange);
+    }
+
+    assert(Entry && "Expected Entry to be populated with CIE or FDE");
+    Entry->parseInstructions(&Offset, EndStructureOffset);
+
+    if (Offset == EndStructureOffset) {
+      // Entry instrucitons parsed successfully.
+      Entries.push_back(Entry);
+    } else {
+      std::string Str;
+      raw_string_ostream OS(Str);
+      OS << format("Parsing entry instructions at %lx failed",
+                   Entry->getOffset());
+      report_fatal_error(Str);
+    }
+  }
+}
+
+
+void DWARFDebugFrame::dump(raw_ostream &OS) const {
+  OS << "\n";
+  for (EntryVector::const_iterator I = Entries.begin(), E = Entries.end();
+       I != E; ++I) {
+    FrameEntry *Entry = *I;
+    Entry->dumpHeader(OS);
+    Entry->dumpInstructions(OS);
+    OS << "\n";
+  }
+}
+
diff --git a/lib/DebugInfo/DWARFDebugFrame.h b/lib/DebugInfo/DWARFDebugFrame.h
new file mode 100644
index 0000000..48b8d63
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugFrame.h
@@ -0,0 +1,46 @@
+//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGFRAME_H
+#define LLVM_DEBUGINFO_DWARFDEBUGFRAME_H
+
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+
+namespace llvm {
+
+class FrameEntry;
+
+
+/// \brief A parsed .debug_frame section
+///
+class DWARFDebugFrame {
+public:
+  DWARFDebugFrame();
+  ~DWARFDebugFrame();
+
+  /// \brief Dump the section data into the given stream.
+  void dump(raw_ostream &OS) const;
+
+  /// \brief Parse the section from raw data.
+  /// data is assumed to be pointing to the beginning of the section.
+  void parse(DataExtractor Data);
+
+private:
+  typedef std::vector<FrameEntry *> EntryVector;
+  EntryVector Entries;
+};
+
+
+} // namespace llvm
+
+#endif 
+
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index bb11850..02b15d6 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -203,11 +203,9 @@ bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *cu,
             form = debug_info_data.getULEB128(&offset);
             break;
 
+            // FIXME: 64-bit for DWARF64
           case DW_FORM_sec_offset:
-            if (cu->getAddressByteSize() == 4)
-              debug_info_data.getU32(offset_ptr);
-            else
-              debug_info_data.getU64(offset_ptr);
+            debug_info_data.getU32(offset_ptr);
             break;
 
           default:
@@ -344,11 +342,9 @@ DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *cu,
                 form_is_indirect = true;
                 break;
 
+                // FIXME: 64-bit for DWARF64.
               case DW_FORM_sec_offset:
-                if (cu->getAddressByteSize() == 4)
-                  debug_info_data.getU32(offset_ptr);
-                else
-                  debug_info_data.getU64(offset_ptr);
+                debug_info_data.getU32(offset_ptr);
                 break;
 
               default:
diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
index 267364a..192381c 100644
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARFDebugLine.cpp
@@ -155,7 +155,7 @@ DWARFDebugLine::getOrParseLineTable(DataExtractor debug_line_data,
   if (pos.second) {
     // Parse and cache the line table for at this offset.
     State state;
-    if (!parseStatementTable(debug_line_data, &offset, state))
+    if (!parseStatementTable(debug_line_data, RelocMap, &offset, state))
       return 0;
     pos.first->second = state;
   }
@@ -219,7 +219,8 @@ DWARFDebugLine::parsePrologue(DataExtractor debug_line_data,
 }
 
 bool
-DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
+DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data, 
+                                    const RelocAddrMap *RMap,
                                     uint32_t *offset_ptr, State &state) {
   const uint32_t debug_line_offset = *offset_ptr;
 
@@ -268,7 +269,15 @@ DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
         // relocatable address. All of the other statement program opcodes
         // that affect the address register add a delta to it. This instruction
         // stores a relocatable value into it instead.
-        state.Address = debug_line_data.getAddress(offset_ptr);
+        {
+          // If this address is in our relocation map, apply the relocation.
+          RelocAddrMap::const_iterator AI = RMap->find(*offset_ptr);
+          if (AI != RMap->end()) {
+             const std::pair<uint8_t, int64_t> &R = AI->second;
+             state.Address = debug_line_data.getAddress(offset_ptr) + R.second;
+          } else
+            state.Address = debug_line_data.getAddress(offset_ptr);
+        }
         break;
 
       case DW_LNE_define_file:
@@ -516,6 +525,83 @@ DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
 }
 
 bool
+DWARFDebugLine::LineTable::lookupAddressRange(uint64_t address,
+                                       uint64_t size, 
+                                       std::vector<uint32_t>& result) const {
+  if (Sequences.empty())
+    return false;
+  uint64_t end_addr = address + size;
+  // First, find an instruction sequence containing the given address.
+  DWARFDebugLine::Sequence sequence;
+  sequence.LowPC = address;
+  SequenceIter first_seq = Sequences.begin();
+  SequenceIter last_seq = Sequences.end();
+  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
+      DWARFDebugLine::Sequence::orderByLowPC);
+  if (seq_pos == last_seq || seq_pos->LowPC != address) {
+    if (seq_pos == first_seq)
+      return false;
+    seq_pos--;
+  }
+  if (!seq_pos->containsPC(address))
+    return false;
+
+  SequenceIter start_pos = seq_pos;
+
+  // Add the rows from the first sequence to the vector, starting with the
+  // index we just calculated
+
+  while (seq_pos != last_seq && seq_pos->LowPC < end_addr) {
+    DWARFDebugLine::Sequence cur_seq = *seq_pos;
+    uint32_t first_row_index;
+    uint32_t last_row_index;
+    if (seq_pos == start_pos) {
+      // For the first sequence, we need to find which row in the sequence is the
+      // first in our range. Rows are stored in a vector, so we may use
+      // arithmetical operations with iterators.
+      DWARFDebugLine::Row row;
+      row.Address = address;
+      RowIter first_row = Rows.begin() + cur_seq.FirstRowIndex;
+      RowIter last_row = Rows.begin() + cur_seq.LastRowIndex;
+      RowIter row_pos = std::upper_bound(first_row, last_row, row,
+                                         DWARFDebugLine::Row::orderByAddress);
+      // The 'row_pos' iterator references the first row that is greater than
+      // our start address. Unless that's the first row, we want to start at
+      // the row before that.
+      first_row_index = cur_seq.FirstRowIndex + (row_pos - first_row);
+      if (row_pos != first_row)
+        --first_row_index;
+    } else
+      first_row_index = cur_seq.FirstRowIndex;
+
+    // For the last sequence in our range, we need to figure out the last row in
+    // range.  For all other sequences we can go to the end of the sequence.
+    if (cur_seq.HighPC > end_addr) {
+      DWARFDebugLine::Row row;
+      row.Address = end_addr;
+      RowIter first_row = Rows.begin() + cur_seq.FirstRowIndex;
+      RowIter last_row = Rows.begin() + cur_seq.LastRowIndex;
+      RowIter row_pos = std::upper_bound(first_row, last_row, row,
+                                         DWARFDebugLine::Row::orderByAddress);
+      // The 'row_pos' iterator references the first row that is greater than
+      // our end address.  The row before that is the last row we want.
+      last_row_index = cur_seq.FirstRowIndex + (row_pos - first_row) - 1;
+    } else
+      // Contrary to what you might expect, DWARFDebugLine::SequenceLastRowIndex
+      // isn't a valid index within the current sequence.  It's that plus one.
+      last_row_index = cur_seq.LastRowIndex - 1;
+
+    for (uint32_t i = first_row_index; i <= last_row_index; ++i) {
+      result.push_back(i);
+    }
+
+    ++seq_pos;
+  }
+
+  return true;
+}
+
+bool
 DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
                                               bool NeedsAbsoluteFilePath,
                                               std::string &Result) const {
diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h
index 586dd7e..2990756 100644
--- a/lib/DebugInfo/DWARFDebugLine.h
+++ b/lib/DebugInfo/DWARFDebugLine.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_DWARFDEBUGLINE_H
 #define LLVM_DEBUGINFO_DWARFDEBUGLINE_H
 
+#include "DWARFRelocMap.h"
 #include "llvm/Support/DataExtractor.h"
 #include <map>
 #include <string>
@@ -21,6 +22,7 @@ class raw_ostream;
 
 class DWARFDebugLine {
 public:
+  DWARFDebugLine(const RelocAddrMap* LineInfoRelocMap) : RelocMap(LineInfoRelocMap) {}
   struct FileNameEntry {
     FileNameEntry() : Name(0), DirIdx(0), ModTime(0), Length(0) {}
 
@@ -176,6 +178,10 @@ public:
     // or -1 if there is no such row.
     uint32_t lookupAddress(uint64_t address) const;
 
+    bool lookupAddressRange(uint64_t address,
+                            uint64_t size, 
+                            std::vector<uint32_t>& result) const;
+
     // Extracts filename by its index in filename table in prologue.
     // Returns true on success.
     bool getFileNameByIndex(uint64_t FileIndex,
@@ -227,6 +233,7 @@ public:
                             Prologue *prologue);
   /// Parse a single line table (prologue and all rows).
   static bool parseStatementTable(DataExtractor debug_line_data,
+                                  const RelocAddrMap *RMap,
                                   uint32_t *offset_ptr, State &state);
 
   const LineTable *getLineTable(uint32_t offset) const;
@@ -238,6 +245,7 @@ private:
   typedef LineTableMapTy::iterator LineTableIter;
   typedef LineTableMapTy::const_iterator LineTableConstIter;
 
+  const RelocAddrMap *RelocMap;
   LineTableMapTy LineTableMap;
 };
 
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index 14c6804..9f807aa 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp
@@ -72,7 +72,7 @@ static const uint8_t form_sizes_addr8[] = {
   8, // 0x14 DW_FORM_ref8
   0, // 0x15 DW_FORM_ref_udata
   0, // 0x16 DW_FORM_indirect
-  8, // 0x17 DW_FORM_sec_offset
+  4, // 0x17 DW_FORM_sec_offset
   0, // 0x18 DW_FORM_exprloc
   0, // 0x19 DW_FORM_flag_present
   8, // 0x20 DW_FORM_ref_sig8
@@ -173,10 +173,8 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
       indirect = true;
       break;
     case DW_FORM_sec_offset:
-      if (cu->getAddressByteSize() == 4)
-        Value.uval = data.getU32(offset_ptr);
-      else
-        Value.uval = data.getU64(offset_ptr);
+      // FIXME: This is 64-bit for DWARF64.
+      Value.uval = data.getU32(offset_ptr);
       break;
     case DW_FORM_flag_present:
       Value.uval = 1;
@@ -301,12 +299,9 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
       form = debug_info_data.getULEB128(offset_ptr);
       break;
 
-    // 4 for DWARF32, 8 for DWARF64.
+    // FIXME: 4 for DWARF32, 8 for DWARF64.
     case DW_FORM_sec_offset:
-      if (cu->getAddressByteSize() == 4)
-        *offset_ptr += 4;
-      else
-        *offset_ptr += 8;
+      *offset_ptr += 4;
       return true;
 
     default:
@@ -325,6 +320,16 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
 
   switch (Form) {
   case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
+  case DW_FORM_GNU_addr_index: {
+    StringRef AddrOffsetSec = cu->getAddrOffsetSection();
+    OS << format(" indexed (%8.8x) address = ", (uint32_t)uvalue);
+    if (AddrOffsetSec.size() != 0) {
+      DataExtractor DA(AddrOffsetSec, true, cu->getAddressByteSize());
+      OS << format("0x%016" PRIx64, getIndirectAddress(&DA, cu));
+    } else
+      OS << "<no .debug_addr section>";
+    break;
+  }
   case DW_FORM_flag_present: OS << "true"; break;
   case DW_FORM_flag:
   case DW_FORM_data1:     OS << format("0x%02x", (uint8_t)uvalue); break;
@@ -419,11 +424,9 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
     OS << "DW_FORM_indirect";
     break;
 
+    // Should be formatted to 64-bit for DWARF64.
   case DW_FORM_sec_offset:
-    if (cu->getAddressByteSize() == 4)
-      OS << format("0x%08x", (uint32_t)uvalue);
-    else
-      OS << format("0x%016" PRIx64, uvalue);
+    OS << format("0x%08x", (uint32_t)uvalue);
     break;
 
   default:
@@ -452,10 +455,19 @@ DWARFFormValue::getIndirectCString(const DataExtractor *DS,
   if (!DS || !DSO) return NULL;
 
   uint32_t offset = Value.uval * 4;
-  uint32_t soffset = DSO->getULEB128(&offset);
+  uint32_t soffset = DSO->getU32(&offset);
   return DS->getCStr(&soffset);
 }
 
+uint64_t
+DWARFFormValue::getIndirectAddress(const DataExtractor *DA,
+                                   const DWARFCompileUnit *cu) const {
+  if (!DA) return 0;
+
+  uint32_t offset = Value.uval * cu->getAddressByteSize();
+  return DA->getAddress(&offset);
+}
+
 uint64_t DWARFFormValue::getReference(const DWARFCompileUnit *cu) const {
   uint64_t die_offset = Value.uval;
   switch (Form) {
diff --git a/lib/DebugInfo/DWARFFormValue.h b/lib/DebugInfo/DWARFFormValue.h
index 7768c18..b863001 100644
--- a/lib/DebugInfo/DWARFFormValue.h
+++ b/lib/DebugInfo/DWARFFormValue.h
@@ -66,6 +66,8 @@ public:
   const char *getAsCString(const DataExtractor *debug_str_data_ptr) const;
   const char *getIndirectCString(const DataExtractor *,
                                  const DataExtractor *) const;
+  uint64_t getIndirectAddress(const DataExtractor *,
+                              const DWARFCompileUnit *) const;
   bool skipValue(DataExtractor debug_info_data, uint32_t *offset_ptr,
                  const DWARFCompileUnit *cu) const;
   static bool skipValue(uint16_t form, DataExtractor debug_info_data,
diff --git a/lib/DebugInfo/DWARFRelocMap.h b/lib/DebugInfo/DWARFRelocMap.h
new file mode 100644
index 0000000..6929e36
--- /dev/null
+++ b/lib/DebugInfo/DWARFRelocMap.h
@@ -0,0 +1,22 @@
+//===-- DWARFRelocMap.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFRELOCMAP_H
+#define LLVM_DEBUGINFO_DWARFRELOCMAP_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t> > RelocAddrMap;
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARFRELOCMAP_H
+
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index ef5f589..3d59d25 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -632,7 +632,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       else if (Op0->getType()->isDoubleTy())
         GV.IntVal = APIntOps::RoundDoubleToAPInt(GV.DoubleVal, BitWidth);
       else if (Op0->getType()->isX86_FP80Ty()) {
-        APFloat apf = APFloat(GV.IntVal);
+        APFloat apf = APFloat(APFloat::x87DoubleExtended, GV.IntVal);
         uint64_t v;
         bool ignored;
         (void)apf.convertToInteger(&v, BitWidth,
@@ -751,27 +751,32 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       case Type::X86_FP80TyID:
       case Type::PPC_FP128TyID:
       case Type::FP128TyID: {
-        APFloat apfLHS = APFloat(LHS.IntVal);
+        const fltSemantics &Sem = CE->getOperand(0)->getType()->getFltSemantics();
+        APFloat apfLHS = APFloat(Sem, LHS.IntVal);
         switch (CE->getOpcode()) {
           default: llvm_unreachable("Invalid long double opcode");
           case Instruction::FAdd:
-            apfLHS.add(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            apfLHS.add(APFloat(Sem, RHS.IntVal), APFloat::rmNearestTiesToEven);
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
           case Instruction::FSub:
-            apfLHS.subtract(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            apfLHS.subtract(APFloat(Sem, RHS.IntVal),
+                            APFloat::rmNearestTiesToEven);
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
           case Instruction::FMul:
-            apfLHS.multiply(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            apfLHS.multiply(APFloat(Sem, RHS.IntVal),
+                            APFloat::rmNearestTiesToEven);
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
           case Instruction::FDiv:
-            apfLHS.divide(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            apfLHS.divide(APFloat(Sem, RHS.IntVal),
+                          APFloat::rmNearestTiesToEven);
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
           case Instruction::FRem:
-            apfLHS.mod(APFloat(RHS.IntVal), APFloat::rmNearestTiesToEven);
+            apfLHS.mod(APFloat(Sem, RHS.IntVal),
+                       APFloat::rmNearestTiesToEven);
             GV.IntVal = apfLHS.bitcastToAPInt();
             break;
           }
@@ -893,7 +898,8 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
 /// from Src into IntVal, which is assumed to be wide enough and to hold zero.
 static void LoadIntFromMemory(APInt &IntVal, uint8_t *Src, unsigned LoadBytes) {
   assert((IntVal.getBitWidth()+7)/8 >= LoadBytes && "Integer too small!");
-  uint8_t *Dst = (uint8_t *)IntVal.getRawData();
+  uint8_t *Dst = reinterpret_cast<uint8_t *>(
+                   const_cast<uint64_t *>(IntVal.getRawData()));
 
   if (sys::isLittleEndianHost())
     // Little-endian host - the destination must be ordered from LSB to MSB.
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 0f99596..7dc295f 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -22,7 +22,9 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Errno.h"
@@ -78,6 +80,17 @@ static LineNumberInfo LineStartToIntelJITFormat(
   return Result;
 }
 
+static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress,
+                                                 uintptr_t Address,
+                                                 DILineInfo Line) {
+  LineNumberInfo Result;
+
+  Result.Offset = Address - StartAddress;
+  Result.LineNumber = Line.getLine();
+
+  return Result;
+}
+
 static iJIT_Method_Load FunctionDescToIntelJITFormat(
     IntelJITEventsWrapper& Wrapper,
     const char* FnName,
@@ -177,6 +190,7 @@ void IntelJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
 void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
   // Get the address of the object image for use as a unique identifier
   const void* ObjData = Obj.getData().data();
+  DIContext* Context = DIContext::getDWARFContext(Obj.getObjectFile());
   MethodAddressVector Functions;
 
   // Use symbol info to iterate functions in the object.
@@ -185,12 +199,15 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
                                E = Obj.end_symbols();
                         I != E && !ec;
                         I.increment(ec)) {
+    std::vector<LineNumberInfo> LineInfo;
+    std::string SourceFileName;
+
     object::SymbolRef::Type SymType;
     if (I->getType(SymType)) continue;
     if (SymType == object::SymbolRef::ST_Function) {
-      StringRef Name;
-      uint64_t  Addr;
-      uint64_t  Size;
+      StringRef  Name;
+      uint64_t   Addr;
+      uint64_t   Size;
       if (I->getName(Name)) continue;
       if (I->getAddress(Addr)) continue;
       if (I->getSize(Size)) continue;
@@ -203,11 +220,30 @@ void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
                                            Name.data(),
                                            Addr,
                                            Size);
-
-      // FIXME: Try to find line info for this function in the DWARF sections.
-      FunctionMessage.source_file_name = 0;
-      FunctionMessage.line_number_size = 0;
-      FunctionMessage.line_number_table = 0;
+      if (Context) {
+        DILineInfoTable  Lines = Context->getLineInfoForAddressRange(Addr, Size);
+        DILineInfoTable::iterator  Begin = Lines.begin();
+        DILineInfoTable::iterator  End = Lines.end();
+        for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
+          LineInfo.push_back(DILineInfoToIntelJITFormat((uintptr_t)Addr,
+                                                        It->first,
+                                                        It->second));
+        }
+        if (LineInfo.size() == 0) {
+          FunctionMessage.source_file_name = 0;
+          FunctionMessage.line_number_size = 0;
+          FunctionMessage.line_number_table = 0;
+        } else {
+          SourceFileName = Lines.front().second.getFileName();
+          FunctionMessage.source_file_name = (char *)SourceFileName.c_str();
+          FunctionMessage.line_number_size = LineInfo.size();
+          FunctionMessage.line_number_table = &*LineInfo.begin();
+        }
+      } else {
+        FunctionMessage.source_file_name = 0;
+        FunctionMessage.line_number_size = 0;
+        FunctionMessage.line_number_table = 0;
+      }
 
       Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
                                 &FunctionMessage);
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index 431744a..ec4f7f6 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -1169,10 +1169,12 @@ void Interpreter::visitVAArgInst(VAArgInst &I) {
                       .VarArgs[VAList.UIntPairVal.second];
   Type *Ty = I.getType();
   switch (Ty->getTypeID()) {
-    case Type::IntegerTyID: Dest.IntVal = Src.IntVal;
-    IMPLEMENT_VAARG(Pointer);
-    IMPLEMENT_VAARG(Float);
-    IMPLEMENT_VAARG(Double);
+  case Type::IntegerTyID:
+    Dest.IntVal = Src.IntVal;
+    break;
+  IMPLEMENT_VAARG(Pointer);
+  IMPLEMENT_VAARG(Float);
+  IMPLEMENT_VAARG(Double);
   default:
     dbgs() << "Unhandled dest type for vaarg instruction: " << *Ty << "\n";
     llvm_unreachable(0);
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 103c0c0..53ea0a2 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -522,7 +522,8 @@ GenericValue JIT::runFunction(Function *F,
     case Type::PPC_FP128TyID:
     case Type::X86_FP80TyID:
     case Type::FP128TyID:
-        C = ConstantFP::get(F->getContext(), APFloat(AV.IntVal));
+        C = ConstantFP::get(F->getContext(), APFloat(ArgTy->getFltSemantics(),
+                                                     AV.IntVal));
         break;
     case Type::PointerTyID:
       void *ArgPtr = GVTOP(AV);
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index 6fd4df4..c273876 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -969,14 +969,24 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
     SavedBufferBegin = BufferBegin;
     SavedBufferEnd = BufferEnd;
     SavedCurBufferPtr = CurBufferPtr;
-
-    BufferBegin = CurBufferPtr = MemMgr->startExceptionTable(F.getFunction(),
-                                                             ActualSize);
-    BufferEnd = BufferBegin+ActualSize;
-    EmittedFunctions[F.getFunction()].ExceptionTable = BufferBegin;
-    uint8_t *EhStart;
-    uint8_t *FrameRegister = DE->EmitDwarfTable(F, *this, FnStart, FnEnd,
-                                                EhStart);
+    uint8_t *FrameRegister;
+
+    while (true) {
+      BufferBegin = CurBufferPtr = MemMgr->startExceptionTable(F.getFunction(),
+                                                               ActualSize);
+      BufferEnd = BufferBegin+ActualSize;
+      EmittedFunctions[F.getFunction()].ExceptionTable = BufferBegin;
+      uint8_t *EhStart;
+      FrameRegister = DE->EmitDwarfTable(F, *this, FnStart, FnEnd, EhStart);
+
+      // If the buffer was large enough to hold the table then we are done.
+      if (CurBufferPtr != BufferEnd)
+        break;
+
+      // Try again with twice as much space.
+      ActualSize = (CurBufferPtr - BufferBegin) * 2;
+      MemMgr->deallocateExceptionTable(BufferBegin);
+    }
     MemMgr->endExceptionTable(F.getFunction(), BufferBegin, CurBufferPtr,
                               FrameRegister);
     BufferBegin = SavedBufferBegin;
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index 353bebf..66aeb77 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -72,15 +72,20 @@ namespace {
     /// getBlockAfter - Return the memory block immediately after this one.
     ///
     MemoryRangeHeader &getBlockAfter() const {
-      return *(MemoryRangeHeader*)((char*)this+BlockSize);
+      return *reinterpret_cast<MemoryRangeHeader *>(
+                reinterpret_cast<char*>(
+                  const_cast<MemoryRangeHeader *>(this))+BlockSize);
     }
 
     /// getFreeBlockBefore - If the block before this one is free, return it,
     /// otherwise return null.
     FreeRangeHeader *getFreeBlockBefore() const {
       if (PrevAllocated) return 0;
-      intptr_t PrevSize = ((intptr_t *)this)[-1];
-      return (FreeRangeHeader*)((char*)this-PrevSize);
+      intptr_t PrevSize = reinterpret_cast<intptr_t *>(
+                            const_cast<MemoryRangeHeader *>(this))[-1];
+      return reinterpret_cast<FreeRangeHeader *>(
+               reinterpret_cast<char*>(
+                 const_cast<MemoryRangeHeader *>(this))-PrevSize);
     }
 
     /// FreeBlock - Turn an allocated block into a free block, adjusting
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
index d67f537..7c0d395 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -29,6 +29,7 @@
 #include <dirent.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <unistd.h>
 
 namespace {
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
index 28b4f0f..89350cc 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
@@ -58,6 +58,8 @@ public:
 
   virtual StringRef getData() const { return ObjFile->getData(); }
 
+  virtual object::ObjectFile* getObjectFile() const { return ObjFile; }
+
   // Subclasses can override these methods to update the image with loaded
   // addresses for sections and common symbols
   virtual void updateSectionAddress(const object::SectionRef &Sec,
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index c5b807b..409b25f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -432,14 +432,20 @@ void RuntimeDyldImpl::resolveExternalSymbols() {
     RelocationList &Relocs = i->second;
     SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
     if (Loc == GlobalSymbolTable.end()) {
-      // This is an external symbol, try to get it address from
-      // MemoryManager.
-      uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(),
+      if (Name.size() == 0) {
+        // This is an absolute symbol, use an address of zero.
+        DEBUG(dbgs() << "Resolving absolute relocations." << "\n");
+        resolveRelocationList(Relocs, 0);
+      } else {
+        // This is an external symbol, try to get its address from
+        // MemoryManager.
+        uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(),
                                                                    true);
-      DEBUG(dbgs() << "Resolving relocations Name: " << Name
-              << "\t" << format("%p", Addr)
-              << "\n");
-      resolveRelocationList(Relocs, (uintptr_t)Addr);
+        DEBUG(dbgs() << "Resolving relocations Name: " << Name
+                << "\t" << format("%p", Addr)
+                << "\n");
+        resolveRelocationList(Relocs, (uintptr_t)Addr);
+      }
     } else {
       report_fatal_error("Expected external symbol");
     }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 0a68f4e..b8537b1 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -28,8 +28,6 @@
 using namespace llvm;
 using namespace llvm::object;
 
-using support::endianness;
-
 namespace {
 
 static inline
@@ -40,22 +38,22 @@ error_code check(error_code Err) {
   return Err;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 class DyldELFObject
-  : public ELFObjectFile<target_endianness, max_alignment, is64Bits> {
-  LLVM_ELF_IMPORT_TYPES(target_endianness, max_alignment, is64Bits)
+  : public ELFObjectFile<ELFT> {
+  LLVM_ELF_IMPORT_TYPES(ELFT)
 
-  typedef Elf_Shdr_Impl<target_endianness, max_alignment, is64Bits> Elf_Shdr;
-  typedef Elf_Sym_Impl<target_endianness, max_alignment, is64Bits> Elf_Sym;
+  typedef Elf_Shdr_Impl<ELFT> Elf_Shdr;
+  typedef Elf_Sym_Impl<ELFT> Elf_Sym;
   typedef
-    Elf_Rel_Impl<target_endianness, max_alignment, is64Bits, false> Elf_Rel;
+    Elf_Rel_Impl<ELFT, false> Elf_Rel;
   typedef
-    Elf_Rel_Impl<target_endianness, max_alignment, is64Bits, true> Elf_Rela;
+    Elf_Rel_Impl<ELFT, true> Elf_Rela;
 
-  typedef Elf_Ehdr_Impl<target_endianness, max_alignment, is64Bits> Elf_Ehdr;
+  typedef Elf_Ehdr_Impl<ELFT> Elf_Ehdr;
 
   typedef typename ELFDataTypeTypedefHelper<
-          target_endianness, max_alignment, is64Bits>::value_type addr_type;
+          ELFT>::value_type addr_type;
 
 public:
   DyldELFObject(MemoryBuffer *Wrapper, error_code &ec);
@@ -65,25 +63,25 @@ public:
 
   // Methods for type inquiry through isa, cast and dyn_cast
   static inline bool classof(const Binary *v) {
-    return (isa<ELFObjectFile<target_endianness, max_alignment, is64Bits> >(v)
+    return (isa<ELFObjectFile<ELFT> >(v)
             && classof(cast<ELFObjectFile
-                <target_endianness, max_alignment, is64Bits> >(v)));
+                <ELFT> >(v)));
   }
   static inline bool classof(
-      const ELFObjectFile<target_endianness, max_alignment, is64Bits> *v) {
+      const ELFObjectFile<ELFT> *v) {
     return v->isDyldType();
   }
 };
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 class ELFObjectImage : public ObjectImageCommon {
   protected:
-    DyldELFObject<target_endianness, max_alignment, is64Bits> *DyldObj;
+    DyldELFObject<ELFT> *DyldObj;
     bool Registered;
 
   public:
     ELFObjectImage(ObjectBuffer *Input,
-                 DyldELFObject<target_endianness, max_alignment, is64Bits> *Obj)
+                 DyldELFObject<ELFT> *Obj)
     : ObjectImageCommon(Input, Obj),
       DyldObj(Obj),
       Registered(false) {}
@@ -119,16 +117,15 @@ class ELFObjectImage : public ObjectImageCommon {
 // The MemoryBuffer passed into this constructor is just a wrapper around the
 // actual memory.  Ultimately, the Binary parent class will take ownership of
 // this MemoryBuffer object but not the underlying memory.
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-DyldELFObject<target_endianness, max_alignment, is64Bits>
-             ::DyldELFObject(MemoryBuffer *Wrapper, error_code &ec)
-  : ELFObjectFile<target_endianness, max_alignment, is64Bits>(Wrapper, ec) {
+template<class ELFT>
+DyldELFObject<ELFT>::DyldELFObject(MemoryBuffer *Wrapper, error_code &ec)
+  : ELFObjectFile<ELFT>(Wrapper, ec) {
   this->isDyldELFObject = true;
 }
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-void DyldELFObject<target_endianness, max_alignment, is64Bits>
-                  ::updateSectionAddress(const SectionRef &Sec, uint64_t Addr) {
+template<class ELFT>
+void DyldELFObject<ELFT>::updateSectionAddress(const SectionRef &Sec,
+                                               uint64_t Addr) {
   DataRefImpl ShdrRef = Sec.getRawDataRefImpl();
   Elf_Shdr *shdr = const_cast<Elf_Shdr*>(
                           reinterpret_cast<const Elf_Shdr *>(ShdrRef.p));
@@ -138,13 +135,12 @@ void DyldELFObject<target_endianness, max_alignment, is64Bits>
   shdr->sh_addr = static_cast<addr_type>(Addr);
 }
 
-template<endianness target_endianness, std::size_t max_align, bool is64Bits>
-void DyldELFObject<target_endianness, max_align, is64Bits>
-                  ::updateSymbolAddress(const SymbolRef &SymRef, uint64_t Addr){
+template<class ELFT>
+void DyldELFObject<ELFT>::updateSymbolAddress(const SymbolRef &SymRef,
+                                              uint64_t Addr) {
 
   Elf_Sym *sym = const_cast<Elf_Sym*>(
-    ELFObjectFile<target_endianness, max_align, is64Bits>
-                 ::getSymbol(SymRef.getRawDataRefImpl()));
+    ELFObjectFile<ELFT>::getSymbol(SymRef.getRawDataRefImpl()));
 
   // This assumes the address passed in matches the target address bitness
   // The template-based type cast handles everything else.
@@ -164,24 +160,28 @@ ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
   error_code ec;
 
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) {
-    DyldELFObject<support::little, 4, false> *Obj =
-      new DyldELFObject<support::little, 4, false>(Buffer->getMemBuffer(), ec);
-    return new ELFObjectImage<support::little, 4, false>(Buffer, Obj);
+    DyldELFObject<ELFType<support::little, 4, false> > *Obj =
+      new DyldELFObject<ELFType<support::little, 4, false> >(
+        Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<ELFType<support::little, 4, false> >(Buffer, Obj);
   }
   else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB) {
-    DyldELFObject<support::big, 4, false> *Obj =
-      new DyldELFObject<support::big, 4, false>(Buffer->getMemBuffer(), ec);
-    return new ELFObjectImage<support::big, 4, false>(Buffer, Obj);
+    DyldELFObject<ELFType<support::big, 4, false> > *Obj =
+      new DyldELFObject<ELFType<support::big, 4, false> >(
+        Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<ELFType<support::big, 4, false> >(Buffer, Obj);
   }
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB) {
-    DyldELFObject<support::big, 8, true> *Obj =
-      new DyldELFObject<support::big, 8, true>(Buffer->getMemBuffer(), ec);
-    return new ELFObjectImage<support::big, 8, true>(Buffer, Obj);
+    DyldELFObject<ELFType<support::big, 8, true> > *Obj =
+      new DyldELFObject<ELFType<support::big, 8, true> >(
+        Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<ELFType<support::big, 8, true> >(Buffer, Obj);
   }
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) {
-    DyldELFObject<support::little, 8, true> *Obj =
-      new DyldELFObject<support::little, 8, true>(Buffer->getMemBuffer(), ec);
-    return new ELFObjectImage<support::little, 8, true>(Buffer, Obj);
+    DyldELFObject<ELFType<support::little, 8, true> > *Obj =
+      new DyldELFObject<ELFType<support::little, 8, true> >(
+        Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<ELFType<support::little, 8, true> >(Buffer, Obj);
   }
   else
     llvm_unreachable("Unexpected ELF format");
@@ -523,7 +523,7 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
   case ELF::R_PPC64_ADDR32 : {
     int32_t Result = static_cast<int32_t>(Value + Addend);
     if (SignExtend32<32>(Result) != Result)
-      llvm_unreachable("Relocation R_PPC64_REL32 overflow");
+      llvm_unreachable("Relocation R_PPC64_ADDR32 overflow");
     writeInt32BE(LocalAddress, Result);
   } break;
   case ELF::R_PPC64_REL24 : {
@@ -534,6 +534,13 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
     // Generates a 'bl <address>' instruction
     writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC));
   } break;
+  case ELF::R_PPC64_REL32 : {
+    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
+    if (SignExtend32<32>(delta) != delta)
+      llvm_unreachable("Relocation R_PPC64_REL32 overflow");
+    writeInt32BE(LocalAddress, delta);
+  } break;
   case ELF::R_PPC64_ADDR64 :
     writeInt64BE(LocalAddress, Value + Addend);
     break;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index d2310b5..bcc3df1 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -96,6 +96,7 @@ bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
       *p++ = (uint8_t)(ValueToWrite & 0xff);
       ValueToWrite >>= 8;
     }
+    return false;
   }
   case macho::RIT_Difference:
   case macho::RIT_Generic_LocalDifference:
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 3c4da75..ca4330f 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -32,18 +32,7 @@ TargetMachine *EngineBuilder::selectTarget() {
   // must use the host architecture.
   if (UseMCJIT && WhichEngine != EngineKind::Interpreter && M)
     TT.setTriple(M->getTargetTriple());
-  else {
-    TT.setTriple(LLVM_HOSTTRIPLE);
-#if defined(__APPLE__)
-#if defined(__LP64__)
-    if (TT.isArch32Bit())
-      TT = TT.get64BitArchVariant();
-#else
-    if (TT.isArch64Bit())
-      TT = TT.get32BitArchVariant();
-#endif
-#endif // APPLE
-  }
+
   return selectTarget(TT, MArch, MCPU, MAttrs);
 }
 
@@ -55,7 +44,7 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
                               const SmallVectorImpl<std::string>& MAttrs) {
   Triple TheTriple(TargetTriple);
   if (TheTriple.getTriple().empty())
-    TheTriple.setTriple(sys::getDefaultTargetTriple());
+    TheTriple.setTriple(sys::getProcessTriple());
 
   // Adjust the triple to match what the user requested.
   const Target *TheTarget = 0;
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 1c46a94..9954a29 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -66,22 +66,21 @@ static const Module *getModuleFromVal(const Value *V) {
   return 0;
 }
 
-static void PrintCallingConv(unsigned cc, raw_ostream &Out)
-{
+static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   switch (cc) {
-    case CallingConv::Fast:         Out << "fastcc"; break;
-    case CallingConv::Cold:         Out << "coldcc"; break;
-    case CallingConv::X86_StdCall:  Out << "x86_stdcallcc"; break;
-    case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break;
-    case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break;
-    case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break;
-    case CallingConv::ARM_APCS:     Out << "arm_apcscc"; break;
-    case CallingConv::ARM_AAPCS:    Out << "arm_aapcscc"; break;
-    case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc"; break;
-    case CallingConv::MSP430_INTR:  Out << "msp430_intrcc"; break;
-    case CallingConv::PTX_Kernel:   Out << "ptx_kernel"; break;
-    case CallingConv::PTX_Device:   Out << "ptx_device"; break;
-    default:                        Out << "cc" << cc; break;
+  default:                         Out << "cc" << cc; break;
+  case CallingConv::Fast:          Out << "fastcc"; break;
+  case CallingConv::Cold:          Out << "coldcc"; break;
+  case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
+  case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
+  case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
+  case CallingConv::Intel_OCL_BI:  Out << "intel_ocl_bicc"; break;
+  case CallingConv::ARM_APCS:      Out << "arm_apcscc"; break;
+  case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
+  case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break;
+  case CallingConv::MSP430_INTR:   Out << "msp430_intrcc"; break;
+  case CallingConv::PTX_Kernel:    Out << "ptx_kernel"; break;
+  case CallingConv::PTX_Device:    Out << "ptx_device"; break;
   }
 }
 
@@ -117,7 +116,7 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
   }
 
   // Scan the name to see if it needs quotes first.
-  bool NeedsQuotes = isdigit(Name[0]);
+  bool NeedsQuotes = isdigit(static_cast<unsigned char>(Name[0]));
   if (!NeedsQuotes) {
     for (unsigned i = 0, e = Name.size(); i != e; ++i) {
       // By making this unsigned, the value passed in to isalnum will always be
@@ -125,7 +124,8 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
       // its implementation will assert.  This situation can arise when dealing
       // with UTF-8 multibyte characters.
       unsigned char C = Name[i];
-      if (!isalnum(C) && C != '-' && C != '.' && C != '_') {
+      if (!isalnum(static_cast<unsigned char>(C)) && C != '-' && C != '.' &&
+          C != '_') {
         NeedsQuotes = true;
         break;
       }
@@ -347,6 +347,10 @@ private:
   /// mdnMap - Map for MDNodes.
   DenseMap<const MDNode*, unsigned> mdnMap;
   unsigned mdnNext;
+
+  /// asMap - The slot map for attribute sets.
+  DenseMap<AttributeSet, unsigned> asMap;
+  unsigned asNext;
 public:
   /// Construct from a module
   explicit SlotTracker(const Module *M);
@@ -358,6 +362,7 @@ public:
   int getLocalSlot(const Value *V);
   int getGlobalSlot(const GlobalValue *V);
   int getMetadataSlot(const MDNode *N);
+  int getAttributeGroupSlot(AttributeSet AS);
 
   /// If you'd like to deal with a function instead of just a module, use
   /// this method to get its data into the SlotTracker.
@@ -378,6 +383,13 @@ public:
   unsigned mdn_size() const { return mdnMap.size(); }
   bool mdn_empty() const { return mdnMap.empty(); }
 
+  /// AttributeSet map iterators.
+  typedef DenseMap<AttributeSet, unsigned>::iterator as_iterator;
+  as_iterator as_begin()   { return asMap.begin(); }
+  as_iterator as_end()     { return asMap.end(); }
+  unsigned as_size() const { return asMap.size(); }
+  bool as_empty() const    { return asMap.empty(); }
+
   /// This function does the actual initialization.
   inline void initialize();
 
@@ -392,6 +404,9 @@ private:
   /// CreateFunctionSlot - Insert the specified Value* into the slot table.
   void CreateFunctionSlot(const Value *V);
 
+  /// \brief Insert the specified AttributeSet into the slot table.
+  void CreateAttributeSetSlot(AttributeSet AS);
+
   /// Add all of the module level global variables (and their initializers)
   /// and function declarations, but not the contents of those functions.
   void processModule();
@@ -446,14 +461,14 @@ static SlotTracker *createSlotTracker(const Value *V) {
 // to be added to the slot table.
 SlotTracker::SlotTracker(const Module *M)
   : TheModule(M), TheFunction(0), FunctionProcessed(false),
-    mNext(0), fNext(0),  mdnNext(0) {
+    mNext(0), fNext(0),  mdnNext(0), asNext(0) {
 }
 
 // Function level constructor. Causes the contents of the Module and the one
 // function provided to be added to the slot table.
 SlotTracker::SlotTracker(const Function *F)
   : TheModule(F ? F->getParent() : 0), TheFunction(F), FunctionProcessed(false),
-    mNext(0), fNext(0), mdnNext(0) {
+    mNext(0), fNext(0), mdnNext(0), asNext(0) {
 }
 
 inline void SlotTracker::initialize() {
@@ -487,12 +502,19 @@ void SlotTracker::processModule() {
       CreateMetadataSlot(NMD->getOperand(i));
   }
 
-  // Add all the unnamed functions to the table.
   for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
-       I != E; ++I)
+       I != E; ++I) {
     if (!I->hasName())
+      // Add all the unnamed functions to the table.
       CreateModuleSlot(I);
 
+    // Add all the function attributes to the table.
+    // FIXME: Add attributes of other objects?
+    AttributeSet FnAttrs = I->getAttributes().getFnAttributes();
+    if (FnAttrs.hasAttributes(AttributeSet::FunctionIndex))
+      CreateAttributeSetSlot(FnAttrs);
+  }
+
   ST_DEBUG("end processModule!\n");
 }
 
@@ -531,6 +553,16 @@ void SlotTracker::processFunction() {
             for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
               if (MDNode *N = dyn_cast_or_null<MDNode>(I->getOperand(i)))
                 CreateMetadataSlot(N);
+
+        // Add all the call attributes to the table.
+        AttributeSet Attrs = CI->getAttributes().getFnAttributes();
+        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+          CreateAttributeSetSlot(Attrs);
+      } else if (const InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+        // Add all the call attributes to the table.
+        AttributeSet Attrs = II->getAttributes().getFnAttributes();
+        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+          CreateAttributeSetSlot(Attrs);
       }
 
       // Process metadata attached with this instruction.
@@ -589,6 +621,14 @@ int SlotTracker::getLocalSlot(const Value *V) {
   return FI == fMap.end() ? -1 : (int)FI->second;
 }
 
+int SlotTracker::getAttributeGroupSlot(AttributeSet AS) {
+  // Check for uninitialized state and do lazy initialization.
+  initialize();
+
+  // Find the AttributeSet in the module map.
+  as_iterator AI = asMap.find(AS);
+  return AI == asMap.end() ? -1 : (int)AI->second;
+}
 
 /// CreateModuleSlot - Insert the specified GlobalValue* into the slot table.
 void SlotTracker::CreateModuleSlot(const GlobalValue *V) {
@@ -640,6 +680,18 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) {
       CreateMetadataSlot(Op);
 }
 
+void SlotTracker::CreateAttributeSetSlot(AttributeSet AS) {
+  assert(AS.hasAttributes(AttributeSet::FunctionIndex) &&
+         "Doesn't need a slot!");
+
+  as_iterator I = asMap.find(AS);
+  if (I != asMap.end())
+    return;
+
+  unsigned DestSlot = asNext++;
+  asMap[AS] = DestSlot;
+}
+
 //===----------------------------------------------------------------------===//
 // AsmWriter Implementation
 //===----------------------------------------------------------------------===//
@@ -1201,6 +1253,7 @@ public:
   void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope);
 
   void writeAllMDNodes();
+  void writeAllAttributeGroups();
 
   void printTypeIdentities();
   void printGlobal(const GlobalVariable *GV);
@@ -1268,6 +1321,8 @@ void AssemblyWriter::writeParamOperand(const Value *Operand,
 }
 
 void AssemblyWriter::printModule(const Module *M) {
+  Machine.initialize();
+
   if (!M->getModuleIdentifier().empty() &&
       // Don't print the ID if it will start a new line (which would
       // require a comment char before it).
@@ -1322,6 +1377,12 @@ void AssemblyWriter::printModule(const Module *M) {
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
     printFunction(I);
 
+  // Output all attribute groups.
+  if (!Machine.as_empty()) {
+    Out << '\n';
+    writeAllAttributeGroups();
+  }
+
   // Output named metadata.
   if (!M->named_metadata_empty()) Out << '\n';
 
@@ -1342,14 +1403,16 @@ void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) {
   if (Name.empty()) {
     Out << "<empty name> ";
   } else {
-    if (isalpha(Name[0]) || Name[0] == '-' || Name[0] == '$' ||
+    if (isalpha(static_cast<unsigned char>(Name[0])) ||
+        Name[0] == '-' || Name[0] == '$' ||
         Name[0] == '.' || Name[0] == '_')
       Out << Name[0];
     else
       Out << '\\' << hexdigit(Name[0] >> 4) << hexdigit(Name[0] & 0x0F);
     for (unsigned i = 1, e = Name.size(); i != e; ++i) {
       unsigned char C = Name[i];
-      if (isalnum(C) || C == '-' || C == '$' || C == '.' || C == '_')
+      if (isalnum(static_cast<unsigned char>(C)) || C == '-' || C == '$' ||
+          C == '.' || C == '_')
         Out << C;
       else
         Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F);
@@ -1443,6 +1506,7 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   if (unsigned AddressSpace = GV->getType()->getAddressSpace())
     Out << "addrspace(" << AddressSpace << ") ";
   if (GV->hasUnnamedAddr()) Out << "unnamed_addr ";
+  if (GV->isExternallyInitialized()) Out << "externally_initialized ";
   Out << (GV->isConstant() ? "constant " : "global ");
   TypePrinter.print(GV->getType()->getElementType(), Out);
 
@@ -1557,9 +1621,8 @@ void AssemblyWriter::printFunction(const Function *F) {
 
   FunctionType *FT = F->getFunctionType();
   const AttributeSet &Attrs = F->getAttributes();
-  Attribute RetAttrs = Attrs.getRetAttributes();
-  if (RetAttrs.hasAttributes())
-    Out <<  Attrs.getRetAttributes().getAsString() << ' ';
+  if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
+    Out <<  Attrs.getAsString(AttributeSet::ReturnIndex) << ' ';
   TypePrinter.print(F->getReturnType(), Out);
   Out << ' ';
   WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
@@ -1601,7 +1664,7 @@ void AssemblyWriter::printFunction(const Function *F) {
   if (F->hasUnnamedAddr())
     Out << " unnamed_addr";
   if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
-    Out << ' ' << Attrs.getAsString(AttributeSet::FunctionIndex);
+    Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes());
   if (F->hasSection()) {
     Out << " section \"";
     PrintEscapedString(F->getSection(), Out);
@@ -1758,7 +1821,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 
   // Special case conditional branches to swizzle the condition out to the front
   if (isa<BranchInst>(I) && cast<BranchInst>(I).isConditional()) {
-    BranchInst &BI(cast<BranchInst>(I));
+    const BranchInst &BI(cast<BranchInst>(I));
     Out << ' ';
     writeOperand(BI.getCondition(), true);
     Out << ", ";
@@ -1767,14 +1830,14 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeOperand(BI.getSuccessor(1), true);
 
   } else if (isa<SwitchInst>(I)) {
-    SwitchInst& SI(cast<SwitchInst>(I));
+    const SwitchInst& SI(cast<SwitchInst>(I));
     // Special case switch instruction to get formatting nice and correct.
     Out << ' ';
     writeOperand(SI.getCondition(), true);
     Out << ", ";
     writeOperand(SI.getDefaultDest(), true);
     Out << " [";
-    for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
+    for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
          i != e; ++i) {
       Out << "\n    ";
       writeOperand(i.getCaseValue(), true);
@@ -1849,8 +1912,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Type *RetTy = FTy->getReturnType();
     const AttributeSet &PAL = CI->getAttributes();
 
-    if (PAL.getRetAttributes().hasAttributes())
-      Out << ' ' << PAL.getRetAttributes().getAsString();
+    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
 
     // If possible, print out the short form of the call instruction.  We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -1874,7 +1937,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     }
     Out << ')';
     if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-      Out << ' ' << PAL.getAsString(AttributeSet::FunctionIndex);
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
     Operand = II->getCalledValue();
     PointerType *PTy = cast<PointerType>(Operand->getType());
@@ -1888,8 +1951,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       PrintCallingConv(II->getCallingConv(), Out);
     }
 
-    if (PAL.getRetAttributes().hasAttributes())
-      Out << ' ' << PAL.getRetAttributes().getAsString();
+    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
 
     // If possible, print out the short form of the invoke instruction. We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -1914,7 +1977,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 
     Out << ')';
     if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-      Out << ' ' << PAL.getAsString(AttributeSet::FunctionIndex);
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
 
     Out << "\n          to ";
     writeOperand(II->getNormalDest(), true);
@@ -1923,7 +1986,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 
   } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
     Out << ' ';
-    TypePrinter.print(AI->getType()->getElementType(), Out);
+    TypePrinter.print(AI->getAllocatedType(), Out);
     if (!AI->getArraySize() || AI->isArrayAllocation()) {
       Out << ", ";
       writeOperand(AI->getArraySize(), true);
@@ -2063,6 +2126,20 @@ void AssemblyWriter::printMDNodeBody(const MDNode *Node) {
   Out << "\n";
 }
 
+void AssemblyWriter::writeAllAttributeGroups() {
+  std::vector<std::pair<AttributeSet, unsigned> > asVec;
+  asVec.resize(Machine.as_size());
+
+  for (SlotTracker::as_iterator I = Machine.as_begin(), E = Machine.as_end();
+       I != E; ++I)
+    asVec[I->second] = *I;
+
+  for (std::vector<std::pair<AttributeSet, unsigned> >::iterator
+         I = asVec.begin(), E = asVec.end(); I != E; ++I)
+    Out << "attributes #" << I->second << " = { "
+        << I->first.getAsString(AttributeSet::FunctionIndex, true) << " }\n";
+}
+
 //===----------------------------------------------------------------------===//
 //                       External Interface declarations
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 7bb666a..cb2c55c 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/IR/Attributes.h"
+#include <string>
 
 namespace llvm {
 
@@ -26,81 +27,250 @@ class LLVMContext;
 
 //===----------------------------------------------------------------------===//
 /// \class
+/// \brief A set of classes that contain the kind and (optional) value of the
+/// attribute object. There are three main categories: enum attribute entries,
+/// represented by Attribute::AttrKind; alignment attribute entries; and string
+/// attribute enties, which are for target-dependent attributes.
+class AttributeEntry {
+  unsigned char KindID;
+protected:
+  enum AttrEntryKind {
+    EnumAttrEntry,
+    AlignAttrEntry,
+    StringAttrEntry
+  };
+public:
+  AttributeEntry(AttrEntryKind Kind)
+    : KindID(Kind) {}
+  virtual ~AttributeEntry() {}
+
+  unsigned getKindID() const { return KindID; }
+
+  static inline bool classof(const AttributeEntry *) { return true; }
+};
+
+class EnumAttributeEntry : public AttributeEntry {
+  Attribute::AttrKind Kind;
+public:
+  EnumAttributeEntry(Attribute::AttrKind Kind)
+    : AttributeEntry(EnumAttrEntry), Kind(Kind) {}
+
+  Attribute::AttrKind getEnumKind() const { return Kind; }
+
+  static inline bool classof(const AttributeEntry *AE) {
+    return AE->getKindID() == EnumAttrEntry;
+  }
+  static inline bool classof(const EnumAttributeEntry *) { return true; }
+};
+
+class AlignAttributeEntry : public AttributeEntry {
+  Attribute::AttrKind Kind;
+  unsigned Align;
+public:
+  AlignAttributeEntry(Attribute::AttrKind Kind, unsigned Align)
+    : AttributeEntry(AlignAttrEntry), Kind(Kind), Align(Align) {}
+
+  Attribute::AttrKind getEnumKind() const { return Kind; }
+  unsigned getAlignment() const { return Align; }
+
+  static inline bool classof(const AttributeEntry *AE) {
+    return AE->getKindID() == AlignAttrEntry;
+  }
+  static inline bool classof(const AlignAttributeEntry *) { return true; }
+};
+
+class StringAttributeEntry : public AttributeEntry {
+  std::string Kind;
+  std::string Val;
+public:
+  StringAttributeEntry(StringRef Kind, StringRef Val = StringRef())
+    : AttributeEntry(StringAttrEntry), Kind(Kind), Val(Val) {}
+
+  StringRef getStringKind() const { return Kind; }
+  StringRef getStringValue() const { return Val; }
+
+  static inline bool classof(const AttributeEntry *AE) {
+    return AE->getKindID() == StringAttrEntry;
+  }
+  static inline bool classof(const StringAttributeEntry *) { return true; }
+};
+
+//===----------------------------------------------------------------------===//
+/// \class
 /// \brief This class represents a single, uniqued attribute. That attribute
 /// could be a single enum, a tuple, or a string.
 class AttributeImpl : public FoldingSetNode {
-  LLVMContext &Context;
-  Constant *Data;
-  SmallVector<Constant*, 0> Vals;
+  LLVMContext &Context;  ///< Global context for uniquing objects
+
+  AttributeEntry *Entry; ///< Holds the kind and value of the attribute
+
+  // AttributesImpl is uniqued, these should not be publicly available.
+  void operator=(const AttributeImpl &) LLVM_DELETED_FUNCTION;
+  AttributeImpl(const AttributeImpl &) LLVM_DELETED_FUNCTION;
 public:
-  explicit AttributeImpl(LLVMContext &C, uint64_t data);
-  explicit AttributeImpl(LLVMContext &C, Attribute::AttrKind data);
-  AttributeImpl(LLVMContext &C, Attribute::AttrKind data,
-                ArrayRef<Constant*> values);
-  AttributeImpl(LLVMContext &C, StringRef data);
+  AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind);
+  AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind, unsigned Align);
+  AttributeImpl(LLVMContext &C, StringRef Kind, StringRef Val = StringRef());
+  ~AttributeImpl();
 
-  ArrayRef<Constant*> getValues() const { return Vals; }
+  LLVMContext &getContext() { return Context; }
+
+  bool isEnumAttribute() const;
+  bool isAlignAttribute() const;
+  bool isStringAttribute() const;
 
   bool hasAttribute(Attribute::AttrKind A) const;
+  bool hasAttribute(StringRef Kind) const;
+
+  Attribute::AttrKind getKindAsEnum() const;
+  uint64_t getValueAsInt() const;
 
-  bool hasAttributes() const;
+  StringRef getKindAsString() const;
+  StringRef getValueAsString() const;
 
-  uint64_t getAlignment() const;
-  void setAlignment(unsigned Align);
+  /// \brief Used when sorting the attributes.
+  bool operator<(const AttributeImpl &AI) const;
 
-  uint64_t getStackAlignment() const;
-  void setStackAlignment(unsigned Align);
+  void Profile(FoldingSetNodeID &ID) const {
+    if (isEnumAttribute())
+      Profile(ID, getKindAsEnum(), 0);
+    else if (isAlignAttribute())
+      Profile(ID, getKindAsEnum(), getValueAsInt());
+    else
+      Profile(ID, getKindAsString(), getValueAsString());
+  }
+  static void Profile(FoldingSetNodeID &ID, Attribute::AttrKind Kind,
+                      uint64_t Val) {
+    ID.AddInteger(Kind);
+    if (Val) ID.AddInteger(Val);
+  }
+  static void Profile(FoldingSetNodeID &ID, StringRef Kind, StringRef Values) {
+    ID.AddString(Kind);
+    ID.AddString(Values);
+  }
+
+  // FIXME: Remove this!
+  static uint64_t getAttrMask(Attribute::AttrKind Val);
+};
+
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a group of attributes that apply to one
+/// element: function, return type, or parameter.
+class AttributeSetNode : public FoldingSetNode {
+  SmallVector<Attribute, 4> AttrList;
 
-  bool operator==(Attribute::AttrKind Kind) const;
-  bool operator!=(Attribute::AttrKind Kind) const;
+  AttributeSetNode(ArrayRef<Attribute> Attrs)
+    : AttrList(Attrs.begin(), Attrs.end()) {}
 
-  bool operator==(StringRef Kind) const;
-  bool operator!=(StringRef Kind) const;
+  // AttributesSetNode is uniqued, these should not be publicly available.
+  void operator=(const AttributeSetNode &) LLVM_DELETED_FUNCTION;
+  AttributeSetNode(const AttributeSetNode &) LLVM_DELETED_FUNCTION;
+public:
+  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
 
-  uint64_t getBitMask() const;         // FIXME: Remove.
+  bool hasAttribute(Attribute::AttrKind Kind) const;
+  bool hasAttribute(StringRef Kind) const;
+  bool hasAttributes() const { return !AttrList.empty(); }
 
-  static uint64_t getAttrMask(Attribute::AttrKind Val);
+  Attribute getAttribute(Attribute::AttrKind Kind) const;
+  Attribute getAttribute(StringRef Kind) const;
+
+  unsigned getAlignment() const;
+  unsigned getStackAlignment() const;
+  std::string getAsString(bool InAttrGrp) const;
+
+  typedef SmallVectorImpl<Attribute>::iterator       iterator;
+  typedef SmallVectorImpl<Attribute>::const_iterator const_iterator;
+
+  iterator begin() { return AttrList.begin(); }
+  iterator end()   { return AttrList.end(); }
+
+  const_iterator begin() const { return AttrList.begin(); }
+  const_iterator end() const   { return AttrList.end(); }
 
   void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, Data, Vals);
+    Profile(ID, AttrList);
   }
-  static void Profile(FoldingSetNodeID &ID, Constant *Data,
-                      ArrayRef<Constant*> Vals) {
-    ID.AddPointer(Data);
-    for (ArrayRef<Constant*>::iterator I = Vals.begin(), E = Vals.end();
-         I != E; ++I)
-      ID.AddPointer(*I);
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
+    for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
+      AttrList[I].Profile(ID);
   }
 };
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief This class represents a set of attributes.
+/// \brief This class represents a set of attributes that apply to the function,
+/// return type, and parameters.
 class AttributeSetImpl : public FoldingSetNode {
+  friend class AttributeSet;
+
   LLVMContext &Context;
-  SmallVector<AttributeWithIndex, 4> AttrList;
+
+  typedef std::pair<unsigned, AttributeSetNode*> IndexAttrPair;
+  SmallVector<IndexAttrPair, 4> AttrNodes;
 
   // AttributesSet is uniqued, these should not be publicly available.
   void operator=(const AttributeSetImpl &) LLVM_DELETED_FUNCTION;
   AttributeSetImpl(const AttributeSetImpl &) LLVM_DELETED_FUNCTION;
 public:
-  AttributeSetImpl(LLVMContext &C, ArrayRef<AttributeWithIndex> attrs)
-    : Context(C), AttrList(attrs.begin(), attrs.end()) {}
+  AttributeSetImpl(LLVMContext &C,
+                   ArrayRef<std::pair<unsigned, AttributeSetNode*> > attrs)
+    : Context(C), AttrNodes(attrs.begin(), attrs.end()) {}
 
+  /// \brief Get the context that created this AttributeSetImpl.
   LLVMContext &getContext() { return Context; }
-  ArrayRef<AttributeWithIndex> getAttributes() const { return AttrList; }
-  unsigned getNumAttributes() const { return AttrList.size(); }
+
+  /// \brief Return the number of attributes this AttributeSet contains.
+  unsigned getNumAttributes() const { return AttrNodes.size(); }
+
+  /// \brief Get the index of the given "slot" in the AttrNodes list. This index
+  /// is the index of the return, parameter, or function object that the
+  /// attributes are applied to, not the index into the AttrNodes list where the
+  /// attributes reside.
+  uint64_t getSlotIndex(unsigned Slot) const {
+    return AttrNodes[Slot].first;
+  }
+
+  /// \brief Retrieve the attributes for the given "slot" in the AttrNode list.
+  /// \p Slot is an index into the AttrNodes list, not the index of the return /
+  /// parameter/ function which the attributes apply to.
+  AttributeSet getSlotAttributes(unsigned Slot) const {
+    return AttributeSet::get(Context, AttrNodes[Slot]);
+  }
+
+  /// \brief Retrieve the attribute set node for the given "slot" in the
+  /// AttrNode list.
+  AttributeSetNode *getSlotNode(unsigned Slot) const {
+    return AttrNodes[Slot].second;
+  }
+
+  typedef AttributeSetNode::iterator       iterator;
+  typedef AttributeSetNode::const_iterator const_iterator;
+
+  iterator begin(unsigned Idx)
+    { return AttrNodes[Idx].second->begin(); }
+  iterator end(unsigned Idx)
+    { return AttrNodes[Idx].second->end(); }
+
+  const_iterator begin(unsigned Idx) const
+    { return AttrNodes[Idx].second->begin(); }
+  const_iterator end(unsigned Idx) const
+    { return AttrNodes[Idx].second->end(); }
 
   void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, AttrList);
+    Profile(ID, AttrNodes);
   }
   static void Profile(FoldingSetNodeID &ID,
-                      ArrayRef<AttributeWithIndex> AttrList){
-    for (unsigned i = 0, e = AttrList.size(); i != e; ++i) {
-      ID.AddInteger(AttrList[i].Index);
-      ID.AddInteger(AttrList[i].Attrs.getBitMask());
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> > Nodes) {
+    for (unsigned i = 0, e = Nodes.size(); i != e; ++i) {
+      ID.AddInteger(Nodes[i].first);
+      ID.AddPointer(Nodes[i].second);
     }
   }
+
+  // FIXME: This atrocity is temporary.
+  uint64_t Raw(uint64_t Index) const;
 };
 
 } // end llvm namespace
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index bef7a6c..6eb51f0 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -1,4 +1,4 @@
-//===-- Attribute.cpp - Implement AttributesList -------------------------===//
+//===-- Attributes.cpp - Implement AttributesList -------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the Attribute, AttributeImpl, AttrBuilder,
+// \file
+// \brief This file implements the Attribute, AttributeImpl, AttrBuilder,
 // AttributeSetImpl, and AttributeSet classes.
 //
 //===----------------------------------------------------------------------===//
@@ -15,7 +16,6 @@
 #include "llvm/IR/Attributes.h"
 #include "AttributeImpl.h"
 #include "LLVMContextImpl.h"
-#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Atomic.h"
@@ -23,30 +23,41 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// Attribute Implementation
+// Attribute Construction Methods
 //===----------------------------------------------------------------------===//
 
-Attribute Attribute::get(LLVMContext &Context, ArrayRef<AttrKind> Vals) {
-  AttrBuilder B;
-  for (ArrayRef<AttrKind>::iterator I = Vals.begin(), E = Vals.end();
-       I != E; ++I)
-    B.addAttribute(*I);
-  return Attribute::get(Context, B);
-}
+Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
+                         uint64_t Val) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+  FoldingSetNodeID ID;
+  ID.AddInteger(Kind);
+  if (Val) ID.AddInteger(Val);
 
-Attribute Attribute::get(LLVMContext &Context, AttrBuilder &B) {
-  // If there are no attributes, return an empty Attribute class.
-  if (!B.hasAttributes())
-    return Attribute();
+  void *InsertPoint;
+  AttributeImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint);
 
-  // Otherwise, build a key to look up the existing attributes.
+  if (!PA) {
+    // If we didn't find any existing attributes of the same shape then create a
+    // new one and insert it.
+    PA = !Val ?
+      new AttributeImpl(Context, Kind) :
+      new AttributeImpl(Context, Kind, Val);
+    pImpl->AttrsSet.InsertNode(PA, InsertPoint);
+  }
+
+  // Return the Attribute that we found or created.
+  return Attribute(PA);
+}
+
+Attribute Attribute::get(LLVMContext &Context, StringRef Kind, StringRef Val) {
   LLVMContextImpl *pImpl = Context.pImpl;
   FoldingSetNodeID ID;
-  // FIXME: Don't look up ConstantInts here.
-  ID.AddPointer(ConstantInt::get(Type::getInt64Ty(Context), B.getBitMask()));
+  ID.AddString(Kind);
+  if (!Val.empty()) ID.AddString(Val);
 
   void *InsertPoint;
   AttributeImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint);
@@ -54,401 +65,298 @@ Attribute Attribute::get(LLVMContext &Context, AttrBuilder &B) {
   if (!PA) {
     // If we didn't find any existing attributes of the same shape then create a
     // new one and insert it.
-    PA = new AttributeImpl(Context, B.getBitMask());
+    PA = new AttributeImpl(Context, Kind, Val);
     pImpl->AttrsSet.InsertNode(PA, InsertPoint);
   }
 
-  // Return the AttributesList that we found or created.
+  // Return the Attribute that we found or created.
   return Attribute(PA);
 }
 
-bool Attribute::hasAttribute(AttrKind Val) const {
-  return pImpl && pImpl->hasAttribute(Val);
+Attribute Attribute::getWithAlignment(LLVMContext &Context, uint64_t Align) {
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x40000000 && "Alignment too large.");
+  return get(Context, Alignment, Align);
 }
 
-bool Attribute::hasAttributes() const {
-  return pImpl && pImpl->hasAttributes();
+Attribute Attribute::getWithStackAlignment(LLVMContext &Context,
+                                           uint64_t Align) {
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x100 && "Alignment too large.");
+  return get(Context, StackAlignment, Align);
 }
 
-/// This returns the alignment field of an attribute as a byte alignment value.
-unsigned Attribute::getAlignment() const {
-  if (!hasAttribute(Attribute::Alignment))
-    return 0;
-  return 1U << ((pImpl->getAlignment() >> 16) - 1);
+//===----------------------------------------------------------------------===//
+// Attribute Accessor Methods
+//===----------------------------------------------------------------------===//
+
+bool Attribute::isEnumAttribute() const {
+  return pImpl && pImpl->isEnumAttribute();
 }
 
-void Attribute::setAlignment(unsigned Align) {
-  assert(hasAttribute(Attribute::Alignment) &&
-         "Trying to set the alignment on a non-alignment attribute!");
-  pImpl->setAlignment(Align);
+bool Attribute::isAlignAttribute() const {
+  return pImpl && pImpl->isAlignAttribute();
 }
 
-/// This returns the stack alignment field of an attribute as a byte alignment
-/// value.
-unsigned Attribute::getStackAlignment() const {
-  if (!hasAttribute(Attribute::StackAlignment))
-    return 0;
-  return 1U << ((pImpl->getStackAlignment() >> 26) - 1);
+bool Attribute::isStringAttribute() const {
+  return pImpl && pImpl->isStringAttribute();
 }
 
-void Attribute::setStackAlignment(unsigned Align) {
-  assert(hasAttribute(Attribute::StackAlignment) &&
-         "Trying to set the stack alignment on a non-alignment attribute!");
-  pImpl->setStackAlignment(Align);
+Attribute::AttrKind Attribute::getKindAsEnum() const {
+  assert((isEnumAttribute() || isAlignAttribute()) &&
+         "Invalid attribute type to get the kind as an enum!");
+  return pImpl ? pImpl->getKindAsEnum() : None;
 }
 
-bool Attribute::operator==(AttrKind K) const {
-  return pImpl && *pImpl == K;
+uint64_t Attribute::getValueAsInt() const {
+  assert(isAlignAttribute() &&
+         "Expected the attribute to be an alignment attribute!");
+  return pImpl ? pImpl->getValueAsInt() : 0;
 }
-bool Attribute::operator!=(AttrKind K) const {
-  return !(*this == K);
+
+StringRef Attribute::getKindAsString() const {
+  assert(isStringAttribute() &&
+         "Invalid attribute type to get the kind as a string!");
+  return pImpl ? pImpl->getKindAsString() : StringRef();
 }
 
-uint64_t Attribute::getBitMask() const {
-  return pImpl ? pImpl->getBitMask() : 0;
+StringRef Attribute::getValueAsString() const {
+  assert(isStringAttribute() &&
+         "Invalid attribute type to get the value as a string!");
+  return pImpl ? pImpl->getValueAsString() : StringRef();
 }
 
-Attribute Attribute::typeIncompatible(Type *Ty) {
-  AttrBuilder Incompatible;
+bool Attribute::hasAttribute(AttrKind Kind) const {
+  return (pImpl && pImpl->hasAttribute(Kind)) || (!pImpl && Kind == None);
+}
 
-  if (!Ty->isIntegerTy())
-    // Attribute that only apply to integers.
-    Incompatible.addAttribute(Attribute::SExt)
-      .addAttribute(Attribute::ZExt);
+bool Attribute::hasAttribute(StringRef Kind) const {
+  if (!isStringAttribute()) return false;
+  return pImpl && pImpl->hasAttribute(Kind);
+}
 
-  if (!Ty->isPointerTy())
-    // Attribute that only apply to pointers.
-    Incompatible.addAttribute(Attribute::ByVal)
-      .addAttribute(Attribute::Nest)
-      .addAttribute(Attribute::NoAlias)
-      .addAttribute(Attribute::NoCapture)
-      .addAttribute(Attribute::StructRet);
+/// This returns the alignment field of an attribute as a byte alignment value.
+unsigned Attribute::getAlignment() const {
+  assert(hasAttribute(Attribute::Alignment) &&
+         "Trying to get alignment from non-alignment attribute!");
+  return pImpl->getValueAsInt();
+}
 
-  return Attribute::get(Ty->getContext(), Incompatible);
-}
-
-/// encodeLLVMAttributesForBitcode - This returns an integer containing an
-/// encoding of all the LLVM attributes found in the given attribute bitset.
-/// Any change to this encoding is a breaking change to bitcode compatibility.
-uint64_t Attribute::encodeLLVMAttributesForBitcode(Attribute Attrs) {
-  // FIXME: It doesn't make sense to store the alignment information as an
-  // expanded out value, we should store it as a log2 value.  However, we can't
-  // just change that here without breaking bitcode compatibility.  If this ever
-  // becomes a problem in practice, we should introduce new tag numbers in the
-  // bitcode file and have those tags use a more efficiently encoded alignment
-  // field.
-
-  // Store the alignment in the bitcode as a 16-bit raw value instead of a 5-bit
-  // log2 encoded value. Shift the bits above the alignment up by 11 bits.
-  uint64_t EncodedAttrs = Attrs.getBitMask() & 0xffff;
-  if (Attrs.hasAttribute(Attribute::Alignment))
-    EncodedAttrs |= Attrs.getAlignment() << 16;
-  EncodedAttrs |= (Attrs.getBitMask() & (0xffffULL << 21)) << 11;
-  return EncodedAttrs;
-}
-
-/// decodeLLVMAttributesForBitcode - This returns an attribute bitset containing
-/// the LLVM attributes that have been decoded from the given integer.  This
-/// function must stay in sync with 'encodeLLVMAttributesForBitcode'.
-Attribute Attribute::decodeLLVMAttributesForBitcode(LLVMContext &C,
-                                                      uint64_t EncodedAttrs) {
-  // The alignment is stored as a 16-bit raw value from bits 31--16.  We shift
-  // the bits above 31 down by 11 bits.
-  unsigned Alignment = (EncodedAttrs & (0xffffULL << 16)) >> 16;
-  assert((!Alignment || isPowerOf2_32(Alignment)) &&
-         "Alignment must be a power of two.");
-
-  AttrBuilder B(EncodedAttrs & 0xffff);
-  if (Alignment)
-    B.addAlignmentAttr(Alignment);
-  B.addRawValue((EncodedAttrs & (0xffffULL << 32)) >> 11);
-  return Attribute::get(C, B);
-}
-
-std::string Attribute::getAsString() const {
-  std::string Result;
-  if (hasAttribute(Attribute::ZExt))
-    Result += "zeroext ";
-  if (hasAttribute(Attribute::SExt))
-    Result += "signext ";
-  if (hasAttribute(Attribute::NoReturn))
-    Result += "noreturn ";
-  if (hasAttribute(Attribute::NoUnwind))
-    Result += "nounwind ";
-  if (hasAttribute(Attribute::UWTable))
-    Result += "uwtable ";
-  if (hasAttribute(Attribute::ReturnsTwice))
-    Result += "returns_twice ";
+/// This returns the stack alignment field of an attribute as a byte alignment
+/// value.
+unsigned Attribute::getStackAlignment() const {
+  assert(hasAttribute(Attribute::StackAlignment) &&
+         "Trying to get alignment from non-alignment attribute!");
+  return pImpl->getValueAsInt();
+}
+
+std::string Attribute::getAsString(bool InAttrGrp) const {
+  if (!pImpl) return "";
+
+  if (hasAttribute(Attribute::SanitizeAddress))
+    return "sanitize_address";
+  if (hasAttribute(Attribute::AlwaysInline))
+    return "alwaysinline";
+  if (hasAttribute(Attribute::ByVal))
+    return "byval";
+  if (hasAttribute(Attribute::InlineHint))
+    return "inlinehint";
   if (hasAttribute(Attribute::InReg))
-    Result += "inreg ";
+    return "inreg";
+  if (hasAttribute(Attribute::MinSize))
+    return "minsize";
+  if (hasAttribute(Attribute::Naked))
+    return "naked";
+  if (hasAttribute(Attribute::Nest))
+    return "nest";
   if (hasAttribute(Attribute::NoAlias))
-    Result += "noalias ";
+    return "noalias";
+  if (hasAttribute(Attribute::NoBuiltin))
+    return "nobuiltin";
   if (hasAttribute(Attribute::NoCapture))
-    Result += "nocapture ";
-  if (hasAttribute(Attribute::StructRet))
-    Result += "sret ";
-  if (hasAttribute(Attribute::ByVal))
-    Result += "byval ";
-  if (hasAttribute(Attribute::Nest))
-    Result += "nest ";
+    return "nocapture";
+  if (hasAttribute(Attribute::NoDuplicate))
+    return "noduplicate";
+  if (hasAttribute(Attribute::NoImplicitFloat))
+    return "noimplicitfloat";
+  if (hasAttribute(Attribute::NoInline))
+    return "noinline";
+  if (hasAttribute(Attribute::NonLazyBind))
+    return "nonlazybind";
+  if (hasAttribute(Attribute::NoRedZone))
+    return "noredzone";
+  if (hasAttribute(Attribute::NoReturn))
+    return "noreturn";
+  if (hasAttribute(Attribute::NoUnwind))
+    return "nounwind";
+  if (hasAttribute(Attribute::OptimizeForSize))
+    return "optsize";
   if (hasAttribute(Attribute::ReadNone))
-    Result += "readnone ";
+    return "readnone";
   if (hasAttribute(Attribute::ReadOnly))
-    Result += "readonly ";
-  if (hasAttribute(Attribute::OptimizeForSize))
-    Result += "optsize ";
-  if (hasAttribute(Attribute::NoInline))
-    Result += "noinline ";
-  if (hasAttribute(Attribute::InlineHint))
-    Result += "inlinehint ";
-  if (hasAttribute(Attribute::AlwaysInline))
-    Result += "alwaysinline ";
+    return "readonly";
+  if (hasAttribute(Attribute::ReturnsTwice))
+    return "returns_twice";
+  if (hasAttribute(Attribute::SExt))
+    return "signext";
   if (hasAttribute(Attribute::StackProtect))
-    Result += "ssp ";
+    return "ssp";
   if (hasAttribute(Attribute::StackProtectReq))
-    Result += "sspreq ";
-  if (hasAttribute(Attribute::NoRedZone))
-    Result += "noredzone ";
-  if (hasAttribute(Attribute::NoImplicitFloat))
-    Result += "noimplicitfloat ";
-  if (hasAttribute(Attribute::Naked))
-    Result += "naked ";
-  if (hasAttribute(Attribute::NonLazyBind))
-    Result += "nonlazybind ";
-  if (hasAttribute(Attribute::AddressSafety))
-    Result += "address_safety ";
-  if (hasAttribute(Attribute::MinSize))
-    Result += "minsize ";
-  if (hasAttribute(Attribute::StackAlignment)) {
-    Result += "alignstack(";
-    Result += utostr(getStackAlignment());
-    Result += ") ";
-  }
+    return "sspreq";
+  if (hasAttribute(Attribute::StackProtectStrong))
+    return "sspstrong";
+  if (hasAttribute(Attribute::StructRet))
+    return "sret";
+  if (hasAttribute(Attribute::SanitizeThread))
+    return "sanitize_thread";
+  if (hasAttribute(Attribute::SanitizeMemory))
+    return "sanitize_memory";
+  if (hasAttribute(Attribute::UWTable))
+    return "uwtable";
+  if (hasAttribute(Attribute::ZExt))
+    return "zeroext";
+
+  // FIXME: These should be output like this:
+  //
+  //   align=4
+  //   alignstack=8
+  //
   if (hasAttribute(Attribute::Alignment)) {
-    Result += "align ";
-    Result += utostr(getAlignment());
-    Result += " ";
+    std::string Result;
+    Result += "align";
+    Result += (InAttrGrp) ? "=" : " ";
+    Result += utostr(getValueAsInt());
+    return Result;
   }
-  if (hasAttribute(Attribute::NoDuplicate))
-    Result += "noduplicate ";
-  // Trim the trailing space.
-  assert(!Result.empty() && "Unknown attribute!");
-  Result.erase(Result.end()-1);
-  return Result;
-}
-
-//===----------------------------------------------------------------------===//
-// AttrBuilder Method Implementations
-//===----------------------------------------------------------------------===//
-
-AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Idx)
-  : Alignment(0), StackAlignment(0) {
-  AttributeSetImpl *pImpl = AS.AttrList;
-  if (!pImpl) return;
 
-  ArrayRef<AttributeWithIndex> AttrList = pImpl->getAttributes();
-  const AttributeWithIndex *AWI = 0;
-  for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
-    if (AttrList[I].Index == Idx) {
-      AWI = &AttrList[I];
-      break;
+  if (hasAttribute(Attribute::StackAlignment)) {
+    std::string Result;
+    Result += "alignstack";
+    if (InAttrGrp) {
+      Result += "=";
+      Result += utostr(getValueAsInt());
+    } else {
+      Result += "(";
+      Result += utostr(getValueAsInt());
+      Result += ")";
     }
+    return Result;
+  }
 
-  assert(AWI && "Cannot find index in attribute set!");
-
-  /// FIXME: This will be modified in the future. Basically, the
-  /// AttributeWithIndex class will contain the
+  // Convert target-dependent attributes to strings of the form:
+  //
+  //   "kind"
+  //   "kind" = "value"
+  //
+  if (isStringAttribute()) {
+    std::string Result;
+    Result += '\"' + getKindAsString().str() + '"';
 
-}
+    StringRef Val = pImpl->getValueAsString();
+    if (Val.empty()) return Result;
 
-void AttrBuilder::clear() {
-  Attrs.clear();
-  Alignment = StackAlignment = 0;
-}
+    Result += "=\"" + Val.str() + '"';
+    return Result;
+  }
 
-AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Val) {
-  Attrs.insert(Val);
-  return *this;
+  llvm_unreachable("Unknown attribute");
 }
 
-AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
-  Attrs.erase(Val);
-  if (Val == Attribute::Alignment)
-    Alignment = 0;
-  else if (Val == Attribute::StackAlignment)
-    StackAlignment = 0;
-
-  return *this;
+bool Attribute::operator<(Attribute A) const {
+  if (!pImpl && !A.pImpl) return false;
+  if (!pImpl) return true;
+  if (!A.pImpl) return false;
+  return *pImpl < *A.pImpl;
 }
 
-AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) {
-  if (Align == 0) return *this;
-
-  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
-  assert(Align <= 0x40000000 && "Alignment too large.");
-
-  Attrs.insert(Attribute::Alignment);
-  Alignment = Align;
-  return *this;
-}
-
-AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align) {
-  // Default alignment, allow the target to define how to align it.
-  if (Align == 0) return *this;
+//===----------------------------------------------------------------------===//
+// AttributeImpl Definition
+//===----------------------------------------------------------------------===//
 
-  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
-  assert(Align <= 0x100 && "Alignment too large.");
+AttributeImpl::AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind)
+  : Context(C), Entry(new EnumAttributeEntry(Kind)) {}
 
-  Attrs.insert(Attribute::StackAlignment);
-  StackAlignment = Align;
-  return *this;
+AttributeImpl::AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind,
+                             unsigned Align)
+  : Context(C) {
+  assert((Kind == Attribute::Alignment || Kind == Attribute::StackAlignment) &&
+         "Wrong kind for alignment attribute!");
+  Entry = new AlignAttributeEntry(Kind, Align);
 }
 
-AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) {
-  for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
-       I = Attribute::AttrKind(I + 1)) {
-    if (uint64_t A = (Val & AttributeImpl::getAttrMask(I))) {
-      Attrs.insert(I);
-
-      if (I == Attribute::Alignment)
-        Alignment = 1ULL << ((A >> 16) - 1);
-      else if (I == Attribute::StackAlignment)
-        StackAlignment = 1ULL << ((A >> 26)-1);
-    }
-  }
+AttributeImpl::AttributeImpl(LLVMContext &C, StringRef Kind, StringRef Val)
+  : Context(C), Entry(new StringAttributeEntry(Kind, Val)) {}
 
-  return *this;
+AttributeImpl::~AttributeImpl() {
+  delete Entry;
 }
 
-AttrBuilder &AttrBuilder::addAttributes(const Attribute &A) {
-  uint64_t Mask = A.getBitMask();
-
-  for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
-       I = Attribute::AttrKind(I + 1)) {
-    if (uint64_t A = (Mask & AttributeImpl::getAttrMask(I))) {
-      Attrs.insert(I);
-
-      if (I == Attribute::Alignment)
-        Alignment = 1ULL << ((A >> 16) - 1);
-      else if (I == Attribute::StackAlignment)
-        StackAlignment = 1ULL << ((A >> 26)-1);
-    }
-  }
-
-  return *this;
+bool AttributeImpl::isEnumAttribute() const {
+  return isa<EnumAttributeEntry>(Entry);
 }
 
-AttrBuilder &AttrBuilder::removeAttributes(const Attribute &A){
-  uint64_t Mask = A.getBitMask();
-
-  for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
-       I = Attribute::AttrKind(I + 1)) {
-    if (Mask & AttributeImpl::getAttrMask(I)) {
-      Attrs.erase(I);
-
-      if (I == Attribute::Alignment)
-        Alignment = 0;
-      else if (I == Attribute::StackAlignment)
-        StackAlignment = 0;
-    }
-  }
-
-  return *this;
+bool AttributeImpl::isAlignAttribute() const {
+  return isa<AlignAttributeEntry>(Entry);
 }
 
-bool AttrBuilder::contains(Attribute::AttrKind A) const {
-  return Attrs.count(A);
+bool AttributeImpl::isStringAttribute() const {
+  return isa<StringAttributeEntry>(Entry);
 }
 
-bool AttrBuilder::hasAttributes() const {
-  return !Attrs.empty();
+bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
+  if (isStringAttribute()) return false;
+  return getKindAsEnum() == A;
 }
 
-bool AttrBuilder::hasAttributes(const Attribute &A) const {
-  return getBitMask() & A.getBitMask();
+bool AttributeImpl::hasAttribute(StringRef Kind) const {
+  if (!isStringAttribute()) return false;
+  return getKindAsString() == Kind;
 }
 
-bool AttrBuilder::hasAlignmentAttr() const {
-  return Alignment != 0;
+Attribute::AttrKind AttributeImpl::getKindAsEnum() const {
+  if (EnumAttributeEntry *E = dyn_cast<EnumAttributeEntry>(Entry))
+    return E->getEnumKind();
+  return cast<AlignAttributeEntry>(Entry)->getEnumKind();
 }
 
-uint64_t AttrBuilder::getBitMask() const {
-  uint64_t Mask = 0;
-
-  for (DenseSet<Attribute::AttrKind>::const_iterator I = Attrs.begin(),
-         E = Attrs.end(); I != E; ++I) {
-    Attribute::AttrKind Kind = *I;
-
-    if (Kind == Attribute::Alignment)
-      Mask |= (Log2_32(Alignment) + 1) << 16;
-    else if (Kind == Attribute::StackAlignment)
-      Mask |= (Log2_32(StackAlignment) + 1) << 26;
-    else
-      Mask |= AttributeImpl::getAttrMask(Kind);
-  }
-
-  return Mask;
+uint64_t AttributeImpl::getValueAsInt() const {
+  return cast<AlignAttributeEntry>(Entry)->getAlignment();
 }
 
-bool AttrBuilder::operator==(const AttrBuilder &B) {
-  SmallVector<Attribute::AttrKind, 8> This(Attrs.begin(), Attrs.end());
-  SmallVector<Attribute::AttrKind, 8> That(B.Attrs.begin(), B.Attrs.end());
-  return This == That;
+StringRef AttributeImpl::getKindAsString() const {
+  return cast<StringAttributeEntry>(Entry)->getStringKind();
 }
 
-//===----------------------------------------------------------------------===//
-// AttributeImpl Definition
-//===----------------------------------------------------------------------===//
-
-AttributeImpl::AttributeImpl(LLVMContext &C, uint64_t data)
-  : Context(C) {
-  Data = ConstantInt::get(Type::getInt64Ty(C), data);
-}
-AttributeImpl::AttributeImpl(LLVMContext &C, Attribute::AttrKind data)
-  : Context(C) {
-  Data = ConstantInt::get(Type::getInt64Ty(C), data);
-}
-AttributeImpl::AttributeImpl(LLVMContext &C, Attribute::AttrKind data,
-                             ArrayRef<Constant*> values)
-  : Context(C) {
-  Data = ConstantInt::get(Type::getInt64Ty(C), data);
-  Vals.reserve(values.size());
-  Vals.append(values.begin(), values.end());
-}
-AttributeImpl::AttributeImpl(LLVMContext &C, StringRef data)
-  : Context(C) {
-  Data = ConstantDataArray::getString(C, data);
+StringRef AttributeImpl::getValueAsString() const {
+  return cast<StringAttributeEntry>(Entry)->getStringValue();
 }
 
-bool AttributeImpl::operator==(Attribute::AttrKind Kind) const {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Data))
-    return CI->getZExtValue() == Kind;
-  return false;
-}
-bool AttributeImpl::operator!=(Attribute::AttrKind Kind) const {
-  return !(*this == Kind);
-}
+bool AttributeImpl::operator<(const AttributeImpl &AI) const {
+  // This sorts the attributes with Attribute::AttrKinds coming first (sorted
+  // relative to their enum value) and then strings.
+  if (isEnumAttribute()) {
+    if (AI.isEnumAttribute()) return getKindAsEnum() < AI.getKindAsEnum();
+    if (AI.isAlignAttribute()) return true;
+    if (AI.isStringAttribute()) return true;
+  }
 
-bool AttributeImpl::operator==(StringRef Kind) const {
-  if (ConstantDataArray *CDA = dyn_cast<ConstantDataArray>(Data))
-    if (CDA->isString())
-      return CDA->getAsString() == Kind;
-  return false;
-}
-bool AttributeImpl::operator!=(StringRef Kind) const {
-  return !(*this == Kind);
-}
+  if (isAlignAttribute()) {
+    if (AI.isEnumAttribute()) return false;
+    if (AI.isAlignAttribute()) return getValueAsInt() < AI.getValueAsInt();
+    if (AI.isStringAttribute()) return true;
+  }
 
-uint64_t AttributeImpl::getBitMask() const {
-  // FIXME: Remove this.
-  return cast<ConstantInt>(Data)->getZExtValue();
+  if (AI.isEnumAttribute()) return false;
+  if (AI.isAlignAttribute()) return false;
+  if (getKindAsString() == AI.getKindAsString())
+    return getValueAsString() < AI.getValueAsString();
+  return getKindAsString() < AI.getKindAsString();
 }
 
 uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
+  // FIXME: Remove this.
   switch (Val) {
   case Attribute::EndAttrKinds:
-  case Attribute::AttrKindEmptyKey:
-  case Attribute::AttrKindTombstoneKey:
     llvm_unreachable("Synthetic enumerators which should never get here");
 
   case Attribute::None:            return 0;
@@ -478,57 +386,150 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::ReturnsTwice:    return 1 << 29;
   case Attribute::UWTable:         return 1 << 30;
   case Attribute::NonLazyBind:     return 1U << 31;
-  case Attribute::AddressSafety:   return 1ULL << 32;
+  case Attribute::SanitizeAddress: return 1ULL << 32;
   case Attribute::MinSize:         return 1ULL << 33;
   case Attribute::NoDuplicate:     return 1ULL << 34;
+  case Attribute::StackProtectStrong: return 1ULL << 35;
+  case Attribute::SanitizeThread:  return 1ULL << 36;
+  case Attribute::SanitizeMemory:  return 1ULL << 37;
+  case Attribute::NoBuiltin:       return 1ULL << 38;
   }
   llvm_unreachable("Unsupported attribute type");
 }
 
-bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
-  return (getBitMask() & getAttrMask(A)) != 0;
+//===----------------------------------------------------------------------===//
+// AttributeSetNode Definition
+//===----------------------------------------------------------------------===//
+
+AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
+                                        ArrayRef<Attribute> Attrs) {
+  if (Attrs.empty())
+    return 0;
+
+  // Otherwise, build a key to look up the existing attributes.
+  LLVMContextImpl *pImpl = C.pImpl;
+  FoldingSetNodeID ID;
+
+  SmallVector<Attribute, 8> SortedAttrs(Attrs.begin(), Attrs.end());
+  array_pod_sort(SortedAttrs.begin(), SortedAttrs.end());
+
+  for (SmallVectorImpl<Attribute>::iterator I = SortedAttrs.begin(),
+         E = SortedAttrs.end(); I != E; ++I)
+    I->Profile(ID);
+
+  void *InsertPoint;
+  AttributeSetNode *PA =
+    pImpl->AttrsSetNodes.FindNodeOrInsertPos(ID, InsertPoint);
+
+  // If we didn't find any existing attributes of the same shape then create a
+  // new one and insert it.
+  if (!PA) {
+    PA = new AttributeSetNode(SortedAttrs);
+    pImpl->AttrsSetNodes.InsertNode(PA, InsertPoint);
+  }
+
+  // Return the AttributesListNode that we found or created.
+  return PA;
+}
+
+bool AttributeSetNode::hasAttribute(Attribute::AttrKind Kind) const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Kind))
+      return true;
+  return false;
+}
+
+bool AttributeSetNode::hasAttribute(StringRef Kind) const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Kind))
+      return true;
+  return false;
 }
 
-bool AttributeImpl::hasAttributes() const {
-  return getBitMask() != 0;
+Attribute AttributeSetNode::getAttribute(Attribute::AttrKind Kind) const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Kind))
+      return *I;
+  return Attribute();
 }
 
-uint64_t AttributeImpl::getAlignment() const {
-  return getBitMask() & getAttrMask(Attribute::Alignment);
+Attribute AttributeSetNode::getAttribute(StringRef Kind) const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Kind))
+      return *I;
+  return Attribute();
 }
 
-void AttributeImpl::setAlignment(unsigned Align) {
-  Vals.push_back(ConstantInt::get(Type::getInt64Ty(Context), Align));
+unsigned AttributeSetNode::getAlignment() const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Attribute::Alignment))
+      return I->getAlignment();
+  return 0;
 }
 
-uint64_t AttributeImpl::getStackAlignment() const {
-  return getBitMask() & getAttrMask(Attribute::StackAlignment);
+unsigned AttributeSetNode::getStackAlignment() const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Attribute::StackAlignment))
+      return I->getStackAlignment();
+  return 0;
 }
 
-void AttributeImpl::setStackAlignment(unsigned Align) {
-  Vals.push_back(ConstantInt::get(Type::getInt64Ty(Context), Align));
+std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
+  std::string Str = "";
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ) {
+    Str += I->getAsString(InAttrGrp);
+    if (++I != E) Str += " ";
+  }
+  return Str;
 }
 
 //===----------------------------------------------------------------------===//
 // AttributeSetImpl Definition
 //===----------------------------------------------------------------------===//
 
-AttributeSet AttributeSet::get(LLVMContext &C,
-                               ArrayRef<AttributeWithIndex> Attrs) {
-  // If there are no attributes then return a null AttributesList pointer.
-  if (Attrs.empty())
-    return AttributeSet();
+uint64_t AttributeSetImpl::Raw(uint64_t Index) const {
+  for (unsigned I = 0, E = getNumAttributes(); I != E; ++I) {
+    if (getSlotIndex(I) != Index) continue;
+    const AttributeSetNode *ASN = AttrNodes[I].second;
+    uint64_t Mask = 0;
 
-#ifndef NDEBUG
-  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    assert(Attrs[i].Attrs.hasAttributes() &&
-           "Pointless attribute!");
-    assert((!i || Attrs[i-1].Index < Attrs[i].Index) &&
-           "Misordered AttributesList!");
+    for (AttributeSetNode::const_iterator II = ASN->begin(),
+           IE = ASN->end(); II != IE; ++II) {
+      Attribute Attr = *II;
+
+      // This cannot handle string attributes.
+      if (Attr.isStringAttribute()) continue;
+
+      Attribute::AttrKind Kind = Attr.getKindAsEnum();
+
+      if (Kind == Attribute::Alignment)
+        Mask |= (Log2_32(ASN->getAlignment()) + 1) << 16;
+      else if (Kind == Attribute::StackAlignment)
+        Mask |= (Log2_32(ASN->getStackAlignment()) + 1) << 26;
+      else
+        Mask |= AttributeImpl::getAttrMask(Kind);
+    }
+
+    return Mask;
   }
-#endif
 
-  // Otherwise, build a key to look up the existing attributes.
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeSet Construction and Mutation Methods
+//===----------------------------------------------------------------------===//
+
+AttributeSet
+AttributeSet::getImpl(LLVMContext &C,
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> > Attrs) {
   LLVMContextImpl *pImpl = C.pImpl;
   FoldingSetNodeID ID;
   AttributeSetImpl::Profile(ID, Attrs);
@@ -547,185 +548,624 @@ AttributeSet AttributeSet::get(LLVMContext &C,
   return AttributeSet(PA);
 }
 
+AttributeSet AttributeSet::get(LLVMContext &C,
+                               ArrayRef<std::pair<unsigned, Attribute> > Attrs){
+  // If there are no attributes then return a null AttributesList pointer.
+  if (Attrs.empty())
+    return AttributeSet();
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
+    assert((!i || Attrs[i-1].first <= Attrs[i].first) &&
+           "Misordered Attributes list!");
+    assert(!Attrs[i].second.hasAttribute(Attribute::None) &&
+           "Pointless attribute!");
+  }
+#endif
+
+  // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
+  // list.
+  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrPairVec;
+  for (ArrayRef<std::pair<unsigned, Attribute> >::iterator I = Attrs.begin(),
+         E = Attrs.end(); I != E; ) {
+    unsigned Index = I->first;
+    SmallVector<Attribute, 4> AttrVec;
+    while (I != E && I->first == Index) {
+      AttrVec.push_back(I->second);
+      ++I;
+    }
+
+    AttrPairVec.push_back(std::make_pair(Index,
+                                         AttributeSetNode::get(C, AttrVec)));
+  }
+
+  return getImpl(C, AttrPairVec);
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C,
+                               ArrayRef<std::pair<unsigned,
+                                                  AttributeSetNode*> > Attrs) {
+  // If there are no attributes then return a null AttributesList pointer.
+  if (Attrs.empty())
+    return AttributeSet();
+
+  return getImpl(C, Attrs);
+}
+
 AttributeSet AttributeSet::get(LLVMContext &C, unsigned Idx, AttrBuilder &B) {
-  SmallVector<AttributeWithIndex, 8> Attrs;
-  for (AttrBuilder::iterator I = B.begin(), E = B.end(); I != E; ++I) {
-    Attribute::AttrKind Kind = *I;
-    Attribute A = Attribute::get(C, Kind);
+  if (!B.hasAttributes())
+    return AttributeSet();
+
+  // Add target-independent attributes.
+  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
+  for (Attribute::AttrKind Kind = Attribute::None;
+       Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) {
+    if (!B.contains(Kind))
+      continue;
 
     if (Kind == Attribute::Alignment)
-      A.setAlignment(B.getAlignment());
+      Attrs.push_back(std::make_pair(Idx, Attribute::
+                                     getWithAlignment(C, B.getAlignment())));
     else if (Kind == Attribute::StackAlignment)
-      A.setStackAlignment(B.getStackAlignment());
-
-    Attrs.push_back(AttributeWithIndex::get(Idx, A));
+      Attrs.push_back(std::make_pair(Idx, Attribute::
+                              getWithStackAlignment(C, B.getStackAlignment())));
+    else
+      Attrs.push_back(std::make_pair(Idx, Attribute::get(C, Kind)));
   }
 
+  // Add target-dependent (string) attributes.
+  for (AttrBuilder::td_iterator I = B.td_begin(), E = B.td_end();
+       I != E; ++I)
+    Attrs.push_back(std::make_pair(Idx, Attribute::get(C, I->first,I->second)));
+
+  return get(C, Attrs);
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C, unsigned Idx,
+                               ArrayRef<Attribute::AttrKind> Kind) {
+  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
+  for (ArrayRef<Attribute::AttrKind>::iterator I = Kind.begin(),
+         E = Kind.end(); I != E; ++I)
+    Attrs.push_back(std::make_pair(Idx, Attribute::get(C, *I)));
   return get(C, Attrs);
 }
 
+AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
+  if (Attrs.empty()) return AttributeSet();
+
+  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrNodeVec;
+  for (unsigned I = 0, E = Attrs.size(); I != E; ++I) {
+    AttributeSet AS = Attrs[I];
+    if (!AS.pImpl) continue;
+    AttrNodeVec.append(AS.pImpl->AttrNodes.begin(), AS.pImpl->AttrNodes.end());
+  }
+
+  return getImpl(C, AttrNodeVec);
+}
+
+AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Idx,
+                                        Attribute::AttrKind Attr) const {
+  return addAttributes(C, Idx, AttributeSet::get(C, Idx, Attr));
+}
+
+AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Idx,
+                                         AttributeSet Attrs) const {
+  if (!pImpl) return Attrs;
+  if (!Attrs.pImpl) return *this;
+
+#ifndef NDEBUG
+  // FIXME it is not obvious how this should work for alignment. For now, say
+  // we can't change a known alignment.
+  unsigned OldAlign = getParamAlignment(Idx);
+  unsigned NewAlign = Attrs.getParamAlignment(Idx);
+  assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
+         "Attempt to change alignment!");
+#endif
+
+  // Add the attribute slots before the one we're trying to add.
+  SmallVector<AttributeSet, 4> AttrSet;
+  uint64_t NumAttrs = pImpl->getNumAttributes();
+  AttributeSet AS;
+  uint64_t LastIndex = 0;
+  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
+    if (getSlotIndex(I) >= Idx) {
+      if (getSlotIndex(I) == Idx) AS = getSlotAttributes(LastIndex++);
+      break;
+    }
+    LastIndex = I + 1;
+    AttrSet.push_back(getSlotAttributes(I));
+  }
+
+  // Now add the attribute into the correct slot. There may already be an
+  // AttributeSet there.
+  AttrBuilder B(AS, Idx);
+
+  for (unsigned I = 0, E = Attrs.pImpl->getNumAttributes(); I != E; ++I)
+    if (Attrs.getSlotIndex(I) == Idx) {
+      for (AttributeSetImpl::const_iterator II = Attrs.pImpl->begin(I),
+             IE = Attrs.pImpl->end(I); II != IE; ++II)
+        B.addAttribute(*II);
+      break;
+    }
+
+  AttrSet.push_back(AttributeSet::get(C, Idx, B));
+
+  // Add the remaining attribute slots.
+  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
+    AttrSet.push_back(getSlotAttributes(I));
+
+  return get(C, AttrSet);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Idx,
+                                           Attribute::AttrKind Attr) const {
+  return removeAttributes(C, Idx, AttributeSet::get(C, Idx, Attr));
+}
+
+AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Idx,
+                                            AttributeSet Attrs) const {
+  if (!pImpl) return AttributeSet();
+  if (!Attrs.pImpl) return *this;
+
+#ifndef NDEBUG
+  // FIXME it is not obvious how this should work for alignment.
+  // For now, say we can't pass in alignment, which no current use does.
+  assert(!Attrs.hasAttribute(Idx, Attribute::Alignment) &&
+         "Attempt to change alignment!");
+#endif
+
+  // Add the attribute slots before the one we're trying to add.
+  SmallVector<AttributeSet, 4> AttrSet;
+  uint64_t NumAttrs = pImpl->getNumAttributes();
+  AttributeSet AS;
+  uint64_t LastIndex = 0;
+  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
+    if (getSlotIndex(I) >= Idx) {
+      if (getSlotIndex(I) == Idx) AS = getSlotAttributes(LastIndex++);
+      break;
+    }
+    LastIndex = I + 1;
+    AttrSet.push_back(getSlotAttributes(I));
+  }
+
+  // Now remove the attribute from the correct slot. There may already be an
+  // AttributeSet there.
+  AttrBuilder B(AS, Idx);
+
+  for (unsigned I = 0, E = Attrs.pImpl->getNumAttributes(); I != E; ++I)
+    if (Attrs.getSlotIndex(I) == Idx) {
+      B.removeAttributes(Attrs.pImpl->getSlotAttributes(I), Idx);
+      break;
+    }
+
+  AttrSet.push_back(AttributeSet::get(C, Idx, B));
+
+  // Add the remaining attribute slots.
+  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
+    AttrSet.push_back(getSlotAttributes(I));
+
+  return get(C, AttrSet);
+}
+
 //===----------------------------------------------------------------------===//
-// AttributeSet Method Implementations
+// AttributeSet Accessor Methods
 //===----------------------------------------------------------------------===//
 
-const AttributeSet &AttributeSet::operator=(const AttributeSet &RHS) {
-  AttrList = RHS.AttrList;
-  return *this;
+LLVMContext &AttributeSet::getContext() const {
+  return pImpl->getContext();
 }
 
-/// getNumSlots - Return the number of slots used in this attribute list.
-/// This is the number of arguments that have an attribute set on them
-/// (including the function itself).
-unsigned AttributeSet::getNumSlots() const {
-  return AttrList ? AttrList->getNumAttributes() : 0;
+AttributeSet AttributeSet::getParamAttributes(unsigned Idx) const {
+  return pImpl && hasAttributes(Idx) ?
+    AttributeSet::get(pImpl->getContext(),
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
+                        std::make_pair(Idx, getAttributes(Idx)))) :
+    AttributeSet();
 }
 
-/// getSlot - Return the AttributeWithIndex at the specified slot.  This
-/// holds a number plus a set of attributes.
-const AttributeWithIndex &AttributeSet::getSlot(unsigned Slot) const {
-  assert(AttrList && Slot < AttrList->getNumAttributes() &&
-         "Slot # out of range!");
-  return AttrList->getAttributes()[Slot];
+AttributeSet AttributeSet::getRetAttributes() const {
+  return pImpl && hasAttributes(ReturnIndex) ?
+    AttributeSet::get(pImpl->getContext(),
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
+                        std::make_pair(ReturnIndex,
+                                       getAttributes(ReturnIndex)))) :
+    AttributeSet();
+}
+
+AttributeSet AttributeSet::getFnAttributes() const {
+  return pImpl && hasAttributes(FunctionIndex) ?
+    AttributeSet::get(pImpl->getContext(),
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
+                        std::make_pair(FunctionIndex,
+                                       getAttributes(FunctionIndex)))) :
+    AttributeSet();
 }
 
 bool AttributeSet::hasAttribute(unsigned Index, Attribute::AttrKind Kind) const{
-  return getAttributes(Index).hasAttribute(Kind);
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->hasAttribute(Kind) : false;
+}
+
+bool AttributeSet::hasAttribute(unsigned Index, StringRef Kind) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->hasAttribute(Kind) : false;
 }
 
 bool AttributeSet::hasAttributes(unsigned Index) const {
-  return getAttributes(Index).hasAttributes();
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->hasAttributes() : false;
+}
+
+/// \brief Return true if the specified attribute is set for at least one
+/// parameter or for the return value.
+bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr) const {
+  if (pImpl == 0) return false;
+
+  for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I)
+    for (AttributeSetImpl::const_iterator II = pImpl->begin(I),
+           IE = pImpl->end(I); II != IE; ++II)
+      if (II->hasAttribute(Attr))
+        return true;
+
+  return false;
 }
 
-std::string AttributeSet::getAsString(unsigned Index) const {
-  return getAttributes(Index).getAsString();
+Attribute AttributeSet::getAttribute(unsigned Index,
+                                     Attribute::AttrKind Kind) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getAttribute(Kind) : Attribute();
+}
+
+Attribute AttributeSet::getAttribute(unsigned Index,
+                                     StringRef Kind) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getAttribute(Kind) : Attribute();
+}
+
+unsigned AttributeSet::getParamAlignment(unsigned Index) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getAlignment() : 0;
 }
 
 unsigned AttributeSet::getStackAlignment(unsigned Index) const {
-  return getAttributes(Index).getStackAlignment();
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getStackAlignment() : 0;
+}
+
+std::string AttributeSet::getAsString(unsigned Index,
+                                      bool InAttrGrp) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getAsString(InAttrGrp) : std::string("");
+}
+
+/// \brief The attributes for the specified index are returned.
+AttributeSetNode *AttributeSet::getAttributes(unsigned Idx) const {
+  if (!pImpl) return 0;
+
+  // Loop through to find the attribute node we want.
+  for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I)
+    if (pImpl->getSlotIndex(I) == Idx)
+      return pImpl->getSlotNode(I);
+
+  return 0;
+}
+
+AttributeSet::iterator AttributeSet::begin(unsigned Idx) const {
+  if (!pImpl)
+    return ArrayRef<Attribute>().begin();
+  return pImpl->begin(Idx);
+}
+
+AttributeSet::iterator AttributeSet::end(unsigned Idx) const {
+  if (!pImpl)
+    return ArrayRef<Attribute>().end();
+  return pImpl->end(Idx);
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeSet Introspection Methods
+//===----------------------------------------------------------------------===//
+
+/// \brief Return the number of slots used in this attribute list.  This is the
+/// number of arguments that have an attribute set on them (including the
+/// function itself).
+unsigned AttributeSet::getNumSlots() const {
+  return pImpl ? pImpl->getNumAttributes() : 0;
+}
+
+uint64_t AttributeSet::getSlotIndex(unsigned Slot) const {
+  assert(pImpl && Slot < pImpl->getNumAttributes() &&
+         "Slot # out of range!");
+  return pImpl->getSlotIndex(Slot);
+}
+
+AttributeSet AttributeSet::getSlotAttributes(unsigned Slot) const {
+  assert(pImpl && Slot < pImpl->getNumAttributes() &&
+         "Slot # out of range!");
+  return pImpl->getSlotAttributes(Slot);
 }
 
-uint64_t AttributeSet::getBitMask(unsigned Index) const {
+uint64_t AttributeSet::Raw(unsigned Index) const {
   // FIXME: Remove this.
-  return getAttributes(Index).getBitMask();
+  return pImpl ? pImpl->Raw(Index) : 0;
 }
 
-/// getAttributes - The attributes for the specified index are returned.
-/// Attributes for the result are denoted with Idx = 0.  Function attributes are
-/// denoted with Idx = ~0.
-Attribute AttributeSet::getAttributes(unsigned Idx) const {
-  if (AttrList == 0) return Attribute();
+void AttributeSet::dump() const {
+  dbgs() << "PAL[\n";
 
-  ArrayRef<AttributeWithIndex> Attrs = AttrList->getAttributes();
-  for (unsigned i = 0, e = Attrs.size(); i != e && Attrs[i].Index <= Idx; ++i)
-    if (Attrs[i].Index == Idx)
-      return Attrs[i].Attrs;
+  for (unsigned i = 0, e = getNumSlots(); i < e; ++i) {
+    uint64_t Index = getSlotIndex(i);
+    dbgs() << "  { ";
+    if (Index == ~0U)
+      dbgs() << "~0U";
+    else
+      dbgs() << Index;
+    dbgs() << " => " << getAsString(Index) << " }\n";
+  }
 
-  return Attribute();
+  dbgs() << "]\n";
 }
 
-/// hasAttrSomewhere - Return true if the specified attribute is set for at
-/// least one parameter or for the return value.
-bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr) const {
-  if (AttrList == 0) return false;
+//===----------------------------------------------------------------------===//
+// AttrBuilder Method Implementations
+//===----------------------------------------------------------------------===//
 
-  ArrayRef<AttributeWithIndex> Attrs = AttrList->getAttributes();
-  for (unsigned i = 0, e = Attrs.size(); i != e; ++i)
-    if (Attrs[i].Attrs.hasAttribute(Attr))
-      return true;
+AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Idx)
+  : Attrs(0), Alignment(0), StackAlignment(0) {
+  AttributeSetImpl *pImpl = AS.pImpl;
+  if (!pImpl) return;
 
-  return false;
+  for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I) {
+    if (pImpl->getSlotIndex(I) != Idx) continue;
+
+    for (AttributeSetImpl::const_iterator II = pImpl->begin(I),
+           IE = pImpl->end(I); II != IE; ++II)
+      addAttribute(*II);
+
+    break;
+  }
 }
 
-AttributeSet AttributeSet::addAttr(LLVMContext &C, unsigned Idx,
-                                   Attribute Attrs) const {
-  Attribute OldAttrs = getAttributes(Idx);
-#ifndef NDEBUG
-  // FIXME it is not obvious how this should work for alignment.
-  // For now, say we can't change a known alignment.
-  unsigned OldAlign = OldAttrs.getAlignment();
-  unsigned NewAlign = Attrs.getAlignment();
-  assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
-         "Attempt to change alignment!");
-#endif
+void AttrBuilder::clear() {
+  Attrs.reset();
+  Alignment = StackAlignment = 0;
+}
 
-  AttrBuilder NewAttrs =
-    AttrBuilder(OldAttrs).addAttributes(Attrs);
-  if (NewAttrs == AttrBuilder(OldAttrs))
+AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Val) {
+  assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!");
+  assert(Val != Attribute::Alignment && Val != Attribute::StackAlignment &&
+         "Adding alignment attribute without adding alignment value!");
+  Attrs[Val] = true;
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) {
+  if (Attr.isStringAttribute()) {
+    addAttribute(Attr.getKindAsString(), Attr.getValueAsString());
     return *this;
+  }
+
+  Attribute::AttrKind Kind = Attr.getKindAsEnum();
+  Attrs[Kind] = true;
+
+  if (Kind == Attribute::Alignment)
+    Alignment = Attr.getAlignment();
+  else if (Kind == Attribute::StackAlignment)
+    StackAlignment = Attr.getStackAlignment();
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addAttribute(StringRef A, StringRef V) {
+  TargetDepAttrs[A] = V;
+  return *this;
+}
 
-  SmallVector<AttributeWithIndex, 8> NewAttrList;
-  if (AttrList == 0)
-    NewAttrList.push_back(AttributeWithIndex::get(Idx, Attrs));
-  else {
-    ArrayRef<AttributeWithIndex> OldAttrList = AttrList->getAttributes();
-    unsigned i = 0, e = OldAttrList.size();
-    // Copy attributes for arguments before this one.
-    for (; i != e && OldAttrList[i].Index < Idx; ++i)
-      NewAttrList.push_back(OldAttrList[i]);
-
-    // If there are attributes already at this index, merge them in.
-    if (i != e && OldAttrList[i].Index == Idx) {
-      Attrs =
-        Attribute::get(C, AttrBuilder(Attrs).
-                        addAttributes(OldAttrList[i].Attrs));
-      ++i;
+AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
+  assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!");
+  Attrs[Val] = false;
+
+  if (Val == Attribute::Alignment)
+    Alignment = 0;
+  else if (Val == Attribute::StackAlignment)
+    StackAlignment = 0;
+
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) {
+  unsigned Idx = ~0U;
+  for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
+    if (A.getSlotIndex(I) == Index) {
+      Idx = I;
+      break;
     }
 
-    NewAttrList.push_back(AttributeWithIndex::get(Idx, Attrs));
+  assert(Idx != ~0U && "Couldn't find index in AttributeSet!");
+
+  for (AttributeSet::iterator I = A.begin(Idx), E = A.end(Idx); I != E; ++I) {
+    Attribute Attr = *I;
+    if (Attr.isEnumAttribute() || Attr.isAlignAttribute()) {
+      Attribute::AttrKind Kind = I->getKindAsEnum();
+      Attrs[Kind] = false;
 
-    // Copy attributes for arguments after this one.
-    NewAttrList.insert(NewAttrList.end(),
-                       OldAttrList.begin()+i, OldAttrList.end());
+      if (Kind == Attribute::Alignment)
+        Alignment = 0;
+      else if (Kind == Attribute::StackAlignment)
+        StackAlignment = 0;
+    } else {
+      assert(Attr.isStringAttribute() && "Invalid attribute type!");
+      std::map<std::string, std::string>::iterator
+        Iter = TargetDepAttrs.find(Attr.getKindAsString());
+      if (Iter != TargetDepAttrs.end())
+        TargetDepAttrs.erase(Iter);
+    }
   }
 
-  return get(C, NewAttrList);
+  return *this;
 }
 
-AttributeSet AttributeSet::removeAttr(LLVMContext &C, unsigned Idx,
-                                      Attribute Attrs) const {
-#ifndef NDEBUG
-  // FIXME it is not obvious how this should work for alignment.
-  // For now, say we can't pass in alignment, which no current use does.
-  assert(!Attrs.hasAttribute(Attribute::Alignment) &&
-         "Attempt to exclude alignment!");
-#endif
-  if (AttrList == 0) return AttributeSet();
+AttrBuilder &AttrBuilder::removeAttribute(StringRef A) {
+  std::map<std::string, std::string>::iterator I = TargetDepAttrs.find(A);
+  if (I != TargetDepAttrs.end())
+    TargetDepAttrs.erase(I);
+  return *this;
+}
 
-  Attribute OldAttrs = getAttributes(Idx);
-  AttrBuilder NewAttrs =
-    AttrBuilder(OldAttrs).removeAttributes(Attrs);
-  if (NewAttrs == AttrBuilder(OldAttrs))
-    return *this;
+AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) {
+  if (Align == 0) return *this;
+
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x40000000 && "Alignment too large.");
+
+  Attrs[Attribute::Alignment] = true;
+  Alignment = Align;
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align) {
+  // Default alignment, allow the target to define how to align it.
+  if (Align == 0) return *this;
 
-  SmallVector<AttributeWithIndex, 8> NewAttrList;
-  ArrayRef<AttributeWithIndex> OldAttrList = AttrList->getAttributes();
-  unsigned i = 0, e = OldAttrList.size();
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x100 && "Alignment too large.");
 
-  // Copy attributes for arguments before this one.
-  for (; i != e && OldAttrList[i].Index < Idx; ++i)
-    NewAttrList.push_back(OldAttrList[i]);
+  Attrs[Attribute::StackAlignment] = true;
+  StackAlignment = Align;
+  return *this;
+}
 
-  // If there are attributes already at this index, merge them in.
-  assert(OldAttrList[i].Index == Idx && "Attribute isn't set?");
-  Attrs = Attribute::get(C, AttrBuilder(OldAttrList[i].Attrs).
-                          removeAttributes(Attrs));
-  ++i;
-  if (Attrs.hasAttributes()) // If any attributes left for this param, add them.
-    NewAttrList.push_back(AttributeWithIndex::get(Idx, Attrs));
+AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
+  // FIXME: What if both have alignments, but they don't match?!
+  if (!Alignment)
+    Alignment = B.Alignment;
 
-  // Copy attributes for arguments after this one.
-  NewAttrList.insert(NewAttrList.end(),
-                     OldAttrList.begin()+i, OldAttrList.end());
+  if (!StackAlignment)
+    StackAlignment = B.StackAlignment;
 
-  return get(C, NewAttrList);
+  Attrs |= B.Attrs;
+
+  for (td_const_iterator I = B.TargetDepAttrs.begin(),
+         E = B.TargetDepAttrs.end(); I != E; ++I)
+    TargetDepAttrs[I->first] = I->second;
+
+  return *this;
 }
 
-void AttributeSet::dump() const {
-  dbgs() << "PAL[ ";
-  for (unsigned i = 0; i < getNumSlots(); ++i) {
-    const AttributeWithIndex &PAWI = getSlot(i);
-    dbgs() << "{ " << PAWI.Index << ", " << PAWI.Attrs.getAsString() << " } ";
+bool AttrBuilder::contains(StringRef A) const {
+  return TargetDepAttrs.find(A) != TargetDepAttrs.end();
+}
+
+bool AttrBuilder::hasAttributes() const {
+  return !Attrs.none() || !TargetDepAttrs.empty();
+}
+
+bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const {
+  unsigned Idx = ~0U;
+  for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
+    if (A.getSlotIndex(I) == Index) {
+      Idx = I;
+      break;
+    }
+
+  assert(Idx != ~0U && "Couldn't find the index!");
+
+  for (AttributeSet::iterator I = A.begin(Idx), E = A.end(Idx);
+       I != E; ++I) {
+    Attribute Attr = *I;
+    if (Attr.isEnumAttribute() || Attr.isAlignAttribute()) {
+      if (Attrs[I->getKindAsEnum()])
+        return true;
+    } else {
+      assert(Attr.isStringAttribute() && "Invalid attribute kind!");
+      return TargetDepAttrs.find(Attr.getKindAsString())!=TargetDepAttrs.end();
+    }
   }
 
-  dbgs() << "]\n";
+  return false;
+}
+
+bool AttrBuilder::hasAlignmentAttr() const {
+  return Alignment != 0;
+}
+
+bool AttrBuilder::operator==(const AttrBuilder &B) {
+  if (Attrs != B.Attrs)
+    return false;
+
+  for (td_const_iterator I = TargetDepAttrs.begin(),
+         E = TargetDepAttrs.end(); I != E; ++I)
+    if (B.TargetDepAttrs.find(I->first) == B.TargetDepAttrs.end())
+      return false;
+
+  return Alignment == B.Alignment && StackAlignment == B.StackAlignment;
+}
+
+void AttrBuilder::removeFunctionOnlyAttrs() {
+  removeAttribute(Attribute::NoReturn)
+    .removeAttribute(Attribute::NoUnwind)
+    .removeAttribute(Attribute::ReadNone)
+    .removeAttribute(Attribute::ReadOnly)
+    .removeAttribute(Attribute::NoInline)
+    .removeAttribute(Attribute::AlwaysInline)
+    .removeAttribute(Attribute::OptimizeForSize)
+    .removeAttribute(Attribute::StackProtect)
+    .removeAttribute(Attribute::StackProtectReq)
+    .removeAttribute(Attribute::StackProtectStrong)
+    .removeAttribute(Attribute::NoRedZone)
+    .removeAttribute(Attribute::NoImplicitFloat)
+    .removeAttribute(Attribute::Naked)
+    .removeAttribute(Attribute::InlineHint)
+    .removeAttribute(Attribute::StackAlignment)
+    .removeAttribute(Attribute::UWTable)
+    .removeAttribute(Attribute::NonLazyBind)
+    .removeAttribute(Attribute::ReturnsTwice)
+    .removeAttribute(Attribute::SanitizeAddress)
+    .removeAttribute(Attribute::SanitizeThread)
+    .removeAttribute(Attribute::SanitizeMemory)
+    .removeAttribute(Attribute::MinSize)
+    .removeAttribute(Attribute::NoDuplicate)
+    .removeAttribute(Attribute::NoBuiltin);
+}
+
+AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) {
+  // FIXME: Remove this in 4.0.
+  if (!Val) return *this;
+
+  for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
+       I = Attribute::AttrKind(I + 1)) {
+    if (uint64_t A = (Val & AttributeImpl::getAttrMask(I))) {
+      Attrs[I] = true;
+ 
+      if (I == Attribute::Alignment)
+        Alignment = 1ULL << ((A >> 16) - 1);
+      else if (I == Attribute::StackAlignment)
+        StackAlignment = 1ULL << ((A >> 26)-1);
+    }
+  }
+ 
+  return *this;
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeFuncs Function Defintions
+//===----------------------------------------------------------------------===//
+
+/// \brief Which attributes cannot be applied to a type.
+AttributeSet AttributeFuncs::typeIncompatible(Type *Ty, uint64_t Index) {
+  AttrBuilder Incompatible;
+
+  if (!Ty->isIntegerTy())
+    // Attribute that only apply to integers.
+    Incompatible.addAttribute(Attribute::SExt)
+      .addAttribute(Attribute::ZExt);
+
+  if (!Ty->isPointerTy())
+    // Attribute that only apply to pointers.
+    Incompatible.addAttribute(Attribute::ByVal)
+      .addAttribute(Attribute::Nest)
+      .addAttribute(Attribute::NoAlias)
+      .addAttribute(Attribute::NoCapture)
+      .addAttribute(Attribute::StructRet);
+
+  return AttributeSet::get(Ty->getContext(), Index, Incompatible);
 }
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 0ffb24e..a5a9d9f 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -168,8 +168,8 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
 
     if (DestTy->isFloatingPointTy())
       return ConstantFP::get(DestTy->getContext(),
-                             APFloat(CI->getValue(),
-                                     !DestTy->isPPC_FP128Ty()));
+                             APFloat(DestTy->getFltSemantics(),
+                                     CI->getValue()));
 
     // Otherwise, can't fold this (vector?)
     return 0;
@@ -647,8 +647,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
   case Instruction::SIToFP:
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
       APInt api = CI->getValue();
-      APFloat apf(APInt::getNullValue(DestTy->getPrimitiveSizeInBits()),
-                  !DestTy->isPPC_FP128Ty() /* isEEEE */);
+      APFloat apf(DestTy->getFltSemantics(),
+                  APInt::getNullValue(DestTy->getPrimitiveSizeInBits()));
       (void)apf.convertFromAPInt(api, 
                                  opc==Instruction::SIToFP,
                                  APFloat::rmNearestTiesToEven);
@@ -846,8 +846,8 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
   else if (ArrayType *AT = dyn_cast<ArrayType>(Agg->getType()))
     NumElts = AT->getNumElements();
   else
-    NumElts = AT->getVectorNumElements();
-  
+    NumElts = Agg->getType()->getVectorNumElements();
+
   SmallVector<Constant*, 32> Result;
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *C = Agg->getAggregateElement(i);
@@ -1495,9 +1495,8 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
                    "Surprising getelementptr!");
             return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
           } else {
-            // If they are different globals, we don't know what the value is,
-            // but they can't be equal.
-            return ICmpInst::ICMP_NE;
+            // If they are different globals, we don't know what the value is.
+            return ICmpInst::BAD_ICMP_PREDICATE;
           }
         }
       } else {
@@ -1510,10 +1509,10 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
         default: break;
         case Instruction::GetElementPtr:
           // By far the most common case to handle is when the base pointers are
-          // obviously to the same or different globals.
+          // obviously to the same global.
           if (isa<GlobalValue>(CE1Op0) && isa<GlobalValue>(CE2Op0)) {
-            if (CE1Op0 != CE2Op0) // Don't know relative ordering, but not equal
-              return ICmpInst::ICMP_NE;
+            if (CE1Op0 != CE2Op0) // Don't know relative ordering.
+              return ICmpInst::BAD_ICMP_PREDICATE;
             // Ok, we know that both getelementptr instructions are based on the
             // same global.  From this, we can precisely determine the relative
             // ordering of the resultant pointers.
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 4b58599..8093a09 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -51,6 +51,17 @@ bool Constant::isNegativeZeroValue() const {
   return isNullValue();
 }
 
+// Return true iff this constant is positive zero (floating point), negative
+// zero (floating point), or a null value.
+bool Constant::isZeroValue() const {
+  // Floating point values have an explicit -0.0 value.
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->isZero();
+
+  // Otherwise, just use +0.0.
+  return isNullValue();
+}
+
 bool Constant::isNullValue() const {
   // 0 is null.
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
@@ -108,7 +119,8 @@ Constant *Constant::getNullValue(Type *Ty) {
                            APFloat::getZero(APFloat::IEEEquad));
   case Type::PPC_FP128TyID:
     return ConstantFP::get(Ty->getContext(),
-                           APFloat(APInt::getNullValue(128)));
+                           APFloat(APFloat::PPCDoubleDouble,
+                                   APInt::getNullValue(128)));
   case Type::PointerTyID:
     return ConstantPointerNull::get(cast<PointerType>(Ty));
   case Type::StructTyID:
@@ -1454,10 +1466,11 @@ Constant *ConstantExpr::getTruncOrBitCast(Constant *C, Type *Ty) {
 }
 
 Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) {
-  assert(S->getType()->isPointerTy() && "Invalid cast");
-  assert((Ty->isIntegerTy() || Ty->isPointerTy()) && "Invalid cast");
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
+          "Invalid cast");
 
-  if (Ty->isIntegerTy())
+  if (Ty->isIntOrIntVectorTy())
     return getPtrToInt(S, Ty);
   return getBitCast(S, Ty);
 }
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 2024ac9..983b49c 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -26,9 +26,11 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
+#include "llvm/Support/Threading.h"
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
@@ -39,6 +41,7 @@ void llvm::initializeCore(PassRegistry &Registry) {
   initializeDominatorTreePass(Registry);
   initializePrintModulePassPass(Registry);
   initializePrintFunctionPassPass(Registry);
+  initializePrintBasicBlockPassPass(Registry);
   initializeVerifierPass(Registry);
   initializePreVerifierPass(Registry);
 }
@@ -47,6 +50,10 @@ void LLVMInitializeCore(LLVMPassRegistryRef R) {
   initializeCore(*unwrap(R));
 }
 
+void LLVMShutdown() {
+  llvm_shutdown();
+}
+
 /*===-- Error handling ----------------------------------------------------===*/
 
 void LLVMDisposeMessage(char *Message) {
@@ -1383,8 +1390,9 @@ void LLVMAddFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA) {
   const AttributeSet PAL = Func->getAttributes();
   AttrBuilder B(PA);
   const AttributeSet PALnew =
-    PAL.addAttr(Func->getContext(), AttributeSet::FunctionIndex,
-                Attribute::get(Func->getContext(), B));
+    PAL.addAttributes(Func->getContext(), AttributeSet::FunctionIndex,
+                      AttributeSet::get(Func->getContext(),
+                                        AttributeSet::FunctionIndex, B));
   Func->setAttributes(PALnew);
 }
 
@@ -1393,15 +1401,16 @@ void LLVMRemoveFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA) {
   const AttributeSet PAL = Func->getAttributes();
   AttrBuilder B(PA);
   const AttributeSet PALnew =
-    PAL.removeAttr(Func->getContext(), AttributeSet::FunctionIndex,
-                   Attribute::get(Func->getContext(), B));
+    PAL.removeAttributes(Func->getContext(), AttributeSet::FunctionIndex,
+                         AttributeSet::get(Func->getContext(),
+                                           AttributeSet::FunctionIndex, B));
   Func->setAttributes(PALnew);
 }
 
 LLVMAttribute LLVMGetFunctionAttr(LLVMValueRef Fn) {
   Function *Func = unwrap<Function>(Fn);
   const AttributeSet PAL = Func->getAttributes();
-  return (LLVMAttribute)PAL.getBitMask(AttributeSet::FunctionIndex);
+  return (LLVMAttribute)PAL.Raw(AttributeSet::FunctionIndex);
 }
 
 /*--.. Operations on parameters ............................................--*/
@@ -1465,27 +1474,27 @@ LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
 void LLVMAddAttribute(LLVMValueRef Arg, LLVMAttribute PA) {
   Argument *A = unwrap<Argument>(Arg);
   AttrBuilder B(PA);
-  A->addAttr(Attribute::get(A->getContext(), B));
+  A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1,  B));
 }
 
 void LLVMRemoveAttribute(LLVMValueRef Arg, LLVMAttribute PA) {
   Argument *A = unwrap<Argument>(Arg);
   AttrBuilder B(PA);
-  A->removeAttr(Attribute::get(A->getContext(), B));
+  A->removeAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1,  B));
 }
 
 LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
   return (LLVMAttribute)A->getParent()->getAttributes().
-    getBitMask(A->getArgNo()+1);
+    Raw(A->getArgNo()+1);
 }
   
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
+  Argument *A = unwrap<Argument>(Arg);
   AttrBuilder B;
   B.addAlignmentAttr(align);
-  unwrap<Argument>(Arg)->addAttr(Attribute::
-                                 get(unwrap<Argument>(Arg)->getContext(), B));
+  A->addAttr(AttributeSet::get(A->getContext(),A->getArgNo() + 1, B));
 }
 
 /*--.. Operations on basic blocks ..........................................--*/
@@ -1676,17 +1685,19 @@ void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index,
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B(PA);
   Call.setAttributes(
-    Call.getAttributes().addAttr(Call->getContext(), index,
-                                 Attribute::get(Call->getContext(), B)));
+    Call.getAttributes().addAttributes(Call->getContext(), index,
+                                       AttributeSet::get(Call->getContext(),
+                                                         index, B)));
 }
 
 void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index, 
                               LLVMAttribute PA) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B(PA);
-  Call.setAttributes(
-    Call.getAttributes().removeAttr(Call->getContext(), index,
-                                    Attribute::get(Call->getContext(), B)));
+  Call.setAttributes(Call.getAttributes()
+                       .removeAttributes(Call->getContext(), index,
+                                         AttributeSet::get(Call->getContext(),
+                                                           index, B)));
 }
 
 void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index, 
@@ -1694,8 +1705,10 @@ void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B;
   B.addAlignmentAttr(align);
-  Call.setAttributes(Call.getAttributes().addAttr(Call->getContext(), index,
-                                       Attribute::get(Call->getContext(), B)));
+  Call.setAttributes(Call.getAttributes()
+                       .addAttributes(Call->getContext(), index,
+                                      AttributeSet::get(Call->getContext(),
+                                                        index, B)));
 }
 
 /*--.. Operations on call instructions (only) ..............................--*/
@@ -2362,6 +2375,29 @@ LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf,
   return 1;
 }
 
+LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRange(
+    const char *InputData,
+    size_t InputDataLength,
+    const char *BufferName,
+    LLVMBool RequiresNullTerminator) {
+
+  return wrap(MemoryBuffer::getMemBuffer(
+      StringRef(InputData, InputDataLength),
+      StringRef(BufferName),
+      RequiresNullTerminator));
+}
+
+LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRangeCopy(
+    const char *InputData,
+    size_t InputDataLength,
+    const char *BufferName) {
+
+  return wrap(MemoryBuffer::getMemBufferCopy(
+      StringRef(InputData, InputDataLength),
+      StringRef(BufferName)));
+}
+
+
 void LLVMDisposeMemoryBuffer(LLVMMemoryBufferRef MemBuf) {
   delete unwrap(MemBuf);
 }
@@ -2406,3 +2442,17 @@ LLVMBool LLVMFinalizeFunctionPassManager(LLVMPassManagerRef FPM) {
 void LLVMDisposePassManager(LLVMPassManagerRef PM) {
   delete unwrap(PM);
 }
+
+/*===-- Threading ------------------------------------------------------===*/
+
+LLVMBool LLVMStartMultithreaded() {
+  return llvm_start_multithreaded();
+}
+
+void LLVMStopMultithreaded() {
+  llvm_stop_multithreaded();
+}
+
+LLVMBool LLVMIsMultithreaded() {
+  return llvm_is_multithreaded();
+}
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index bd7f0e3..f31e531 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -76,7 +76,7 @@ static MDNode *getNonCompileUnitScope(MDNode *N) {
 void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
                                   StringRef Directory, StringRef Producer,
                                   bool isOptimized, StringRef Flags,
-                                  unsigned RunTimeVer) {
+                                  unsigned RunTimeVer, StringRef SplitName) {
   assert(((Lang <= dwarf::DW_LANG_Python && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
          "Invalid Language tag");
@@ -84,20 +84,12 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
          "Unable to create compile unit without filename");
   Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
   TempEnumTypes = MDNode::getTemporary(VMContext, TElts);
-  Value *THElts[] = { TempEnumTypes };
-  MDNode *EnumHolder = MDNode::get(VMContext, THElts);
 
   TempRetainTypes = MDNode::getTemporary(VMContext, TElts);
-  Value *TRElts[] = { TempRetainTypes };
-  MDNode *RetainHolder = MDNode::get(VMContext, TRElts);
 
   TempSubprograms = MDNode::getTemporary(VMContext, TElts);
-  Value *TSElts[] = { TempSubprograms };
-  MDNode *SPHolder = MDNode::get(VMContext, TSElts);
 
   TempGVs = MDNode::getTemporary(VMContext, TElts);
-  Value *TVElts[] = { TempGVs };
-  MDNode *GVHolder = MDNode::get(VMContext, TVElts);
 
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_compile_unit),
@@ -106,15 +98,16 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
     MDString::get(VMContext, Filename),
     MDString::get(VMContext, Directory),
     MDString::get(VMContext, Producer),
-    // Deprecate isMain field.
+    // isMain field can be removed when we remove the legacy debug info.
     ConstantInt::get(Type::getInt1Ty(VMContext), true), // isMain
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     MDString::get(VMContext, Flags),
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeVer),
-    EnumHolder,
-    RetainHolder,
-    SPHolder,
-    GVHolder
+    TempEnumTypes,
+    TempRetainTypes,
+    TempSubprograms,
+    TempGVs,
+    MDString::get(VMContext, SplitName)
   };
   TheCU = DICompileUnit(MDNode::get(VMContext, Elts));
 
@@ -170,9 +163,9 @@ DIType DIBuilder::createNullPtrType(StringRef Name) {
 
 /// createBasicType - Create debugging information entry for a basic
 /// type, e.g 'char'.
-DIType DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
-                                  uint64_t AlignInBits,
-                                  unsigned Encoding) {
+DIBasicType
+DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
+                           uint64_t AlignInBits, unsigned Encoding) {
   assert(!Name.empty() && "Unable to create type without name");
   // Basic types are encoded in DIBasicType format. Line number, filename,
   // offset and flags are always empty here.
@@ -188,12 +181,12 @@ DIType DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
     ConstantInt::get(Type::getInt32Ty(VMContext), Encoding)
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIBasicType(MDNode::get(VMContext, Elts));
 }
 
 /// createQualifiedType - Create debugging information entry for a qualified
 /// type, e.g. 'const int'.
-DIType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
+DIDerivedType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
   // Qualified types are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
@@ -207,12 +200,13 @@ DIType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     FromTy
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createPointerType - Create debugging information entry for a pointer.
-DIType DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
-                                    uint64_t AlignInBits, StringRef Name) {
+DIDerivedType
+DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
+                             uint64_t AlignInBits, StringRef Name) {
   // Pointer types are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_pointer_type),
@@ -226,10 +220,10 @@ DIType DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     PointeeTy
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
-DIType DIBuilder::createMemberPointerType(DIType PointeeTy, DIType Base) {
+DIDerivedType DIBuilder::createMemberPointerType(DIType PointeeTy, DIType Base) {
   // Pointer types are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_ptr_to_member_type),
@@ -244,12 +238,12 @@ DIType DIBuilder::createMemberPointerType(DIType PointeeTy, DIType Base) {
     PointeeTy,
     Base
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createReferenceType - Create debugging information entry for a reference
 /// type.
-DIType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
+DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
   assert(RTy.Verify() && "Unable to create reference type");
   // References are encoded in DIDerivedType format.
   Value *Elts[] = {
@@ -264,12 +258,12 @@ DIType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     RTy
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createTypedef - Create debugging information entry for a typedef.
-DIType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
-                                unsigned LineNo, DIDescriptor Context) {
+DIDerivedType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
+                                       unsigned LineNo, DIDescriptor Context) {
   // typedefs are encoded in DIDerivedType format.
   assert(Ty.Verify() && "Invalid typedef type!");
   Value *Elts[] = {
@@ -284,7 +278,7 @@ DIType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     Ty
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createFriend - Create debugging information entry for a 'friend'.
@@ -309,8 +303,8 @@ DIType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
 
 /// createInheritance - Create debugging information entry to establish
 /// inheritance relationship between two types.
-DIType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
-                                    uint64_t BaseOffset, unsigned Flags) {
+DIDerivedType DIBuilder::createInheritance(
+    DIType Ty, DIType BaseTy, uint64_t BaseOffset, unsigned Flags) {
   assert(Ty.Verify() && "Unable to create inheritance");
   // TAG_inheritance is encoded in DIDerivedType format.
   Value *Elts[] = {
@@ -325,15 +319,14 @@ DIType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     BaseTy
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createMemberType - Create debugging information entry for a member.
-DIType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
-                                   DIFile File, unsigned LineNumber,
-                                   uint64_t SizeInBits, uint64_t AlignInBits,
-                                   uint64_t OffsetInBits, unsigned Flags,
-                                   DIType Ty) {
+DIDerivedType DIBuilder::createMemberType(
+    DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
+    uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
+    unsigned Flags, DIType Ty) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
@@ -347,6 +340,30 @@ DIType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     Ty
   };
+  return DIDerivedType(MDNode::get(VMContext, Elts));
+}
+
+/// createStaticMemberType - Create debugging information entry for a
+/// C++ static data member.
+DIType DIBuilder::createStaticMemberType(DIDescriptor Scope, StringRef Name,
+                                         DIFile File, unsigned LineNumber,
+                                         DIType Ty, unsigned Flags,
+                                         llvm::Value *Val) {
+  // TAG_member is encoded in DIDerivedType format.
+  Flags |= DIDescriptor::FlagStaticMember;
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_member),
+    getNonCompileUnitScope(Scope),
+    MDString::get(VMContext, Name),
+    File,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*SizeInBits*/),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*AlignInBits*/),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*OffsetInBits*/),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    Ty,
+    Val
+  };
   return DIType(MDNode::get(VMContext, Elts));
 }
 
@@ -491,11 +508,15 @@ DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
 }
 
 /// createStructType - Create debugging information entry for a struct.
-DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
-                                   DIFile File, unsigned LineNumber,
-                                   uint64_t SizeInBits, uint64_t AlignInBits,
-                                   unsigned Flags, DIArray Elements,
-                                   unsigned RunTimeLang) {
+DICompositeType DIBuilder::createStructType(DIDescriptor Context,
+                                            StringRef Name, DIFile File,
+                                            unsigned LineNumber,
+                                            uint64_t SizeInBits,
+                                            uint64_t AlignInBits,
+                                            unsigned Flags, DIType DerivedFrom,
+                                            DIArray Elements,
+                                            unsigned RunTimeLang,
+                                            MDNode *VTableHolder) {
  // TAG_structure_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_structure_type),
@@ -507,21 +528,20 @@ DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    NULL,
+    DerivedFrom,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    VTableHolder,
+    NULL,
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DICompositeType(MDNode::get(VMContext, Elts));
 }
 
 /// createUnionType - Create debugging information entry for an union.
-DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
-                                  DIFile File,
-                                  unsigned LineNumber, uint64_t SizeInBits,
-                                  uint64_t AlignInBits, unsigned Flags,
-                                  DIArray Elements, unsigned RunTimeLang) {
+DICompositeType DIBuilder::createUnionType(
+    DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
+    uint64_t SizeInBits, uint64_t AlignInBits, unsigned Flags, DIArray Elements,
+    unsigned RunTimeLang) {
   // TAG_union_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_union_type),
@@ -538,11 +558,12 @@ DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
     Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DICompositeType(MDNode::get(VMContext, Elts));
 }
 
 /// createSubroutineType - Create subroutine type.
-DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
+DICompositeType
+DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
   // TAG_subroutine_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
@@ -559,17 +580,15 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DICompositeType(MDNode::get(VMContext, Elts));
 }
 
 /// createEnumerationType - Create debugging information entry for an
 /// enumeration.
-DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
-                                        DIFile File, unsigned LineNumber,
-                                        uint64_t SizeInBits,
-                                        uint64_t AlignInBits,
-                                        DIArray Elements,
-                                        DIType ClassType) {
+DICompositeType DIBuilder::createEnumerationType(
+    DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
+    uint64_t SizeInBits, uint64_t AlignInBits, DIArray Elements,
+    DIType ClassType) {
   // TAG_enumeration_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
@@ -588,12 +607,12 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
   AllEnumTypes.push_back(Node);
-  return DIType(Node);
+  return DICompositeType(Node);
 }
 
 /// createArrayType - Create debugging information entry for an array.
-DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
-                                  DIType Ty, DIArray Subscripts) {
+DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
+                                           DIType Ty, DIArray Subscripts) {
   // TAG_array_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
@@ -610,7 +629,7 @@ DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     Constant::getNullValue(Type::getInt32Ty(VMContext))
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DICompositeType(MDNode::get(VMContext, Elts));
 }
 
 /// createVectorType - Create debugging information entry for a vector.
@@ -787,7 +806,8 @@ createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
     Ty,
     ConstantInt::get(Type::getInt32Ty(VMContext), isLocalToUnit),
     ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/
-    Val
+    Val,
+    DIDescriptor()
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
   AllGVs.push_back(Node);
@@ -799,7 +819,7 @@ createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
 DIGlobalVariable DIBuilder::
 createStaticVariable(DIDescriptor Context, StringRef Name,
                      StringRef LinkageName, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, Value *Val) {
+                     DIType Ty, bool isLocalToUnit, Value *Val, MDNode *Decl) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -812,7 +832,8 @@ createStaticVariable(DIDescriptor Context, StringRef Name,
     Ty,
     ConstantInt::get(Type::getInt32Ty(VMContext), isLocalToUnit),
     ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/
-    Val
+    Val,
+    DIDescriptor(Decl)
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
   AllGVs.push_back(Node);
@@ -882,10 +903,6 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
                                        MDNode *TParams,
                                        MDNode *Decl) {
   Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
-  MDNode *Temp = MDNode::getTemporary(VMContext, TElts);
-  Value *TVElts[] = { Temp };
-  MDNode *THolder = MDNode::get(VMContext, TVElts);
-
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -906,13 +923,14 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
     Fn,
     TParams,
     Decl,
-    THolder,
+    MDNode::getTemporary(VMContext, TElts),
     ConstantInt::get(Type::getInt32Ty(VMContext), ScopeLine)
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
 
   // Create a named metadata so that we do not lose this mdnode.
-  AllSubprograms.push_back(Node);
+  if (isDefinition)
+    AllSubprograms.push_back(Node);
   return DISubprogram(Node);
 }
 
@@ -931,10 +949,6 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
                                      Function *Fn,
                                      MDNode *TParam) {
   Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
-  MDNode *Temp = MDNode::getTemporary(VMContext, TElts);
-  Value *TVElts[] = { Temp };
-  MDNode *THolder = MDNode::get(VMContext, TVElts);
-
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -955,11 +969,13 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
     Fn,
     TParam,
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    THolder,
+    MDNode::getTemporary(VMContext, TElts),
     // FIXME: Do we want to use different scope/lines?
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
+  if (isDefinition)
+    AllSubprograms.push_back(Node);
   return DISubprogram(Node);
 }
 
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index b159af6..f09de3a 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -126,7 +126,7 @@ DataLayout::InvalidAlignmentElem = LayoutAlignElem::get(INVALID_ALIGN, 0, 0, 0);
 
 PointerAlignElem
 PointerAlignElem::get(uint32_t addr_space, unsigned abi_align,
-                     unsigned pref_align, uint32_t bit_width) {
+                      unsigned pref_align, uint32_t bit_width) {
   assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
   PointerAlignElem retval;
   retval.AddressSpace = addr_space;
@@ -309,7 +309,7 @@ void DataLayout::parseSpecifier(StringRef Desc) {
 /// used.
 DataLayout::DataLayout() : ImmutablePass(ID) {
   report_fatal_error("Bad DataLayout ctor used.  "
-                    "Tool did not specify a DataLayout to use?");
+                     "Tool did not specify a DataLayout to use?");
 }
 
 DataLayout::DataLayout(const Module *M)
@@ -371,7 +371,7 @@ unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
       // The "best match" for integers is the smallest size that is larger than
       // the BitWidth requested.
       if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 ||
-           Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
+          Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
         BestMatchIdx = i;
       // However, if there isn't one that's larger, then we must use the
       // largest one we have (see below)
@@ -512,7 +512,7 @@ uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const {
   case Type::PointerTyID: {
     unsigned AS = dyn_cast<PointerType>(Ty)->getAddressSpace();
     return getPointerSizeInBits(AS);
-    }
+  }
   case Type::ArrayTyID: {
     ArrayType *ATy = cast<ArrayType>(Ty);
     return getTypeAllocSizeInBits(ATy->getElementType())*ATy->getNumElements();
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index 7083495..a59fdcd 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -383,7 +383,8 @@ bool DIType::isUnsignedDIType() {
   if (BTy.Verify()) {
     unsigned Encoding = BTy.getEncoding();
     if (Encoding == dwarf::DW_ATE_unsigned ||
-        Encoding == dwarf::DW_ATE_unsigned_char)
+        Encoding == dwarf::DW_ATE_unsigned_char ||
+        Encoding == dwarf::DW_ATE_boolean)
       return true;
   }
   return false;
@@ -592,17 +593,14 @@ unsigned DISubprogram::isOptimized() const {
 MDNode *DISubprogram::getVariablesNodes() const {
   if (!DbgNode || DbgNode->getNumOperands() <= 19)
     return NULL;
-  if (MDNode *Temp = dyn_cast_or_null<MDNode>(DbgNode->getOperand(19)))
-    return dyn_cast_or_null<MDNode>(Temp->getOperand(0));
-  return NULL;
+  return dyn_cast_or_null<MDNode>(DbgNode->getOperand(19));
 }
 
 DIArray DISubprogram::getVariables() const {
   if (!DbgNode || DbgNode->getNumOperands() <= 19)
     return DIArray();
   if (MDNode *T = dyn_cast_or_null<MDNode>(DbgNode->getOperand(19)))
-    if (MDNode *A = dyn_cast_or_null<MDNode>(T->getOperand(0)))
-      return DIArray(A);
+    return DIArray(T);
   return DIArray();
 }
 
@@ -651,8 +649,7 @@ DIArray DICompileUnit::getEnumTypes() const {
     return DIArray();
 
   if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(10)))
-    if (MDNode *A = dyn_cast_or_null<MDNode>(N->getOperand(0)))
-      return DIArray(A);
+    return DIArray(N);
   return DIArray();
 }
 
@@ -661,8 +658,7 @@ DIArray DICompileUnit::getRetainedTypes() const {
     return DIArray();
 
   if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(11)))
-    if (MDNode *A = dyn_cast_or_null<MDNode>(N->getOperand(0)))
-      return DIArray(A);
+    return DIArray(N);
   return DIArray();
 }
 
@@ -671,9 +667,7 @@ DIArray DICompileUnit::getSubprograms() const {
     return DIArray();
 
   if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(12)))
-    if (N->getNumOperands() > 0)
-      if (MDNode *A = dyn_cast_or_null<MDNode>(N->getOperand(0)))
-        return DIArray(A);
+    return DIArray(N);
   return DIArray();
 }
 
@@ -683,8 +677,7 @@ DIArray DICompileUnit::getGlobalVariables() const {
     return DIArray();
 
   if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(13)))
-    if (MDNode *A = dyn_cast_or_null<MDNode>(N->getOperand(0)))
-      return DIArray(A);
+    return DIArray(N);
   return DIArray();
 }
 
@@ -1062,8 +1055,8 @@ void DIScope::printInternal(raw_ostream &OS) const {
 
 void DICompileUnit::printInternal(raw_ostream &OS) const {
   DIScope::printInternal(OS);
-  if (unsigned Lang = getLanguage())
-    OS << " [" << dwarf::LanguageString(Lang) << ']';
+  if (const char *Lang = dwarf::LanguageString(getLanguage()))
+    OS << " [" << Lang << ']';
 }
 
 void DIEnumerator::printInternal(raw_ostream &OS) const {
@@ -1101,6 +1094,8 @@ void DIType::printInternal(raw_ostream &OS) const {
     OS << " [fwd]";
   if (isVector())
     OS << " [vector]";
+  if (isStaticMember())
+    OS << " [static]";
 }
 
 void DIDerivedType::printInternal(raw_ostream &OS) const {
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 317017a..5c444d2 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -123,17 +123,26 @@ bool Argument::hasStructRetAttr() const {
     hasAttribute(1, Attribute::StructRet);
 }
 
-/// addAttr - Add a Attribute to an argument
-void Argument::addAttr(Attribute attr) {
-  getParent()->addAttribute(getArgNo() + 1, attr);
+/// addAttr - Add attributes to an argument.
+void Argument::addAttr(AttributeSet AS) {
+  assert(AS.getNumSlots() <= 1 &&
+         "Trying to add more than one attribute set to an argument!");
+  AttrBuilder B(AS, AS.getSlotIndex(0));
+  getParent()->addAttributes(getArgNo() + 1,
+                             AttributeSet::get(Parent->getContext(),
+                                               getArgNo() + 1, B));
+}
+
+/// removeAttr - Remove attributes from an argument.
+void Argument::removeAttr(AttributeSet AS) {
+  assert(AS.getNumSlots() <= 1 &&
+         "Trying to remove more than one attribute set from an argument!");
+  AttrBuilder B(AS, AS.getSlotIndex(0));
+  getParent()->removeAttributes(getArgNo() + 1,
+                                AttributeSet::get(Parent->getContext(),
+                                                  getArgNo() + 1, B));
 }
 
-/// removeAttr - Remove a Attribute from an argument
-void Argument::removeAttr(Attribute attr) {
-  getParent()->removeAttribute(getArgNo() + 1, attr);
-}
-
-
 //===----------------------------------------------------------------------===//
 // Helper Methods in Function
 //===----------------------------------------------------------------------===//
@@ -248,15 +257,21 @@ void Function::dropAllReferences() {
     BasicBlocks.begin()->eraseFromParent();
 }
 
-void Function::addAttribute(unsigned i, Attribute attr) {
+void Function::addAttribute(unsigned i, Attribute::AttrKind attr) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addAttribute(getContext(), i, attr);
+  setAttributes(PAL);
+}
+
+void Function::addAttributes(unsigned i, AttributeSet attrs) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.addAttr(getContext(), i, attr);
+  PAL = PAL.addAttributes(getContext(), i, attrs);
   setAttributes(PAL);
 }
 
-void Function::removeAttribute(unsigned i, Attribute attr) {
+void Function::removeAttributes(unsigned i, AttributeSet attrs) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.removeAttr(getContext(), i, attr);
+  PAL = PAL.removeAttributes(getContext(), i, attrs);
   setAttributes(PAL);
 }
 
@@ -372,27 +387,28 @@ enum IIT_Info {
   IIT_I16  = 3,
   IIT_I32  = 4,
   IIT_I64  = 5,
-  IIT_F32  = 6,
-  IIT_F64  = 7,
-  IIT_V2   = 8,
-  IIT_V4   = 9,
-  IIT_V8   = 10,
-  IIT_V16  = 11,
-  IIT_V32  = 12,
-  IIT_MMX  = 13,
+  IIT_F16  = 6,
+  IIT_F32  = 7,
+  IIT_F64  = 8,
+  IIT_V2   = 9,
+  IIT_V4   = 10,
+  IIT_V8   = 11,
+  IIT_V16  = 12,
+  IIT_V32  = 13,
   IIT_PTR  = 14,
   IIT_ARG  = 15,
 
   // Values from 16+ are only encodable with the inefficient encoding.
-  IIT_METADATA = 16,
-  IIT_EMPTYSTRUCT = 17,
-  IIT_STRUCT2 = 18,
-  IIT_STRUCT3 = 19,
-  IIT_STRUCT4 = 20,
-  IIT_STRUCT5 = 21,
-  IIT_EXTEND_VEC_ARG = 22,
-  IIT_TRUNC_VEC_ARG = 23,
-  IIT_ANYPTR = 24
+  IIT_MMX  = 16,
+  IIT_METADATA = 17,
+  IIT_EMPTYSTRUCT = 18,
+  IIT_STRUCT2 = 19,
+  IIT_STRUCT3 = 20,
+  IIT_STRUCT4 = 21,
+  IIT_STRUCT5 = 22,
+  IIT_EXTEND_VEC_ARG = 23,
+  IIT_TRUNC_VEC_ARG = 24,
+  IIT_ANYPTR = 25
 };
 
 
@@ -412,6 +428,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_METADATA:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Metadata, 0));
     return;
+  case IIT_F16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Half, 0));
+    return;
   case IIT_F32:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Float, 0));
     return;
@@ -546,6 +565,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::Void: return Type::getVoidTy(Context);
   case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
+  case IITDescriptor::Half: return Type::getHalfTy(Context);
   case IITDescriptor::Float: return Type::getFloatTy(Context);
   case IITDescriptor::Double: return Type::getDoubleTy(Context);
 
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 2e52aa3..6d547f3 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -82,13 +82,16 @@ bool GlobalValue::isDeclaration() const {
 //===----------------------------------------------------------------------===//
 
 GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
-                               Constant *InitVal, const Twine &Name,
-                               ThreadLocalMode TLMode, unsigned AddressSpace)
+                               Constant *InitVal,
+                               const Twine &Name, ThreadLocalMode TLMode,
+                               unsigned AddressSpace,
+                               bool isExternallyInitialized)
   : GlobalValue(PointerType::get(Ty, AddressSpace),
                 Value::GlobalVariableVal,
                 OperandTraits<GlobalVariable>::op_begin(this),
                 InitVal != 0, Link, Name),
-    isConstantGlobal(constant), threadLocalMode(TLMode) {
+    isConstantGlobal(constant), threadLocalMode(TLMode),
+    isExternallyInitializedConstant(isExternallyInitialized) {
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
@@ -102,12 +105,14 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
                                LinkageTypes Link, Constant *InitVal,
                                const Twine &Name,
                                GlobalVariable *Before, ThreadLocalMode TLMode,
-                               unsigned AddressSpace)
+                               unsigned AddressSpace,
+                               bool isExternallyInitialized)
   : GlobalValue(PointerType::get(Ty, AddressSpace),
                 Value::GlobalVariableVal,
                 OperandTraits<GlobalVariable>::op_begin(this),
                 InitVal != 0, Link, Name),
-    isConstantGlobal(constant), threadLocalMode(TLMode) {
+    isConstantGlobal(constant), threadLocalMode(TLMode),
+    isExternallyInitializedConstant(isExternallyInitialized) {
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
            "Initializer should be the same type as the GlobalVariable!");
diff --git a/lib/IR/InlineAsm.cpp b/lib/IR/InlineAsm.cpp
index 10d281b..9f2a9fe 100644
--- a/lib/IR/InlineAsm.cpp
+++ b/lib/IR/InlineAsm.cpp
@@ -151,10 +151,10 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
       if (ConstraintEnd == E) return true;  // "{foo"
       pCodes->push_back(std::string(I, ConstraintEnd+1));
       I = ConstraintEnd+1;
-    } else if (isdigit(*I)) {     // Matching Constraint
+    } else if (isdigit(static_cast<unsigned char>(*I))) { // Matching Constraint
       // Maximal munch numbers.
       StringRef::iterator NumStart = I;
-      while (I != E && isdigit(*I))
+      while (I != E && isdigit(static_cast<unsigned char>(*I)))
         ++I;
       pCodes->push_back(std::string(NumStart, I));
       unsigned N = atoi(pCodes->back().c_str());
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 42df5d7..2b5a0b3 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -455,14 +455,18 @@ bool Instruction::mayWriteToMemory() const {
   }
 }
 
-/// mayThrow - Return true if this instruction may throw an exception.
-///
 bool Instruction::mayThrow() const {
   if (const CallInst *CI = dyn_cast<CallInst>(this))
     return !CI->doesNotThrow();
   return isa<ResumeInst>(this);
 }
 
+bool Instruction::mayReturn() const {
+  if (const CallInst *CI = dyn_cast<CallInst>(this))
+    return !CI->doesNotReturn();
+  return true;
+}
+
 /// isAssociative - Return true if the instruction is associative:
 ///
 ///   Associative operators satisfy:  x op (y op z) === (x op y) op z
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 1b5d004..8a0a465 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -333,13 +333,19 @@ CallInst::CallInst(const CallInst &CI)
 
 void CallInst::addAttribute(unsigned i, Attribute attr) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.addAttr(getContext(), i, attr);
+  AttrBuilder B(attr);
+  LLVMContext &Context = getContext();
+  PAL = PAL.addAttributes(Context, i,
+                          AttributeSet::get(Context, i, B));
   setAttributes(PAL);
 }
 
 void CallInst::removeAttribute(unsigned i, Attribute attr) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.removeAttr(getContext(), i, attr);
+  AttrBuilder B(attr);
+  LLVMContext &Context = getContext();
+  PAL = PAL.removeAttributes(Context, i,
+                             AttributeSet::get(Context, i, B));
   setAttributes(PAL);
 }
 
@@ -589,13 +595,17 @@ bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
 
 void InvokeInst::addAttribute(unsigned i, Attribute attr) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.addAttr(getContext(), i, attr);
+  AttrBuilder B(attr);
+  PAL = PAL.addAttributes(getContext(), i,
+                          AttributeSet::get(getContext(), i, B));
   setAttributes(PAL);
 }
 
 void InvokeInst::removeAttribute(unsigned i, Attribute attr) {
   AttributeSet PAL = getAttributes();
-  PAL = PAL.removeAttr(getContext(), i, attr);
+  AttrBuilder B(attr);
+  PAL = PAL.removeAttributes(getContext(), i,
+                             AttributeSet::get(getContext(), i, B));
   setAttributes(PAL);
 }
 
@@ -1926,11 +1936,14 @@ bool BinaryOperator::isNeg(const Value *V) {
   return false;
 }
 
-bool BinaryOperator::isFNeg(const Value *V) {
+bool BinaryOperator::isFNeg(const Value *V, bool IgnoreZeroSign) {
   if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
     if (Bop->getOpcode() == Instruction::FSub)
-      if (Constant* C = dyn_cast<Constant>(Bop->getOperand(0)))
-        return C->isNegativeZeroValue();
+      if (Constant* C = dyn_cast<Constant>(Bop->getOperand(0))) {
+        if (!IgnoreZeroSign)
+          IgnoreZeroSign = cast<Instruction>(V)->hasNoSignedZeros();
+        return !IgnoreZeroSign ? C->isNegativeZeroValue() : C->isZeroValue();
+      }
   return false;
 }
 
@@ -2383,11 +2396,11 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
 CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, 
                                       const Twine &Name, 
                                       Instruction *InsertBefore) {
-  assert(S->getType()->isPointerTy() && "Invalid cast");
-  assert((Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
          "Invalid cast");
 
-  if (Ty->isIntegerTy())
+  if (Ty->isIntOrIntVectorTy())
     return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
   return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
 }
@@ -2621,6 +2634,11 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
 
   // Check for type sanity on the arguments
   Type *SrcTy = S->getType();
+
+  // If this is a cast to the same type then it's trivially true.
+  if (SrcTy == DstTy)
+    return true;
+
   if (!SrcTy->isFirstClassType() || !DstTy->isFirstClassType() ||
       SrcTy->isAggregateType() || DstTy->isAggregateType())
     return false;
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index 282779c..b73cd03 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -73,55 +73,43 @@ void LLVMContext::removeModule(Module *M) {
 // Recoverable Backend Errors
 //===----------------------------------------------------------------------===//
 
-void LLVMContext::setDiagnosticHandler(DiagHandlerTy DiagHandler,
-                                       void *DiagContext) {
-  pImpl->DiagHandler = DiagHandler;
-  pImpl->DiagContext = DiagContext;
+void LLVMContext::
+setInlineAsmDiagnosticHandler(InlineAsmDiagHandlerTy DiagHandler,
+                              void *DiagContext) {
+  pImpl->InlineAsmDiagHandler = DiagHandler;
+  pImpl->InlineAsmDiagContext = DiagContext;
 }
 
-/// getDiagnosticHandler - Return the diagnostic handler set by
-/// setDiagnosticHandler.
-LLVMContext::DiagHandlerTy LLVMContext::getDiagnosticHandler() const {
-  return pImpl->DiagHandler;
+/// getInlineAsmDiagnosticHandler - Return the diagnostic handler set by
+/// setInlineAsmDiagnosticHandler.
+LLVMContext::InlineAsmDiagHandlerTy
+LLVMContext::getInlineAsmDiagnosticHandler() const {
+  return pImpl->InlineAsmDiagHandler;
 }
 
-/// getDiagnosticContext - Return the diagnostic context set by
-/// setDiagnosticHandler.
-void *LLVMContext::getDiagnosticContext() const {
-  return pImpl->DiagContext;
+/// getInlineAsmDiagnosticContext - Return the diagnostic context set by
+/// setInlineAsmDiagnosticHandler.
+void *LLVMContext::getInlineAsmDiagnosticContext() const {
+  return pImpl->InlineAsmDiagContext;
 }
 
 void LLVMContext::emitError(const Twine &ErrorStr) {
   emitError(0U, ErrorStr);
 }
 
-void LLVMContext::emitWarning(const Twine &ErrorStr) {
-  emitWarning(0U, ErrorStr);
-}
-
-static unsigned getSrcLocation(const Instruction *I) {
+void LLVMContext::emitError(const Instruction *I, const Twine &ErrorStr) {
   unsigned LocCookie = 0;
   if (const MDNode *SrcLoc = I->getMetadata("srcloc")) {
     if (SrcLoc->getNumOperands() != 0)
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(SrcLoc->getOperand(0)))
         LocCookie = CI->getZExtValue();
   }
-  return LocCookie;
-}
-
-void LLVMContext::emitError(const Instruction *I, const Twine &ErrorStr) {
-  unsigned LocCookie = getSrcLocation(I);
   return emitError(LocCookie, ErrorStr);
 }
 
-void LLVMContext::emitWarning(const Instruction *I, const Twine &ErrorStr) {
-  unsigned LocCookie = getSrcLocation(I);
-  return emitWarning(LocCookie, ErrorStr);
-}
-
 void LLVMContext::emitError(unsigned LocCookie, const Twine &ErrorStr) {
   // If there is no error handler installed, just print the error and exit.
-  if (pImpl->DiagHandler == 0) {
+  if (pImpl->InlineAsmDiagHandler == 0) {
     errs() << "error: " << ErrorStr << "\n";
     exit(1);
   }
@@ -129,20 +117,7 @@ void LLVMContext::emitError(unsigned LocCookie, const Twine &ErrorStr) {
   // If we do have an error handler, we can report the error and keep going.
   SMDiagnostic Diag("", SourceMgr::DK_Error, ErrorStr.str());
 
-  pImpl->DiagHandler(Diag, pImpl->DiagContext, LocCookie);
-}
-
-void LLVMContext::emitWarning(unsigned LocCookie, const Twine &ErrorStr) {
-  // If there is no handler installed, just print the warning.
-  if (pImpl->DiagHandler == 0) {
-    errs() << "warning: " << ErrorStr << "\n";
-    return;
-  }
-
-  // If we do have a handler, we can report the warning.
-  SMDiagnostic Diag("", SourceMgr::DK_Warning, ErrorStr.str());
-
-  pImpl->DiagHandler(Diag, pImpl->DiagContext, LocCookie);
+  pImpl->InlineAsmDiagHandler(Diag, pImpl->InlineAsmDiagContext, LocCookie);
 }
 
 //===----------------------------------------------------------------------===//
@@ -155,12 +130,13 @@ static bool isValidName(StringRef MDName) {
   if (MDName.empty())
     return false;
 
-  if (!std::isalpha(MDName[0]))
+  if (!std::isalpha(static_cast<unsigned char>(MDName[0])))
     return false;
 
   for (StringRef::iterator I = MDName.begin() + 1, E = MDName.end(); I != E;
        ++I) {
-    if (!std::isalnum(*I) && *I != '_' && *I != '-' && *I != '.')
+    if (!std::isalnum(static_cast<unsigned char>(*I)) && *I != '_' &&
+        *I != '-' && *I != '.')
       return false;
   }
   return true;
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index 8fc9379..6a6a4d6 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -35,8 +35,8 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
     Int16Ty(C, 16),
     Int32Ty(C, 32),
     Int64Ty(C, 64) {
-  DiagHandler = 0;
-  DiagContext = 0;
+  InlineAsmDiagHandler = 0;
+  InlineAsmDiagContext = 0;
   NamedStructTypesUniqueID = 0;
 }
 
@@ -109,6 +109,13 @@ LLVMContextImpl::~LLVMContextImpl() {
     delete &*Elem;
   }
 
+  // Destroy attribute node lists.
+  for (FoldingSetIterator<AttributeSetNode> I = AttrsSetNodes.begin(),
+         E = AttrsSetNodes.end(); I != E; ) {
+    FoldingSetIterator<AttributeSetNode> Elem = I++;
+    delete &*Elem;
+  }
+
   // Destroy MDNodes.  ~MDNode can move and remove nodes between the MDNodeSet
   // and the NonUniquedMDNodes sets, so copy the values out first.
   SmallVector<MDNode*, 8> MDNodes;
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 30fd666..7353dc0 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -236,8 +236,8 @@ public:
   /// will be automatically deleted if this context is deleted.
   SmallPtrSet<Module*, 4> OwnedModules;
   
-  LLVMContext::DiagHandlerTy DiagHandler;
-  void *DiagContext;
+  LLVMContext::InlineAsmDiagHandlerTy InlineAsmDiagHandler;
+  void *InlineAsmDiagContext;
   
   typedef DenseMap<DenseMapAPIntKeyInfo::KeyTy, ConstantInt*, 
                          DenseMapAPIntKeyInfo> IntMapTy;
@@ -249,6 +249,7 @@ public:
 
   FoldingSet<AttributeImpl> AttrsSet;
   FoldingSet<AttributeSetImpl> AttrsLists;
+  FoldingSet<AttributeSetNode> AttrsSetNodes;
 
   StringMap<Value*> MDStringCache;
 
diff --git a/lib/IR/Pass.cpp b/lib/IR/Pass.cpp
index ec448e6..7fc4828 100644
--- a/lib/IR/Pass.cpp
+++ b/lib/IR/Pass.cpp
@@ -143,8 +143,7 @@ PassManagerType FunctionPass::getPotentialPassManagerType() const {
 
 Pass *BasicBlockPass::createPrinterPass(raw_ostream &O,
                                         const std::string &Banner) const {
-
-  llvm_unreachable("BasicBlockPass printing unsupported.");
+  return createPrintBasicBlockPass(&O, false, Banner);
 }
 
 bool BasicBlockPass::doInitialization(Function &) {
diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index 4f7984e..5a8df70 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp
@@ -626,8 +626,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
             Pass *AnalysisPass2 = findAnalysisPass(*I2);
             if (AnalysisPass2) {
               dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
-            }
-            else {
+            } else {
               dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
               dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
               dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
@@ -648,8 +647,7 @@ void PMTopLevelManager::schedulePass(Pass *P) {
           // Recheck analysis passes to ensure that required analyses that
           // are already checked are still available.
           checkAnalysis = true;
-        }
-        else
+        } else
           // Do not schedule this analysis. Lower level analsyis
           // passes are run on the fly.
           delete AnalysisPass;
@@ -876,9 +874,9 @@ void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
     return;
 
   const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-  for (std::map<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
+  for (DenseMap<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
          E = AvailableAnalysis.end(); I != E; ) {
-    std::map<AnalysisID, Pass*>::iterator Info = I++;
+    DenseMap<AnalysisID, Pass*>::iterator Info = I++;
     if (Info->second->getAsImmutablePass() == 0 &&
         std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
         PreservedSet.end()) {
@@ -899,10 +897,10 @@ void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
     if (!InheritedAnalysis[Index])
       continue;
 
-    for (std::map<AnalysisID, Pass*>::iterator
+    for (DenseMap<AnalysisID, Pass*>::iterator
            I = InheritedAnalysis[Index]->begin(),
            E = InheritedAnalysis[Index]->end(); I != E; ) {
-      std::map<AnalysisID, Pass *>::iterator Info = I++;
+      DenseMap<AnalysisID, Pass *>::iterator Info = I++;
       if (Info->second->getAsImmutablePass() == 0 &&
           std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
              PreservedSet.end()) {
@@ -962,7 +960,7 @@ void PMDataManager::freePass(Pass *P, StringRef Msg,
     // listed as the available implementation.
     const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
     for (unsigned i = 0, e = II.size(); i != e; ++i) {
-      std::map<AnalysisID, Pass*>::iterator Pos =
+      DenseMap<AnalysisID, Pass*>::iterator Pos =
         AvailableAnalysis.find(II[i]->getTypeInfo());
       if (Pos != AvailableAnalysis.end() && Pos->second == P)
         AvailableAnalysis.erase(Pos);
@@ -1102,7 +1100,7 @@ void PMDataManager::initializeAnalysisImpl(Pass *P) {
 Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
 
   // Check if AvailableAnalysis map has one entry.
-  std::map<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
+  DenseMap<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
 
   if (I != AvailableAnalysis.end())
     return I->second;
@@ -1797,8 +1795,7 @@ void PMStack::push(PMDataManager *PM) {
     TPM->addIndirectPassManager(PM);
     PM->setTopLevelManager(TPM);
     PM->setDepth(this->top()->getDepth()+1);
-  }
-  else {
+  } else {
     assert((PM->getPassManagerType() == PMT_ModulePassManager
            || PM->getPassManagerType() == PMT_FunctionPassManager)
            && "pushing bad pass manager to PMStack");
diff --git a/lib/IR/PrintModulePass.cpp b/lib/IR/PrintModulePass.cpp
index e4e9939..5026bc2 100644
--- a/lib/IR/PrintModulePass.cpp
+++ b/lib/IR/PrintModulePass.cpp
@@ -73,6 +73,31 @@ namespace {
       AU.setPreservesAll();
     }
   };
+  
+  class PrintBasicBlockPass : public BasicBlockPass {
+    std::string Banner;
+    raw_ostream *Out;       // raw_ostream to print on
+    bool DeleteStream;      // Delete the ostream in our dtor?
+  public:
+    static char ID;
+    PrintBasicBlockPass() : BasicBlockPass(ID), Out(&dbgs()), 
+      DeleteStream(false) {}
+    PrintBasicBlockPass(const std::string &B, raw_ostream *o, bool DS)
+        : BasicBlockPass(ID), Banner(B), Out(o), DeleteStream(DS) {}
+    
+    ~PrintBasicBlockPass() {
+      if (DeleteStream) delete Out;
+    }
+    
+    bool runOnBasicBlock(BasicBlock &BB) {
+      (*Out) << Banner << BB;
+      return false;
+    }
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
 }
 
 char PrintModulePass::ID = 0;
@@ -81,6 +106,9 @@ INITIALIZE_PASS(PrintModulePass, "print-module",
 char PrintFunctionPass::ID = 0;
 INITIALIZE_PASS(PrintFunctionPass, "print-function",
                 "Print function to stderr", false, false)
+char PrintBasicBlockPass::ID = 0;
+INITIALIZE_PASS(PrintBasicBlockPass, "print-bb",
+                "Print BB to stderr", false, false)
 
 /// createPrintModulePass - Create and return a pass that writes the
 /// module to the specified raw_ostream.
@@ -98,3 +126,11 @@ FunctionPass *llvm::createPrintFunctionPass(const std::string &Banner,
   return new PrintFunctionPass(Banner, OS, DeleteStream);
 }
 
+/// createPrintBasicBlockPass - Create and return a pass that writes the
+/// BB to the specified raw_ostream.
+BasicBlockPass *llvm::createPrintBasicBlockPass(llvm::raw_ostream *OS,
+                                        bool DeleteStream,
+                                        const std::string &Banner) {
+  return new PrintBasicBlockPass(Banner, OS, DeleteStream);
+}
+
diff --git a/lib/IR/Use.cpp b/lib/IR/Use.cpp
index 481cbab..1d343e8 100644
--- a/lib/IR/Use.cpp
+++ b/lib/IR/Use.cpp
@@ -139,7 +139,7 @@ User *Use::getUser() const {
   const UserRef *ref = reinterpret_cast<const UserRef*>(End);
   return ref->getInt()
     ? ref->getPointer()
-    : (User*)End;
+    : reinterpret_cast<User*>(const_cast<Use*>(End));
 }
 
 } // End llvm namespace
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index ee18f77..8bfbb32 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -200,6 +200,8 @@ namespace {
            E = M.named_metadata_end(); I != E; ++I)
         visitNamedMDNode(*I);
 
+      visitModuleFlags(M);
+
       // If the module is broken, abort at this time.
       return abortIfBroken();
     }
@@ -240,6 +242,9 @@ namespace {
     void visitGlobalAlias(GlobalAlias &GA);
     void visitNamedMDNode(NamedMDNode &NMD);
     void visitMDNode(MDNode &MD, Function *F);
+    void visitModuleFlags(Module &M);
+    void visitModuleFlag(MDNode *Op, DenseMap<MDString*, MDNode*> &SeenIDs,
+                         SmallVectorImpl<MDNode*> &Requirements);
     void visitFunction(Function &F);
     void visitBasicBlock(BasicBlock &BB);
     using InstVisitor<Verifier>::visit;
@@ -296,7 +301,7 @@ namespace {
     bool VerifyIntrinsicType(Type *Ty,
                              ArrayRef<Intrinsic::IITDescriptor> &Infos,
                              SmallVectorImpl<Type*> &ArgTys);
-    void VerifyParameterAttrs(Attribute Attrs, Type *Ty,
+    void VerifyParameterAttrs(AttributeSet Attrs, uint64_t Idx, Type *Ty,
                               bool isReturnValue, const Value *V);
     void VerifyFunctionAttrs(FunctionType *FT, const AttributeSet &Attrs,
                              const Value *V);
@@ -521,83 +526,186 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) {
   }
 }
 
+void Verifier::visitModuleFlags(Module &M) {
+  const NamedMDNode *Flags = M.getModuleFlagsMetadata();
+  if (!Flags) return;
+
+  // Scan each flag, and track the flags and requirements.
+  DenseMap<MDString*, MDNode*> SeenIDs;
+  SmallVector<MDNode*, 16> Requirements;
+  for (unsigned I = 0, E = Flags->getNumOperands(); I != E; ++I) {
+    visitModuleFlag(Flags->getOperand(I), SeenIDs, Requirements);
+  }
+
+  // Validate that the requirements in the module are valid.
+  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
+    MDNode *Requirement = Requirements[I];
+    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
+    Value *ReqValue = Requirement->getOperand(1);
+
+    MDNode *Op = SeenIDs.lookup(Flag);
+    if (!Op) {
+      CheckFailed("invalid requirement on flag, flag is not present in module",
+                  Flag);
+      continue;
+    }
+
+    if (Op->getOperand(2) != ReqValue) {
+      CheckFailed(("invalid requirement on flag, "
+                   "flag does not have the required value"),
+                  Flag);
+      continue;
+    }
+  }
+}
+
+void Verifier::visitModuleFlag(MDNode *Op, DenseMap<MDString*, MDNode*>&SeenIDs,
+                               SmallVectorImpl<MDNode*> &Requirements) {
+  // Each module flag should have three arguments, the merge behavior (a
+  // constant int), the flag ID (an MDString), and the value.
+  Assert1(Op->getNumOperands() == 3,
+          "incorrect number of operands in module flag", Op);
+  ConstantInt *Behavior = dyn_cast<ConstantInt>(Op->getOperand(0));
+  MDString *ID = dyn_cast<MDString>(Op->getOperand(1));
+  Assert1(Behavior,
+          "invalid behavior operand in module flag (expected constant integer)",
+          Op->getOperand(0));
+  unsigned BehaviorValue = Behavior->getZExtValue();
+  Assert1(ID,
+          "invalid ID operand in module flag (expected metadata string)",
+          Op->getOperand(1));
+
+  // Sanity check the values for behaviors with additional requirements.
+  switch (BehaviorValue) {
+  default:
+    Assert1(false,
+            "invalid behavior operand in module flag (unexpected constant)",
+            Op->getOperand(0));
+    break;
+
+  case Module::Error:
+  case Module::Warning:
+  case Module::Override:
+    // These behavior types accept any value.
+    break;
+
+  case Module::Require: {
+    // The value should itself be an MDNode with two operands, a flag ID (an
+    // MDString), and a value.
+    MDNode *Value = dyn_cast<MDNode>(Op->getOperand(2));
+    Assert1(Value && Value->getNumOperands() == 2,
+            "invalid value for 'require' module flag (expected metadata pair)",
+            Op->getOperand(2));
+    Assert1(isa<MDString>(Value->getOperand(0)),
+            ("invalid value for 'require' module flag "
+             "(first value operand should be a string)"),
+            Value->getOperand(0));
+
+    // Append it to the list of requirements, to check once all module flags are
+    // scanned.
+    Requirements.push_back(Value);
+    break;
+  }
+
+  case Module::Append:
+  case Module::AppendUnique: {
+    // These behavior types require the operand be an MDNode.
+    Assert1(isa<MDNode>(Op->getOperand(2)),
+            "invalid value for 'append'-type module flag "
+            "(expected a metadata node)", Op->getOperand(2));
+    break;
+  }
+  }
+
+  // Unless this is a "requires" flag, check the ID is unique.
+  if (BehaviorValue != Module::Require) {
+    bool Inserted = SeenIDs.insert(std::make_pair(ID, Op)).second;
+    Assert1(Inserted,
+            "module flag identifiers must be unique (or of 'require' type)",
+            ID);
+  }
+}
+
 // VerifyParameterAttrs - Check the given attributes for an argument or return
 // value of the specified type.  The value V is printed in error messages.
-void Verifier::VerifyParameterAttrs(Attribute Attrs, Type *Ty,
+void Verifier::VerifyParameterAttrs(AttributeSet Attrs, uint64_t Idx, Type *Ty,
                                     bool isReturnValue, const Value *V) {
-  if (!Attrs.hasAttributes())
+  if (!Attrs.hasAttributes(Idx))
     return;
 
-  Assert1(!Attrs.hasAttribute(Attribute::NoReturn) &&
-          !Attrs.hasAttribute(Attribute::NoUnwind) &&
-          !Attrs.hasAttribute(Attribute::ReadNone) &&
-          !Attrs.hasAttribute(Attribute::ReadOnly) &&
-          !Attrs.hasAttribute(Attribute::NoInline) &&
-          !Attrs.hasAttribute(Attribute::AlwaysInline) &&
-          !Attrs.hasAttribute(Attribute::OptimizeForSize) &&
-          !Attrs.hasAttribute(Attribute::StackProtect) &&
-          !Attrs.hasAttribute(Attribute::StackProtectReq) &&
-          !Attrs.hasAttribute(Attribute::NoRedZone) &&
-          !Attrs.hasAttribute(Attribute::NoImplicitFloat) &&
-          !Attrs.hasAttribute(Attribute::Naked) &&
-          !Attrs.hasAttribute(Attribute::InlineHint) &&
-          !Attrs.hasAttribute(Attribute::StackAlignment) &&
-          !Attrs.hasAttribute(Attribute::UWTable) &&
-          !Attrs.hasAttribute(Attribute::NonLazyBind) &&
-          !Attrs.hasAttribute(Attribute::ReturnsTwice) &&
-          !Attrs.hasAttribute(Attribute::AddressSafety) &&
-          !Attrs.hasAttribute(Attribute::MinSize),
-          "Some attributes in '" + Attrs.getAsString() +
+  Assert1(!Attrs.hasAttribute(Idx, Attribute::NoReturn) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoUnwind) &&
+          !Attrs.hasAttribute(Idx, Attribute::ReadNone) &&
+          !Attrs.hasAttribute(Idx, Attribute::ReadOnly) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoInline) &&
+          !Attrs.hasAttribute(Idx, Attribute::AlwaysInline) &&
+          !Attrs.hasAttribute(Idx, Attribute::OptimizeForSize) &&
+          !Attrs.hasAttribute(Idx, Attribute::StackProtect) &&
+          !Attrs.hasAttribute(Idx, Attribute::StackProtectReq) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoRedZone) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoImplicitFloat) &&
+          !Attrs.hasAttribute(Idx, Attribute::Naked) &&
+          !Attrs.hasAttribute(Idx, Attribute::InlineHint) &&
+          !Attrs.hasAttribute(Idx, Attribute::StackAlignment) &&
+          !Attrs.hasAttribute(Idx, Attribute::UWTable) &&
+          !Attrs.hasAttribute(Idx, Attribute::NonLazyBind) &&
+          !Attrs.hasAttribute(Idx, Attribute::ReturnsTwice) &&
+          !Attrs.hasAttribute(Idx, Attribute::SanitizeAddress) &&
+          !Attrs.hasAttribute(Idx, Attribute::SanitizeThread) &&
+          !Attrs.hasAttribute(Idx, Attribute::SanitizeMemory) &&
+          !Attrs.hasAttribute(Idx, Attribute::MinSize) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoBuiltin),
+          "Some attributes in '" + Attrs.getAsString(Idx) +
           "' only apply to functions!", V);
 
   if (isReturnValue)
-    Assert1(!Attrs.hasAttribute(Attribute::ByVal) &&
-            !Attrs.hasAttribute(Attribute::Nest) &&
-            !Attrs.hasAttribute(Attribute::StructRet) &&
-            !Attrs.hasAttribute(Attribute::NoCapture),
+    Assert1(!Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+            !Attrs.hasAttribute(Idx, Attribute::Nest) &&
+            !Attrs.hasAttribute(Idx, Attribute::StructRet) &&
+            !Attrs.hasAttribute(Idx, Attribute::NoCapture),
             "Attribute 'byval', 'nest', 'sret', and 'nocapture' "
             "do not apply to return values!", V);
 
   // Check for mutually incompatible attributes.
-  Assert1(!((Attrs.hasAttribute(Attribute::ByVal) &&
-             Attrs.hasAttribute(Attribute::Nest)) ||
-            (Attrs.hasAttribute(Attribute::ByVal) &&
-             Attrs.hasAttribute(Attribute::StructRet)) ||
-            (Attrs.hasAttribute(Attribute::Nest) &&
-             Attrs.hasAttribute(Attribute::StructRet))), "Attributes "
+  Assert1(!((Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+             Attrs.hasAttribute(Idx, Attribute::Nest)) ||
+            (Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+             Attrs.hasAttribute(Idx, Attribute::StructRet)) ||
+            (Attrs.hasAttribute(Idx, Attribute::Nest) &&
+             Attrs.hasAttribute(Idx, Attribute::StructRet))), "Attributes "
           "'byval, nest, and sret' are incompatible!", V);
 
-  Assert1(!((Attrs.hasAttribute(Attribute::ByVal) &&
-             Attrs.hasAttribute(Attribute::Nest)) ||
-            (Attrs.hasAttribute(Attribute::ByVal) &&
-             Attrs.hasAttribute(Attribute::InReg)) ||
-            (Attrs.hasAttribute(Attribute::Nest) &&
-             Attrs.hasAttribute(Attribute::InReg))), "Attributes "
+  Assert1(!((Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+             Attrs.hasAttribute(Idx, Attribute::Nest)) ||
+            (Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+             Attrs.hasAttribute(Idx, Attribute::InReg)) ||
+            (Attrs.hasAttribute(Idx, Attribute::Nest) &&
+             Attrs.hasAttribute(Idx, Attribute::InReg))), "Attributes "
           "'byval, nest, and inreg' are incompatible!", V);
 
-  Assert1(!(Attrs.hasAttribute(Attribute::ZExt) &&
-            Attrs.hasAttribute(Attribute::SExt)), "Attributes "
+  Assert1(!(Attrs.hasAttribute(Idx, Attribute::ZExt) &&
+            Attrs.hasAttribute(Idx, Attribute::SExt)), "Attributes "
           "'zeroext and signext' are incompatible!", V);
 
-  Assert1(!(Attrs.hasAttribute(Attribute::ReadNone) &&
-            Attrs.hasAttribute(Attribute::ReadOnly)), "Attributes "
+  Assert1(!(Attrs.hasAttribute(Idx, Attribute::ReadNone) &&
+            Attrs.hasAttribute(Idx, Attribute::ReadOnly)), "Attributes "
           "'readnone and readonly' are incompatible!", V);
 
-  Assert1(!(Attrs.hasAttribute(Attribute::NoInline) &&
-            Attrs.hasAttribute(Attribute::AlwaysInline)), "Attributes "
+  Assert1(!(Attrs.hasAttribute(Idx, Attribute::NoInline) &&
+            Attrs.hasAttribute(Idx, Attribute::AlwaysInline)), "Attributes "
           "'noinline and alwaysinline' are incompatible!", V);
 
-  Assert1(!AttrBuilder(Attrs).
-            hasAttributes(Attribute::typeIncompatible(Ty)),
+  Assert1(!AttrBuilder(Attrs, Idx).
+            hasAttributes(AttributeFuncs::typeIncompatible(Ty, Idx), Idx),
           "Wrong types for attribute: " +
-          Attribute::typeIncompatible(Ty).getAsString(), V);
+          AttributeFuncs::typeIncompatible(Ty, Idx).getAsString(Idx), V);
 
   if (PointerType *PTy = dyn_cast<PointerType>(Ty))
-    Assert1(!Attrs.hasAttribute(Attribute::ByVal) ||
+    Assert1(!Attrs.hasAttribute(Idx, Attribute::ByVal) ||
             PTy->getElementType()->isSized(),
             "Attribute 'byval' does not support unsized types!", V);
   else
-    Assert1(!Attrs.hasAttribute(Attribute::ByVal),
+    Assert1(!Attrs.hasAttribute(Idx, Attribute::ByVal),
             "Attribute 'byval' only applies to parameters with pointer type!",
             V);
 }
@@ -613,75 +721,97 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT,
   bool SawNest = false;
 
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
-    const AttributeWithIndex &Attr = Attrs.getSlot(i);
+    unsigned Index = Attrs.getSlotIndex(i);
 
     Type *Ty;
-    if (Attr.Index == 0)
+    if (Index == 0)
       Ty = FT->getReturnType();
-    else if (Attr.Index-1 < FT->getNumParams())
-      Ty = FT->getParamType(Attr.Index-1);
+    else if (Index-1 < FT->getNumParams())
+      Ty = FT->getParamType(Index-1);
     else
       break;  // VarArgs attributes, verified elsewhere.
 
-    VerifyParameterAttrs(Attr.Attrs, Ty, Attr.Index == 0, V);
+    VerifyParameterAttrs(Attrs, Index, Ty, Index == 0, V);
 
-    if (Attr.Attrs.hasAttribute(Attribute::Nest)) {
+    if (Attrs.hasAttribute(i, Attribute::Nest)) {
       Assert1(!SawNest, "More than one parameter has attribute nest!", V);
       SawNest = true;
     }
 
-    if (Attr.Attrs.hasAttribute(Attribute::StructRet))
-      Assert1(Attr.Index == 1, "Attribute sret is not on first parameter!", V);
+    if (Attrs.hasAttribute(Index, Attribute::StructRet))
+      Assert1(Index == 1, "Attribute sret is not on first parameter!", V);
   }
 
-  Attribute FAttrs = Attrs.getFnAttributes();
-  AttrBuilder NotFn(FAttrs);
+  if (!Attrs.hasAttributes(AttributeSet::FunctionIndex))
+    return;
+
+  AttrBuilder NotFn(Attrs, AttributeSet::FunctionIndex);
   NotFn.removeFunctionOnlyAttrs();
-  Assert1(!NotFn.hasAttributes(), "Attribute '" +
-          Attribute::get(V->getContext(), NotFn).getAsString() +
+  Assert1(NotFn.empty(), "Attributes '" +
+          AttributeSet::get(V->getContext(),
+                            AttributeSet::FunctionIndex,
+                            NotFn).getAsString(AttributeSet::FunctionIndex) +
           "' do not apply to the function!", V);
 
   // Check for mutually incompatible attributes.
-  Assert1(!((FAttrs.hasAttribute(Attribute::ByVal) &&
-             FAttrs.hasAttribute(Attribute::Nest)) ||
-            (FAttrs.hasAttribute(Attribute::ByVal) &&
-             FAttrs.hasAttribute(Attribute::StructRet)) ||
-            (FAttrs.hasAttribute(Attribute::Nest) &&
-             FAttrs.hasAttribute(Attribute::StructRet))), "Attributes "
-          "'byval, nest, and sret' are incompatible!", V);
-
-  Assert1(!((FAttrs.hasAttribute(Attribute::ByVal) &&
-             FAttrs.hasAttribute(Attribute::Nest)) ||
-            (FAttrs.hasAttribute(Attribute::ByVal) &&
-             FAttrs.hasAttribute(Attribute::InReg)) ||
-            (FAttrs.hasAttribute(Attribute::Nest) &&
-             FAttrs.hasAttribute(Attribute::InReg))), "Attributes "
-          "'byval, nest, and inreg' are incompatible!", V);
-
-  Assert1(!(FAttrs.hasAttribute(Attribute::ZExt) &&
-            FAttrs.hasAttribute(Attribute::SExt)), "Attributes "
-          "'zeroext and signext' are incompatible!", V);
-
-  Assert1(!(FAttrs.hasAttribute(Attribute::ReadNone) &&
-            FAttrs.hasAttribute(Attribute::ReadOnly)), "Attributes "
-          "'readnone and readonly' are incompatible!", V);
-
-  Assert1(!(FAttrs.hasAttribute(Attribute::NoInline) &&
-            FAttrs.hasAttribute(Attribute::AlwaysInline)), "Attributes "
-          "'noinline and alwaysinline' are incompatible!", V);
+  Assert1(!((Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::ByVal) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::Nest)) ||
+            (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::ByVal) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::StructRet)) ||
+            (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::Nest) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::StructRet))),
+          "Attributes 'byval, nest, and sret' are incompatible!", V);
+
+  Assert1(!((Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::ByVal) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::Nest)) ||
+            (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::ByVal) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::InReg)) ||
+            (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::Nest) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::InReg))),
+          "Attributes 'byval, nest, and inreg' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::ZExt) &&
+            Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::SExt)),
+          "Attributes 'zeroext and signext' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::ReadNone) &&
+            Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::ReadOnly)),
+          "Attributes 'readnone and readonly' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::NoInline) &&
+            Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::AlwaysInline)),
+          "Attributes 'noinline and alwaysinline' are incompatible!", V);
 }
 
 static bool VerifyAttributeCount(const AttributeSet &Attrs, unsigned Params) {
-  if (Attrs.isEmpty())
+  if (Attrs.getNumSlots() == 0)
     return true;
 
   unsigned LastSlot = Attrs.getNumSlots() - 1;
-  unsigned LastIndex = Attrs.getSlot(LastSlot).Index;
+  unsigned LastIndex = Attrs.getSlotIndex(LastSlot);
   if (LastIndex <= Params
-      || (LastIndex == (unsigned)~0
-          && (LastSlot == 0 || Attrs.getSlot(LastSlot - 1).Index <= Params)))  
+      || (LastIndex == AttributeSet::FunctionIndex
+          && (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params)))
     return true;
-
+ 
   return false;
 }
 
@@ -1231,11 +1361,10 @@ void Verifier::VerifyCallSite(CallSite CS) {
   if (FTy->isVarArg())
     // Check attributes on the varargs part.
     for (unsigned Idx = 1 + FTy->getNumParams(); Idx <= CS.arg_size(); ++Idx) {
-      Attribute Attr = Attrs.getParamAttributes(Idx);
-
-      VerifyParameterAttrs(Attr, CS.getArgument(Idx-1)->getType(), false, I);
+      VerifyParameterAttrs(Attrs, Idx, CS.getArgument(Idx-1)->getType(),
+                           false, I);
 
-      Assert1(!Attr.hasAttribute(Attribute::StructRet),
+      Assert1(!Attrs.hasAttribute(Idx, Attribute::StructRet),
               "Attribute 'sret' cannot be used for vararg call arguments!", I);
     }
 
@@ -1800,6 +1929,7 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
   case IITDescriptor::Void: return !Ty->isVoidTy();
   case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
   case IITDescriptor::Metadata: return !Ty->isMetadataTy();
+  case IITDescriptor::Half: return !Ty->isHalfTy();
   case IITDescriptor::Float: return !Ty->isFloatTy();
   case IITDescriptor::Double: return !Ty->isDoubleTy();
   case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
diff --git a/lib/Linker/CMakeLists.txt b/lib/Linker/CMakeLists.txt
index 0b6d2f4..28f1262 100644
--- a/lib/Linker/CMakeLists.txt
+++ b/lib/Linker/CMakeLists.txt
@@ -1,6 +1,4 @@
 add_llvm_library(LLVMLinker
-  LinkArchives.cpp
-  LinkItems.cpp
   LinkModules.cpp
   Linker.cpp
   )
diff --git a/lib/Linker/LLVMBuild.txt b/lib/Linker/LLVMBuild.txt
index 2b4c232..0bb26d0 100644
--- a/lib/Linker/LLVMBuild.txt
+++ b/lib/Linker/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = Linker
 parent = Libraries
-required_libraries = Archive BitReader Core Support TransformUtils
+required_libraries = Core Support TransformUtils
diff --git a/lib/Linker/LinkArchives.cpp b/lib/Linker/LinkArchives.cpp
deleted file mode 100644
index a35991c..0000000
--- a/lib/Linker/LinkArchives.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-//===- lib/Linker/LinkArchives.cpp - Link LLVM objects and libraries ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains routines to handle linking together LLVM bitcode files,
-// and to handle annoying things like static libraries.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Linker.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/Bitcode/Archive.h"
-#include "llvm/IR/Module.h"
-#include <memory>
-#include <set>
-using namespace llvm;
-
-/// GetAllUndefinedSymbols - calculates the set of undefined symbols that still
-/// exist in an LLVM module. This is a bit tricky because there may be two
-/// symbols with the same name but different LLVM types that will be resolved to
-/// each other but aren't currently (thus we need to treat it as resolved).
-///
-/// Inputs:
-///  M - The module in which to find undefined symbols.
-///
-/// Outputs:
-///  UndefinedSymbols - A set of C++ strings containing the name of all
-///                     undefined symbols.
-///
-static void
-GetAllUndefinedSymbols(Module *M, std::set<std::string> &UndefinedSymbols) {
-  std::set<std::string> DefinedSymbols;
-  UndefinedSymbols.clear();
-
-  // If the program doesn't define a main, try pulling one in from a .a file.
-  // This is needed for programs where the main function is defined in an
-  // archive, such f2c'd programs.
-  Function *Main = M->getFunction("main");
-  if (Main == 0 || Main->isDeclaration())
-    UndefinedSymbols.insert("main");
-
-  for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
-    if (I->hasName()) {
-      if (I->isDeclaration())
-        UndefinedSymbols.insert(I->getName());
-      else if (!I->hasLocalLinkage()) {
-        assert(!I->hasDLLImportLinkage()
-               && "Found dllimported non-external symbol!");
-        DefinedSymbols.insert(I->getName());
-      }      
-    }
-
-  for (Module::global_iterator I = M->global_begin(), E = M->global_end();
-       I != E; ++I)
-    if (I->hasName()) {
-      if (I->isDeclaration())
-        UndefinedSymbols.insert(I->getName());
-      else if (!I->hasLocalLinkage()) {
-        assert(!I->hasDLLImportLinkage()
-               && "Found dllimported non-external symbol!");
-        DefinedSymbols.insert(I->getName());
-      }      
-    }
-
-  for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
-       I != E; ++I)
-    if (I->hasName())
-      DefinedSymbols.insert(I->getName());
-
-  // Prune out any defined symbols from the undefined symbols set...
-  for (std::set<std::string>::iterator I = UndefinedSymbols.begin();
-       I != UndefinedSymbols.end(); )
-    if (DefinedSymbols.count(*I))
-      UndefinedSymbols.erase(I++);  // This symbol really is defined!
-    else
-      ++I; // Keep this symbol in the undefined symbols list
-}
-
-/// LinkInArchive - opens an archive library and link in all objects which
-/// provide symbols that are currently undefined.
-///
-/// Inputs:
-///  Filename - The pathname of the archive.
-///
-/// Return Value:
-///  TRUE  - An error occurred.
-///  FALSE - No errors.
-bool
-Linker::LinkInArchive(const sys::Path &Filename, bool &is_native) {
-  // Make sure this is an archive file we're dealing with
-  if (!Filename.isArchive())
-    return error("File '" + Filename.str() + "' is not an archive.");
-
-  // Open the archive file
-  verbose("Linking archive file '" + Filename.str() + "'");
-
-  // Find all of the symbols currently undefined in the bitcode program.
-  // If all the symbols are defined, the program is complete, and there is
-  // no reason to link in any archive files.
-  std::set<std::string> UndefinedSymbols;
-  GetAllUndefinedSymbols(Composite, UndefinedSymbols);
-
-  if (UndefinedSymbols.empty()) {
-    verbose("No symbols undefined, skipping library '" + Filename.str() + "'");
-    return false;  // No need to link anything in!
-  }
-
-  std::string ErrMsg;
-  std::auto_ptr<Archive> AutoArch (
-    Archive::OpenAndLoadSymbols(Filename, Context, &ErrMsg));
-
-  Archive* arch = AutoArch.get();
-
-  if (!arch)
-    return error("Cannot read archive '" + Filename.str() +
-                 "': " + ErrMsg);
-  if (!arch->isBitcodeArchive()) {
-    is_native = true;
-    return false;
-  }
-  is_native = false;
-
-  // Save a set of symbols that are not defined by the archive. Since we're
-  // entering a loop, there's no point searching for these multiple times. This
-  // variable is used to "set_subtract" from the set of undefined symbols.
-  std::set<std::string> NotDefinedByArchive;
-
-  // Save the current set of undefined symbols, because we may have to make
-  // multiple passes over the archive:
-  std::set<std::string> CurrentlyUndefinedSymbols;
-
-  do {
-    CurrentlyUndefinedSymbols = UndefinedSymbols;
-
-    // Find the modules we need to link into the target module.  Note that arch
-    // keeps ownership of these modules and may return the same Module* from a
-    // subsequent call.
-    SmallVector<Module*, 16> Modules;
-    if (!arch->findModulesDefiningSymbols(UndefinedSymbols, Modules, &ErrMsg))
-      return error("Cannot find symbols in '" + Filename.str() + 
-                   "': " + ErrMsg);
-
-    // If we didn't find any more modules to link this time, we are done
-    // searching this archive.
-    if (Modules.empty())
-      break;
-
-    // Any symbols remaining in UndefinedSymbols after
-    // findModulesDefiningSymbols are ones that the archive does not define. So
-    // we add them to the NotDefinedByArchive variable now.
-    NotDefinedByArchive.insert(UndefinedSymbols.begin(),
-        UndefinedSymbols.end());
-
-    // Loop over all the Modules that we got back from the archive
-    for (SmallVectorImpl<Module*>::iterator I=Modules.begin(), E=Modules.end();
-         I != E; ++I) {
-
-      // Get the module we must link in.
-      std::string moduleErrorMsg;
-      Module* aModule = *I;
-      if (aModule != NULL) {
-        if (aModule->MaterializeAll(&moduleErrorMsg))
-          return error("Could not load a module: " + moduleErrorMsg);
-
-        verbose("  Linking in module: " + aModule->getModuleIdentifier());
-
-        // Link it in
-        if (LinkInModule(aModule, &moduleErrorMsg))
-          return error("Cannot link in module '" +
-                       aModule->getModuleIdentifier() + "': " + moduleErrorMsg);
-      } 
-    }
-    
-    // Get the undefined symbols from the aggregate module. This recomputes the
-    // symbols we still need after the new modules have been linked in.
-    GetAllUndefinedSymbols(Composite, UndefinedSymbols);
-
-    // At this point we have two sets of undefined symbols: UndefinedSymbols
-    // which holds the undefined symbols from all the modules, and
-    // NotDefinedByArchive which holds symbols we know the archive doesn't
-    // define. There's no point searching for symbols that we won't find in the
-    // archive so we subtract these sets.
-    set_subtract(UndefinedSymbols, NotDefinedByArchive);
-
-    // If there's no symbols left, no point in continuing to search the
-    // archive.
-    if (UndefinedSymbols.empty())
-      break;
-  } while (CurrentlyUndefinedSymbols != UndefinedSymbols);
-
-  return false;
-}
diff --git a/lib/Linker/LinkItems.cpp b/lib/Linker/LinkItems.cpp
deleted file mode 100644
index 8c6ed42..0000000
--- a/lib/Linker/LinkItems.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-//===- lib/Linker/LinkItems.cpp - Link LLVM objects and libraries ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains routines to handle linking together LLVM bitcode files,
-// and to handle annoying things like static libraries.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Linker.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/system_error.h"
-using namespace llvm;
-
-// LinkItems - This function is the main entry point into linking. It takes a
-// list of LinkItem which indicates the order the files should be linked and
-// how each file should be treated (plain file or with library search). The
-// function only links bitcode and produces a result list of items that are
-// native objects. 
-bool
-Linker::LinkInItems(const ItemList& Items, ItemList& NativeItems) {
-  // Clear the NativeItems just in case
-  NativeItems.clear();
-
-  // For each linkage item ...
-  for (ItemList::const_iterator I = Items.begin(), E = Items.end();
-       I != E; ++I) {
-    if (I->second) {
-      // Link in the library suggested.
-      bool is_native = false;
-      if (LinkInLibrary(I->first, is_native))
-        return true;
-      if (is_native)
-        NativeItems.push_back(*I);
-    } else {
-      // Link in the file suggested
-      bool is_native = false;
-      if (LinkInFile(sys::Path(I->first), is_native))
-        return true;
-      if (is_native)
-        NativeItems.push_back(*I);
-    }
-  }
-
-  return false;
-}
-
-
-/// LinkInLibrary - links one library into the HeadModule.
-///
-bool Linker::LinkInLibrary(StringRef Lib, bool& is_native) {
-  is_native = false;
-  // Determine where this library lives.
-  sys::Path Pathname = FindLib(Lib);
-  if (Pathname.isEmpty())
-    return error("Cannot find library '" + Lib.str() + "'");
-
-  // If its an archive, try to link it in
-  std::string Magic;
-  Pathname.getMagicNumber(Magic, 64);
-  switch (sys::IdentifyFileType(Magic.c_str(), 64)) {
-    default: llvm_unreachable("Bad file type identification");
-    case sys::Unknown_FileType:
-      return warning("Supposed library '" + Lib.str() + "' isn't a library.");
-
-    case sys::Bitcode_FileType:
-      // LLVM ".so" file.
-      if (LinkInFile(Pathname, is_native))
-        return true;
-      break;
-
-    case sys::Archive_FileType:
-      if (LinkInArchive(Pathname, is_native))
-        return error("Cannot link archive '" + Pathname.str() + "'");
-      break;
-
-    case sys::ELF_Relocatable_FileType:
-    case sys::ELF_SharedObject_FileType:
-    case sys::Mach_O_Object_FileType:
-    case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
-    case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
-    case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
-    case sys::COFF_FileType:
-      is_native = true;
-      break;
-  }
-  return false;
-}
-
-/// LinkLibraries - takes the specified library files and links them into the
-/// main bitcode object file.
-///
-/// Inputs:
-///  Libraries  - The list of libraries to link into the module.
-///
-/// Return value:
-///  FALSE - No error.
-///  TRUE  - Error.
-///
-bool Linker::LinkInLibraries(const std::vector<std::string> &Libraries) {
-
-  // Process the set of libraries we've been provided.
-  bool is_native = false;
-  for (unsigned i = 0; i < Libraries.size(); ++i)
-    if (LinkInLibrary(Libraries[i], is_native))
-      return true;
-
-  return false;
-}
-
-/// LinkInFile - opens a bitcode file and links in all objects which
-/// provide symbols that are currently undefined.
-///
-/// Inputs:
-///  File - The pathname of the bitcode file.
-///
-/// Outputs:
-///  ErrorMessage - A C++ string detailing what error occurred, if any.
-///
-/// Return Value:
-///  TRUE  - An error occurred.
-///  FALSE - No errors.
-///
-bool Linker::LinkInFile(const sys::Path &File, bool &is_native) {
-  is_native = false;
-  
-  // Check for a file of name "-", which means "read standard input"
-  if (File.str() == "-") {
-    std::auto_ptr<Module> M;
-    OwningPtr<MemoryBuffer> Buffer;
-    error_code ec;
-    if (!(ec = MemoryBuffer::getSTDIN(Buffer))) {
-      if (!Buffer->getBufferSize()) {
-        Error = "standard input is empty";
-      } else {
-        M.reset(ParseBitcodeFile(Buffer.get(), Context, &Error));
-        if (M.get())
-          if (!LinkInModule(M.get(), &Error))
-            return false;
-      }
-    }
-    return error("Cannot link stdin: " + ec.message());
-  }
-
-  // Determine what variety of file it is.
-  std::string Magic;
-  if (!File.getMagicNumber(Magic, 64))
-    return error("Cannot find linker input '" + File.str() + "'");
-
-  switch (sys::IdentifyFileType(Magic.c_str(), 64)) {
-    default: llvm_unreachable("Bad file type identification");
-    case sys::Unknown_FileType:
-      return warning("Ignoring file '" + File.str() + 
-                   "' because does not contain bitcode.");
-
-    case sys::Archive_FileType:
-      // A user may specify an ar archive without -l, perhaps because it
-      // is not installed as a library. Detect that and link the archive.
-      if (LinkInArchive(File, is_native))
-        return true;
-      break;
-
-    case sys::Bitcode_FileType: {
-      verbose("Linking bitcode file '" + File.str() + "'");
-      std::auto_ptr<Module> M(LoadObject(File));
-      if (M.get() == 0)
-        return error("Cannot load file '" + File.str() + "': " + Error);
-      if (LinkInModule(M.get(), &Error))
-        return error("Cannot link file '" + File.str() + "': " + Error);
-
-      verbose("Linked in file '" + File.str() + "'");
-      break;
-    }
-
-    case sys::ELF_Relocatable_FileType:
-    case sys::ELF_SharedObject_FileType:
-    case sys::Mach_O_Object_FileType:
-    case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
-    case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
-    case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
-    case sys::COFF_FileType:
-      is_native = true;
-      break;
-  }
-  return false;
-}
-
-/// LinkFiles - takes a module and a list of files and links them all together.
-/// It locates the file either in the current directory, as its absolute
-/// or relative pathname, or as a file somewhere in LLVM_LIB_SEARCH_PATH.
-///
-/// Inputs:
-///  Files      - A vector of sys::Path indicating the LLVM bitcode filenames
-///               to be linked.  The names can refer to a mixture of pure LLVM
-///               bitcode files and archive (ar) formatted files.
-///
-/// Return value:
-///  FALSE - No errors.
-///  TRUE  - Some error occurred.
-///
-bool Linker::LinkInFiles(const std::vector<sys::Path> &Files) {
-  bool is_native;
-  for (unsigned i = 0; i < Files.size(); ++i)
-    if (LinkInFile(Files[i], is_native))
-      return true;
-  return false;
-}
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index e973919..c358a0a 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -180,7 +180,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
     if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
       return false;
   } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
-    if (DVTy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
+    if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements())
       return false;
   }
 
@@ -421,13 +421,6 @@ namespace {
     }
     
     void computeTypeMapping();
-    bool categorizeModuleFlagNodes(const NamedMDNode *ModFlags,
-                                   DenseMap<MDString*, MDNode*> &ErrorNode,
-                                   DenseMap<MDString*, MDNode*> &WarningNode,
-                                   DenseMap<MDString*, MDNode*> &OverrideNode,
-                                   DenseMap<MDString*,
-                                   SmallSetVector<MDNode*, 8> > &RequireNodes,
-                                   SmallSetVector<MDString*, 16> &SeenIDs);
     
     bool linkAppendingVarProto(GlobalVariable *DstGV, GlobalVariable *SrcGV);
     bool linkGlobalProto(GlobalVariable *SrcGV);
@@ -613,7 +606,8 @@ void ModuleLinker::computeTypeMapping() {
     // Check to see if there is a dot in the name followed by a digit.
     size_t DotPos = ST->getName().rfind('.');
     if (DotPos == 0 || DotPos == StringRef::npos ||
-        ST->getName().back() == '.' || !isdigit(ST->getName()[DotPos+1]))
+        ST->getName().back() == '.' ||
+        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos+1])))
       continue;
     
     // Check to see if the destination module has a struct with the prefix name.
@@ -987,76 +981,16 @@ void ModuleLinker::linkNamedMDNodes() {
   }
 }
 
-/// categorizeModuleFlagNodes - Categorize the module flags according to their
-/// type: Error, Warning, Override, and Require.
-bool ModuleLinker::
-categorizeModuleFlagNodes(const NamedMDNode *ModFlags,
-                          DenseMap<MDString*, MDNode*> &ErrorNode,
-                          DenseMap<MDString*, MDNode*> &WarningNode,
-                          DenseMap<MDString*, MDNode*> &OverrideNode,
-                          DenseMap<MDString*,
-                            SmallSetVector<MDNode*, 8> > &RequireNodes,
-                          SmallSetVector<MDString*, 16> &SeenIDs) {
-  bool HasErr = false;
-
-  for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) {
-    MDNode *Op = ModFlags->getOperand(I);
-    assert(Op->getNumOperands() == 3 && "Invalid module flag metadata!");
-    assert(isa<ConstantInt>(Op->getOperand(0)) &&
-           "Module flag's first operand must be an integer!");
-    assert(isa<MDString>(Op->getOperand(1)) &&
-           "Module flag's second operand must be an MDString!");
-
-    ConstantInt *Behavior = cast<ConstantInt>(Op->getOperand(0));
-    MDString *ID = cast<MDString>(Op->getOperand(1));
-    Value *Val = Op->getOperand(2);
-    switch (Behavior->getZExtValue()) {
-    default:
-      assert(false && "Invalid behavior in module flag metadata!");
-      break;
-    case Module::Error: {
-      MDNode *&ErrNode = ErrorNode[ID];
-      if (!ErrNode) ErrNode = Op;
-      if (ErrNode->getOperand(2) != Val)
-        HasErr = emitError("linking module flags '" + ID->getString() +
-                           "': IDs have conflicting values");
-      break;
-    }
-    case Module::Warning: {
-      MDNode *&WarnNode = WarningNode[ID];
-      if (!WarnNode) WarnNode = Op;
-      if (WarnNode->getOperand(2) != Val)
-        errs() << "WARNING: linking module flags '" << ID->getString()
-               << "': IDs have conflicting values";
-      break;
-    }
-    case Module::Require:  RequireNodes[ID].insert(Op);     break;
-    case Module::Override: {
-      MDNode *&OvrNode = OverrideNode[ID];
-      if (!OvrNode) OvrNode = Op;
-      if (OvrNode->getOperand(2) != Val)
-        HasErr = emitError("linking module flags '" + ID->getString() +
-                           "': IDs have conflicting override values");
-      break;
-    }
-    }
-
-    SeenIDs.insert(ID);
-  }
-
-  return HasErr;
-}
-
 /// linkModuleFlagsMetadata - Merge the linker flags in Src into the Dest
 /// module.
 bool ModuleLinker::linkModuleFlagsMetadata() {
+  // If the source module has no module flags, we are done.
   const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
   if (!SrcModFlags) return false;
 
-  NamedMDNode *DstModFlags = DstM->getOrInsertModuleFlagsMetadata();
-
   // If the destination module doesn't have module flags yet, then just copy
   // over the source module's flags.
+  NamedMDNode *DstModFlags = DstM->getOrInsertModuleFlagsMetadata();
   if (DstModFlags->getNumOperands() == 0) {
     for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
       DstModFlags->addOperand(SrcModFlags->getOperand(I));
@@ -1064,89 +998,137 @@ bool ModuleLinker::linkModuleFlagsMetadata() {
     return false;
   }
 
-  bool HasErr = false;
+  // First build a map of the existing module flags and requirements.
+  DenseMap<MDString*, MDNode*> Flags;
+  SmallSetVector<MDNode*, 16> Requirements;
+  for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *Op = DstModFlags->getOperand(I);
+    ConstantInt *Behavior = cast<ConstantInt>(Op->getOperand(0));
+    MDString *ID = cast<MDString>(Op->getOperand(1));
 
-  // Otherwise, we have to merge them based on their behaviors. First,
-  // categorize all of the nodes in the modules' module flags. If an error or
-  // warning occurs, then emit the appropriate message(s).
-  DenseMap<MDString*, MDNode*> ErrorNode;
-  DenseMap<MDString*, MDNode*> WarningNode;
-  DenseMap<MDString*, MDNode*> OverrideNode;
-  DenseMap<MDString*, SmallSetVector<MDNode*, 8> > RequireNodes;
-  SmallSetVector<MDString*, 16> SeenIDs;
-
-  HasErr |= categorizeModuleFlagNodes(SrcModFlags, ErrorNode, WarningNode,
-                                      OverrideNode, RequireNodes, SeenIDs);
-  HasErr |= categorizeModuleFlagNodes(DstModFlags, ErrorNode, WarningNode,
-                                      OverrideNode, RequireNodes, SeenIDs);
-
-  // Check that there isn't both an error and warning node for a flag.
-  for (SmallSetVector<MDString*, 16>::iterator
-         I = SeenIDs.begin(), E = SeenIDs.end(); I != E; ++I) {
-    MDString *ID = *I;
-    if (ErrorNode[ID] && WarningNode[ID])
-      HasErr = emitError("linking module flags '" + ID->getString() +
-                         "': IDs have conflicting behaviors");
+    if (Behavior->getZExtValue() == Module::Require) {
+      Requirements.insert(cast<MDNode>(Op->getOperand(2)));
+    } else {
+      Flags[ID] = Op;
+    }
   }
 
-  // Early exit if we had an error.
-  if (HasErr) return true;
-
-  // Get the destination's module flags ready for new operands.
-  DstModFlags->dropAllReferences();
-
-  // Add all of the module flags to the destination module.
-  DenseMap<MDString*, SmallVector<MDNode*, 4> > AddedNodes;
-  for (SmallSetVector<MDString*, 16>::iterator
-         I = SeenIDs.begin(), E = SeenIDs.end(); I != E; ++I) {
-    MDString *ID = *I;
-    if (OverrideNode[ID]) {
-      DstModFlags->addOperand(OverrideNode[ID]);
-      AddedNodes[ID].push_back(OverrideNode[ID]);
-    } else if (ErrorNode[ID]) {
-      DstModFlags->addOperand(ErrorNode[ID]);
-      AddedNodes[ID].push_back(ErrorNode[ID]);
-    } else if (WarningNode[ID]) {
-      DstModFlags->addOperand(WarningNode[ID]);
-      AddedNodes[ID].push_back(WarningNode[ID]);
+  // Merge in the flags from the source module, and also collect its set of
+  // requirements.
+  bool HasErr = false;
+  for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *SrcOp = SrcModFlags->getOperand(I);
+    ConstantInt *SrcBehavior = cast<ConstantInt>(SrcOp->getOperand(0));
+    MDString *ID = cast<MDString>(SrcOp->getOperand(1));
+    MDNode *DstOp = Flags.lookup(ID);
+    unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
+
+    // If this is a requirement, add it and continue.
+    if (SrcBehaviorValue == Module::Require) {
+      // If the destination module does not already have this requirement, add
+      // it.
+      if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) {
+        DstModFlags->addOperand(SrcOp);
+      }
+      continue;
+    }
+
+    // If there is no existing flag with this ID, just add it.
+    if (!DstOp) {
+      Flags[ID] = SrcOp;
+      DstModFlags->addOperand(SrcOp);
+      continue;
     }
 
-    for (SmallSetVector<MDNode*, 8>::iterator
-           II = RequireNodes[ID].begin(), IE = RequireNodes[ID].end();
-         II != IE; ++II)
-      DstModFlags->addOperand(*II);
-  }
+    // Otherwise, perform a merge.
+    ConstantInt *DstBehavior = cast<ConstantInt>(DstOp->getOperand(0));
+    unsigned DstBehaviorValue = DstBehavior->getZExtValue();
+
+    // If either flag has override behavior, handle it first.
+    if (DstBehaviorValue == Module::Override) {
+      // Diagnose inconsistent flags which both have override behavior.
+      if (SrcBehaviorValue == Module::Override &&
+          SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        HasErr |= emitError("linking module flags '" + ID->getString() +
+                            "': IDs have conflicting override values");
+      }
+      continue;
+    } else if (SrcBehaviorValue == Module::Override) {
+      // Update the destination flag to that of the source.
+      DstOp->replaceOperandWith(0, SrcBehavior);
+      DstOp->replaceOperandWith(2, SrcOp->getOperand(2));
+      continue;
+    }
 
-  // Now check that all of the requirements have been satisfied.
-  for (SmallSetVector<MDString*, 16>::iterator
-         I = SeenIDs.begin(), E = SeenIDs.end(); I != E; ++I) {
-    MDString *ID = *I;
-    SmallSetVector<MDNode*, 8> &Set = RequireNodes[ID];
-
-    for (SmallSetVector<MDNode*, 8>::iterator
-           II = Set.begin(), IE = Set.end(); II != IE; ++II) {
-      MDNode *Node = *II;
-      assert(isa<MDNode>(Node->getOperand(2)) &&
-             "Module flag's third operand must be an MDNode!");
-      MDNode *Val = cast<MDNode>(Node->getOperand(2));
-
-      MDString *ReqID = cast<MDString>(Val->getOperand(0));
-      Value *ReqVal = Val->getOperand(1);
-
-      bool HasValue = false;
-      for (SmallVectorImpl<MDNode*>::iterator
-             RI = AddedNodes[ReqID].begin(), RE = AddedNodes[ReqID].end();
-           RI != RE; ++RI) {
-        MDNode *ReqNode = *RI;
-        if (ReqNode->getOperand(2) == ReqVal) {
-          HasValue = true;
-          break;
-        }
+    // Diagnose inconsistent merge behavior types.
+    if (SrcBehaviorValue != DstBehaviorValue) {
+      HasErr |= emitError("linking module flags '" + ID->getString() +
+                          "': IDs have conflicting behaviors");
+      continue;
+    }
+
+    // Perform the merge for standard behavior types.
+    switch (SrcBehaviorValue) {
+    case Module::Require:
+    case Module::Override: assert(0 && "not possible"); break;
+    case Module::Error: {
+      // Emit an error if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        HasErr |= emitError("linking module flags '" + ID->getString() +
+                            "': IDs have conflicting values");
       }
+      continue;
+    }
+    case Module::Warning: {
+      // Emit a warning if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        errs() << "WARNING: linking module flags '" << ID->getString()
+               << "': IDs have conflicting values";
+      }
+      continue;
+    }
+    case Module::Append: {
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      unsigned NumOps = DstValue->getNumOperands() + SrcValue->getNumOperands();
+      Value **VP, **Values = VP = new Value*[NumOps];
+      for (unsigned i = 0, e = DstValue->getNumOperands(); i != e; ++i, ++VP)
+        *VP = DstValue->getOperand(i);
+      for (unsigned i = 0, e = SrcValue->getNumOperands(); i != e; ++i, ++VP)
+        *VP = SrcValue->getOperand(i);
+      DstOp->replaceOperandWith(2, MDNode::get(DstM->getContext(),
+                                               ArrayRef<Value*>(Values,
+                                                                NumOps)));
+      delete[] Values;
+      break;
+    }
+    case Module::AppendUnique: {
+      SmallSetVector<Value*, 16> Elts;
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      for (unsigned i = 0, e = DstValue->getNumOperands(); i != e; ++i)
+        Elts.insert(DstValue->getOperand(i));
+      for (unsigned i = 0, e = SrcValue->getNumOperands(); i != e; ++i)
+        Elts.insert(SrcValue->getOperand(i));
+      DstOp->replaceOperandWith(2, MDNode::get(DstM->getContext(),
+                                               ArrayRef<Value*>(Elts.begin(),
+                                                                Elts.end())));
+      break;
+    }
+    }
+  }
 
-      if (!HasValue)
-        HasErr = emitError("linking module flags '" + ReqID->getString() +
-                           "': does not have the required value");
+  // Check all of the requirements.
+  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
+    MDNode *Requirement = Requirements[I];
+    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
+    Value *ReqValue = Requirement->getOperand(1);
+
+    MDNode *Op = Flags[Flag];
+    if (!Op || Op->getOperand(2) != ReqValue) {
+      HasErr |= emitError("linking module flags '" + Flag->getString() +
+                          "': does not have the required value");
+      continue;
     }
   }
 
diff --git a/lib/Linker/Linker.cpp b/lib/Linker/Linker.cpp
index a30363d..c8ea8ff 100644
--- a/lib/Linker/Linker.cpp
+++ b/lib/Linker/Linker.cpp
@@ -89,93 +89,3 @@ Linker::releaseModule() {
   Flags = 0;
   return result;
 }
-
-// LoadObject - Read in and parse the bitcode file named by FN and return the
-// module it contains (wrapped in an auto_ptr), or auto_ptr<Module>() and set
-// Error if an error occurs.
-std::auto_ptr<Module>
-Linker::LoadObject(const sys::Path &FN) {
-  std::string ParseErrorMessage;
-  Module *Result = 0;
-
-  OwningPtr<MemoryBuffer> Buffer;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(FN.c_str(), Buffer))
-    ParseErrorMessage = "Error reading file '" + FN.str() + "'" + ": "
-                      + ec.message();
-  else
-    Result = ParseBitcodeFile(Buffer.get(), Context, &ParseErrorMessage);
-
-  if (Result)
-    return std::auto_ptr<Module>(Result);
-  Error = "Bitcode file '" + FN.str() + "' could not be loaded";
-  if (ParseErrorMessage.size())
-    Error += ": " + ParseErrorMessage;
-  return std::auto_ptr<Module>();
-}
-
-// IsLibrary - Determine if "Name" is a library in "Directory". Return
-// a non-empty sys::Path if its found, an empty one otherwise.
-static inline sys::Path IsLibrary(StringRef Name,
-                                  const sys::Path &Directory) {
-
-  sys::Path FullPath(Directory);
-
-  // Try the libX.a form
-  FullPath.appendComponent(("lib" + Name).str());
-  FullPath.appendSuffix("a");
-  if (FullPath.isArchive())
-    return FullPath;
-
-  // Try the libX.bca form
-  FullPath.eraseSuffix();
-  FullPath.appendSuffix("bca");
-  if (FullPath.isArchive())
-    return FullPath;
-
-  // Try the libX.so (or .dylib) form
-  FullPath.eraseSuffix();
-  FullPath.appendSuffix(sys::Path::GetDLLSuffix());
-  if (FullPath.isDynamicLibrary())  // Native shared library?
-    return FullPath;
-  if (FullPath.isBitcodeFile())    // .so file containing bitcode?
-    return FullPath;
-
-  // Try libX form, to make it possible to add dependency on the
-  // specific version of .so, like liblzma.so.1.0.0
-  FullPath.eraseSuffix();
-  if (FullPath.isDynamicLibrary())  // Native shared library?
-    return FullPath;
-  if (FullPath.isBitcodeFile())    // .so file containing bitcode?
-    return FullPath;
-
-  // Not found .. fall through
-
-  // Indicate that the library was not found in the directory.
-  FullPath.clear();
-  return FullPath;
-}
-
-/// FindLib - Try to convert Filename into the name of a file that we can open,
-/// if it does not already name a file we can open, by first trying to open
-/// Filename, then libFilename.[suffix] for each of a set of several common
-/// library suffixes, in each of the directories in LibPaths. Returns an empty
-/// Path if no matching file can be found.
-///
-sys::Path
-Linker::FindLib(StringRef Filename) {
-  // Determine if the pathname can be found as it stands.
-  sys::Path FilePath(Filename);
-  if (FilePath.canRead() &&
-      (FilePath.isArchive() || FilePath.isDynamicLibrary()))
-    return FilePath;
-
-  // Iterate over the directories in Paths to see if we can find the library
-  // there.
-  for (unsigned Index = 0; Index != LibPaths.size(); ++Index) {
-    sys::Path Directory(LibPaths[Index]);
-    sys::Path FullPath = IsLibrary(Filename, Directory);
-    if (!FullPath.isEmpty())
-      return FullPath;
-  }
-  return sys::Path();
-}
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index bfe1709..0b97f27 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -135,16 +135,14 @@ class ELFObjectWriter : public MCObjectWriter {
     const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
                                             const MCFixup &Fixup,
                                             bool IsPCRel) const {
-      return TargetObjectWriter->undefinedExplicitRelSym(Target, Fixup, IsPCRel);
+      return TargetObjectWriter->undefinedExplicitRelSym(Target, Fixup,
+                                                         IsPCRel);
     }
 
     bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
     bool hasRelocationAddend() const {
       return TargetObjectWriter->hasRelocationAddend();
     }
-    unsigned getEFlags() const {
-      return TargetObjectWriter->getEFlags();
-    }
     unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                           bool IsPCRel, bool IsRelocWithSymbol,
                           int64_t Addend) const {
@@ -152,7 +150,6 @@ class ELFObjectWriter : public MCObjectWriter {
                                               IsRelocWithSymbol, Addend);
     }
 
-
   public:
     ELFObjectWriter(MCELFObjectTargetWriter *MOTW,
                     raw_ostream &_OS, bool IsLittleEndian)
@@ -233,7 +230,8 @@ class ELFObjectWriter : public MCObjectWriter {
       F.getContents().append(&buf[0], &buf[8]);
     }
 
-    void WriteHeader(uint64_t SectionDataSize,
+    void WriteHeader(const MCAssembler &Asm,
+                     uint64_t SectionDataSize,
                      unsigned NumberOfSections);
 
     void WriteSymbolEntry(MCDataFragment *SymtabF,
@@ -373,7 +371,8 @@ ELFObjectWriter::~ELFObjectWriter()
 {}
 
 // Emit the ELF header.
-void ELFObjectWriter::WriteHeader(uint64_t SectionDataSize,
+void ELFObjectWriter::WriteHeader(const MCAssembler &Asm,
+                                  uint64_t SectionDataSize,
                                   unsigned NumberOfSections) {
   // ELF Header
   // ----------
@@ -411,7 +410,7 @@ void ELFObjectWriter::WriteHeader(uint64_t SectionDataSize,
             sizeof(ELF::Elf32_Ehdr)));  // e_shoff = sec hdr table off in bytes
 
   // e_flags = whatever the target wants
-  Write32(getEFlags());
+  Write32(Asm.getELFHeaderEFlags());
 
   // e_ehsize = ELF header size
   Write16(is64Bit() ? sizeof(ELF::Elf64_Ehdr) : sizeof(ELF::Elf32_Ehdr));
@@ -547,12 +546,17 @@ void ELFObjectWriter::WriteSymbol(MCDataFragment *SymtabF,
   bool IsReserved = Data.isCommon() || Data.getSymbol().isAbsolute() ||
     Data.getSymbol().isVariable();
 
+  // Binding and Type share the same byte as upper and lower nibbles
   uint8_t Binding = MCELF::GetBinding(OrigData);
-  uint8_t Visibility = MCELF::GetVisibility(OrigData);
   uint8_t Type = MCELF::GetType(Data);
-
   uint8_t Info = (Binding << ELF_STB_Shift) | (Type << ELF_STT_Shift);
-  uint8_t Other = Visibility;
+
+  // Other and Visibility share the same byte with Visability using the lower
+  // 2 bits
+  uint8_t Visibility = MCELF::GetVisibility(OrigData);
+  uint8_t Other = MCELF::getOther(OrigData) <<
+    (ELF_Other_Shift - ELF_STV_Shift);
+  Other |= Visibility;
 
   uint64_t Value = SymbolValue(Data, Layout);
   uint64_t Size = 0;
@@ -865,7 +869,7 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
   // FIXME: Is this the correct place to do this?
   // FIXME: Why is an undefined reference to _GLOBAL_OFFSET_TABLE_ needed?
   if (NeedsGOT) {
-    llvm::StringRef Name = "_GLOBAL_OFFSET_TABLE_";
+    StringRef Name = "_GLOBAL_OFFSET_TABLE_";
     MCSymbol *Sym = Asm.getContext().GetOrCreateSymbol(Name);
     MCSymbolData &Data = Asm.getOrCreateSymbolData(*Sym);
     Data.setExternal(true);
@@ -1319,6 +1323,8 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
   case ELF::SHT_FINI_ARRAY:
   case ELF::SHT_PREINIT_ARRAY:
   case ELF::SHT_X86_64_UNWIND:
+  case ELF::SHT_MIPS_REGINFO:
+  case ELF::SHT_MIPS_OPTIONS:
     // Nothing to do.
     break;
 
@@ -1332,6 +1338,24 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
     break;
   }
 
+  if (TargetObjectWriter->getEMachine() == ELF::EM_ARM &&
+      Section.getType() == ELF::SHT_ARM_EXIDX) {
+    StringRef SecName(Section.getSectionName());
+    if (SecName == ".ARM.exidx") {
+      sh_link = SectionIndexMap.lookup(
+        Asm.getContext().getELFSection(".text",
+                                       ELF::SHT_PROGBITS,
+                                       ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
+                                       SectionKind::getText()));
+    } else if (SecName.startswith(".ARM.exidx")) {
+      sh_link = SectionIndexMap.lookup(
+        Asm.getContext().getELFSection(SecName.substr(sizeof(".ARM.exidx") - 1),
+                                       ELF::SHT_PROGBITS,
+                                       ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
+                                       SectionKind::getText()));
+    }
+  }
+
   WriteSecHdrEntry(SectionStringTableIndex[&Section], Section.getType(),
                    Section.getFlags(), 0, Offset, Size, sh_link, sh_info,
                    Alignment, Section.getEntrySize());
@@ -1532,7 +1556,7 @@ void ELFObjectWriter::WriteObject(MCAssembler &Asm,
   }
 
   // Write out the ELF header ...
-  WriteHeader(SectionHeaderOffset, NumSections + 1);
+  WriteHeader(Asm, SectionHeaderOffset, NumSections + 1);
 
   // ... then the regular sections ...
   // + because of .shstrtab
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index a6fa658..51bb435 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -24,6 +24,8 @@ using namespace llvm;
 
 MCAsmInfo::MCAsmInfo() {
   PointerSize = 4;
+  CalleeSaveStackSlotSize = 4;
+
   IsLittleEndian = true;
   StackGrowsUp = false;
   HasSubsectionsViaSymbols = false;
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index e234dfe..7eb7202 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -71,7 +71,7 @@ public:
                 MCInstPrinter *printer, MCCodeEmitter *emitter,
                 MCAsmBackend *asmbackend,
                 bool showInst)
-    : MCStreamer(Context), OS(os), MAI(Context.getAsmInfo()),
+    : MCStreamer(SK_AsmStreamer, Context), OS(os), MAI(Context.getAsmInfo()),
       InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
       CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
       ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI),
@@ -127,11 +127,16 @@ public:
   virtual void ChangeSection(const MCSection *Section);
 
   virtual void InitSections() {
+    InitToTextSection();
+  }
+
+  virtual void InitToTextSection() {
     // FIXME, this is MachO specific, but the testsuite
     // expects this.
-    SwitchSection(getContext().getMachOSection("__TEXT", "__text",
-                         MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
-                         0, SectionKind::getText()));
+    SwitchSection(getContext().getMachOSection(
+                                      "__TEXT", "__text",
+                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      0, SectionKind::getText()));
   }
 
   virtual void EmitLabel(MCSymbol *Symbol);
@@ -140,6 +145,7 @@ public:
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                    MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitLinkerOptions(ArrayRef<std::string> Options);
   virtual void EmitDataRegion(MCDataRegionType Kind);
   virtual void EmitThumbFunc(MCSymbol *Func);
 
@@ -271,6 +277,10 @@ public:
   virtual void FinishImpl();
 
   /// @}
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_AsmStreamer;
+  }
 };
 
 } // end anonymous namespace.
@@ -370,6 +380,16 @@ void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitLinkerOptions(ArrayRef<std::string> Options) {
+  assert(!Options.empty() && "At least one option is required!");
+  OS << "\t.linker_option \"" << Options[0] << '"';
+  for (ArrayRef<std::string>::iterator it = Options.begin() + 1,
+         ie = Options.end(); it != ie; ++it) {
+    OS << ", " << '"' << *it << '"';
+  }
+  OS << "\n";
+}
+
 void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) {
   MCContext &Ctx = getContext();
   const MCAsmInfo &MAI = Ctx.getAsmInfo();
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 46a0089..208a4d5 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -38,6 +38,8 @@ STATISTIC(EmittedRelaxableFragments,
           "Number of emitted assembler fragments - relaxable");
 STATISTIC(EmittedDataFragments,
           "Number of emitted assembler fragments - data");
+STATISTIC(EmittedCompactEncodedInstFragments,
+          "Number of emitted assembler fragments - compact encoded inst");
 STATISTIC(EmittedAlignFragments,
           "Number of emitted assembler fragments - align");
 STATISTIC(EmittedFillFragments,
@@ -80,14 +82,15 @@ bool MCAsmLayout::isFragmentValid(const MCFragment *F) const {
   return F->getLayoutOrder() <= LastValid->getLayoutOrder();
 }
 
-void MCAsmLayout::invalidateFragmentsAfter(MCFragment *F) {
+void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) {
   // If this fragment wasn't already valid, we don't need to do anything.
   if (!isFragmentValid(F))
     return;
 
-  // Otherwise, reset the last valid fragment to this fragment.
+  // Otherwise, reset the last valid fragment to the previous fragment
+  // (if this is the first fragment, it will be NULL).
   const MCSectionData &SD = *F->getParent();
-  LastValidFragment[&SD] = F;
+  LastValidFragment[&SD] = F->getPrevNode();
 }
 
 void MCAsmLayout::ensureValid(const MCFragment *F) const {
@@ -163,14 +166,14 @@ uint64_t MCAsmLayout::getSectionFileSize(const MCSectionData *SD) const {
 uint64_t MCAsmLayout::computeBundlePadding(const MCFragment *F,
                                            uint64_t FOffset, uint64_t FSize) {
   uint64_t BundleSize = Assembler.getBundleAlignSize();
-  assert(BundleSize > 0 && 
+  assert(BundleSize > 0 &&
          "computeBundlePadding should only be called if bundling is enabled");
   uint64_t BundleMask = BundleSize - 1;
   uint64_t OffsetInBundle = FOffset & BundleMask;
   uint64_t EndOfFragment = OffsetInBundle + FSize;
 
   // There are two kinds of bundling restrictions:
-  // 
+  //
   // 1) For alignToBundleEnd(), add padding to ensure that the fragment will
   //    *end* on a bundle boundary.
   // 2) Otherwise, check if the fragment would cross a bundle boundary. If it
@@ -223,6 +226,11 @@ MCEncodedFragment::~MCEncodedFragment() {
 
 /* *** */
 
+MCEncodedFragmentWithFixups::~MCEncodedFragmentWithFixups() {
+}
+
+/* *** */
+
 MCSectionData::MCSectionData() : Section(0) {}
 
 MCSectionData::MCSectionData(const MCSection &_Section, MCAssembler *A)
@@ -258,7 +266,7 @@ MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_,
                          raw_ostream &OS_)
   : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(&Writer_),
     OS(OS_), BundleAlignSize(0), RelaxAll(false), NoExecStack(false),
-    SubsectionsViaSymbols(false) {
+    SubsectionsViaSymbols(false), ELFHeaderEFlags(0) {
 }
 
 MCAssembler::~MCAssembler() {
@@ -280,6 +288,7 @@ void MCAssembler::reset() {
   RelaxAll = false;
   NoExecStack = false;
   SubsectionsViaSymbols = false;
+  ELFHeaderEFlags = 0;
 
   // reset objects owned by us
   getBackend().reset();
@@ -394,6 +403,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   switch (F.getKind()) {
   case MCFragment::FT_Data:
   case MCFragment::FT_Relaxable:
+  case MCFragment::FT_CompactEncodedInst:
     return cast<MCEncodedFragment>(F).getContents().size();
   case MCFragment::FT_Fill:
     return cast<MCFillFragment>(F).getSize();
@@ -417,7 +427,7 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   }
 
   case MCFragment::FT_Org: {
-    MCOrgFragment &OF = cast<MCOrgFragment>(F);
+    const MCOrgFragment &OF = cast<MCOrgFragment>(F);
     int64_t TargetLocation;
     if (!OF.getOffset().EvaluateAsAbsolute(TargetLocation, Layout))
       report_fatal_error("expected assembly-time absolute expression");
@@ -464,7 +474,7 @@ void MCAsmLayout::layoutFragment(MCFragment *F) {
   //
   //
   //        BundlePadding
-  //             ||| 
+  //             |||
   // -------------------------------------
   //   Prev  |##########|       F        |
   // -------------------------------------
@@ -494,7 +504,7 @@ void MCAsmLayout::layoutFragment(MCFragment *F) {
 /// \brief Write the contents of a fragment to the given object writer. Expects
 ///        a MCEncodedFragment.
 static void writeFragmentContents(const MCFragment &F, MCObjectWriter *OW) {
-  MCEncodedFragment &EF = cast<MCEncodedFragment>(F);
+  const MCEncodedFragment &EF = cast<MCEncodedFragment>(F);
   OW->WriteBytes(EF.getContents());
 }
 
@@ -503,6 +513,9 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
                           const MCFragment &F) {
   MCObjectWriter *OW = &Asm.getWriter();
 
+  // FIXME: Embed in fragments instead?
+  uint64_t FragmentSize = Asm.computeFragmentSize(Layout, F);
+
   // Should NOP padding be written out before this fragment?
   unsigned BundlePadding = F.getBundlePadding();
   if (BundlePadding > 0) {
@@ -511,6 +524,22 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
     assert(F.hasInstructions() &&
            "Writing bundle padding for a fragment without instructions");
 
+    unsigned TotalLength = BundlePadding + static_cast<unsigned>(FragmentSize);
+    if (F.alignToBundleEnd() && TotalLength > Asm.getBundleAlignSize()) {
+      // If the padding itself crosses a bundle boundary, it must be emitted
+      // in 2 pieces, since even nop instructions must not cross boundaries.
+      //             v--------------v   <- BundleAlignSize
+      //        v---------v             <- BundlePadding
+      // ----------------------------
+      // | Prev |####|####|    F    |
+      // ----------------------------
+      //        ^-------------------^   <- TotalLength
+      unsigned DistanceToBoundary = TotalLength - Asm.getBundleAlignSize();
+      if (!Asm.getBackend().writeNopData(DistanceToBoundary, OW))
+          report_fatal_error("unable to write NOP sequence of " +
+                             Twine(DistanceToBoundary) + " bytes");
+      BundlePadding -= DistanceToBoundary;
+    }
     if (!Asm.getBackend().writeNopData(BundlePadding, OW))
       report_fatal_error("unable to write NOP sequence of " +
                          Twine(BundlePadding) + " bytes");
@@ -523,12 +552,10 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
 
   ++stats::EmittedFragments;
 
-  // FIXME: Embed in fragments instead?
-  uint64_t FragmentSize = Asm.computeFragmentSize(Layout, F);
   switch (F.getKind()) {
   case MCFragment::FT_Align: {
     ++stats::EmittedAlignFragments;
-    MCAlignFragment &AF = cast<MCAlignFragment>(F);
+    const MCAlignFragment &AF = cast<MCAlignFragment>(F);
     uint64_t Count = FragmentSize / AF.getValueSize();
 
     assert(AF.getValueSize() && "Invalid virtual align in concrete fragment!");
@@ -576,9 +603,14 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
     writeFragmentContents(F, OW);
     break;
 
+  case MCFragment::FT_CompactEncodedInst:
+    ++stats::EmittedCompactEncodedInstFragments;
+    writeFragmentContents(F, OW);
+    break;
+
   case MCFragment::FT_Fill: {
     ++stats::EmittedFillFragments;
-    MCFillFragment &FF = cast<MCFillFragment>(F);
+    const MCFillFragment &FF = cast<MCFillFragment>(F);
 
     assert(FF.getValueSize() && "Invalid virtual align in concrete fragment!");
 
@@ -595,14 +627,14 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
   }
 
   case MCFragment::FT_LEB: {
-    MCLEBFragment &LF = cast<MCLEBFragment>(F);
+    const MCLEBFragment &LF = cast<MCLEBFragment>(F);
     OW->WriteBytes(LF.getContents().str());
     break;
   }
 
   case MCFragment::FT_Org: {
     ++stats::EmittedOrgFragments;
-    MCOrgFragment &OF = cast<MCOrgFragment>(F);
+    const MCOrgFragment &OF = cast<MCOrgFragment>(F);
 
     for (uint64_t i = 0, e = FragmentSize; i != e; ++i)
       OW->Write8(uint8_t(OF.getValue()));
@@ -641,7 +673,7 @@ void MCAssembler::writeSectionData(const MCSectionData *SD,
         // Check that we aren't trying to write a non-zero contents (or fixups)
         // into a virtual section. This is to support clients which use standard
         // directives to fill the contents of virtual sections.
-        MCDataFragment &DF = cast<MCDataFragment>(*it);
+        const MCDataFragment &DF = cast<MCDataFragment>(*it);
         assert(DF.fixup_begin() == DF.fixup_end() &&
                "Cannot have fixups in virtual section!");
         for (unsigned i = 0, e = DF.getContents().size(); i != e; ++i)
@@ -748,9 +780,10 @@ void MCAssembler::Finish() {
   for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) {
     for (MCSectionData::iterator it2 = it->begin(),
            ie2 = it->end(); it2 != ie2; ++it2) {
-      MCEncodedFragment *F = dyn_cast<MCEncodedFragment>(it2);
+      MCEncodedFragmentWithFixups *F =
+        dyn_cast<MCEncodedFragmentWithFixups>(it2);
       if (F) {
-        for (MCEncodedFragment::fixup_iterator it3 = F->fixup_begin(),
+        for (MCEncodedFragmentWithFixups::fixup_iterator it3 = F->fixup_begin(),
              ie3 = F->fixup_end(); it3 != ie3; ++it3) {
           MCFixup &Fixup = *it3;
           uint64_t FixedValue = handleFixup(Layout, *F, Fixup);
@@ -913,7 +946,7 @@ bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSectionData &SD) {
       FirstRelaxedFragment = I;
   }
   if (FirstRelaxedFragment) {
-    Layout.invalidateFragmentsAfter(FirstRelaxedFragment);
+    Layout.invalidateFragmentsFrom(FirstRelaxedFragment);
     return true;
   }
   return false;
@@ -960,6 +993,8 @@ void MCFragment::dump() {
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
+  case MCFragment::FT_CompactEncodedInst:
+    OS << "MCCompactEncodedInstFragment"; break;
   case MCFragment::FT_Fill:  OS << "MCFillFragment"; break;
   case MCFragment::FT_Relaxable:  OS << "MCRelaxableFragment"; break;
   case MCFragment::FT_Org:   OS << "MCOrgFragment"; break;
@@ -971,7 +1006,7 @@ void MCFragment::dump() {
   OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder
      << " Offset:" << Offset
      << " HasInstructions:" << hasInstructions() 
-     << " BundlePadding:" << getBundlePadding() << ">";
+     << " BundlePadding:" << static_cast<unsigned>(getBundlePadding()) << ">";
 
   switch (getKind()) {
   case MCFragment::FT_Align: {
@@ -1007,6 +1042,19 @@ void MCFragment::dump() {
     }
     break;
   }
+  case MCFragment::FT_CompactEncodedInst: {
+    const MCCompactEncodedInstFragment *CEIF =
+      cast<MCCompactEncodedInstFragment>(this);
+    OS << "\n       ";
+    OS << " Contents:[";
+    const SmallVectorImpl<char> &Contents = CEIF->getContents();
+    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
+      if (i) OS << ",";
+      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
+    }
+    OS << "] (" << Contents.size() << " bytes)";
+    break;
+  }
   case MCFragment::FT_Fill:  {
     const MCFillFragment *FF = cast<MCFillFragment>(this);
     OS << " Value:" << FF->getValue() << " ValueSize:" << FF->getValueSize()
@@ -1100,7 +1148,9 @@ void MCAssembler::dump() {
 
 // anchors for MC*Fragment vtables
 void MCEncodedFragment::anchor() { }
+void MCEncodedFragmentWithFixups::anchor() { }
 void MCDataFragment::anchor() { }
+void MCCompactEncodedInstFragment::anchor() { }
 void MCRelaxableFragment::anchor() { }
 void MCAlignFragment::anchor() { }
 void MCFillFragment::anchor() { }
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index aa52b49..1a7df60 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -40,7 +40,7 @@ MCContext::MCContext(const MCAsmInfo &mai, const MCRegisterInfo &mri,
   CompilationDir(llvm::sys::Path::GetCurrentDirectory().str()),
   CurrentDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0), 
   DwarfLocSeen(false), GenDwarfForAssembly(false), GenDwarfFileNumber(0),
-  AllowTemporaryLabels(true), AutoReset(DoAutoReset) {
+  AllowTemporaryLabels(true), DwarfCompileUnitID(0), AutoReset(DoAutoReset) {
 
   MachOUniquingMap = 0;
   ELFUniquingMap = 0;
@@ -83,6 +83,8 @@ void MCContext::reset() {
   DwarfDebugFlags = StringRef();
   MCLineSections.clear();
   MCLineSectionOrder.clear();
+  DwarfCompileUnitID = 0;
+  MCLineTableSymbols.clear();
   CurrentDwarfLoc = MCDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0);
 
   // If we have the MachO uniquing map, free it.
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index d53d2fc..fea057a 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -101,7 +101,8 @@ void MCLineEntry::Make(MCStreamer *MCOS, const MCSection *Section) {
   }
 
   // Add the line entry to this section's entries.
-  LineSection->addLineEntry(LineEntry);
+  LineSection->addLineEntry(LineEntry,
+                            MCOS->getContext().getDwarfCompileUnitID());
 }
 
 //
@@ -131,7 +132,12 @@ static inline const MCExpr *MakeStartMinusEndExpr(const MCStreamer &MCOS,
 //
 static inline void EmitDwarfLineTable(MCStreamer *MCOS,
                                       const MCSection *Section,
-                                      const MCLineSection *LineSection) {
+                                      const MCLineSection *LineSection,
+                                      unsigned CUID) {
+  // This LineSection does not contain any LineEntry for the given Compile Unit.
+  if (!LineSection->containEntriesForID(CUID))
+    return;
+
   unsigned FileNum = 1;
   unsigned LastLine = 1;
   unsigned Column = 0;
@@ -141,8 +147,8 @@ static inline void EmitDwarfLineTable(MCStreamer *MCOS,
 
   // Loop through each MCLineEntry and encode the dwarf line number table.
   for (MCLineSection::const_iterator
-         it = LineSection->getMCLineEntries()->begin(),
-         ie = LineSection->getMCLineEntries()->end(); it != ie; ++it) {
+         it = LineSection->getMCLineEntries(CUID).begin(),
+         ie = LineSection->getMCLineEntries(CUID).end(); it != ie; ++it) {
 
     if (FileNum != it->getFileNum()) {
       FileNum = it->getFileNum();
@@ -215,9 +221,36 @@ const MCSymbol *MCDwarfFileTable::Emit(MCStreamer *MCOS) {
   // Switch to the section where the table will be emitted into.
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
 
-  // Create a symbol at the beginning of this section.
-  MCSymbol *LineStartSym = context.CreateTempSymbol();
-  // Set the value of the symbol, as we are at the start of the section.
+  const DenseMap<unsigned, MCSymbol *> &MCLineTableSymbols =
+    MCOS->getContext().getMCLineTableSymbols();
+  // CUID and MCLineTableSymbols are set in DwarfDebug, when DwarfDebug does
+  // not exist, CUID will be 0 and MCLineTableSymbols will be empty.
+  // Handle Compile Unit 0, the line table start symbol is the section symbol.
+  const MCSymbol *LineStartSym = EmitCU(MCOS, 0);
+  // Handle the rest of the Compile Units.
+  for (unsigned Is = 1, Ie = MCLineTableSymbols.size(); Is < Ie; Is++)
+    EmitCU(MCOS, Is);
+
+  // Now delete the MCLineSections that were created in MCLineEntry::Make()
+  // and used to emit the line table.
+  const DenseMap<const MCSection *, MCLineSection *> &MCLineSections =
+    MCOS->getContext().getMCLineSections();
+  for (DenseMap<const MCSection *, MCLineSection *>::const_iterator it =
+       MCLineSections.begin(), ie = MCLineSections.end(); it != ie;
+       ++it)
+    delete it->second;
+
+  return LineStartSym;
+}
+
+const MCSymbol *MCDwarfFileTable::EmitCU(MCStreamer *MCOS, unsigned CUID) {
+  MCContext &context = MCOS->getContext();
+
+  // Create a symbol at the beginning of the line table.
+  MCSymbol *LineStartSym = MCOS->getContext().getMCLineTableSymbol(CUID);
+  if (!LineStartSym)
+    LineStartSym = context.CreateTempSymbol();
+  // Set the value of the symbol, as we are at the start of the line table.
   MCOS->EmitLabel(LineStartSym);
 
   // Create a symbol for the end of the section (to be set when we get there).
@@ -239,8 +272,7 @@ const MCSymbol *MCDwarfFileTable::Emit(MCStreamer *MCOS) {
   // total length, the 2 bytes for the version, and these 4 bytes for the
   // length of the prologue.
   MCOS->EmitAbsValue(MakeStartMinusEndExpr(*MCOS, *LineStartSym, *ProEndSym,
-                                        (4 + 2 + 4)),
-                  4, 0);
+                                           (4 + 2 + 4)), 4, 0);
 
   // Parameters of the state machine, are next.
   MCOS->EmitIntValue(DWARF2_LINE_MIN_INSN_LENGTH, 1);
@@ -269,8 +301,8 @@ const MCSymbol *MCDwarfFileTable::Emit(MCStreamer *MCOS) {
   const std::vector<StringRef> &MCDwarfDirs =
     context.getMCDwarfDirs();
   for (unsigned i = 0; i < MCDwarfDirs.size(); i++) {
-    MCOS->EmitBytes(MCDwarfDirs[i], 0); // the DirectoryName
-    MCOS->EmitBytes(StringRef("\0", 1), 0); // the null term. of the string
+    MCOS->EmitBytes(MCDwarfDirs[i]); // the DirectoryName
+    MCOS->EmitBytes(StringRef("\0", 1)); // the null term. of the string
   }
   MCOS->EmitIntValue(0, 1); // Terminate the directory list
 
@@ -278,8 +310,8 @@ const MCSymbol *MCDwarfFileTable::Emit(MCStreamer *MCOS) {
   const std::vector<MCDwarfFile *> &MCDwarfFiles =
     MCOS->getContext().getMCDwarfFiles();
   for (unsigned i = 1; i < MCDwarfFiles.size(); i++) {
-    MCOS->EmitBytes(MCDwarfFiles[i]->getName(), 0); // FileName
-    MCOS->EmitBytes(StringRef("\0", 1), 0); // the null term. of the string
+    MCOS->EmitBytes(MCDwarfFiles[i]->getName()); // FileName
+    MCOS->EmitBytes(StringRef("\0", 1)); // the null term. of the string
     // the Directory num
     MCOS->EmitULEB128IntValue(MCDwarfFiles[i]->getDirIndex());
     MCOS->EmitIntValue(0, 1); // last modification timestamp (always 0)
@@ -301,11 +333,7 @@ const MCSymbol *MCDwarfFileTable::Emit(MCStreamer *MCOS) {
        ++it) {
     const MCSection *Sec = *it;
     const MCLineSection *Line = MCLineSections.lookup(Sec);
-    EmitDwarfLineTable(MCOS, Sec, Line);
-
-    // Now delete the MCLineSections that were created in MCLineEntry::Make()
-    // and used to emit the line table.
-    delete Line;
+    EmitDwarfLineTable(MCOS, Sec, Line, CUID);
   }
 
   if (MCOS->getContext().getAsmInfo().getLinkerRequiresNonEmptyDwarfLines()
@@ -342,7 +370,7 @@ void MCDwarfLineAddr::Emit(MCStreamer *MCOS, int64_t LineDelta,
   SmallString<256> Tmp;
   raw_svector_ostream OS(Tmp);
   MCDwarfLineAddr::Encode(LineDelta, AddrDelta, OS);
-  MCOS->EmitBytes(OS.str(), /*AddrSpace=*/0);
+  MCOS->EmitBytes(OS.str());
 }
 
 /// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas.
@@ -618,29 +646,35 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   const std::vector<StringRef> &MCDwarfDirs =
     context.getMCDwarfDirs();
   if (MCDwarfDirs.size() > 0) {
-    MCOS->EmitBytes(MCDwarfDirs[0], 0);
-    MCOS->EmitBytes("/", 0);
+    MCOS->EmitBytes(MCDwarfDirs[0]);
+    MCOS->EmitBytes("/");
   }
   const std::vector<MCDwarfFile *> &MCDwarfFiles =
     MCOS->getContext().getMCDwarfFiles();
-  MCOS->EmitBytes(MCDwarfFiles[1]->getName(), 0);
+  MCOS->EmitBytes(MCDwarfFiles[1]->getName());
   MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
 
   // AT_comp_dir, the working directory the assembly was done in.
-  MCOS->EmitBytes(context.getCompilationDir(), 0);
+  MCOS->EmitBytes(context.getCompilationDir());
   MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
 
   // AT_APPLE_flags, the command line arguments of the assembler tool.
   StringRef DwarfDebugFlags = context.getDwarfDebugFlags();
   if (!DwarfDebugFlags.empty()){
-    MCOS->EmitBytes(DwarfDebugFlags, 0);
+    MCOS->EmitBytes(DwarfDebugFlags);
     MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
   }
 
   // AT_producer, the version of the assembler tool.
-  MCOS->EmitBytes(StringRef("llvm-mc (based on LLVM "), 0);
-  MCOS->EmitBytes(StringRef(PACKAGE_VERSION), 0);
-  MCOS->EmitBytes(StringRef(")"), 0);
+  StringRef DwarfDebugProducer = context.getDwarfDebugProducer();
+  if (!DwarfDebugProducer.empty()){
+    MCOS->EmitBytes(DwarfDebugProducer);
+  }
+  else {
+    MCOS->EmitBytes(StringRef("llvm-mc (based on LLVM "));
+    MCOS->EmitBytes(StringRef(PACKAGE_VERSION));
+    MCOS->EmitBytes(StringRef(")"));
+  }
   MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
 
   // AT_language, a 4 byte value.  We use DW_LANG_Mips_Assembler as the dwarf2
@@ -661,7 +695,7 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
     MCOS->EmitULEB128IntValue(2);
 
     // AT_name, of the label without any leading underbar.
-    MCOS->EmitBytes(Entry->getName(), 0);
+    MCOS->EmitBytes(Entry->getName());
     MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
 
     // AT_decl_file, index into the file table.
@@ -786,7 +820,7 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
 static int getDataAlignmentFactor(MCStreamer &streamer) {
   MCContext &context = streamer.getContext();
   const MCAsmInfo &asmInfo = context.getAsmInfo();
-  int size = asmInfo.getPointerSize();
+  int size = asmInfo.getCalleeSaveStackSlotSize();
   if (asmInfo.isStackGrowthDirectionUp())
     return size;
   else
@@ -1071,7 +1105,7 @@ void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
   }
   case MCCFIInstruction::OpEscape:
     if (VerboseAsm) Streamer.AddComment("Escape bytes");
-    Streamer.EmitBytes(Instr.getValues(), 0);
+    Streamer.EmitBytes(Instr.getValues());
     return;
   }
   llvm_unreachable("Unhandled case in switch");
@@ -1229,7 +1263,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
     Augmentation += "R";
     if (IsSignalFrame)
       Augmentation += "S";
-    streamer.EmitBytes(Augmentation.str(), 0);
+    streamer.EmitBytes(Augmentation.str());
   }
   streamer.EmitIntValue(0, 1);
 
@@ -1493,7 +1527,7 @@ void MCDwarfFrameEmitter::EmitAdvanceLoc(MCStreamer &Streamer,
   SmallString<256> Tmp;
   raw_svector_ostream OS(Tmp);
   MCDwarfFrameEmitter::EncodeAdvanceLoc(AddrDelta, OS);
-  Streamer.EmitBytes(OS.str(), /*AddrSpace=*/0);
+  Streamer.EmitBytes(OS.str());
 }
 
 void MCDwarfFrameEmitter::EncodeAdvanceLoc(uint64_t AddrDelta,
diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp
index 4db2846..560cdbc 100644
--- a/lib/MC/MCELF.cpp
+++ b/lib/MC/MCELF.cpp
@@ -52,6 +52,8 @@ unsigned MCELF::GetType(const MCSymbolData &SD) {
   return Type;
 }
 
+// Visibility is stored in the first two bits of st_other
+// st_other values are stored in the second byte of get/setFlags
 void MCELF::SetVisibility(MCSymbolData &SD, unsigned Visibility) {
   assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL ||
          Visibility == ELF::STV_HIDDEN || Visibility == ELF::STV_PROTECTED);
@@ -68,4 +70,17 @@ unsigned MCELF::GetVisibility(MCSymbolData &SD) {
   return Visibility;
 }
 
+// Other is stored in the last six bits of st_other
+// st_other values are stored in the second byte of get/setFlags
+void MCELF::setOther(MCSymbolData &SD, unsigned Other) {
+  uint32_t OtherFlags = SD.getFlags() & ~(0x3f << ELF_Other_Shift);
+  SD.setFlags(OtherFlags | (Other << ELF_Other_Shift));
+}
+
+unsigned MCELF::getOther(MCSymbolData &SD) {
+  unsigned Other =
+    (SD.getFlags() & (0x3f << ELF_Other_Shift)) >> ELF_Other_Shift;
+  return Other;
+}
+
 }
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index 74cd042..4cac84d 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -24,11 +24,6 @@ MCELFObjectTargetWriter::MCELFObjectTargetWriter(bool Is64Bit_,
     IsN64(IsN64_){
 }
 
-/// Default e_flags = 0
-unsigned MCELFObjectTargetWriter::getEFlags() const {
-  return 0;
-}
-
 const MCSymbol *MCELFObjectTargetWriter::ExplicitRelSym(const MCAssembler &Asm,
                                                         const MCValue &Target,
                                                         const MCFragment &F,
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index b08fa41..c1428d8 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -65,6 +65,10 @@ inline void MCELFStreamer::SetSectionBss() {
 MCELFStreamer::~MCELFStreamer() {
 }
 
+void MCELFStreamer::InitToTextSection() {
+  SetSectionText();
+}
+
 void MCELFStreamer::InitSections() {
   // This emulates the same behavior of GNU as. This makes it easier
   // to compare the output as the major sections are in the same order.
@@ -296,7 +300,9 @@ void MCELFStreamer::EmitFileDirective(StringRef Filename) {
 
 void  MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
   switch (expr->getKind()) {
-  case MCExpr::Target: llvm_unreachable("Can't handle target exprs yet!");
+  case MCExpr::Target:
+    cast<MCTargetExpr>(expr)->fixELFSymbolsInTLSFixups(getAssembler());
+    break;
   case MCExpr::Constant:
     break;
 
@@ -328,6 +334,19 @@ void  MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
     case MCSymbolRefExpr::VK_Mips_GOTTPREL:
     case MCSymbolRefExpr::VK_Mips_TPREL_HI:
     case MCSymbolRefExpr::VK_Mips_TPREL_LO:
+    case MCSymbolRefExpr::VK_PPC_TPREL16_HA:
+    case MCSymbolRefExpr::VK_PPC_TPREL16_LO:
+    case MCSymbolRefExpr::VK_PPC_DTPREL16_HA:
+    case MCSymbolRefExpr::VK_PPC_DTPREL16_LO:
+    case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_HA:
+    case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_LO:
+    case MCSymbolRefExpr::VK_PPC_TLS:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_HA:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_LO:
+    case MCSymbolRefExpr::VK_PPC_TLSGD:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO:
+    case MCSymbolRefExpr::VK_PPC_TLSLD:
       break;
     }
     MCSymbolData &SD = getAssembler().getOrCreateSymbolData(symRef.getSymbol());
@@ -367,8 +386,10 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst) {
   // data fragment).
   //
   // If bundling is enabled:
-  // - If we're not in a bundle-locked group, emit the instruction into a data
-  //   fragment of its own.
+  // - If we're not in a bundle-locked group, emit the instruction into a
+  //   fragment of its own. If there are no fixups registered for the
+  //   instruction, emit a MCCompactEncodedInstFragment. Otherwise, emit a
+  //   MCDataFragment.
   // - If we're in a bundle-locked group, append the instruction to the current
   //   data fragment because we want all the instructions in a group to get into
   //   the same fragment. Be careful not to do that for the first instruction in
@@ -378,8 +399,17 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst) {
   if (Assembler.isBundlingEnabled()) {
     MCSectionData *SD = getCurrentSectionData();
     if (SD->isBundleLocked() && !SD->isBundleGroupBeforeFirstInst())
-      DF = getOrCreateDataFragment();
-    else {
+      // If we are bundle-locked, we re-use the current fragment.
+      // The bundle-locking directive ensures this is a new data fragment.
+      DF = cast<MCDataFragment>(getCurrentFragment());
+    else if (!SD->isBundleLocked() && Fixups.size() == 0) {
+      // Optimize memory usage by emitting the instruction to a
+      // MCCompactEncodedInstFragment when not in a bundle-locked group and
+      // there are no fixups registered.
+      MCCompactEncodedInstFragment *CEIF = new MCCompactEncodedInstFragment(SD);
+      CEIF->getContents().append(Code.begin(), Code.end());
+      return;
+    } else {
       DF = new MCDataFragment(SD);
       if (SD->getBundleLockState() == MCSectionData::BundleLockedAlignToEnd) {
         // If this is a new fragment created for a bundle-locked group, and the
@@ -469,7 +499,7 @@ void MCELFStreamer::FinishImpl() {
 }
 void MCELFStreamer::EmitTCEntry(const MCSymbol &S) {
   // Creates a R_PPC64_TOC relocation
-  MCObjectStreamer::EmitSymbolValue(&S, 8, 0);
+  MCObjectStreamer::EmitSymbolValue(&S, 8);
 }
 
 MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
@@ -487,6 +517,10 @@ void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("Generic ELF doesn't support this directive");
 }
 
+MCSymbolData &MCELFStreamer::getOrCreateSymbolData(MCSymbol *Symbol) {
+  return getAssembler().getOrCreateSymbolData(*Symbol);
+}
+
 void MCELFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("ELF doesn't support this directive");
 }
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 82ccdd4..7d08d0e 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -34,19 +34,21 @@ private:
   void EmitDataRegion(DataRegionData::KindTy Kind);
   void EmitDataRegionEnd();
 public:
-  MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB,
-                  raw_ostream &OS, MCCodeEmitter *Emitter)
-    : MCObjectStreamer(Context, MAB, OS, Emitter) {}
+  MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS,
+                  MCCodeEmitter *Emitter)
+      : MCObjectStreamer(SK_MachOStreamer, Context, MAB, OS, Emitter) {}
 
   /// @name MCStreamer Interface
   /// @{
 
   virtual void InitSections();
+  virtual void InitToTextSection();
   virtual void EmitLabel(MCSymbol *Symbol);
   virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                    MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitLinkerOptions(ArrayRef<std::string> Options);
   virtual void EmitDataRegion(MCDataRegionType Kind);
   virtual void EmitThumbFunc(MCSymbol *Func);
   virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
@@ -85,15 +87,23 @@ public:
   virtual void FinishImpl();
 
   /// @}
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_MachOStreamer;
+  }
 };
 
 } // end anonymous namespace.
 
 void MCMachOStreamer::InitSections() {
-  SwitchSection(getContext().getMachOSection("__TEXT", "__text",
-                                    MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
-                                    0, SectionKind::getText()));
+  InitToTextSection();
+}
 
+void MCMachOStreamer::InitToTextSection() {
+  SwitchSection(getContext().getMachOSection(
+                                    "__TEXT", "__text",
+                                    MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS, 0,
+                                    SectionKind::getText()));
 }
 
 void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
@@ -173,6 +183,10 @@ void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   }
 }
 
+void MCMachOStreamer::EmitLinkerOptions(ArrayRef<std::string> Options) {
+  getAssembler().getLinkerOptions().push_back(Options);
+}
+
 void MCMachOStreamer::EmitDataRegion(MCDataRegionType Kind) {
   switch (Kind) {
   case MCDR_DataRegion:
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 364c324..89f74c1 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -19,11 +19,14 @@ namespace {
 
   class MCNullStreamer : public MCStreamer {
   public:
-    MCNullStreamer(MCContext &Context) : MCStreamer(Context) {}
+    MCNullStreamer(MCContext &Context) : MCStreamer(SK_NullStreamer, Context) {}
 
     /// @name MCStreamer Interface
     /// @{
 
+    virtual void InitToTextSection() {
+    }
+
     virtual void InitSections() {
     }
 
@@ -106,6 +109,11 @@ namespace {
     }
 
     /// @}
+
+    static bool classof(const MCStreamer *S) {
+      return S->getKind() == SK_NullStreamer;
+    }
+
   };
 
 }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index a46f7be..2e1a045 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -186,6 +186,10 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     Ctx->getMachOSection("__DWARF", "__debug_frame",
                          MCSectionMachO::S_ATTR_DEBUG,
                          SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_pubnames",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
   DwarfPubTypesSection =
     Ctx->getMachOSection("__DWARF", "__debug_pubtypes",
                          MCSectionMachO::S_ATTR_DEBUG,
@@ -256,17 +260,33 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
       TTypeEncoding = (CMModel == CodeModel::Small)
         ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
     }
+  }  else if (T.getArch() ==  Triple::aarch64) {
+    FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+
+    // The small model guarantees static code/data size < 4GB, but not where it
+    // will be in memory. Most of these could end up >2GB away so even a signed
+    // pc-relative 32-bit address is insufficient, theoretically.
+    if (RelocM == Reloc::PIC_) {
+      PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata8;
+      LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8;
+      FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+      TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata8;
+    } else {
+      PersonalityEncoding = dwarf::DW_EH_PE_absptr;
+      LSDAEncoding = dwarf::DW_EH_PE_absptr;
+      FDEEncoding = dwarf::DW_EH_PE_udata4;
+      TTypeEncoding = dwarf::DW_EH_PE_absptr;
+    }
   } else if (T.getArch() == Triple::ppc64) {
-    PersonalityEncoding = dwarf::DW_EH_PE_udata8;
-    PersonalityEncoding |= (RelocM == Reloc::PIC_)
-     ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
-     : dwarf::DW_EH_PE_absptr;
-    unsigned PICFlag = (RelocM == Reloc::PIC_) ? dwarf::DW_EH_PE_pcrel
-     : dwarf::DW_EH_PE_absptr;
-    FDECFIEncoding = PICFlag | dwarf::DW_EH_PE_sdata4;
-    LSDAEncoding = PICFlag | dwarf::DW_EH_PE_udata8;
-    FDEEncoding = PICFlag | dwarf::DW_EH_PE_sdata4;
-    TTypeEncoding = PICFlag | dwarf::DW_EH_PE_sdata4;
+    PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+      dwarf::DW_EH_PE_udata8;
+    FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+    LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8;
+    FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8;
+    TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+      dwarf::DW_EH_PE_udata8;
   }
 
   // Solaris requires different flags for .eh_frame to seemingly every other
@@ -384,6 +404,9 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   DwarfFrameSection =
     Ctx->getELFSection(".debug_frame", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    Ctx->getELFSection(".debug_pubnames", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
   DwarfPubTypesSection =
     Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
@@ -440,6 +463,9 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   DwarfStrOffDWOSection =
     Ctx->getELFSection(".debug_str_offsets.dwo", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
+  DwarfAddrSection =
+    Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
 }
 
 
@@ -524,6 +550,11 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    Ctx->getCOFFSection(".debug_pubnames",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfPubTypesSection =
     Ctx->getCOFFSection(".debug_pubtypes",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index d205a8c..0d2ce83 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -20,22 +20,19 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
-MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                   raw_ostream &OS, MCCodeEmitter *Emitter_)
-  : MCStreamer(Context),
-    Assembler(new MCAssembler(Context, TAB,
-                              *Emitter_, *TAB.createObjectWriter(OS),
-                              OS)),
-    CurSectionData(0)
-{
-}
-
-MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                   raw_ostream &OS, MCCodeEmitter *Emitter_,
+MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+                                   MCAsmBackend &TAB, raw_ostream &OS,
+                                   MCCodeEmitter *Emitter_)
+    : MCStreamer(Kind, Context),
+      Assembler(new MCAssembler(Context, TAB, *Emitter_,
+                                *TAB.createObjectWriter(OS), OS)),
+      CurSectionData(0) {}
+
+MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+                                   MCAsmBackend &TAB, raw_ostream &OS,
+                                   MCCodeEmitter *Emitter_,
                                    MCAssembler *_Assembler)
-  : MCStreamer(Context), Assembler(_Assembler), CurSectionData(0)
-{
-}
+    : MCStreamer(Kind, Context), Assembler(_Assembler), CurSectionData(0) {}
 
 MCObjectStreamer::~MCObjectStreamer() {
   delete &Assembler->getBackend();
@@ -62,7 +59,9 @@ MCFragment *MCObjectStreamer::getCurrentFragment() const {
 
 MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() const {
   MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
-  if (!F)
+  // When bundling is enabled, we don't want to add data to a fragment that
+  // already has instructions (see MCELFStreamer::EmitInstToData for details)
+  if (!F || (Assembler->isBundlingEnabled() && F->hasInstructions()))
     F = new MCDataFragment(getCurrentSectionData());
   return F;
 }
@@ -227,8 +226,10 @@ void MCObjectStreamer::EmitInstToFragment(const MCInst &Inst) {
   IF->getContents().append(Code.begin(), Code.end());
 }
 
-const char *BundlingNotImplementedMsg =
+#ifndef NDEBUG
+static const char *BundlingNotImplementedMsg =
   "Aligned bundling is not implemented for this object format";
+#endif
 
 void MCObjectStreamer::EmitBundleAlignMode(unsigned AlignPow2) {
   llvm_unreachable(BundlingNotImplementedMsg);
@@ -315,7 +316,7 @@ bool MCObjectStreamer::EmitValueToOffset(const MCExpr *Offset,
 
   if (!Delta->EvaluateAsAbsolute(Res, getAssembler()))
     return true;
-  EmitFill(Res, Value, 0);
+  EmitFill(Res, Value);
   return false;
 }
 
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index d0492fd..c1c594a 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -156,10 +156,36 @@ AsmToken AsmLexer::LexLineComment() {
 }
 
 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
-  if (CurPtr[0] == 'L' && CurPtr[1] == 'L')
-    CurPtr += 2;
-  if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L')
-    CurPtr += 3;
+  // Skip ULL, UL, U, L and LL suffices.
+  if (CurPtr[0] == 'U')
+    ++CurPtr;
+  if (CurPtr[0] == 'L')
+    ++CurPtr;
+  if (CurPtr[0] == 'L')
+    ++CurPtr;
+}
+
+// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
+// integer as a hexadecimal, possibly with leading zeroes.
+static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
+  const char *FirstHex = 0;
+  const char *LookAhead = CurPtr;
+  while (1) {
+    if (isdigit(*LookAhead)) {
+      ++LookAhead;
+    } else if (isxdigit(*LookAhead)) {
+      if (!FirstHex)
+        FirstHex = LookAhead;
+      ++LookAhead;
+    } else {
+      break;
+    }
+  }
+  bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
+  CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
+  if (isHex)
+    return 16;
+  return DefaultRadix;
 }
 
 /// LexDigit: First character is [0-9].
@@ -167,16 +193,15 @@ static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
 ///   Forward/Backward Label: [0-9][fb]
 ///   Binary integer: 0b[01]+
 ///   Octal integer: 0[0-7]+
-///   Hex integer: 0x[0-9a-fA-F]+
+///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
 ///   Decimal integer: [1-9][0-9]*
 AsmToken AsmLexer::LexDigit() {
   // Decimal integer: [1-9][0-9]*
   if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
-    while (isdigit(*CurPtr))
-      ++CurPtr;
-
+    unsigned Radix = doLookAhead(CurPtr, 10);
+    bool isHex = Radix == 16;
     // Check for floating point literals.
-    if (*CurPtr == '.' || *CurPtr == 'e') {
+    if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
       ++CurPtr;
       return LexFloatLiteral();
     }
@@ -184,17 +209,22 @@ AsmToken AsmLexer::LexDigit() {
     StringRef Result(TokStart, CurPtr - TokStart);
 
     long long Value;
-    if (Result.getAsInteger(10, Value)) {
+    if (Result.getAsInteger(Radix, Value)) {
       // Allow positive values that are too large to fit into a signed 64-bit
       // integer, but that do fit in an unsigned one, we just convert them over.
       unsigned long long UValue;
-      if (Result.getAsInteger(10, UValue))
-        return ReturnError(TokStart, "invalid decimal number");
+      if (Result.getAsInteger(Radix, UValue))
+        return ReturnError(TokStart, !isHex ? "invalid decimal number" :
+                           "invalid hexdecimal number");
       Value = (long long)UValue;
     }
 
-    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
-    // suffixes on integer literals.
+    // Consume the [bB][hH].
+    if (Radix == 2 || Radix == 16)
+      ++CurPtr;
+
+    // The darwin/x86 (and x86-64) assembler accepts and ignores type
+    // suffices on integer literals.
     SkipIgnoredIntegerSuffix(CurPtr);
 
     return AsmToken(AsmToken::Integer, Result, Value);
@@ -243,6 +273,10 @@ AsmToken AsmLexer::LexDigit() {
     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
       return ReturnError(TokStart, "invalid hexadecimal number");
 
+    // Consume the optional [hH].
+    if (*CurPtr == 'h' || *CurPtr == 'H')
+      ++CurPtr;
+
     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
     // suffixes on integer literals.
     SkipIgnoredIntegerSuffix(CurPtr);
@@ -251,14 +285,18 @@ AsmToken AsmLexer::LexDigit() {
                     (int64_t)Result);
   }
 
-  // Must be an octal number, it starts with 0.
-  while (*CurPtr >= '0' && *CurPtr <= '9')
-    ++CurPtr;
-
-  StringRef Result(TokStart, CurPtr - TokStart);
+  // Either octal or hexadecimal.
   long long Value;
-  if (Result.getAsInteger(8, Value))
-    return ReturnError(TokStart, "invalid octal number");
+  unsigned Radix = doLookAhead(CurPtr, 8);
+  bool isHex = Radix == 16;
+  StringRef Result(TokStart, CurPtr - TokStart);
+  if (Result.getAsInteger(Radix, Value))
+    return ReturnError(TokStart, !isHex ? "invalid octal number" :
+                       "invalid hexdecimal number");
+
+  // Consume the [hH].
+  if (Radix == 16)
+    ++CurPtr;
 
   // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
   // suffixes on integer literals.
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 7c3fea5..6ab49ec 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -50,27 +51,30 @@ MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {}
 
 namespace {
 
-/// \brief Helper class for tracking macro definitions.
-typedef std::vector<AsmToken> MacroArgument;
-typedef std::vector<MacroArgument> MacroArguments;
-typedef std::pair<StringRef, MacroArgument> MacroParameter;
-typedef std::vector<MacroParameter> MacroParameters;
+/// \brief Helper types for tracking macro definitions.
+typedef std::vector<AsmToken> MCAsmMacroArgument;
+typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
+typedef std::pair<StringRef, MCAsmMacroArgument> MCAsmMacroParameter;
+typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
 
-struct Macro {
+struct MCAsmMacro {
   StringRef Name;
   StringRef Body;
-  MacroParameters Parameters;
+  MCAsmMacroParameters Parameters;
 
 public:
-  Macro(StringRef N, StringRef B, const MacroParameters &P) :
+  MCAsmMacro(StringRef N, StringRef B, const MCAsmMacroParameters &P) :
     Name(N), Body(B), Parameters(P) {}
+
+  MCAsmMacro(const MCAsmMacro& Other)
+    : Name(Other.Name), Body(Other.Body), Parameters(Other.Parameters) {}
 };
 
 /// \brief Helper class for storing information about an active macro
 /// instantiation.
 struct MacroInstantiation {
   /// The macro being instantiated.
-  const Macro *TheMacro;
+  const MCAsmMacro *TheMacro;
 
   /// The macro instantiation with substitutions.
   MemoryBuffer *Instantiation;
@@ -85,11 +89,10 @@ struct MacroInstantiation {
   SMLoc ExitLoc;
 
 public:
-  MacroInstantiation(const Macro *M, SMLoc IL, int EB, SMLoc EL,
+  MacroInstantiation(const MCAsmMacro *M, SMLoc IL, int EB, SMLoc EL,
                      MemoryBuffer *I);
 };
 
-//struct AsmRewrite;
 struct ParseStatementInfo {
   /// ParsedOperands - The parsed operands from the last parsed statement.
   SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
@@ -116,8 +119,6 @@ struct ParseStatementInfo {
 
 /// \brief The concrete assembly parser instance.
 class AsmParser : public MCAsmParser {
-  friend class GenericAsmParser;
-
   AsmParser(const AsmParser &) LLVM_DELETED_FUNCTION;
   void operator=(const AsmParser &) LLVM_DELETED_FUNCTION;
 private:
@@ -128,7 +129,6 @@ private:
   SourceMgr &SrcMgr;
   SourceMgr::DiagHandlerTy SavedDiagHandler;
   void *SavedDiagContext;
-  MCAsmParserExtension *GenericParser;
   MCAsmParserExtension *PlatformParser;
 
   /// This is the current buffer index we're lexing from as managed by the
@@ -138,20 +138,19 @@ private:
   AsmCond TheCondState;
   std::vector<AsmCond> TheCondStack;
 
-  /// DirectiveMap - This is a table handlers for directives.  Each handler is
-  /// invoked after the directive identifier is read and is responsible for
-  /// parsing and validating the rest of the directive.  The handler is passed
-  /// in the directive name and the location of the directive keyword.
-  StringMap<std::pair<MCAsmParserExtension*, DirectiveHandler> > DirectiveMap;
+  /// ExtensionDirectiveMap - maps directive names to handler methods in parser
+  /// extensions. Extensions register themselves in this map by calling
+  /// addDirectiveHandler.
+  StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
 
   /// MacroMap - Map of currently defined macros.
-  StringMap<Macro*> MacroMap;
+  StringMap<MCAsmMacro*> MacroMap;
 
   /// ActiveMacros - Stack of active macro instantiations.
   std::vector<MacroInstantiation*> ActiveMacros;
 
   /// Boolean tracking whether macro substitution is enabled.
-  unsigned MacrosEnabled : 1;
+  unsigned MacrosEnabledFlag : 1;
 
   /// Flag tracking whether any errors have been encountered.
   unsigned HadError : 1;
@@ -178,10 +177,9 @@ public:
 
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false);
 
-  virtual void AddDirectiveHandler(MCAsmParserExtension *Object,
-                                   StringRef Directive,
-                                   DirectiveHandler Handler) {
-    DirectiveMap[Directive] = std::make_pair(Object, Handler);
+  virtual void addDirectiveHandler(StringRef Directive,
+                                   ExtensionDirectiveHandler Handler) {
+    ExtensionDirectiveMap[Directive] = Handler;
   }
 
 public:
@@ -212,7 +210,7 @@ public:
   void setParsingInlineAsm(bool V) { ParsingInlineAsm = V; }
   bool isParsingInlineAsm() { return ParsingInlineAsm; }
 
-  bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+  bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                         unsigned &NumOutputs, unsigned &NumInputs,
                         SmallVectorImpl<std::pair<void *,bool> > &OpDecls,
                         SmallVectorImpl<std::string> &Constraints,
@@ -221,27 +219,70 @@ public:
                         const MCInstPrinter *IP,
                         MCAsmParserSemaCallback &SI);
 
-  bool ParseExpression(const MCExpr *&Res);
-  virtual bool ParseExpression(const MCExpr *&Res, SMLoc &EndLoc);
-  virtual bool ParseParenExpression(const MCExpr *&Res, SMLoc &EndLoc);
-  virtual bool ParseAbsoluteExpression(int64_t &Res);
+  bool parseExpression(const MCExpr *&Res);
+  virtual bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc);
+  virtual bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc);
+  virtual bool parseAbsoluteExpression(int64_t &Res);
 
+  /// parseIdentifier - Parse an identifier or string (as a quoted identifier)
+  /// and set \p Res to the identifier contents.
+  virtual bool parseIdentifier(StringRef &Res);
+  virtual void eatToEndOfStatement();
+
+  virtual void checkForValidSection();
   /// }
 
 private:
-  void CheckForValidSection();
 
   bool ParseStatement(ParseStatementInfo &Info);
   void EatToEndOfLine();
   bool ParseCppHashLineFilenameComment(const SMLoc &L);
 
-  bool HandleMacroEntry(StringRef Name, SMLoc NameLoc, const Macro *M);
+  void CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
+                        MCAsmMacroParameters Parameters);
   bool expandMacro(raw_svector_ostream &OS, StringRef Body,
-                   const MacroParameters &Parameters,
-                   const MacroArguments &A,
+                   const MCAsmMacroParameters &Parameters,
+                   const MCAsmMacroArguments &A,
                    const SMLoc &L);
+
+  /// \brief Are macros enabled in the parser?
+  bool MacrosEnabled() {return MacrosEnabledFlag;}
+
+  /// \brief Control a flag in the parser that enables or disables macros.
+  void SetMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
+
+  /// \brief Lookup a previously defined macro.
+  /// \param Name Macro name.
+  /// \returns Pointer to macro. NULL if no such macro was defined.
+  const MCAsmMacro* LookupMacro(StringRef Name);
+
+  /// \brief Define a new macro with the given name and information.
+  void DefineMacro(StringRef Name, const MCAsmMacro& Macro);
+
+  /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
+  void UndefineMacro(StringRef Name);
+
+  /// \brief Are we inside a macro instantiation?
+  bool InsideMacroInstantiation() {return !ActiveMacros.empty();}
+
+  /// \brief Handle entry to macro instantiation. 
+  ///
+  /// \param M The macro.
+  /// \param NameLoc Instantiation location.
+  bool HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
+
+  /// \brief Handle exit from macro instantiation.
   void HandleMacroExit();
 
+  /// \brief Extract AsmTokens for a macro argument. If the argument delimiter
+  /// is initially unknown, set it to AsmToken::Eof. It will be set to the
+  /// correct delimiter by the method.
+  bool ParseMacroArgument(MCAsmMacroArgument &MA,
+                          AsmToken::TokenKind &ArgumentDelimiter);
+
+  /// \brief Parse all macro arguments for a given macro.
+  bool ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
+
   void PrintMacroInstantiations();
   void PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
                     ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) const {
@@ -263,16 +304,10 @@ private:
   /// location.
   void JumpToLoc(SMLoc Loc, int InBuffer=-1);
 
-  virtual void EatToEndOfStatement();
-
-  bool ParseMacroArgument(MacroArgument &MA,
-                          AsmToken::TokenKind &ArgumentDelimiter);
-  bool ParseMacroArguments(const Macro *M, MacroArguments &A);
-
   /// \brief Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
   /// will be either the EndOfStatement or EOF.
-  virtual StringRef ParseStringToEndOfStatement();
+  virtual StringRef parseStringToEndOfStatement();
 
   /// \brief Parse until the end of a statement or a comma is encountered,
   /// return the contents from the current token up to the end or comma.
@@ -286,18 +321,43 @@ private:
   bool ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
   bool ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
 
-  /// ParseIdentifier - Parse an identifier or string (as a quoted identifier)
-  /// and set \p Res to the identifier contents.
-  virtual bool ParseIdentifier(StringRef &Res);
-
-  // Directive Parsing.
+  bool ParseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
 
-  // ".ascii", ".asciiz", ".string"
+  // Generic (target and platform independent) directive parsing.
+  enum DirectiveKind {
+    DK_NO_DIRECTIVE, // Placeholder
+    DK_SET, DK_EQU, DK_EQUIV, DK_ASCII, DK_ASCIZ, DK_STRING, DK_BYTE, DK_SHORT,
+    DK_VALUE, DK_2BYTE, DK_LONG, DK_INT, DK_4BYTE, DK_QUAD, DK_8BYTE, DK_SINGLE,
+    DK_FLOAT, DK_DOUBLE, DK_ALIGN, DK_ALIGN32, DK_BALIGN, DK_BALIGNW,
+    DK_BALIGNL, DK_P2ALIGN, DK_P2ALIGNW, DK_P2ALIGNL, DK_ORG, DK_FILL, DK_ENDR,
+    DK_BUNDLE_ALIGN_MODE, DK_BUNDLE_LOCK, DK_BUNDLE_UNLOCK,
+    DK_ZERO, DK_EXTERN, DK_GLOBL, DK_GLOBAL, DK_INDIRECT_SYMBOL,
+    DK_LAZY_REFERENCE, DK_NO_DEAD_STRIP, DK_SYMBOL_RESOLVER, DK_PRIVATE_EXTERN,
+    DK_REFERENCE, DK_WEAK_DEFINITION, DK_WEAK_REFERENCE,
+    DK_WEAK_DEF_CAN_BE_HIDDEN, DK_COMM, DK_COMMON, DK_LCOMM, DK_ABORT,
+    DK_INCLUDE, DK_INCBIN, DK_CODE16, DK_CODE16GCC, DK_REPT, DK_IRP, DK_IRPC,
+    DK_IF, DK_IFB, DK_IFNB, DK_IFC, DK_IFNC, DK_IFDEF, DK_IFNDEF, DK_IFNOTDEF,
+    DK_ELSEIF, DK_ELSE, DK_ENDIF,
+    DK_SPACE, DK_SKIP, DK_FILE, DK_LINE, DK_LOC, DK_STABS,
+    DK_CFI_SECTIONS, DK_CFI_STARTPROC, DK_CFI_ENDPROC, DK_CFI_DEF_CFA,
+    DK_CFI_DEF_CFA_OFFSET, DK_CFI_ADJUST_CFA_OFFSET, DK_CFI_DEF_CFA_REGISTER,
+    DK_CFI_OFFSET, DK_CFI_REL_OFFSET, DK_CFI_PERSONALITY, DK_CFI_LSDA,
+    DK_CFI_REMEMBER_STATE, DK_CFI_RESTORE_STATE, DK_CFI_SAME_VALUE,
+    DK_CFI_RESTORE, DK_CFI_ESCAPE, DK_CFI_SIGNAL_FRAME, DK_CFI_UNDEFINED,
+    DK_CFI_REGISTER,
+    DK_MACROS_ON, DK_MACROS_OFF, DK_MACRO, DK_ENDM, DK_ENDMACRO, DK_PURGEM,
+    DK_SLEB128, DK_ULEB128
+  };
+
+  /// DirectiveKindMap - Maps directive name --> DirectiveKind enum, for
+  /// directives parsed by this class.
+  StringMap<DirectiveKind> DirectiveKindMap;
+
+  // ".ascii", ".asciz", ".string"
   bool ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
   bool ParseDirectiveValue(unsigned Size); // ".byte", ".long", ...
   bool ParseDirectiveRealValue(const fltSemantics &); // ".single", ...
   bool ParseDirectiveFill(); // ".fill"
-  bool ParseDirectiveSpace(); // ".space"
   bool ParseDirectiveZero(); // ".zero"
   // ".set", ".equ", ".equiv"
   bool ParseDirectiveSet(StringRef IDVal, bool allow_redef);
@@ -305,6 +365,38 @@ private:
   // ".align{,32}", ".p2align{,w,l}"
   bool ParseDirectiveAlign(bool IsPow2, unsigned ValueSize);
 
+  // ".file", ".line", ".loc", ".stabs"
+  bool ParseDirectiveFile(SMLoc DirectiveLoc);
+  bool ParseDirectiveLine();
+  bool ParseDirectiveLoc();
+  bool ParseDirectiveStabs();
+
+  // .cfi directives
+  bool ParseDirectiveCFIRegister(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFISections();
+  bool ParseDirectiveCFIStartProc();
+  bool ParseDirectiveCFIEndProc();
+  bool ParseDirectiveCFIDefCfaOffset();
+  bool ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIAdjustCfaOffset();
+  bool ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIOffset(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
+  bool ParseDirectiveCFIRememberState();
+  bool ParseDirectiveCFIRestoreState();
+  bool ParseDirectiveCFISameValue(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIRestore(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIEscape();
+  bool ParseDirectiveCFISignalFrame();
+  bool ParseDirectiveCFIUndefined(SMLoc DirectiveLoc);
+
+  // macro directives
+  bool ParseDirectivePurgeMacro(SMLoc DirectiveLoc);
+  bool ParseDirectiveEndMacro(StringRef Directive);
+  bool ParseDirectiveMacro(SMLoc DirectiveLoc);
+  bool ParseDirectiveMacrosOnOff(StringRef Directive);
+
   // ".bundle_align_mode"
   bool ParseDirectiveBundleAlignMode();
   // ".bundle_lock"
@@ -312,6 +404,12 @@ private:
   // ".bundle_unlock"
   bool ParseDirectiveBundleUnlock();
 
+  // ".space", ".skip"
+  bool ParseDirectiveSpace(StringRef IDVal);
+
+  // .sleb128 (Signed=true) and .uleb128 (Signed=false)
+  bool ParseDirectiveLEB128(bool Signed);
+
   /// ParseDirectiveSymbolAttribute - Parse a directive like ".globl" which
   /// accepts a single symbol (which should be a label or an external).
   bool ParseDirectiveSymbolAttribute(MCSymbolAttr Attr);
@@ -332,139 +430,29 @@ private:
   bool ParseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
   bool ParseDirectiveElse(SMLoc DirectiveLoc); // ".else"
   bool ParseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
-
-  /// ParseEscapedString - Parse the current token as a string which may include
-  /// escaped characters and return the string contents.
-  bool ParseEscapedString(std::string &Data);
+  virtual bool parseEscapedString(std::string &Data);
 
   const MCExpr *ApplyModifierToExpr(const MCExpr *E,
                                     MCSymbolRefExpr::VariantKind Variant);
 
   // Macro-like directives
-  Macro *ParseMacroLikeBody(SMLoc DirectiveLoc);
-  void InstantiateMacroLikeBody(Macro *M, SMLoc DirectiveLoc,
+  MCAsmMacro *ParseMacroLikeBody(SMLoc DirectiveLoc);
+  void InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
                                 raw_svector_ostream &OS);
   bool ParseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
   bool ParseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
   bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
   bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
 
-  // "_emit"
-  bool ParseDirectiveEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info);
-};
-
-/// \brief Generic implementations of directive handling, etc. which is shared
-/// (or the default, at least) for all assembler parser.
-class GenericAsmParser : public MCAsmParserExtension {
-  template<bool (GenericAsmParser::*Handler)(StringRef, SMLoc)>
-  void AddDirectiveHandler(StringRef Directive) {
-    getParser().AddDirectiveHandler(this, Directive,
-                                    HandleDirective<GenericAsmParser, Handler>);
-  }
-public:
-  GenericAsmParser() {}
-
-  AsmParser &getParser() {
-    return (AsmParser&) this->MCAsmParserExtension::getParser();
-  }
-
-  virtual void Initialize(MCAsmParser &Parser) {
-    // Call the base implementation.
-    this->MCAsmParserExtension::Initialize(Parser);
-
-    // Debugging directives.
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveFile>(".file");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLine>(".line");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLoc>(".loc");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveStabs>(".stabs");
-
-    // CFI directives.
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFISections>(
-                                                               ".cfi_sections");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIStartProc>(
-                                                              ".cfi_startproc");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIEndProc>(
-                                                                ".cfi_endproc");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIDefCfa>(
-                                                         ".cfi_def_cfa");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIDefCfaOffset>(
-                                                         ".cfi_def_cfa_offset");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIAdjustCfaOffset>(
-                                                      ".cfi_adjust_cfa_offset");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIDefCfaRegister>(
-                                                       ".cfi_def_cfa_register");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIOffset>(
-                                                                 ".cfi_offset");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveCFIRelOffset>(
-                                                             ".cfi_rel_offset");
-    AddDirectiveHandler<
-     &GenericAsmParser::ParseDirectiveCFIPersonalityOrLsda>(".cfi_personality");
-    AddDirectiveHandler<
-            &GenericAsmParser::ParseDirectiveCFIPersonalityOrLsda>(".cfi_lsda");
-    AddDirectiveHandler<
-      &GenericAsmParser::ParseDirectiveCFIRememberState>(".cfi_remember_state");
-    AddDirectiveHandler<
-      &GenericAsmParser::ParseDirectiveCFIRestoreState>(".cfi_restore_state");
-    AddDirectiveHandler<
-      &GenericAsmParser::ParseDirectiveCFISameValue>(".cfi_same_value");
-    AddDirectiveHandler<
-      &GenericAsmParser::ParseDirectiveCFIRestore>(".cfi_restore");
-    AddDirectiveHandler<
-      &GenericAsmParser::ParseDirectiveCFIEscape>(".cfi_escape");
-    AddDirectiveHandler<
-      &GenericAsmParser::ParseDirectiveCFISignalFrame>(".cfi_signal_frame");
-    AddDirectiveHandler<
-      &GenericAsmParser::ParseDirectiveCFIUndefined>(".cfi_undefined");
-    AddDirectiveHandler<
-      &GenericAsmParser::ParseDirectiveCFIRegister>(".cfi_register");
-
-    // Macro directives.
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacrosOnOff>(
-      ".macros_on");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacrosOnOff>(
-      ".macros_off");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveMacro>(".macro");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endm");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveEndMacro>(".endmacro");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectivePurgeMacro>(".purgem");
-
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLEB128>(".sleb128");
-    AddDirectiveHandler<&GenericAsmParser::ParseDirectiveLEB128>(".uleb128");
-  }
+  // "_emit" or "__emit"
+  bool ParseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
+                            size_t Len);
 
-  bool ParseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
+  // "align"
+  bool ParseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
 
-  bool ParseDirectiveFile(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveLine(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveStabs(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFISections(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIStartProc(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIEndProc(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIDefCfa(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIDefCfaOffset(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIAdjustCfaOffset(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIDefCfaRegister(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRelOffset(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIPersonalityOrLsda(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRememberState(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRestoreState(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFISameValue(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRestore(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIEscape(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFISignalFrame(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIUndefined(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRegister(StringRef, SMLoc DirectiveLoc);
-
-  bool ParseDirectiveMacrosOnOff(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveMacro(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectiveEndMacro(StringRef, SMLoc DirectiveLoc);
-  bool ParseDirectivePurgeMacro(StringRef, SMLoc DirectiveLoc);
-
-  bool ParseDirectiveLEB128(StringRef, SMLoc);
+  void initializeDirectiveKindMap();
 };
-
 }
 
 namespace llvm {
@@ -480,8 +468,8 @@ enum { DEFAULT_ADDRSPACE = 0 };
 AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
                      MCStreamer &_Out, const MCAsmInfo &_MAI)
   : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
-    GenericParser(new GenericAsmParser), PlatformParser(0),
-    CurBuffer(0), MacrosEnabled(true), CppHashLineNumber(0),
+    PlatformParser(0),
+    CurBuffer(0), MacrosEnabledFlag(true), CppHashLineNumber(0),
     AssemblerDialect(~0U), IsDarwin(false), ParsingInlineAsm(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
@@ -490,9 +478,6 @@ AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
   SrcMgr.setDiagHandler(DiagHandler, this);
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
 
-  // Initialize the generic parser.
-  GenericParser->Initialize(*this);
-
   // Initialize the platform / file format parser.
   //
   // FIXME: This is a hack, we need to (majorly) cleanup how these objects are
@@ -508,18 +493,19 @@ AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
     PlatformParser = createELFAsmParser();
     PlatformParser->Initialize(*this);
   }
+
+  initializeDirectiveKindMap();
 }
 
 AsmParser::~AsmParser() {
   assert(ActiveMacros.empty() && "Unexpected active macro instantiation!");
 
   // Destroy any macros.
-  for (StringMap<Macro*>::iterator it = MacroMap.begin(),
+  for (StringMap<MCAsmMacro*>::iterator it = MacroMap.begin(),
          ie = MacroMap.end(); it != ie; ++it)
     delete it->getValue();
 
   delete PlatformParser;
-  delete GenericParser;
 }
 
 void AsmParser::PrintMacroInstantiations() {
@@ -632,7 +618,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
     // We had an error, validate that one was emitted and recover by skipping to
     // the next line.
     assert(HadError && "Parse statement returned an error, but none emitted!");
-    EatToEndOfStatement();
+    eatToEndOfStatement();
   }
 
   if (TheCondState.TheCond != StartingCondState.TheCond ||
@@ -679,18 +665,15 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   return HadError;
 }
 
-void AsmParser::CheckForValidSection() {
+void AsmParser::checkForValidSection() {
   if (!ParsingInlineAsm && !getStreamer().getCurrentSection()) {
     TokError("expected section directive before assembly directive");
-    Out.SwitchSection(Ctx.getMachOSection(
-                        "__TEXT", "__text",
-                        MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
-                        0, SectionKind::getText()));
+    Out.InitToTextSection();
   }
 }
 
-/// EatToEndOfStatement - Throw away the rest of the line for testing purposes.
-void AsmParser::EatToEndOfStatement() {
+/// eatToEndOfStatement - Throw away the rest of the line for testing purposes.
+void AsmParser::eatToEndOfStatement() {
   while (Lexer.isNot(AsmToken::EndOfStatement) &&
          Lexer.isNot(AsmToken::Eof))
     Lex();
@@ -700,7 +683,7 @@ void AsmParser::EatToEndOfStatement() {
     Lex();
 }
 
-StringRef AsmParser::ParseStringToEndOfStatement() {
+StringRef AsmParser::parseStringToEndOfStatement() {
   const char *Start = getTok().getLoc().getPointer();
 
   while (Lexer.isNot(AsmToken::EndOfStatement) &&
@@ -729,7 +712,7 @@ StringRef AsmParser::ParseStringToComma() {
 /// parenexpr ::= expr)
 ///
 bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  if (ParseExpression(Res)) return true;
+  if (parseExpression(Res)) return true;
   if (Lexer.isNot(AsmToken::RParen))
     return TokError("expected ')' in parentheses expression");
   EndLoc = Lexer.getTok().getEndLoc();
@@ -743,7 +726,7 @@ bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 /// bracketexpr ::= expr]
 ///
 bool AsmParser::ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  if (ParseExpression(Res)) return true;
+  if (parseExpression(Res)) return true;
   if (Lexer.isNot(AsmToken::RBrac))
     return TokError("expected ']' in brackets expression");
   EndLoc = Lexer.getTok().getEndLoc();
@@ -758,7 +741,9 @@ bool AsmParser::ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
 ///  primaryexpr ::= '.'
 ///  primaryexpr ::= ~,+,- primaryexpr
 bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  switch (Lexer.getKind()) {
+  SMLoc FirstTokenLoc = getLexer().getLoc();
+  AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
+  switch (FirstTokenKind) {
   default:
     return TokError("unknown token in expression");
   // If we have an error assume that we've already handled it.
@@ -774,8 +759,11 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   case AsmToken::String:
   case AsmToken::Identifier: {
     StringRef Identifier;
-    if (ParseIdentifier(Identifier))
+    if (parseIdentifier(Identifier)) {
+      if (FirstTokenKind == AsmToken::Dollar)
+        return Error(FirstTokenLoc, "invalid token in expression");
       return true;
+    }
 
     EndLoc = SMLoc::getFromPointer(Identifier.end());
 
@@ -876,9 +864,9 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   }
 }
 
-bool AsmParser::ParseExpression(const MCExpr *&Res) {
+bool AsmParser::parseExpression(const MCExpr *&Res) {
   SMLoc EndLoc;
-  return ParseExpression(Res, EndLoc);
+  return parseExpression(Res, EndLoc);
 }
 
 const MCExpr *
@@ -929,7 +917,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
   llvm_unreachable("Invalid expression kind!");
 }
 
-/// ParseExpression - Parse an expression and return it.
+/// parseExpression - Parse an expression and return it.
 ///
 ///  expr ::= expr &&,|| expr               -> lowest.
 ///  expr ::= expr |,^,&,! expr
@@ -939,7 +927,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 ///  expr ::= expr *,/,% expr               -> highest.
 ///  expr ::= primaryexpr
 ///
-bool AsmParser::ParseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
+bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   // Parse the expression.
   Res = 0;
   if (ParsePrimaryExpr(Res, EndLoc) || ParseBinOpRHS(1, Res, EndLoc))
@@ -977,17 +965,17 @@ bool AsmParser::ParseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-bool AsmParser::ParseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
+bool AsmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   Res = 0;
   return ParseParenExpr(Res, EndLoc) ||
          ParseBinOpRHS(1, Res, EndLoc);
 }
 
-bool AsmParser::ParseAbsoluteExpression(int64_t &Res) {
+bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
   const MCExpr *Expr;
 
   SMLoc StartLoc = Lexer.getLoc();
-  if (ParseExpression(Expr))
+  if (parseExpression(Expr))
     return true;
 
   if (!Expr->EvaluateAsAbsolute(Res))
@@ -1134,8 +1122,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
       if (!TheCondState.Ignore)
         return TokError("unexpected token at start of statement");
       IDVal = "";
-    }
-    else {
+    } else {
       IDVal = getTok().getString();
       Lex(); // Consume the integer token to be used as an identifier token.
       if (Lexer.getKind() != AsmToken::Colon) {
@@ -1143,46 +1130,54 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
           return TokError("unexpected token at start of statement");
       }
     }
-
   } else if (Lexer.is(AsmToken::Dot)) {
     // Treat '.' as a valid identifier in this context.
     Lex();
     IDVal = ".";
-
-  } else if (ParseIdentifier(IDVal)) {
+  } else if (parseIdentifier(IDVal)) {
     if (!TheCondState.Ignore)
       return TokError("unexpected token at start of statement");
     IDVal = "";
   }
 
-
   // Handle conditional assembly here before checking for skipping.  We
   // have to do this so that .endif isn't skipped in a ".if 0" block for
   // example.
-  if (IDVal == ".if")
-    return ParseDirectiveIf(IDLoc);
-  if (IDVal == ".ifb")
-    return ParseDirectiveIfb(IDLoc, true);
-  if (IDVal == ".ifnb")
-    return ParseDirectiveIfb(IDLoc, false);
-  if (IDVal == ".ifc")
-    return ParseDirectiveIfc(IDLoc, true);
-  if (IDVal == ".ifnc")
-    return ParseDirectiveIfc(IDLoc, false);
-  if (IDVal == ".ifdef")
-    return ParseDirectiveIfdef(IDLoc, true);
-  if (IDVal == ".ifndef" || IDVal == ".ifnotdef")
-    return ParseDirectiveIfdef(IDLoc, false);
-  if (IDVal == ".elseif")
-    return ParseDirectiveElseIf(IDLoc);
-  if (IDVal == ".else")
-    return ParseDirectiveElse(IDLoc);
-  if (IDVal == ".endif")
-    return ParseDirectiveEndIf(IDLoc);
-
-  // If we are in a ".if 0" block, ignore this statement.
+  StringMap<DirectiveKind>::const_iterator DirKindIt =
+    DirectiveKindMap.find(IDVal);
+  DirectiveKind DirKind =
+    (DirKindIt == DirectiveKindMap.end()) ? DK_NO_DIRECTIVE :
+                                            DirKindIt->getValue();
+  switch (DirKind) {
+    default:
+      break;
+    case DK_IF:
+      return ParseDirectiveIf(IDLoc);
+    case DK_IFB:
+      return ParseDirectiveIfb(IDLoc, true);
+    case DK_IFNB:
+      return ParseDirectiveIfb(IDLoc, false);
+    case DK_IFC:
+      return ParseDirectiveIfc(IDLoc, true);
+    case DK_IFNC:
+      return ParseDirectiveIfc(IDLoc, false);
+    case DK_IFDEF:
+      return ParseDirectiveIfdef(IDLoc, true);
+    case DK_IFNDEF:
+    case DK_IFNOTDEF:
+      return ParseDirectiveIfdef(IDLoc, false);
+    case DK_ELSEIF:
+      return ParseDirectiveElseIf(IDLoc);
+    case DK_ELSE:
+      return ParseDirectiveElse(IDLoc);
+    case DK_ENDIF:
+      return ParseDirectiveEndIf(IDLoc);
+  }
+
+  // Ignore the statement if in the middle of inactive conditional
+  // (e.g. ".if 0").
   if (TheCondState.Ignore) {
-    EatToEndOfStatement();
+    eatToEndOfStatement();
     return false;
   }
 
@@ -1191,7 +1186,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // See what kind of statement we have.
   switch (Lexer.getKind()) {
   case AsmToken::Colon: {
-    CheckForValidSection();
+    checkForValidSection();
 
     // identifier ':'   -> Label.
     Lex();
@@ -1245,167 +1240,233 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   }
 
   // If macros are enabled, check to see if this is a macro instantiation.
-  if (MacrosEnabled)
-    if (const Macro *M = MacroMap.lookup(IDVal))
-      return HandleMacroEntry(IDVal, IDLoc, M);
+  if (MacrosEnabled())
+    if (const MCAsmMacro *M = LookupMacro(IDVal)) {
+      return HandleMacroEntry(M, IDLoc);
+    }
 
   // Otherwise, we have a normal instruction or directive.
+  
+  // Directives start with "."
   if (IDVal[0] == '.' && IDVal != ".") {
-
-    // Target hook for parsing target specific directives.
+    // There are several entities interested in parsing directives:
+    // 
+    // 1. The target-specific assembly parser. Some directives are target
+    //    specific or may potentially behave differently on certain targets.
+    // 2. Asm parser extensions. For example, platform-specific parsers
+    //    (like the ELF parser) register themselves as extensions.
+    // 3. The generic directive parser implemented by this class. These are
+    //    all the directives that behave in a target and platform independent
+    //    manner, or at least have a default behavior that's shared between
+    //    all targets and platforms.
+
+    // First query the target-specific parser. It will return 'true' if it
+    // isn't interested in this directive.
     if (!getTargetParser().ParseDirective(ID))
       return false;
 
-    // Assembler features
-    if (IDVal == ".set" || IDVal == ".equ")
-      return ParseDirectiveSet(IDVal, true);
-    if (IDVal == ".equiv")
-      return ParseDirectiveSet(IDVal, false);
-
-    // Data directives
-
-    if (IDVal == ".ascii")
-      return ParseDirectiveAscii(IDVal, false);
-    if (IDVal == ".asciz" || IDVal == ".string")
-      return ParseDirectiveAscii(IDVal, true);
-
-    if (IDVal == ".byte")
-      return ParseDirectiveValue(1);
-    if (IDVal == ".short")
-      return ParseDirectiveValue(2);
-    if (IDVal == ".value")
-      return ParseDirectiveValue(2);
-    if (IDVal == ".2byte")
-      return ParseDirectiveValue(2);
-    if (IDVal == ".long")
-      return ParseDirectiveValue(4);
-    if (IDVal == ".int")
-      return ParseDirectiveValue(4);
-    if (IDVal == ".4byte")
-      return ParseDirectiveValue(4);
-    if (IDVal == ".quad")
-      return ParseDirectiveValue(8);
-    if (IDVal == ".8byte")
-      return ParseDirectiveValue(8);
-    if (IDVal == ".single" || IDVal == ".float")
-      return ParseDirectiveRealValue(APFloat::IEEEsingle);
-    if (IDVal == ".double")
-      return ParseDirectiveRealValue(APFloat::IEEEdouble);
-
-    if (IDVal == ".align") {
-      bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
-      return ParseDirectiveAlign(IsPow2, /*ExprSize=*/1);
-    }
-    if (IDVal == ".align32") {
-      bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
-      return ParseDirectiveAlign(IsPow2, /*ExprSize=*/4);
-    }
-    if (IDVal == ".balign")
-      return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
-    if (IDVal == ".balignw")
-      return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
-    if (IDVal == ".balignl")
-      return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
-    if (IDVal == ".p2align")
-      return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
-    if (IDVal == ".p2alignw")
-      return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
-    if (IDVal == ".p2alignl")
-      return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
-
-    if (IDVal == ".bundle_align_mode")
-      return ParseDirectiveBundleAlignMode();
-    if (IDVal == ".bundle_lock")
-      return ParseDirectiveBundleLock();
-    if (IDVal == ".bundle_unlock")
-      return ParseDirectiveBundleUnlock();
-
-    if (IDVal == ".org")
-      return ParseDirectiveOrg();
-
-    if (IDVal == ".fill")
-      return ParseDirectiveFill();
-    if (IDVal == ".space" || IDVal == ".skip")
-      return ParseDirectiveSpace();
-    if (IDVal == ".zero")
-      return ParseDirectiveZero();
-
-    // Symbol attribute directives
-
-    if (IDVal == ".extern") {
-      EatToEndOfStatement(); // .extern is the default, ignore it.
-      return false;
-    }
-    if (IDVal == ".globl" || IDVal == ".global")
-      return ParseDirectiveSymbolAttribute(MCSA_Global);
-    if (IDVal == ".indirect_symbol")
-      return ParseDirectiveSymbolAttribute(MCSA_IndirectSymbol);
-    if (IDVal == ".lazy_reference")
-      return ParseDirectiveSymbolAttribute(MCSA_LazyReference);
-    if (IDVal == ".no_dead_strip")
-      return ParseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
-    if (IDVal == ".symbol_resolver")
-      return ParseDirectiveSymbolAttribute(MCSA_SymbolResolver);
-    if (IDVal == ".private_extern")
-      return ParseDirectiveSymbolAttribute(MCSA_PrivateExtern);
-    if (IDVal == ".reference")
-      return ParseDirectiveSymbolAttribute(MCSA_Reference);
-    if (IDVal == ".weak_definition")
-      return ParseDirectiveSymbolAttribute(MCSA_WeakDefinition);
-    if (IDVal == ".weak_reference")
-      return ParseDirectiveSymbolAttribute(MCSA_WeakReference);
-    if (IDVal == ".weak_def_can_be_hidden")
-      return ParseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
-
-    if (IDVal == ".comm" || IDVal == ".common")
-      return ParseDirectiveComm(/*IsLocal=*/false);
-    if (IDVal == ".lcomm")
-      return ParseDirectiveComm(/*IsLocal=*/true);
-
-    if (IDVal == ".abort")
-      return ParseDirectiveAbort();
-    if (IDVal == ".include")
-      return ParseDirectiveInclude();
-    if (IDVal == ".incbin")
-      return ParseDirectiveIncbin();
-
-    if (IDVal == ".code16" || IDVal == ".code16gcc")
-      return TokError(Twine(IDVal) + " not supported yet");
-
-    // Macro-like directives
-    if (IDVal == ".rept")
-      return ParseDirectiveRept(IDLoc);
-    if (IDVal == ".irp")
-      return ParseDirectiveIrp(IDLoc);
-    if (IDVal == ".irpc")
-      return ParseDirectiveIrpc(IDLoc);
-    if (IDVal == ".endr")
-      return ParseDirectiveEndr(IDLoc);
-
-    // Look up the handler in the handler table.
+    // Next, check the extention directive map to see if any extension has
+    // registered itself to parse this directive.
     std::pair<MCAsmParserExtension*, DirectiveHandler> Handler =
-      DirectiveMap.lookup(IDVal);
+      ExtensionDirectiveMap.lookup(IDVal);
     if (Handler.first)
       return (*Handler.second)(Handler.first, IDVal, IDLoc);
 
+    // Finally, if no one else is interested in this directive, it must be
+    // generic and familiar to this class.
+    switch (DirKind) {
+      default:
+        break;
+      case DK_SET:
+      case DK_EQU:
+        return ParseDirectiveSet(IDVal, true);
+      case DK_EQUIV:
+        return ParseDirectiveSet(IDVal, false);
+      case DK_ASCII:
+        return ParseDirectiveAscii(IDVal, false);
+      case DK_ASCIZ:
+      case DK_STRING:
+        return ParseDirectiveAscii(IDVal, true);
+      case DK_BYTE:
+        return ParseDirectiveValue(1);
+      case DK_SHORT:
+      case DK_VALUE:
+      case DK_2BYTE:
+        return ParseDirectiveValue(2);
+      case DK_LONG:
+      case DK_INT:
+      case DK_4BYTE:
+        return ParseDirectiveValue(4);
+      case DK_QUAD:
+      case DK_8BYTE:
+        return ParseDirectiveValue(8);
+      case DK_SINGLE:
+      case DK_FLOAT:
+        return ParseDirectiveRealValue(APFloat::IEEEsingle);
+      case DK_DOUBLE:
+        return ParseDirectiveRealValue(APFloat::IEEEdouble);
+      case DK_ALIGN: {
+        bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
+        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/1);
+      }
+      case DK_ALIGN32: {
+        bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
+        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/4);
+      }
+      case DK_BALIGN:
+        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
+      case DK_BALIGNW:
+        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
+      case DK_BALIGNL:
+        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
+      case DK_P2ALIGN:
+        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
+      case DK_P2ALIGNW:
+        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
+      case DK_P2ALIGNL:
+        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
+      case DK_ORG:
+        return ParseDirectiveOrg();
+      case DK_FILL:
+        return ParseDirectiveFill();
+      case DK_ZERO:
+        return ParseDirectiveZero();
+      case DK_EXTERN:
+        eatToEndOfStatement(); // .extern is the default, ignore it.
+        return false;
+      case DK_GLOBL:
+      case DK_GLOBAL:
+        return ParseDirectiveSymbolAttribute(MCSA_Global);
+      case DK_INDIRECT_SYMBOL:
+        return ParseDirectiveSymbolAttribute(MCSA_IndirectSymbol);
+      case DK_LAZY_REFERENCE:
+        return ParseDirectiveSymbolAttribute(MCSA_LazyReference);
+      case DK_NO_DEAD_STRIP:
+        return ParseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
+      case DK_SYMBOL_RESOLVER:
+        return ParseDirectiveSymbolAttribute(MCSA_SymbolResolver);
+      case DK_PRIVATE_EXTERN:
+        return ParseDirectiveSymbolAttribute(MCSA_PrivateExtern);
+      case DK_REFERENCE:
+        return ParseDirectiveSymbolAttribute(MCSA_Reference);
+      case DK_WEAK_DEFINITION:
+        return ParseDirectiveSymbolAttribute(MCSA_WeakDefinition);
+      case DK_WEAK_REFERENCE:
+        return ParseDirectiveSymbolAttribute(MCSA_WeakReference);
+      case DK_WEAK_DEF_CAN_BE_HIDDEN:
+        return ParseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
+      case DK_COMM:
+      case DK_COMMON:
+        return ParseDirectiveComm(/*IsLocal=*/false);
+      case DK_LCOMM:
+        return ParseDirectiveComm(/*IsLocal=*/true);
+      case DK_ABORT:
+        return ParseDirectiveAbort();
+      case DK_INCLUDE:
+        return ParseDirectiveInclude();
+      case DK_INCBIN:
+        return ParseDirectiveIncbin();
+      case DK_CODE16:
+      case DK_CODE16GCC:
+        return TokError(Twine(IDVal) + " not supported yet");
+      case DK_REPT:
+        return ParseDirectiveRept(IDLoc);
+      case DK_IRP:
+        return ParseDirectiveIrp(IDLoc);
+      case DK_IRPC:
+        return ParseDirectiveIrpc(IDLoc);
+      case DK_ENDR:
+        return ParseDirectiveEndr(IDLoc);
+      case DK_BUNDLE_ALIGN_MODE:
+        return ParseDirectiveBundleAlignMode();
+      case DK_BUNDLE_LOCK:
+        return ParseDirectiveBundleLock();
+      case DK_BUNDLE_UNLOCK:
+        return ParseDirectiveBundleUnlock();
+      case DK_SLEB128:
+        return ParseDirectiveLEB128(true);
+      case DK_ULEB128:
+        return ParseDirectiveLEB128(false);
+      case DK_SPACE:
+      case DK_SKIP:
+        return ParseDirectiveSpace(IDVal);
+      case DK_FILE:
+        return ParseDirectiveFile(IDLoc);
+      case DK_LINE:
+        return ParseDirectiveLine();
+      case DK_LOC:
+        return ParseDirectiveLoc();
+      case DK_STABS:
+        return ParseDirectiveStabs();
+      case DK_CFI_SECTIONS:
+        return ParseDirectiveCFISections();
+      case DK_CFI_STARTPROC:
+        return ParseDirectiveCFIStartProc();
+      case DK_CFI_ENDPROC:
+        return ParseDirectiveCFIEndProc();
+      case DK_CFI_DEF_CFA:
+        return ParseDirectiveCFIDefCfa(IDLoc);
+      case DK_CFI_DEF_CFA_OFFSET:
+        return ParseDirectiveCFIDefCfaOffset();
+      case DK_CFI_ADJUST_CFA_OFFSET:
+        return ParseDirectiveCFIAdjustCfaOffset();
+      case DK_CFI_DEF_CFA_REGISTER:
+        return ParseDirectiveCFIDefCfaRegister(IDLoc);
+      case DK_CFI_OFFSET:
+        return ParseDirectiveCFIOffset(IDLoc);
+      case DK_CFI_REL_OFFSET:
+        return ParseDirectiveCFIRelOffset(IDLoc);
+      case DK_CFI_PERSONALITY:
+        return ParseDirectiveCFIPersonalityOrLsda(true);
+      case DK_CFI_LSDA:
+        return ParseDirectiveCFIPersonalityOrLsda(false);
+      case DK_CFI_REMEMBER_STATE:
+        return ParseDirectiveCFIRememberState();
+      case DK_CFI_RESTORE_STATE:
+        return ParseDirectiveCFIRestoreState();
+      case DK_CFI_SAME_VALUE:
+        return ParseDirectiveCFISameValue(IDLoc);
+      case DK_CFI_RESTORE:
+        return ParseDirectiveCFIRestore(IDLoc);
+      case DK_CFI_ESCAPE:
+        return ParseDirectiveCFIEscape();
+      case DK_CFI_SIGNAL_FRAME:
+        return ParseDirectiveCFISignalFrame();
+      case DK_CFI_UNDEFINED:
+        return ParseDirectiveCFIUndefined(IDLoc);
+      case DK_CFI_REGISTER:
+        return ParseDirectiveCFIRegister(IDLoc);
+      case DK_MACROS_ON:
+      case DK_MACROS_OFF:
+        return ParseDirectiveMacrosOnOff(IDVal);
+      case DK_MACRO:
+        return ParseDirectiveMacro(IDLoc);
+      case DK_ENDM:
+      case DK_ENDMACRO:
+        return ParseDirectiveEndMacro(IDVal);
+      case DK_PURGEM:
+        return ParseDirectivePurgeMacro(IDLoc);
+    }
 
     return Error(IDLoc, "unknown directive");
   }
 
-  // _emit
-  if (ParsingInlineAsm && IDVal == "_emit")
-    return ParseDirectiveEmit(IDLoc, Info);
+  // __asm _emit or __asm __emit
+  if (ParsingInlineAsm && (IDVal == "_emit" || IDVal == "__emit" ||
+                           IDVal == "_EMIT" || IDVal == "__EMIT"))
+    return ParseDirectiveMSEmit(IDLoc, Info, IDVal.size());
 
-  CheckForValidSection();
+  // __asm align
+  if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
+    return ParseDirectiveMSAlign(IDLoc, Info);
 
-  // Canonicalize the opcode to lower case.
-  SmallString<128> OpcodeStr;
-  for (unsigned i = 0, e = IDVal.size(); i != e; ++i)
-    OpcodeStr.push_back(tolower(IDVal[i]));
+  checkForValidSection();
 
+  // Canonicalize the opcode to lower case.
+  std::string OpcodeStr = IDVal.lower();
   ParseInstructionInfo IInfo(Info.AsmRewrites);
-  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr.str(),
-                                                     IDLoc,Info.ParsedOperands);
+  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr,
+                                                     IDLoc, Info.ParsedOperands);
   Info.ParseError = HadError;
 
   // Dump the parsed representation, if requested.
@@ -1429,22 +1490,22 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   if (!HadError && getContext().getGenDwarfForAssembly() &&
       getContext().getGenDwarfSection() == getStreamer().getCurrentSection()) {
 
-     unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
+    unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
 
-     // If we previously parsed a cpp hash file line comment then make sure the
-     // current Dwarf File is for the CppHashFilename if not then emit the
-     // Dwarf File table for it and adjust the line number for the .loc.
-     const std::vector<MCDwarfFile *> &MCDwarfFiles =
-       getContext().getMCDwarfFiles();
-     if (CppHashFilename.size() != 0) {
-       if(MCDwarfFiles[getContext().getGenDwarfFileNumber()]->getName() !=
+    // If we previously parsed a cpp hash file line comment then make sure the
+    // current Dwarf File is for the CppHashFilename if not then emit the
+    // Dwarf File table for it and adjust the line number for the .loc.
+    const std::vector<MCDwarfFile *> &MCDwarfFiles = 
+      getContext().getMCDwarfFiles();
+    if (CppHashFilename.size() != 0) {
+      if (MCDwarfFiles[getContext().getGenDwarfFileNumber()]->getName() !=
           CppHashFilename)
-         getStreamer().EmitDwarfFileDirective(
-           getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
+        getStreamer().EmitDwarfFileDirective(
+          getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
 
        unsigned CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc,CppHashBuf);
        Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
-     }
+    }
 
     getStreamer().EmitDwarfLocDirective(getContext().getGenDwarfFileNumber(),
                                         Line, 0, DWARF2_LINE_DEFAULT_IS_STMT ?
@@ -1570,12 +1631,13 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
 // we can't do that. AsmLexer.cpp should probably be changed to handle
 // '@' as a special case when needed.
 static bool isIdentifierChar(char c) {
-  return isalnum(c) || c == '_' || c == '$' || c == '.';
+  return isalnum(static_cast<unsigned char>(c)) || c == '_' || c == '$' ||
+         c == '.';
 }
 
 bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
-                            const MacroParameters &Parameters,
-                            const MacroArguments &A,
+                            const MCAsmMacroParameters &Parameters,
+                            const MCAsmMacroArguments &A,
                             const SMLoc &L) {
   unsigned NParameters = Parameters.size();
   if (NParameters != 0 && NParameters != A.size())
@@ -1594,7 +1656,8 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
           continue;
 
         char Next = Body[Pos + 1];
-        if (Next == '$' || Next == 'n' || isdigit(Next))
+        if (Next == '$' || Next == 'n' ||
+            isdigit(static_cast<unsigned char>(Next)))
           break;
       } else {
         // This macro has parameters, look for \foo, \bar, etc.
@@ -1630,7 +1693,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
           break;
 
         // Otherwise substitute with the token values, with spaces eliminated.
-        for (MacroArgument::const_iterator it = A[Index].begin(),
+        for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
                ie = A[Index].end(); it != ie; ++it)
           OS << it->getString();
         break;
@@ -1657,7 +1720,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
             Pos = I;
           }
       } else {
-        for (MacroArgument::const_iterator it = A[Index].begin(),
+        for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
                ie = A[Index].end(); it != ie; ++it)
           if (it->getKind() == AsmToken::String)
             OS << it->getStringContents();
@@ -1674,7 +1737,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
   return false;
 }
 
-MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL,
+MacroInstantiation::MacroInstantiation(const MCAsmMacro *M, SMLoc IL,
                                        int EB, SMLoc EL,
                                        MemoryBuffer *I)
   : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitBuffer(EB),
@@ -1715,10 +1778,7 @@ static bool IsOperator(AsmToken::TokenKind kind)
   }
 }
 
-/// ParseMacroArgument - Extract AsmTokens for a macro argument.
-/// This is used for both default macro parameter values and the
-/// arguments in macro invocations
-bool AsmParser::ParseMacroArgument(MacroArgument &MA,
+bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
                                    AsmToken::TokenKind &ArgumentDelimiter) {
   unsigned ParenLevel = 0;
   unsigned AddTokens = 0;
@@ -1794,7 +1854,7 @@ bool AsmParser::ParseMacroArgument(MacroArgument &MA,
 }
 
 // Parse the macro instantiation arguments.
-bool AsmParser::ParseMacroArguments(const Macro *M, MacroArguments &A) {
+bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A) {
   const unsigned NParameters = M ? M->Parameters.size() : 0;
   // Argument delimiter is initially unknown. It will be set by
   // ParseMacroArgument()
@@ -1805,7 +1865,7 @@ bool AsmParser::ParseMacroArguments(const Macro *M, MacroArguments &A) {
   // - macros defined with parameters accept at most that many of them
   for (unsigned Parameter = 0; !NParameters || Parameter < NParameters;
        ++Parameter) {
-    MacroArgument MA;
+    MCAsmMacroArgument MA;
 
     if (ParseMacroArgument(MA, ArgumentDelimiter))
       return true;
@@ -1838,14 +1898,30 @@ bool AsmParser::ParseMacroArguments(const Macro *M, MacroArguments &A) {
   return TokError("Too many arguments");
 }
 
-bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
-                                 const Macro *M) {
+const MCAsmMacro* AsmParser::LookupMacro(StringRef Name) {
+  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+  return (I == MacroMap.end()) ? NULL : I->getValue();
+}
+
+void AsmParser::DefineMacro(StringRef Name, const MCAsmMacro& Macro) {
+  MacroMap[Name] = new MCAsmMacro(Macro);
+}
+
+void AsmParser::UndefineMacro(StringRef Name) {
+  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+  if (I != MacroMap.end()) {
+    delete I->getValue();
+    MacroMap.erase(I);
+  }
+}
+
+bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
   // this, although we should protect against infinite loops.
   if (ActiveMacros.size() == 20)
     return TokError("macros cannot be nested more than 20 levels deep");
 
-  MacroArguments A;
+  MCAsmMacroArguments A;
   if (ParseMacroArguments(M, A))
     return true;
 
@@ -1864,7 +1940,7 @@ bool AsmParser::HandleMacroEntry(StringRef Name, SMLoc NameLoc,
   if (expandMacro(OS, Body, M->Parameters, A, getTok().getLoc()))
     return true;
 
-  // We include the .endmacro in the buffer as our queue to exit the macro
+  // We include the .endmacro in the buffer as our cue to exit the macro
   // instantiation.
   OS << ".endmacro\n";
 
@@ -1926,7 +2002,7 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
   SMLoc EqualLoc = Lexer.getLoc();
 
   const MCExpr *Value;
-  if (ParseExpression(Value))
+  if (parseExpression(Value))
     return true;
 
   // Note: we don't count b as used in "a = b". This is to allow
@@ -1983,10 +2059,10 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
   return false;
 }
 
-/// ParseIdentifier:
+/// parseIdentifier:
 ///   ::= identifier
 ///   ::= string
-bool AsmParser::ParseIdentifier(StringRef &Res) {
+bool AsmParser::parseIdentifier(StringRef &Res) {
   // The assembler has relaxed rules for accepting identifiers, in particular we
   // allow things like '.globl $foo', which would normally be separate
   // tokens. At this level, we have already lexed so we cannot (currently)
@@ -2029,7 +2105,7 @@ bool AsmParser::ParseIdentifier(StringRef &Res) {
 bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
   StringRef Name;
 
-  if (ParseIdentifier(Name))
+  if (parseIdentifier(Name))
     return TokError("expected identifier after '" + Twine(IDVal) + "'");
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2039,7 +2115,7 @@ bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
   return ParseAssignment(Name, allow_redef, true);
 }
 
-bool AsmParser::ParseEscapedString(std::string &Data) {
+bool AsmParser::parseEscapedString(std::string &Data) {
   assert(getLexer().is(AsmToken::String) && "Unexpected current token!");
 
   Data = "";
@@ -2101,14 +2177,14 @@ bool AsmParser::ParseEscapedString(std::string &Data) {
 ///   ::= ( .ascii | .asciz | .string ) [ "string" ( , "string" )* ]
 bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    CheckForValidSection();
+    checkForValidSection();
 
     for (;;) {
       if (getLexer().isNot(AsmToken::String))
         return TokError("expected string in '" + Twine(IDVal) + "' directive");
 
       std::string Data;
-      if (ParseEscapedString(Data))
+      if (parseEscapedString(Data))
         return true;
 
       getStreamer().EmitBytes(Data, DEFAULT_ADDRSPACE);
@@ -2134,12 +2210,12 @@ bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
 ///  ::= (.byte | .short | ... ) [ expression (, expression)* ]
 bool AsmParser::ParseDirectiveValue(unsigned Size) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    CheckForValidSection();
+    checkForValidSection();
 
     for (;;) {
       const MCExpr *Value;
       SMLoc ExprLoc = getLexer().getLoc();
-      if (ParseExpression(Value))
+      if (parseExpression(Value))
         return true;
 
       // Special case constant expressions to match code generator.
@@ -2170,7 +2246,7 @@ bool AsmParser::ParseDirectiveValue(unsigned Size) {
 ///  ::= (.single | .double) [ expression (, expression)* ]
 bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    CheckForValidSection();
+    checkForValidSection();
 
     for (;;) {
       // We don't truly support arithmetic on floating point expressions, so we
@@ -2224,52 +2300,19 @@ bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
   return false;
 }
 
-/// ParseDirectiveSpace
-///  ::= .space expression [ , expression ]
-bool AsmParser::ParseDirectiveSpace() {
-  CheckForValidSection();
-
-  int64_t NumBytes;
-  if (ParseAbsoluteExpression(NumBytes))
-    return true;
-
-  int64_t FillExpr = 0;
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '.space' directive");
-    Lex();
-
-    if (ParseAbsoluteExpression(FillExpr))
-      return true;
-
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '.space' directive");
-  }
-
-  Lex();
-
-  if (NumBytes <= 0)
-    return TokError("invalid number of bytes in '.space' directive");
-
-  // FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
-  getStreamer().EmitFill(NumBytes, FillExpr, DEFAULT_ADDRSPACE);
-
-  return false;
-}
-
 /// ParseDirectiveZero
 ///  ::= .zero expression
 bool AsmParser::ParseDirectiveZero() {
-  CheckForValidSection();
+  checkForValidSection();
 
   int64_t NumBytes;
-  if (ParseAbsoluteExpression(NumBytes))
+  if (parseAbsoluteExpression(NumBytes))
     return true;
 
   int64_t Val = 0;
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
-    if (ParseAbsoluteExpression(Val))
+    if (parseAbsoluteExpression(Val))
       return true;
   }
 
@@ -2286,10 +2329,10 @@ bool AsmParser::ParseDirectiveZero() {
 /// ParseDirectiveFill
 ///  ::= .fill expression , expression , expression
 bool AsmParser::ParseDirectiveFill() {
-  CheckForValidSection();
+  checkForValidSection();
 
   int64_t NumValues;
-  if (ParseAbsoluteExpression(NumValues))
+  if (parseAbsoluteExpression(NumValues))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2297,7 +2340,7 @@ bool AsmParser::ParseDirectiveFill() {
   Lex();
 
   int64_t FillSize;
-  if (ParseAbsoluteExpression(FillSize))
+  if (parseAbsoluteExpression(FillSize))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2305,7 +2348,7 @@ bool AsmParser::ParseDirectiveFill() {
   Lex();
 
   int64_t FillExpr;
-  if (ParseAbsoluteExpression(FillExpr))
+  if (parseAbsoluteExpression(FillExpr))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -2325,11 +2368,11 @@ bool AsmParser::ParseDirectiveFill() {
 /// ParseDirectiveOrg
 ///  ::= .org expression [ , expression ]
 bool AsmParser::ParseDirectiveOrg() {
-  CheckForValidSection();
+  checkForValidSection();
 
   const MCExpr *Offset;
   SMLoc Loc = getTok().getLoc();
-  if (ParseExpression(Offset))
+  if (parseExpression(Offset))
     return true;
 
   // Parse optional fill expression.
@@ -2339,7 +2382,7 @@ bool AsmParser::ParseDirectiveOrg() {
       return TokError("unexpected token in '.org' directive");
     Lex();
 
-    if (ParseAbsoluteExpression(FillExpr))
+    if (parseAbsoluteExpression(FillExpr))
       return true;
 
     if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -2360,11 +2403,11 @@ bool AsmParser::ParseDirectiveOrg() {
 /// ParseDirectiveAlign
 ///  ::= {.align, ...} expression [ , expression [ , expression ]]
 bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
-  CheckForValidSection();
+  checkForValidSection();
 
   SMLoc AlignmentLoc = getLexer().getLoc();
   int64_t Alignment;
-  if (ParseAbsoluteExpression(Alignment))
+  if (parseAbsoluteExpression(Alignment))
     return true;
 
   SMLoc MaxBytesLoc;
@@ -2381,7 +2424,7 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
     //  .align 3,,4
     if (getLexer().isNot(AsmToken::Comma)) {
       HasFillExpr = true;
-      if (ParseAbsoluteExpression(FillExpr))
+      if (parseAbsoluteExpression(FillExpr))
         return true;
     }
 
@@ -2391,7 +2434,7 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
       Lex();
 
       MaxBytesLoc = getLexer().getLoc();
-      if (ParseAbsoluteExpression(MaxBytesToFill))
+      if (parseAbsoluteExpression(MaxBytesToFill))
         return true;
 
       if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -2413,6 +2456,10 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
     }
 
     Alignment = 1ULL << Alignment;
+  } else {
+    // Reject alignments that aren't a power of two, for gas compatibility.
+    if (!isPowerOf2_64(Alignment))
+      Error(AlignmentLoc, "alignment must be a power of 2");
   }
 
   // Diagnose non-sensical max bytes to align.
@@ -2445,437 +2492,10 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   return false;
 }
 
-
-/// ParseDirectiveBundleAlignMode
-/// ::= {.bundle_align_mode} expression
-bool AsmParser::ParseDirectiveBundleAlignMode() {
-  CheckForValidSection();
-
-  // Expect a single argument: an expression that evaluates to a constant
-  // in the inclusive range 0-30.
-  SMLoc ExprLoc = getLexer().getLoc();
-  int64_t AlignSizePow2;
-  if (ParseAbsoluteExpression(AlignSizePow2))
-    return true;
-  else if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token after expression in"
-                    " '.bundle_align_mode' directive");
-  else if (AlignSizePow2 < 0 || AlignSizePow2 > 30)
-    return Error(ExprLoc,
-                 "invalid bundle alignment size (expected between 0 and 30)");
-
-  Lex();
-
-  // Because of AlignSizePow2's verified range we can safely truncate it to
-  // unsigned.
-  getStreamer().EmitBundleAlignMode(static_cast<unsigned>(AlignSizePow2));
-  return false;
-}
-
-/// ParseDirectiveBundleLock
-/// ::= {.bundle_lock} [align_to_end]
-bool AsmParser::ParseDirectiveBundleLock() {
-  CheckForValidSection();
-  bool AlignToEnd = false;
-
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    StringRef Option;
-    SMLoc Loc = getTok().getLoc();
-    const char *kInvalidOptionError =
-      "invalid option for '.bundle_lock' directive";
-
-    if (ParseIdentifier(Option))
-      return Error(Loc, kInvalidOptionError);
-
-    if (Option != "align_to_end")
-      return Error(Loc, kInvalidOptionError);
-    else if (getLexer().isNot(AsmToken::EndOfStatement))
-      return Error(Loc,
-                   "unexpected token after '.bundle_lock' directive option");
-    AlignToEnd = true;
-  }
-
-  Lex();
-
-  getStreamer().EmitBundleLock(AlignToEnd);
-  return false;
-}
-
-/// ParseDirectiveBundleLock
-/// ::= {.bundle_lock}
-bool AsmParser::ParseDirectiveBundleUnlock() {
-  CheckForValidSection();
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.bundle_unlock' directive");
-  Lex();
-
-  getStreamer().EmitBundleUnlock();
-  return false;
-}
-
-/// ParseDirectiveSymbolAttribute
-///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
-bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
-      StringRef Name;
-      SMLoc Loc = getTok().getLoc();
-
-      if (ParseIdentifier(Name))
-        return Error(Loc, "expected identifier in directive");
-
-      MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-
-      // Assembler local symbols don't make any sense here. Complain loudly.
-      if (Sym->isTemporary())
-        return Error(Loc, "non-local symbol required in directive");
-
-      getStreamer().EmitSymbolAttribute(Sym, Attr);
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      if (getLexer().isNot(AsmToken::Comma))
-        return TokError("unexpected token in directive");
-      Lex();
-    }
-  }
-
-  Lex();
-  return false;
-}
-
-/// ParseDirectiveComm
-///  ::= ( .comm | .lcomm ) identifier , size_expression [ , align_expression ]
-bool AsmParser::ParseDirectiveComm(bool IsLocal) {
-  CheckForValidSection();
-
-  SMLoc IDLoc = getLexer().getLoc();
-  StringRef Name;
-  if (ParseIdentifier(Name))
-    return TokError("expected identifier in directive");
-
-  // Handle the identifier as the key symbol.
-  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  int64_t Size;
-  SMLoc SizeLoc = getLexer().getLoc();
-  if (ParseAbsoluteExpression(Size))
-    return true;
-
-  int64_t Pow2Alignment = 0;
-  SMLoc Pow2AlignmentLoc;
-  if (getLexer().is(AsmToken::Comma)) {
-    Lex();
-    Pow2AlignmentLoc = getLexer().getLoc();
-    if (ParseAbsoluteExpression(Pow2Alignment))
-      return true;
-
-    LCOMM::LCOMMType LCOMM = Lexer.getMAI().getLCOMMDirectiveAlignmentType();
-    if (IsLocal && LCOMM == LCOMM::NoAlignment)
-      return Error(Pow2AlignmentLoc, "alignment not supported on this target");
-
-    // If this target takes alignments in bytes (not log) validate and convert.
-    if ((!IsLocal && Lexer.getMAI().getCOMMDirectiveAlignmentIsInBytes()) ||
-        (IsLocal && LCOMM == LCOMM::ByteAlignment)) {
-      if (!isPowerOf2_64(Pow2Alignment))
-        return Error(Pow2AlignmentLoc, "alignment must be a power of 2");
-      Pow2Alignment = Log2_64(Pow2Alignment);
-    }
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.comm' or '.lcomm' directive");
-
-  Lex();
-
-  // NOTE: a size of zero for a .comm should create a undefined symbol
-  // but a size of .lcomm creates a bss symbol of size zero.
-  if (Size < 0)
-    return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
-                 "be less than zero");
-
-  // NOTE: The alignment in the directive is a power of 2 value, the assembler
-  // may internally end up wanting an alignment in bytes.
-  // FIXME: Diagnose overflow.
-  if (Pow2Alignment < 0)
-    return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
-                 "alignment, can't be less than zero");
-
-  if (!Sym->isUndefined())
-    return Error(IDLoc, "invalid symbol redefinition");
-
-  // Create the Symbol as a common or local common with Size and Pow2Alignment
-  if (IsLocal) {
-    getStreamer().EmitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
-    return false;
-  }
-
-  getStreamer().EmitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
-  return false;
-}
-
-/// ParseDirectiveAbort
-///  ::= .abort [... message ...]
-bool AsmParser::ParseDirectiveAbort() {
-  // FIXME: Use loc from directive.
-  SMLoc Loc = getLexer().getLoc();
-
-  StringRef Str = ParseStringToEndOfStatement();
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.abort' directive");
-
-  Lex();
-
-  if (Str.empty())
-    Error(Loc, ".abort detected. Assembly stopping.");
-  else
-    Error(Loc, ".abort '" + Str + "' detected. Assembly stopping.");
-  // FIXME: Actually abort assembly here.
-
-  return false;
-}
-
-/// ParseDirectiveInclude
-///  ::= .include "filename"
-bool AsmParser::ParseDirectiveInclude() {
-  if (getLexer().isNot(AsmToken::String))
-    return TokError("expected string in '.include' directive");
-
-  std::string Filename = getTok().getString();
-  SMLoc IncludeLoc = getLexer().getLoc();
-  Lex();
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.include' directive");
-
-  // Strip the quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
-
-  // Attempt to switch the lexer to the included file before consuming the end
-  // of statement to avoid losing it when we switch.
-  if (EnterIncludeFile(Filename)) {
-    Error(IncludeLoc, "Could not find include file '" + Filename + "'");
-    return true;
-  }
-
-  return false;
-}
-
-/// ParseDirectiveIncbin
-///  ::= .incbin "filename"
-bool AsmParser::ParseDirectiveIncbin() {
-  if (getLexer().isNot(AsmToken::String))
-    return TokError("expected string in '.incbin' directive");
-
-  std::string Filename = getTok().getString();
-  SMLoc IncbinLoc = getLexer().getLoc();
-  Lex();
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.incbin' directive");
-
-  // Strip the quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
-
-  // Attempt to process the included file.
-  if (ProcessIncbinFile(Filename)) {
-    Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
-    return true;
-  }
-
-  return false;
-}
-
-/// ParseDirectiveIf
-/// ::= .if expression
-bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
-  TheCondStack.push_back(TheCondState);
-  TheCondState.TheCond = AsmCond::IfCond;
-  if (TheCondState.Ignore) {
-    EatToEndOfStatement();
-  } else {
-    int64_t ExprValue;
-    if (ParseAbsoluteExpression(ExprValue))
-      return true;
-
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '.if' directive");
-
-    Lex();
-
-    TheCondState.CondMet = ExprValue;
-    TheCondState.Ignore = !TheCondState.CondMet;
-  }
-
-  return false;
-}
-
-/// ParseDirectiveIfb
-/// ::= .ifb string
-bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
-  TheCondStack.push_back(TheCondState);
-  TheCondState.TheCond = AsmCond::IfCond;
-
-  if (TheCondState.Ignore) {
-    EatToEndOfStatement();
-  } else {
-    StringRef Str = ParseStringToEndOfStatement();
-
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '.ifb' directive");
-
-    Lex();
-
-    TheCondState.CondMet = ExpectBlank == Str.empty();
-    TheCondState.Ignore = !TheCondState.CondMet;
-  }
-
-  return false;
-}
-
-/// ParseDirectiveIfc
-/// ::= .ifc string1, string2
-bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
-  TheCondStack.push_back(TheCondState);
-  TheCondState.TheCond = AsmCond::IfCond;
-
-  if (TheCondState.Ignore) {
-    EatToEndOfStatement();
-  } else {
-    StringRef Str1 = ParseStringToComma();
-
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '.ifc' directive");
-
-    Lex();
-
-    StringRef Str2 = ParseStringToEndOfStatement();
-
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '.ifc' directive");
-
-    Lex();
-
-    TheCondState.CondMet = ExpectEqual == (Str1 == Str2);
-    TheCondState.Ignore = !TheCondState.CondMet;
-  }
-
-  return false;
-}
-
-/// ParseDirectiveIfdef
-/// ::= .ifdef symbol
-bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
-  StringRef Name;
-  TheCondStack.push_back(TheCondState);
-  TheCondState.TheCond = AsmCond::IfCond;
-
-  if (TheCondState.Ignore) {
-    EatToEndOfStatement();
-  } else {
-    if (ParseIdentifier(Name))
-      return TokError("expected identifier after '.ifdef'");
-
-    Lex();
-
-    MCSymbol *Sym = getContext().LookupSymbol(Name);
-
-    if (expect_defined)
-      TheCondState.CondMet = (Sym != NULL && !Sym->isUndefined());
-    else
-      TheCondState.CondMet = (Sym == NULL || Sym->isUndefined());
-    TheCondState.Ignore = !TheCondState.CondMet;
-  }
-
-  return false;
-}
-
-/// ParseDirectiveElseIf
-/// ::= .elseif expression
-bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
-  if (TheCondState.TheCond != AsmCond::IfCond &&
-      TheCondState.TheCond != AsmCond::ElseIfCond)
-      Error(DirectiveLoc, "Encountered a .elseif that doesn't follow a .if or "
-                          " an .elseif");
-  TheCondState.TheCond = AsmCond::ElseIfCond;
-
-  bool LastIgnoreState = false;
-  if (!TheCondStack.empty())
-      LastIgnoreState = TheCondStack.back().Ignore;
-  if (LastIgnoreState || TheCondState.CondMet) {
-    TheCondState.Ignore = true;
-    EatToEndOfStatement();
-  }
-  else {
-    int64_t ExprValue;
-    if (ParseAbsoluteExpression(ExprValue))
-      return true;
-
-    if (getLexer().isNot(AsmToken::EndOfStatement))
-      return TokError("unexpected token in '.elseif' directive");
-
-    Lex();
-    TheCondState.CondMet = ExprValue;
-    TheCondState.Ignore = !TheCondState.CondMet;
-  }
-
-  return false;
-}
-
-/// ParseDirectiveElse
-/// ::= .else
-bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.else' directive");
-
-  Lex();
-
-  if (TheCondState.TheCond != AsmCond::IfCond &&
-      TheCondState.TheCond != AsmCond::ElseIfCond)
-      Error(DirectiveLoc, "Encountered a .else that doesn't follow a .if or an "
-                          ".elseif");
-  TheCondState.TheCond = AsmCond::ElseCond;
-  bool LastIgnoreState = false;
-  if (!TheCondStack.empty())
-    LastIgnoreState = TheCondStack.back().Ignore;
-  if (LastIgnoreState || TheCondState.CondMet)
-    TheCondState.Ignore = true;
-  else
-    TheCondState.Ignore = false;
-
-  return false;
-}
-
-/// ParseDirectiveEndIf
-/// ::= .endif
-bool AsmParser::ParseDirectiveEndIf(SMLoc DirectiveLoc) {
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.endif' directive");
-
-  Lex();
-
-  if ((TheCondState.TheCond == AsmCond::NoCond) ||
-      TheCondStack.empty())
-    Error(DirectiveLoc, "Encountered a .endif that doesn't follow a .if or "
-                        ".else");
-  if (!TheCondStack.empty()) {
-    TheCondState = TheCondStack.back();
-    TheCondStack.pop_back();
-  }
-
-  return false;
-}
-
 /// ParseDirectiveFile
 /// ::= .file [number] filename
 /// ::= .file number directory filename
-bool GenericAsmParser::ParseDirectiveFile(StringRef, SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
   // FIXME: I'm not sure what this is.
   int64_t FileNumber = -1;
   SMLoc FileNumberLoc = getLexer().getLoc();
@@ -2927,7 +2547,7 @@ bool GenericAsmParser::ParseDirectiveFile(StringRef, SMLoc DirectiveLoc) {
 
 /// ParseDirectiveLine
 /// ::= .line [number]
-bool GenericAsmParser::ParseDirectiveLine(StringRef, SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveLine() {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     if (getLexer().isNot(AsmToken::Integer))
       return TokError("unexpected token in '.line' directive");
@@ -2945,7 +2565,6 @@ bool GenericAsmParser::ParseDirectiveLine(StringRef, SMLoc DirectiveLoc) {
   return false;
 }
 
-
 /// ParseDirectiveLoc
 /// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
 ///                                [epilogue_begin] [is_stmt VALUE] [isa VALUE]
@@ -2953,8 +2572,7 @@ bool GenericAsmParser::ParseDirectiveLine(StringRef, SMLoc DirectiveLoc) {
 /// a .file directive, the second number is the line number and optionally the
 /// third number is a column position (zero if not specified).  The remaining
 /// optional items are .loc sub-directives.
-bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
-
+bool AsmParser::ParseDirectiveLoc() {
   if (getLexer().isNot(AsmToken::Integer))
     return TokError("unexpected token in '.loc' directive");
   int64_t FileNumber = getTok().getIntVal();
@@ -2990,7 +2608,7 @@ bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
 
       StringRef Name;
       SMLoc Loc = getTok().getLoc();
-      if (getParser().ParseIdentifier(Name))
+      if (parseIdentifier(Name))
         return TokError("unexpected token in '.loc' directive");
 
       if (Name == "basic_block")
@@ -3002,7 +2620,7 @@ bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
       else if (Name == "is_stmt") {
         Loc = getTok().getLoc();
         const MCExpr *Value;
-        if (getParser().ParseExpression(Value))
+        if (parseExpression(Value))
           return true;
         // The expression must be the constant 0 or 1.
         if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
@@ -3021,7 +2639,7 @@ bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
       else if (Name == "isa") {
         Loc = getTok().getLoc();
         const MCExpr *Value;
-        if (getParser().ParseExpression(Value))
+        if (parseExpression(Value))
           return true;
         // The expression must be a constant greater or equal to 0.
         if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
@@ -3035,7 +2653,7 @@ bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
         }
       }
       else if (Name == "discriminator") {
-        if (getParser().ParseAbsoluteExpression(Discriminator))
+        if (parseAbsoluteExpression(Discriminator))
           return true;
       }
       else {
@@ -3055,20 +2673,18 @@ bool GenericAsmParser::ParseDirectiveLoc(StringRef, SMLoc DirectiveLoc) {
 
 /// ParseDirectiveStabs
 /// ::= .stabs string, number, number, number
-bool GenericAsmParser::ParseDirectiveStabs(StringRef Directive,
-                                           SMLoc DirectiveLoc) {
-  return TokError("unsupported directive '" + Directive + "'");
+bool AsmParser::ParseDirectiveStabs() {
+  return TokError("unsupported directive '.stabs'");
 }
 
 /// ParseDirectiveCFISections
 /// ::= .cfi_sections section [, section]
-bool GenericAsmParser::ParseDirectiveCFISections(StringRef,
-                                                 SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFISections() {
   StringRef Name;
   bool EH = false;
   bool Debug = false;
 
-  if (getParser().ParseIdentifier(Name))
+  if (parseIdentifier(Name))
     return TokError("Expected an identifier");
 
   if (Name == ".eh_frame")
@@ -3079,7 +2695,7 @@ bool GenericAsmParser::ParseDirectiveCFISections(StringRef,
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
 
-    if (getParser().ParseIdentifier(Name))
+    if (parseIdentifier(Name))
       return TokError("Expected an identifier");
 
     if (Name == ".eh_frame")
@@ -3089,45 +2705,41 @@ bool GenericAsmParser::ParseDirectiveCFISections(StringRef,
   }
 
   getStreamer().EmitCFISections(EH, Debug);
-
   return false;
 }
 
 /// ParseDirectiveCFIStartProc
 /// ::= .cfi_startproc
-bool GenericAsmParser::ParseDirectiveCFIStartProc(StringRef,
-                                                  SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIStartProc() {
   getStreamer().EmitCFIStartProc();
   return false;
 }
 
 /// ParseDirectiveCFIEndProc
 /// ::= .cfi_endproc
-bool GenericAsmParser::ParseDirectiveCFIEndProc(StringRef, SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIEndProc() {
   getStreamer().EmitCFIEndProc();
   return false;
 }
 
 /// ParseRegisterOrRegisterNumber - parse register name or number.
-bool GenericAsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
-                                                     SMLoc DirectiveLoc) {
+bool AsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
+                                              SMLoc DirectiveLoc) {
   unsigned RegNo;
 
   if (getLexer().isNot(AsmToken::Integer)) {
-    if (getParser().getTargetParser().ParseRegister(RegNo, DirectiveLoc,
-      DirectiveLoc))
+    if (getTargetParser().ParseRegister(RegNo, DirectiveLoc, DirectiveLoc))
       return true;
     Register = getContext().getRegisterInfo().getDwarfRegNum(RegNo, true);
   } else
-    return getParser().ParseAbsoluteExpression(Register);
+    return parseAbsoluteExpression(Register);
 
   return false;
 }
 
 /// ParseDirectiveCFIDefCfa
 /// ::= .cfi_def_cfa register,  offset
-bool GenericAsmParser::ParseDirectiveCFIDefCfa(StringRef,
-                                               SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
   int64_t Register = 0;
   if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
@@ -3137,7 +2749,7 @@ bool GenericAsmParser::ParseDirectiveCFIDefCfa(StringRef,
   Lex();
 
   int64_t Offset = 0;
-  if (getParser().ParseAbsoluteExpression(Offset))
+  if (parseAbsoluteExpression(Offset))
     return true;
 
   getStreamer().EmitCFIDefCfa(Register, Offset);
@@ -3146,22 +2758,39 @@ bool GenericAsmParser::ParseDirectiveCFIDefCfa(StringRef,
 
 /// ParseDirectiveCFIDefCfaOffset
 /// ::= .cfi_def_cfa_offset offset
-bool GenericAsmParser::ParseDirectiveCFIDefCfaOffset(StringRef,
-                                                     SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIDefCfaOffset() {
   int64_t Offset = 0;
-  if (getParser().ParseAbsoluteExpression(Offset))
+  if (parseAbsoluteExpression(Offset))
     return true;
 
   getStreamer().EmitCFIDefCfaOffset(Offset);
   return false;
 }
 
+/// ParseDirectiveCFIRegister
+/// ::= .cfi_register register, register
+bool AsmParser::ParseDirectiveCFIRegister(SMLoc DirectiveLoc) {
+  int64_t Register1 = 0;
+  if (ParseRegisterOrRegisterNumber(Register1, DirectiveLoc))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Register2 = 0;
+  if (ParseRegisterOrRegisterNumber(Register2, DirectiveLoc))
+    return true;
+
+  getStreamer().EmitCFIRegister(Register1, Register2);
+  return false;
+}
+
 /// ParseDirectiveCFIAdjustCfaOffset
 /// ::= .cfi_adjust_cfa_offset adjustment
-bool GenericAsmParser::ParseDirectiveCFIAdjustCfaOffset(StringRef,
-                                                        SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIAdjustCfaOffset() {
   int64_t Adjustment = 0;
-  if (getParser().ParseAbsoluteExpression(Adjustment))
+  if (parseAbsoluteExpression(Adjustment))
     return true;
 
   getStreamer().EmitCFIAdjustCfaOffset(Adjustment);
@@ -3170,8 +2799,7 @@ bool GenericAsmParser::ParseDirectiveCFIAdjustCfaOffset(StringRef,
 
 /// ParseDirectiveCFIDefCfaRegister
 /// ::= .cfi_def_cfa_register register
-bool GenericAsmParser::ParseDirectiveCFIDefCfaRegister(StringRef,
-                                                       SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
   int64_t Register = 0;
   if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
@@ -3182,7 +2810,7 @@ bool GenericAsmParser::ParseDirectiveCFIDefCfaRegister(StringRef,
 
 /// ParseDirectiveCFIOffset
 /// ::= .cfi_offset register, offset
-bool GenericAsmParser::ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIOffset(SMLoc DirectiveLoc) {
   int64_t Register = 0;
   int64_t Offset = 0;
 
@@ -3193,7 +2821,7 @@ bool GenericAsmParser::ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc) {
     return TokError("unexpected token in directive");
   Lex();
 
-  if (getParser().ParseAbsoluteExpression(Offset))
+  if (parseAbsoluteExpression(Offset))
     return true;
 
   getStreamer().EmitCFIOffset(Register, Offset);
@@ -3202,8 +2830,7 @@ bool GenericAsmParser::ParseDirectiveCFIOffset(StringRef, SMLoc DirectiveLoc) {
 
 /// ParseDirectiveCFIRelOffset
 /// ::= .cfi_rel_offset register, offset
-bool GenericAsmParser::ParseDirectiveCFIRelOffset(StringRef,
-                                                  SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
   if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
@@ -3214,7 +2841,7 @@ bool GenericAsmParser::ParseDirectiveCFIRelOffset(StringRef,
   Lex();
 
   int64_t Offset = 0;
-  if (getParser().ParseAbsoluteExpression(Offset))
+  if (parseAbsoluteExpression(Offset))
     return true;
 
   getStreamer().EmitCFIRelOffset(Register, Offset);
@@ -3244,12 +2871,12 @@ static bool isValidEncoding(int64_t Encoding) {
 }
 
 /// ParseDirectiveCFIPersonalityOrLsda
+/// IsPersonality true for cfi_personality, false for cfi_lsda
 /// ::= .cfi_personality encoding, [symbol_name]
 /// ::= .cfi_lsda encoding, [symbol_name]
-bool GenericAsmParser::ParseDirectiveCFIPersonalityOrLsda(StringRef IDVal,
-                                                    SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   int64_t Encoding = 0;
-  if (getParser().ParseAbsoluteExpression(Encoding))
+  if (parseAbsoluteExpression(Encoding))
     return true;
   if (Encoding == dwarf::DW_EH_PE_omit)
     return false;
@@ -3262,70 +2889,61 @@ bool GenericAsmParser::ParseDirectiveCFIPersonalityOrLsda(StringRef IDVal,
   Lex();
 
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
-  if (IDVal == ".cfi_personality")
+  if (IsPersonality)
     getStreamer().EmitCFIPersonality(Sym, Encoding);
-  else {
-    assert(IDVal == ".cfi_lsda");
+  else
     getStreamer().EmitCFILsda(Sym, Encoding);
-  }
   return false;
 }
 
 /// ParseDirectiveCFIRememberState
 /// ::= .cfi_remember_state
-bool GenericAsmParser::ParseDirectiveCFIRememberState(StringRef IDVal,
-                                                      SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIRememberState() {
   getStreamer().EmitCFIRememberState();
   return false;
 }
 
 /// ParseDirectiveCFIRestoreState
 /// ::= .cfi_remember_state
-bool GenericAsmParser::ParseDirectiveCFIRestoreState(StringRef IDVal,
-                                                     SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIRestoreState() {
   getStreamer().EmitCFIRestoreState();
   return false;
 }
 
 /// ParseDirectiveCFISameValue
 /// ::= .cfi_same_value register
-bool GenericAsmParser::ParseDirectiveCFISameValue(StringRef IDVal,
-                                                  SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFISameValue(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
   if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFISameValue(Register);
-
   return false;
 }
 
 /// ParseDirectiveCFIRestore
 /// ::= .cfi_restore register
-bool GenericAsmParser::ParseDirectiveCFIRestore(StringRef IDVal,
-                                                SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIRestore(SMLoc DirectiveLoc) {
   int64_t Register = 0;
   if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIRestore(Register);
-
   return false;
 }
 
 /// ParseDirectiveCFIEscape
 /// ::= .cfi_escape expression[,...]
-bool GenericAsmParser::ParseDirectiveCFIEscape(StringRef IDVal,
-                                               SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIEscape() {
   std::string Values;
   int64_t CurrValue;
-  if (getParser().ParseAbsoluteExpression(CurrValue))
+  if (parseAbsoluteExpression(CurrValue))
     return true;
 
   Values.push_back((uint8_t)CurrValue);
@@ -3333,7 +2951,7 @@ bool GenericAsmParser::ParseDirectiveCFIEscape(StringRef IDVal,
   while (getLexer().is(AsmToken::Comma)) {
     Lex();
 
-    if (getParser().ParseAbsoluteExpression(CurrValue))
+    if (parseAbsoluteExpression(CurrValue))
       return true;
 
     Values.push_back((uint8_t)CurrValue);
@@ -3345,89 +2963,59 @@ bool GenericAsmParser::ParseDirectiveCFIEscape(StringRef IDVal,
 
 /// ParseDirectiveCFISignalFrame
 /// ::= .cfi_signal_frame
-bool GenericAsmParser::ParseDirectiveCFISignalFrame(StringRef Directive,
-                                                    SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFISignalFrame() {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
-                 "unexpected token in '" + Directive + "' directive");
+                 "unexpected token in '.cfi_signal_frame'");
 
   getStreamer().EmitCFISignalFrame();
-
   return false;
 }
 
 /// ParseDirectiveCFIUndefined
 /// ::= .cfi_undefined register
-bool GenericAsmParser::ParseDirectiveCFIUndefined(StringRef Directive,
-                                                  SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
   if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIUndefined(Register);
-
-  return false;
-}
-
-/// ParseDirectiveCFIRegister
-/// ::= .cfi_register register, register
-bool GenericAsmParser::ParseDirectiveCFIRegister(StringRef Directive,
-                                                 SMLoc DirectiveLoc) {
-  int64_t Register1 = 0;
-
-  if (ParseRegisterOrRegisterNumber(Register1, DirectiveLoc))
-    return true;
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in directive");
-  Lex();
-
-  int64_t Register2 = 0;
-
-  if (ParseRegisterOrRegisterNumber(Register2, DirectiveLoc))
-    return true;
-
-  getStreamer().EmitCFIRegister(Register1, Register2);
-
   return false;
 }
 
 /// ParseDirectiveMacrosOnOff
 /// ::= .macros_on
 /// ::= .macros_off
-bool GenericAsmParser::ParseDirectiveMacrosOnOff(StringRef Directive,
-                                                 SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveMacrosOnOff(StringRef Directive) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
                  "unexpected token in '" + Directive + "' directive");
 
-  getParser().MacrosEnabled = Directive == ".macros_on";
-
+  SetMacrosEnabled(Directive == ".macros_on");
   return false;
 }
 
 /// ParseDirectiveMacro
 /// ::= .macro name [parameters]
-bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
-                                           SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (parseIdentifier(Name))
     return TokError("expected identifier in '.macro' directive");
 
-  MacroParameters Parameters;
+  MCAsmMacroParameters Parameters;
   // Argument delimiter is initially unknown. It will be set by
   // ParseMacroArgument()
   AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
-      MacroParameter Parameter;
-      if (getParser().ParseIdentifier(Parameter.first))
+      MCAsmMacroParameter Parameter;
+      if (parseIdentifier(Parameter.first))
         return TokError("expected identifier in '.macro' directive");
 
       if (getLexer().is(AsmToken::Equal)) {
         Lex();
-        if (getParser().ParseMacroArgument(Parameter.second, ArgumentDelimiter))
+        if (ParseMacroArgument(Parameter.second, ArgumentDelimiter))
           return true;
       }
 
@@ -3464,32 +3052,134 @@ bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
     }
 
     // Otherwise, scan til the end of the statement.
-    getParser().EatToEndOfStatement();
+    eatToEndOfStatement();
   }
 
-  if (getParser().MacroMap.lookup(Name)) {
+  if (LookupMacro(Name)) {
     return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
   }
 
   const char *BodyStart = StartToken.getLoc().getPointer();
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
-  getParser().MacroMap[Name] = new Macro(Name, Body, Parameters);
+  CheckForBadMacro(DirectiveLoc, Name, Body, Parameters);
+  DefineMacro(Name, MCAsmMacro(Name, Body, Parameters));
   return false;
 }
 
+/// CheckForBadMacro
+///
+/// With the support added for named parameters there may be code out there that
+/// is transitioning from positional parameters.  In versions of gas that did
+/// not support named parameters they would be ignored on the macro defintion.
+/// But to support both styles of parameters this is not possible so if a macro
+/// defintion has named parameters but does not use them and has what appears
+/// to be positional parameters, strings like $1, $2, ... and $n, then issue a
+/// warning that the positional parameter found in body which have no effect.
+/// Hoping the developer will either remove the named parameters from the macro
+/// definiton so the positional parameters get used if that was what was
+/// intended or change the macro to use the named parameters.  It is possible
+/// this warning will trigger when the none of the named parameters are used
+/// and the strings like $1 are infact to simply to be passed trough unchanged.
+void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
+                                 StringRef Body,
+                                 MCAsmMacroParameters Parameters) {
+  // If this macro is not defined with named parameters the warning we are
+  // checking for here doesn't apply.
+  unsigned NParameters = Parameters.size();
+  if (NParameters == 0)
+    return;
+
+  bool NamedParametersFound = false;
+  bool PositionalParametersFound = false;
+
+  // Look at the body of the macro for use of both the named parameters and what
+  // are likely to be positional parameters.  This is what expandMacro() is
+  // doing when it finds the parameters in the body.
+  while (!Body.empty()) {
+    // Scan for the next possible parameter.
+    std::size_t End = Body.size(), Pos = 0;
+    for (; Pos != End; ++Pos) {
+      // Check for a substitution or escape.
+      // This macro is defined with parameters, look for \foo, \bar, etc.
+      if (Body[Pos] == '\\' && Pos + 1 != End)
+        break;
+
+      // This macro should have parameters, but look for $0, $1, ..., $n too.
+      if (Body[Pos] != '$' || Pos + 1 == End)
+        continue;
+      char Next = Body[Pos + 1];
+      if (Next == '$' || Next == 'n' ||
+          isdigit(static_cast<unsigned char>(Next)))
+        break;
+    }
+
+    // Check if we reached the end.
+    if (Pos == End)
+      break;
+
+    if (Body[Pos] == '$') {
+      switch (Body[Pos+1]) {
+        // $$ => $
+      case '$':
+        break;
+
+        // $n => number of arguments
+      case 'n':
+        PositionalParametersFound = true;
+        break;
+
+        // $[0-9] => argument
+      default: {
+        PositionalParametersFound = true;
+        break;
+        }
+      }
+      Pos += 2;
+    } else {
+      unsigned I = Pos + 1;
+      while (isIdentifierChar(Body[I]) && I + 1 != End)
+        ++I;
+
+      const char *Begin = Body.data() + Pos +1;
+      StringRef Argument(Begin, I - (Pos +1));
+      unsigned Index = 0;
+      for (; Index < NParameters; ++Index)
+        if (Parameters[Index].first == Argument)
+          break;
+
+      if (Index == NParameters) {
+          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
+            Pos += 3;
+          else {
+            Pos = I;
+          }
+      } else {
+        NamedParametersFound = true;
+        Pos += 1 + Argument.size();
+      }
+    }
+    // Update the scan point.
+    Body = Body.substr(Pos);
+  }
+
+  if (!NamedParametersFound && PositionalParametersFound)
+    Warning(DirectiveLoc, "macro defined with named parameters which are not "
+                          "used in macro body, possible positional parameter "
+                          "found in body which will have no effect");
+}
+
 /// ParseDirectiveEndMacro
 /// ::= .endm
 /// ::= .endmacro
-bool GenericAsmParser::ParseDirectiveEndMacro(StringRef Directive,
-                                              SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectiveEndMacro(StringRef Directive) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '" + Directive + "' directive");
 
   // If we are inside a macro instantiation, terminate the current
   // instantiation.
-  if (!getParser().ActiveMacros.empty()) {
-    getParser().HandleMacroExit();
+  if (InsideMacroInstantiation()) {
+    HandleMacroExit();
     return false;
   }
 
@@ -3501,37 +3191,136 @@ bool GenericAsmParser::ParseDirectiveEndMacro(StringRef Directive,
 
 /// ParseDirectivePurgeMacro
 /// ::= .purgem
-bool GenericAsmParser::ParseDirectivePurgeMacro(StringRef Directive,
-                                                SMLoc DirectiveLoc) {
+bool AsmParser::ParseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (parseIdentifier(Name))
     return TokError("expected identifier in '.purgem' directive");
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.purgem' directive");
 
-  StringMap<Macro*>::iterator I = getParser().MacroMap.find(Name);
-  if (I == getParser().MacroMap.end())
+  if (!LookupMacro(Name))
     return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
 
-  // Undefine the macro.
-  delete I->getValue();
-  getParser().MacroMap.erase(I);
+  UndefineMacro(Name);
+  return false;
+}
+
+/// ParseDirectiveBundleAlignMode
+/// ::= {.bundle_align_mode} expression
+bool AsmParser::ParseDirectiveBundleAlignMode() {
+  checkForValidSection();
+
+  // Expect a single argument: an expression that evaluates to a constant
+  // in the inclusive range 0-30.
+  SMLoc ExprLoc = getLexer().getLoc();
+  int64_t AlignSizePow2;
+  if (parseAbsoluteExpression(AlignSizePow2))
+    return true;
+  else if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token after expression in"
+                    " '.bundle_align_mode' directive");
+  else if (AlignSizePow2 < 0 || AlignSizePow2 > 30)
+    return Error(ExprLoc,
+                 "invalid bundle alignment size (expected between 0 and 30)");
+
+  Lex();
+
+  // Because of AlignSizePow2's verified range we can safely truncate it to
+  // unsigned.
+  getStreamer().EmitBundleAlignMode(static_cast<unsigned>(AlignSizePow2));
+  return false;
+}
+
+/// ParseDirectiveBundleLock
+/// ::= {.bundle_lock} [align_to_end]
+bool AsmParser::ParseDirectiveBundleLock() {
+  checkForValidSection();
+  bool AlignToEnd = false;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    StringRef Option;
+    SMLoc Loc = getTok().getLoc();
+    const char *kInvalidOptionError =
+      "invalid option for '.bundle_lock' directive";
+
+    if (parseIdentifier(Option))
+      return Error(Loc, kInvalidOptionError);
+
+    if (Option != "align_to_end")
+      return Error(Loc, kInvalidOptionError);
+    else if (getLexer().isNot(AsmToken::EndOfStatement))
+      return Error(Loc,
+                   "unexpected token after '.bundle_lock' directive option");
+    AlignToEnd = true;
+  }
+
+  Lex();
+
+  getStreamer().EmitBundleLock(AlignToEnd);
+  return false;
+}
+
+/// ParseDirectiveBundleLock
+/// ::= {.bundle_lock}
+bool AsmParser::ParseDirectiveBundleUnlock() {
+  checkForValidSection();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.bundle_unlock' directive");
+  Lex();
+
+  getStreamer().EmitBundleUnlock();
   return false;
 }
 
-bool GenericAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) {
-  getParser().CheckForValidSection();
+/// ParseDirectiveSpace
+/// ::= (.skip | .space) expression [ , expression ]
+bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
+  checkForValidSection();
+
+  int64_t NumBytes;
+  if (parseAbsoluteExpression(NumBytes))
+    return true;
+
+  int64_t FillExpr = 0;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+
+    if (parseAbsoluteExpression(FillExpr))
+      return true;
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+  }
+
+  Lex();
+
+  if (NumBytes <= 0)
+    return TokError("invalid number of bytes in '" +
+                    Twine(IDVal) + "' directive");
+
+  // FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
+  getStreamer().EmitFill(NumBytes, FillExpr, DEFAULT_ADDRSPACE);
 
+  return false;
+}
+
+/// ParseDirectiveLEB128
+/// ::= (.sleb128 | .uleb128) expression
+bool AsmParser::ParseDirectiveLEB128(bool Signed) {
+  checkForValidSection();
   const MCExpr *Value;
 
-  if (getParser().ParseExpression(Value))
+  if (parseExpression(Value))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  if (DirName[1] == 's')
+  if (Signed)
     getStreamer().EmitSLEB128Value(Value);
   else
     getStreamer().EmitULEB128Value(Value);
@@ -3539,7 +3328,469 @@ bool GenericAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) {
   return false;
 }
 
-Macro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
+/// ParseDirectiveSymbolAttribute
+///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
+bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      StringRef Name;
+      SMLoc Loc = getTok().getLoc();
+
+      if (parseIdentifier(Name))
+        return Error(Loc, "expected identifier in directive");
+
+      MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+      // Assembler local symbols don't make any sense here. Complain loudly.
+      if (Sym->isTemporary())
+        return Error(Loc, "non-local symbol required in directive");
+
+      getStreamer().EmitSymbolAttribute(Sym, Attr);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+/// ParseDirectiveComm
+///  ::= ( .comm | .lcomm ) identifier , size_expression [ , align_expression ]
+bool AsmParser::ParseDirectiveComm(bool IsLocal) {
+  checkForValidSection();
+
+  SMLoc IDLoc = getLexer().getLoc();
+  StringRef Name;
+  if (parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Size;
+  SMLoc SizeLoc = getLexer().getLoc();
+  if (parseAbsoluteExpression(Size))
+    return true;
+
+  int64_t Pow2Alignment = 0;
+  SMLoc Pow2AlignmentLoc;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    Pow2AlignmentLoc = getLexer().getLoc();
+    if (parseAbsoluteExpression(Pow2Alignment))
+      return true;
+
+    LCOMM::LCOMMType LCOMM = Lexer.getMAI().getLCOMMDirectiveAlignmentType();
+    if (IsLocal && LCOMM == LCOMM::NoAlignment)
+      return Error(Pow2AlignmentLoc, "alignment not supported on this target");
+
+    // If this target takes alignments in bytes (not log) validate and convert.
+    if ((!IsLocal && Lexer.getMAI().getCOMMDirectiveAlignmentIsInBytes()) ||
+        (IsLocal && LCOMM == LCOMM::ByteAlignment)) {
+      if (!isPowerOf2_64(Pow2Alignment))
+        return Error(Pow2AlignmentLoc, "alignment must be a power of 2");
+      Pow2Alignment = Log2_64(Pow2Alignment);
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.comm' or '.lcomm' directive");
+
+  Lex();
+
+  // NOTE: a size of zero for a .comm should create a undefined symbol
+  // but a size of .lcomm creates a bss symbol of size zero.
+  if (Size < 0)
+    return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
+                 "be less than zero");
+
+  // NOTE: The alignment in the directive is a power of 2 value, the assembler
+  // may internally end up wanting an alignment in bytes.
+  // FIXME: Diagnose overflow.
+  if (Pow2Alignment < 0)
+    return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
+                 "alignment, can't be less than zero");
+
+  if (!Sym->isUndefined())
+    return Error(IDLoc, "invalid symbol redefinition");
+
+  // Create the Symbol as a common or local common with Size and Pow2Alignment
+  if (IsLocal) {
+    getStreamer().EmitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
+    return false;
+  }
+
+  getStreamer().EmitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
+  return false;
+}
+
+/// ParseDirectiveAbort
+///  ::= .abort [... message ...]
+bool AsmParser::ParseDirectiveAbort() {
+  // FIXME: Use loc from directive.
+  SMLoc Loc = getLexer().getLoc();
+
+  StringRef Str = parseStringToEndOfStatement();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.abort' directive");
+
+  Lex();
+
+  if (Str.empty())
+    Error(Loc, ".abort detected. Assembly stopping.");
+  else
+    Error(Loc, ".abort '" + Str + "' detected. Assembly stopping.");
+  // FIXME: Actually abort assembly here.
+
+  return false;
+}
+
+/// ParseDirectiveInclude
+///  ::= .include "filename"
+bool AsmParser::ParseDirectiveInclude() {
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("expected string in '.include' directive");
+
+  std::string Filename = getTok().getString();
+  SMLoc IncludeLoc = getLexer().getLoc();
+  Lex();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.include' directive");
+
+  // Strip the quotes.
+  Filename = Filename.substr(1, Filename.size()-2);
+
+  // Attempt to switch the lexer to the included file before consuming the end
+  // of statement to avoid losing it when we switch.
+  if (EnterIncludeFile(Filename)) {
+    Error(IncludeLoc, "Could not find include file '" + Filename + "'");
+    return true;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIncbin
+///  ::= .incbin "filename"
+bool AsmParser::ParseDirectiveIncbin() {
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("expected string in '.incbin' directive");
+
+  std::string Filename = getTok().getString();
+  SMLoc IncbinLoc = getLexer().getLoc();
+  Lex();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.incbin' directive");
+
+  // Strip the quotes.
+  Filename = Filename.substr(1, Filename.size()-2);
+
+  // Attempt to process the included file.
+  if (ProcessIncbinFile(Filename)) {
+    Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
+    return true;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIf
+/// ::= .if expression
+bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    int64_t ExprValue;
+    if (parseAbsoluteExpression(ExprValue))
+      return true;
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.if' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExprValue;
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfb
+/// ::= .ifb string
+bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    StringRef Str = parseStringToEndOfStatement();
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.ifb' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExpectBlank == Str.empty();
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfc
+/// ::= .ifc string1, string2
+bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    StringRef Str1 = ParseStringToComma();
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '.ifc' directive");
+
+    Lex();
+
+    StringRef Str2 = parseStringToEndOfStatement();
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.ifc' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExpectEqual == (Str1 == Str2);
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfdef
+/// ::= .ifdef symbol
+bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
+  StringRef Name;
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    if (parseIdentifier(Name))
+      return TokError("expected identifier after '.ifdef'");
+
+    Lex();
+
+    MCSymbol *Sym = getContext().LookupSymbol(Name);
+
+    if (expect_defined)
+      TheCondState.CondMet = (Sym != NULL && !Sym->isUndefined());
+    else
+      TheCondState.CondMet = (Sym == NULL || Sym->isUndefined());
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveElseIf
+/// ::= .elseif expression
+bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
+  if (TheCondState.TheCond != AsmCond::IfCond &&
+      TheCondState.TheCond != AsmCond::ElseIfCond)
+      Error(DirectiveLoc, "Encountered a .elseif that doesn't follow a .if or "
+                          " an .elseif");
+  TheCondState.TheCond = AsmCond::ElseIfCond;
+
+  bool LastIgnoreState = false;
+  if (!TheCondStack.empty())
+      LastIgnoreState = TheCondStack.back().Ignore;
+  if (LastIgnoreState || TheCondState.CondMet) {
+    TheCondState.Ignore = true;
+    eatToEndOfStatement();
+  }
+  else {
+    int64_t ExprValue;
+    if (parseAbsoluteExpression(ExprValue))
+      return true;
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.elseif' directive");
+
+    Lex();
+    TheCondState.CondMet = ExprValue;
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveElse
+/// ::= .else
+bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.else' directive");
+
+  Lex();
+
+  if (TheCondState.TheCond != AsmCond::IfCond &&
+      TheCondState.TheCond != AsmCond::ElseIfCond)
+      Error(DirectiveLoc, "Encountered a .else that doesn't follow a .if or an "
+                          ".elseif");
+  TheCondState.TheCond = AsmCond::ElseCond;
+  bool LastIgnoreState = false;
+  if (!TheCondStack.empty())
+    LastIgnoreState = TheCondStack.back().Ignore;
+  if (LastIgnoreState || TheCondState.CondMet)
+    TheCondState.Ignore = true;
+  else
+    TheCondState.Ignore = false;
+
+  return false;
+}
+
+/// ParseDirectiveEndIf
+/// ::= .endif
+bool AsmParser::ParseDirectiveEndIf(SMLoc DirectiveLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.endif' directive");
+
+  Lex();
+
+  if ((TheCondState.TheCond == AsmCond::NoCond) ||
+      TheCondStack.empty())
+    Error(DirectiveLoc, "Encountered a .endif that doesn't follow a .if or "
+                        ".else");
+  if (!TheCondStack.empty()) {
+    TheCondState = TheCondStack.back();
+    TheCondStack.pop_back();
+  }
+
+  return false;
+}
+
+void AsmParser::initializeDirectiveKindMap() {
+  DirectiveKindMap[".set"] = DK_SET;
+  DirectiveKindMap[".equ"] = DK_EQU;
+  DirectiveKindMap[".equiv"] = DK_EQUIV;
+  DirectiveKindMap[".ascii"] = DK_ASCII;
+  DirectiveKindMap[".asciz"] = DK_ASCIZ;
+  DirectiveKindMap[".string"] = DK_STRING;
+  DirectiveKindMap[".byte"] = DK_BYTE;
+  DirectiveKindMap[".short"] = DK_SHORT;
+  DirectiveKindMap[".value"] = DK_VALUE;
+  DirectiveKindMap[".2byte"] = DK_2BYTE;
+  DirectiveKindMap[".long"] = DK_LONG;
+  DirectiveKindMap[".int"] = DK_INT;
+  DirectiveKindMap[".4byte"] = DK_4BYTE;
+  DirectiveKindMap[".quad"] = DK_QUAD;
+  DirectiveKindMap[".8byte"] = DK_8BYTE;
+  DirectiveKindMap[".single"] = DK_SINGLE;
+  DirectiveKindMap[".float"] = DK_FLOAT;
+  DirectiveKindMap[".double"] = DK_DOUBLE;
+  DirectiveKindMap[".align"] = DK_ALIGN;
+  DirectiveKindMap[".align32"] = DK_ALIGN32;
+  DirectiveKindMap[".balign"] = DK_BALIGN;
+  DirectiveKindMap[".balignw"] = DK_BALIGNW;
+  DirectiveKindMap[".balignl"] = DK_BALIGNL;
+  DirectiveKindMap[".p2align"] = DK_P2ALIGN;
+  DirectiveKindMap[".p2alignw"] = DK_P2ALIGNW;
+  DirectiveKindMap[".p2alignl"] = DK_P2ALIGNL;
+  DirectiveKindMap[".org"] = DK_ORG;
+  DirectiveKindMap[".fill"] = DK_FILL;
+  DirectiveKindMap[".zero"] = DK_ZERO;
+  DirectiveKindMap[".extern"] = DK_EXTERN;
+  DirectiveKindMap[".globl"] = DK_GLOBL;
+  DirectiveKindMap[".global"] = DK_GLOBAL;
+  DirectiveKindMap[".indirect_symbol"] = DK_INDIRECT_SYMBOL;
+  DirectiveKindMap[".lazy_reference"] = DK_LAZY_REFERENCE;
+  DirectiveKindMap[".no_dead_strip"] = DK_NO_DEAD_STRIP;
+  DirectiveKindMap[".symbol_resolver"] = DK_SYMBOL_RESOLVER;
+  DirectiveKindMap[".private_extern"] = DK_PRIVATE_EXTERN;
+  DirectiveKindMap[".reference"] = DK_REFERENCE;
+  DirectiveKindMap[".weak_definition"] = DK_WEAK_DEFINITION;
+  DirectiveKindMap[".weak_reference"] = DK_WEAK_REFERENCE;
+  DirectiveKindMap[".weak_def_can_be_hidden"] = DK_WEAK_DEF_CAN_BE_HIDDEN;
+  DirectiveKindMap[".comm"] = DK_COMM;
+  DirectiveKindMap[".common"] = DK_COMMON;
+  DirectiveKindMap[".lcomm"] = DK_LCOMM;
+  DirectiveKindMap[".abort"] = DK_ABORT;
+  DirectiveKindMap[".include"] = DK_INCLUDE;
+  DirectiveKindMap[".incbin"] = DK_INCBIN;
+  DirectiveKindMap[".code16"] = DK_CODE16;
+  DirectiveKindMap[".code16gcc"] = DK_CODE16GCC;
+  DirectiveKindMap[".rept"] = DK_REPT;
+  DirectiveKindMap[".irp"] = DK_IRP;
+  DirectiveKindMap[".irpc"] = DK_IRPC;
+  DirectiveKindMap[".endr"] = DK_ENDR;
+  DirectiveKindMap[".bundle_align_mode"] = DK_BUNDLE_ALIGN_MODE;
+  DirectiveKindMap[".bundle_lock"] = DK_BUNDLE_LOCK;
+  DirectiveKindMap[".bundle_unlock"] = DK_BUNDLE_UNLOCK;
+  DirectiveKindMap[".if"] = DK_IF;
+  DirectiveKindMap[".ifb"] = DK_IFB;
+  DirectiveKindMap[".ifnb"] = DK_IFNB;
+  DirectiveKindMap[".ifc"] = DK_IFC;
+  DirectiveKindMap[".ifnc"] = DK_IFNC;
+  DirectiveKindMap[".ifdef"] = DK_IFDEF;
+  DirectiveKindMap[".ifndef"] = DK_IFNDEF;
+  DirectiveKindMap[".ifnotdef"] = DK_IFNOTDEF;
+  DirectiveKindMap[".elseif"] = DK_ELSEIF;
+  DirectiveKindMap[".else"] = DK_ELSE;
+  DirectiveKindMap[".endif"] = DK_ENDIF;
+  DirectiveKindMap[".skip"] = DK_SKIP;
+  DirectiveKindMap[".space"] = DK_SPACE;
+  DirectiveKindMap[".file"] = DK_FILE;
+  DirectiveKindMap[".line"] = DK_LINE;
+  DirectiveKindMap[".loc"] = DK_LOC;
+  DirectiveKindMap[".stabs"] = DK_STABS;
+  DirectiveKindMap[".sleb128"] = DK_SLEB128;
+  DirectiveKindMap[".uleb128"] = DK_ULEB128;
+  DirectiveKindMap[".cfi_sections"] = DK_CFI_SECTIONS;
+  DirectiveKindMap[".cfi_startproc"] = DK_CFI_STARTPROC;
+  DirectiveKindMap[".cfi_endproc"] = DK_CFI_ENDPROC;
+  DirectiveKindMap[".cfi_def_cfa"] = DK_CFI_DEF_CFA;
+  DirectiveKindMap[".cfi_def_cfa_offset"] = DK_CFI_DEF_CFA_OFFSET;
+  DirectiveKindMap[".cfi_adjust_cfa_offset"] = DK_CFI_ADJUST_CFA_OFFSET;
+  DirectiveKindMap[".cfi_def_cfa_register"] = DK_CFI_DEF_CFA_REGISTER;
+  DirectiveKindMap[".cfi_offset"] = DK_CFI_OFFSET;
+  DirectiveKindMap[".cfi_rel_offset"] = DK_CFI_REL_OFFSET;
+  DirectiveKindMap[".cfi_personality"] = DK_CFI_PERSONALITY;
+  DirectiveKindMap[".cfi_lsda"] = DK_CFI_LSDA;
+  DirectiveKindMap[".cfi_remember_state"] = DK_CFI_REMEMBER_STATE;
+  DirectiveKindMap[".cfi_restore_state"] = DK_CFI_RESTORE_STATE;
+  DirectiveKindMap[".cfi_same_value"] = DK_CFI_SAME_VALUE;
+  DirectiveKindMap[".cfi_restore"] = DK_CFI_RESTORE;
+  DirectiveKindMap[".cfi_escape"] = DK_CFI_ESCAPE;
+  DirectiveKindMap[".cfi_signal_frame"] = DK_CFI_SIGNAL_FRAME;
+  DirectiveKindMap[".cfi_undefined"] = DK_CFI_UNDEFINED;
+  DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
+  DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
+  DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
+  DirectiveKindMap[".macro"] = DK_MACRO;
+  DirectiveKindMap[".endm"] = DK_ENDM;
+  DirectiveKindMap[".endmacro"] = DK_ENDMACRO;
+  DirectiveKindMap[".purgem"] = DK_PURGEM;
+}
+
+
+MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
   AsmToken EndToken, StartToken = getTok();
 
   unsigned NestLevel = 0;
@@ -3571,7 +3822,7 @@ Macro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
     }
 
     // Otherwise, scan till the end of the statement.
-    EatToEndOfStatement();
+    eatToEndOfStatement();
   }
 
   const char *BodyStart = StartToken.getLoc().getPointer();
@@ -3580,11 +3831,11 @@ Macro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
 
   // We Are Anonymous.
   StringRef Name;
-  MacroParameters Parameters;
-  return new Macro(Name, Body, Parameters);
+  MCAsmMacroParameters Parameters;
+  return new MCAsmMacro(Name, Body, Parameters);
 }
 
-void AsmParser::InstantiateMacroLikeBody(Macro *M, SMLoc DirectiveLoc,
+void AsmParser::InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
                                          raw_svector_ostream &OS) {
   OS << ".endr\n";
 
@@ -3607,7 +3858,7 @@ void AsmParser::InstantiateMacroLikeBody(Macro *M, SMLoc DirectiveLoc,
 
 bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
   int64_t Count;
-  if (ParseAbsoluteExpression(Count))
+  if (parseAbsoluteExpression(Count))
     return TokError("unexpected token in '.rept' directive");
 
   if (Count < 0)
@@ -3620,15 +3871,15 @@ bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
   Lex();
 
   // Lex the rept definition.
-  Macro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
   // Macro instantiation is lexical, unfortunately. We construct a new buffer
   // to hold the macro body with substitutions.
   SmallString<256> Buf;
-  MacroParameters Parameters;
-  MacroArguments A;
+  MCAsmMacroParameters Parameters;
+  MCAsmMacroArguments A;
   raw_svector_ostream OS(Buf);
   while (Count--) {
     if (expandMacro(OS, M->Body, Parameters, A, getTok().getLoc()))
@@ -3642,10 +3893,10 @@ bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
 /// ParseDirectiveIrp
 /// ::= .irp symbol,values
 bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
-  MacroParameters Parameters;
-  MacroParameter Parameter;
+  MCAsmMacroParameters Parameters;
+  MCAsmMacroParameter Parameter;
 
-  if (ParseIdentifier(Parameter.first))
+  if (parseIdentifier(Parameter.first))
     return TokError("expected identifier in '.irp' directive");
 
   Parameters.push_back(Parameter);
@@ -3655,7 +3906,7 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
 
   Lex();
 
-  MacroArguments A;
+  MCAsmMacroArguments A;
   if (ParseMacroArguments(0, A))
     return true;
 
@@ -3663,7 +3914,7 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
   Lex();
 
   // Lex the irp definition.
-  Macro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -3672,8 +3923,8 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
   SmallString<256> Buf;
   raw_svector_ostream OS(Buf);
 
-  for (MacroArguments::iterator i = A.begin(), e = A.end(); i != e; ++i) {
-    MacroArguments Args;
+  for (MCAsmMacroArguments::iterator i = A.begin(), e = A.end(); i != e; ++i) {
+    MCAsmMacroArguments Args;
     Args.push_back(*i);
 
     if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc()))
@@ -3688,10 +3939,10 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
 /// ParseDirectiveIrpc
 /// ::= .irpc symbol,values
 bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
-  MacroParameters Parameters;
-  MacroParameter Parameter;
+  MCAsmMacroParameters Parameters;
+  MCAsmMacroParameter Parameter;
 
-  if (ParseIdentifier(Parameter.first))
+  if (parseIdentifier(Parameter.first))
     return TokError("expected identifier in '.irpc' directive");
 
   Parameters.push_back(Parameter);
@@ -3701,7 +3952,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
 
   Lex();
 
-  MacroArguments A;
+  MCAsmMacroArguments A;
   if (ParseMacroArguments(0, A))
     return true;
 
@@ -3712,7 +3963,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   Lex();
 
   // Lex the irpc definition.
-  Macro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -3724,10 +3975,10 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   StringRef Values = A.front().front().getString();
   std::size_t I, End = Values.size();
   for (I = 0; I < End; ++I) {
-    MacroArgument Arg;
+    MCAsmMacroArgument Arg;
     Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I+1)));
 
-    MacroArguments Args;
+    MCAsmMacroArguments Args;
     Args.push_back(Arg);
 
     if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc()))
@@ -3751,10 +4002,11 @@ bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
   return false;
 }
 
-bool AsmParser::ParseDirectiveEmit(SMLoc IDLoc, ParseStatementInfo &Info) {
+bool AsmParser::ParseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
+                                     size_t Len) {
   const MCExpr *Value;
   SMLoc ExprLoc = getLexer().getLoc();
-  if (ParseExpression(Value))
+  if (parseExpression(Value))
     return true;
   const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
   if (!MCE)
@@ -3763,27 +4015,71 @@ bool AsmParser::ParseDirectiveEmit(SMLoc IDLoc, ParseStatementInfo &Info) {
   if (!isUIntN(8, IntValue) && !isIntN(8, IntValue))
     return Error(ExprLoc, "literal value out of range for directive");
 
-  Info.AsmRewrites->push_back(AsmRewrite(AOK_Emit, IDLoc, 5));
+  Info.AsmRewrites->push_back(AsmRewrite(AOK_Emit, IDLoc, Len));
   return false;
 }
 
-bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
-                                 unsigned &NumOutputs, unsigned &NumInputs,
-                                 SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
-                                 SmallVectorImpl<std::string> &Constraints,
-                                 SmallVectorImpl<std::string> &Clobbers,
-                                 const MCInstrInfo *MII,
-                                 const MCInstPrinter *IP,
-                                 MCAsmParserSemaCallback &SI) {
+bool AsmParser::ParseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
+  const MCExpr *Value;
+  SMLoc ExprLoc = getLexer().getLoc();
+  if (parseExpression(Value))
+    return true;
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+  if (!MCE)
+    return Error(ExprLoc, "unexpected expression in align");
+  uint64_t IntValue = MCE->getValue();
+  if (!isPowerOf2_64(IntValue))
+    return Error(ExprLoc, "literal value not a power of two greater then zero");
+
+  Info.AsmRewrites->push_back(AsmRewrite(AOK_Align, IDLoc, 5,
+                                         Log2_64(IntValue)));
+  return false;
+}
+
+// We are comparing pointers, but the pointers are relative to a single string.
+// Thus, this should always be deterministic.
+static int RewritesSort(const void *A, const void *B) {
+  const AsmRewrite *AsmRewriteA = static_cast<const AsmRewrite *>(A);
+  const AsmRewrite *AsmRewriteB = static_cast<const AsmRewrite *>(B);
+  if (AsmRewriteA->Loc.getPointer() < AsmRewriteB->Loc.getPointer())
+    return -1;
+  if (AsmRewriteB->Loc.getPointer() < AsmRewriteA->Loc.getPointer())
+    return 1;
+
+  // It's possible to have a SizeDirective rewrite and an Input/Output rewrite
+  // to the same location.  Make sure the SizeDirective rewrite is performed
+  // first.  This also ensure the sort algorithm is stable.
+  if (AsmRewriteA->Kind == AOK_SizeDirective) {
+    assert ((AsmRewriteB->Kind == AOK_Input || AsmRewriteB->Kind == AOK_Output) &&
+            "Expected an Input/Output rewrite!");
+    return -1;
+  }
+  if (AsmRewriteB->Kind == AOK_SizeDirective) {
+    assert ((AsmRewriteA->Kind == AOK_Input || AsmRewriteA->Kind == AOK_Output) &&
+            "Expected an Input/Output rewrite!");
+    return 1;
+  }
+  llvm_unreachable ("Unstable rewrite sort.");
+}
+
+bool
+AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                            unsigned &NumOutputs, unsigned &NumInputs,
+                            SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+                            SmallVectorImpl<std::string> &Constraints,
+                            SmallVectorImpl<std::string> &Clobbers,
+                            const MCInstrInfo *MII,
+                            const MCInstPrinter *IP,
+                            MCAsmParserSemaCallback &SI) {
   SmallVector<void *, 4> InputDecls;
   SmallVector<void *, 4> OutputDecls;
-  SmallVector<bool, 4> InputDeclsOffsetOf;
-  SmallVector<bool, 4> OutputDeclsOffsetOf;
+  SmallVector<bool, 4> InputDeclsAddressOf;
+  SmallVector<bool, 4> OutputDeclsAddressOf;
   SmallVector<std::string, 4> InputConstraints;
   SmallVector<std::string, 4> OutputConstraints;
-  std::set<std::string> ClobberRegs;
+  SmallVector<unsigned, 4> ClobberRegs;
 
-  SmallVector<struct AsmRewrite, 4> AsmStrRewrites;
+  SmallVector<AsmRewrite, 4> AsmStrRewrites;
 
   // Prime the lexer.
   Lex();
@@ -3799,64 +4095,60 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
     if (Info.ParseError)
       return true;
 
-    if (Info.Opcode != ~0U) {
-      const MCInstrDesc &Desc = MII->get(Info.Opcode);
+    if (Info.Opcode == ~0U)
+      continue;
 
-      // Build the list of clobbers, outputs and inputs.
-      for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
-        MCParsedAsmOperand *Operand = Info.ParsedOperands[i];
+    const MCInstrDesc &Desc = MII->get(Info.Opcode);
 
-        // Immediate.
-        if (Operand->isImm()) {
-          if (Operand->needAsmRewrite())
-            AsmStrRewrites.push_back(AsmRewrite(AOK_ImmPrefix,
-                                                Operand->getStartLoc()));
-          continue;
-        }
+    // Build the list of clobbers, outputs and inputs.
+    for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
+      MCParsedAsmOperand *Operand = Info.ParsedOperands[i];
 
-        // Register operand.
-        if (Operand->isReg() && !Operand->isOffsetOf()) {
-          unsigned NumDefs = Desc.getNumDefs();
-          // Clobber.
-          if (NumDefs && Operand->getMCOperandNum() < NumDefs) {
-            std::string Reg;
-            raw_string_ostream OS(Reg);
-            IP->printRegName(OS, Operand->getReg());
-            ClobberRegs.insert(StringRef(OS.str()));
-          }
-          continue;
-        }
+      // Immediate.
+      if (Operand->isImm()) {
+        if (Operand->needAsmRewrite())
+          AsmStrRewrites.push_back(AsmRewrite(AOK_ImmPrefix,
+                                              Operand->getStartLoc()));
+        continue;
+      }
 
-        // Expr/Input or Output.
-        unsigned Size;
-        void *OpDecl = SI.LookupInlineAsmIdentifier(Operand->getName(), AsmLoc,
-                                                    Size);
-        if (OpDecl) {
-          bool isOutput = (i == 1) && Desc.mayStore();
-          if (!Operand->isOffsetOf() && Operand->needSizeDirective())
-            AsmStrRewrites.push_back(AsmRewrite(AOK_SizeDirective,
-                                                Operand->getStartLoc(),
-                                                /*Len*/0,
-                                                Operand->getMemSize()));
-          if (isOutput) {
-            std::string Constraint = "=";
-            ++InputIdx;
-            OutputDecls.push_back(OpDecl);
-            OutputDeclsOffsetOf.push_back(Operand->isOffsetOf());
-            Constraint += Operand->getConstraint().str();
-            OutputConstraints.push_back(Constraint);
-            AsmStrRewrites.push_back(AsmRewrite(AOK_Output,
-                                                Operand->getStartLoc(),
-                                                Operand->getNameLen()));
-          } else {
-            InputDecls.push_back(OpDecl);
-            InputDeclsOffsetOf.push_back(Operand->isOffsetOf());
-            InputConstraints.push_back(Operand->getConstraint().str());
-            AsmStrRewrites.push_back(AsmRewrite(AOK_Input,
-                                                Operand->getStartLoc(),
-                                                Operand->getNameLen()));
-          }
-        }
+      // Register operand.
+      if (Operand->isReg() && !Operand->needAddressOf()) {
+        unsigned NumDefs = Desc.getNumDefs();
+        // Clobber.
+        if (NumDefs && Operand->getMCOperandNum() < NumDefs)
+          ClobberRegs.push_back(Operand->getReg());
+        continue;
+      }
+
+      // Expr/Input or Output.
+      bool IsVarDecl;
+      unsigned Length, Size, Type;
+      void *OpDecl = SI.LookupInlineAsmIdentifier(Operand->getName(), AsmLoc,
+                                                  Length, Size, Type,
+                                                  IsVarDecl);
+      if (!OpDecl)
+        continue;
+
+      bool isOutput = (i == 1) && Desc.mayStore();
+      if (Operand->isMem() && Operand->needSizeDirective())
+        AsmStrRewrites.push_back(AsmRewrite(AOK_SizeDirective,
+                                            Operand->getStartLoc(), /*Len*/0,
+                                            Operand->getMemSize()));
+
+      if (isOutput) {
+        ++InputIdx;
+        OutputDecls.push_back(OpDecl);
+        OutputDeclsAddressOf.push_back(Operand->needAddressOf());
+        OutputConstraints.push_back('=' + Operand->getConstraint().str());
+        AsmStrRewrites.push_back(AsmRewrite(AOK_Output, Operand->getStartLoc(),
+                                            Operand->getNameLen()));
+      } else {
+        InputDecls.push_back(OpDecl);
+        InputDeclsAddressOf.push_back(Operand->needAddressOf());
+        InputConstraints.push_back(Operand->getConstraint().str());
+        AsmStrRewrites.push_back(AsmRewrite(AOK_Input, Operand->getStartLoc(),
+                                            Operand->getNameLen()));
       }
     }
   }
@@ -3866,24 +4158,27 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
   NumInputs = InputDecls.size();
 
   // Set the unique clobbers.
-  for (std::set<std::string>::iterator I = ClobberRegs.begin(),
-         E = ClobberRegs.end(); I != E; ++I)
-    Clobbers.push_back(*I);
+  array_pod_sort(ClobberRegs.begin(), ClobberRegs.end());
+  ClobberRegs.erase(std::unique(ClobberRegs.begin(), ClobberRegs.end()),
+                    ClobberRegs.end());
+  Clobbers.assign(ClobberRegs.size(), std::string());
+  for (unsigned I = 0, E = ClobberRegs.size(); I != E; ++I) {
+    raw_string_ostream OS(Clobbers[I]);
+    IP->printRegName(OS, ClobberRegs[I]);
+  }
 
   // Merge the various outputs and inputs.  Output are expected first.
   if (NumOutputs || NumInputs) {
     unsigned NumExprs = NumOutputs + NumInputs;
     OpDecls.resize(NumExprs);
     Constraints.resize(NumExprs);
-    // FIXME: Constraints are hard coded to 'm', but we need an 'r'
-    // constraint for offsetof.  This needs to be cleaned up!
     for (unsigned i = 0; i < NumOutputs; ++i) {
-      OpDecls[i] = std::make_pair(OutputDecls[i], OutputDeclsOffsetOf[i]);
-      Constraints[i] = OutputDeclsOffsetOf[i] ? "=r" : OutputConstraints[i];
+      OpDecls[i] = std::make_pair(OutputDecls[i], OutputDeclsAddressOf[i]);
+      Constraints[i] = OutputConstraints[i];
     }
     for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) {
-      OpDecls[j] = std::make_pair(InputDecls[i], InputDeclsOffsetOf[i]);
-      Constraints[j] = InputDeclsOffsetOf[i] ? "r" : InputConstraints[i];
+      OpDecls[j] = std::make_pair(InputDecls[i], InputDeclsAddressOf[i]);
+      Constraints[j] = InputConstraints[i];
     }
   }
 
@@ -3892,10 +4187,14 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
   AsmRewriteKind PrevKind = AOK_Imm;
   raw_string_ostream OS(AsmStringIR);
   const char *Start = SrcMgr.getMemoryBuffer(0)->getBufferStart();
-  for (SmallVectorImpl<struct AsmRewrite>::iterator
-         I = AsmStrRewrites.begin(), E = AsmStrRewrites.end(); I != E; ++I) {
+  array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), RewritesSort);
+  for (SmallVectorImpl<AsmRewrite>::iterator I = AsmStrRewrites.begin(),
+                                             E = AsmStrRewrites.end();
+       I != E; ++I) {
     const char *Loc = (*I).Loc.getPointer();
+    assert(Loc >= Start && "Expected Loc to be after Start!");
 
+    unsigned AdditionalSkip = 0;
     AsmRewriteKind Kind = (*I).Kind;
 
     // Emit everything up to the immediate/expression.  If the previous rewrite
@@ -3914,22 +4213,19 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
     switch (Kind) {
     default: break;
     case AOK_Imm:
-      OS << Twine("$$");
-      OS << (*I).Val;
+      OS << "$$" << (*I).Val;
       break;
     case AOK_ImmPrefix:
-      OS << Twine("$$");
+      OS << "$$";
       break;
     case AOK_Input:
-      OS << '$';
-      OS << InputIdx++;
+      OS << '$' << InputIdx++;
       break;
     case AOK_Output:
-      OS << '$';
-      OS << OutputIdx++;
+      OS << '$' << OutputIdx++;
       break;
     case AOK_SizeDirective:
-      switch((*I).Val) {
+      switch ((*I).Val) {
       default: break;
       case 8:  OS << "byte ptr "; break;
       case 16: OS << "word ptr "; break;
@@ -3943,6 +4239,15 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
     case AOK_Emit:
       OS << ".byte";
       break;
+    case AOK_Align: {
+      unsigned Val = (*I).Val;
+      OS << ".align " << Val;
+
+      // Skip the original immediate.
+      assert(Val < 10 && "Expected alignment less then 2^10.");
+      AdditionalSkip = (Val < 4) ? 2 : Val < 7 ? 3 : 4;
+      break;
+    }
     case AOK_DotOperator:
       OS << (*I).Val;
       break;
@@ -3950,7 +4255,7 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
 
     // Skip the original expression.
     if (Kind != AOK_SizeDirective)
-      Start = Loc + (*I).Len;
+      Start = Loc + (*I).Len + AdditionalSkip;
   }
 
   // Emit the remainder of the asm string.
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index e7c564a..a50eab2 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -24,10 +24,11 @@ using namespace llvm;
 namespace {
 
 class COFFAsmParser : public MCAsmParserExtension {
-  template<bool (COFFAsmParser::*Handler)(StringRef, SMLoc)>
-  void AddDirectiveHandler(StringRef Directive) {
-    getParser().AddDirectiveHandler(this, Directive,
-                                    HandleDirective<COFFAsmParser, Handler>);
+  template<bool (COFFAsmParser::*HandlerMethod)(StringRef, SMLoc)>
+  void addDirectiveHandler(StringRef Directive) {
+    MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair(
+        this, HandleDirective<COFFAsmParser, HandlerMethod>);
+    getParser().addDirectiveHandler(Directive, Handler);
   }
 
   bool ParseSectionSwitch(StringRef Section,
@@ -38,43 +39,43 @@ class COFFAsmParser : public MCAsmParserExtension {
     // Call the base implementation.
     MCAsmParserExtension::Initialize(Parser);
 
-    AddDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveText>(".text");
-    AddDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveData>(".data");
-    AddDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveBSS>(".bss");
-    AddDirectiveHandler<&COFFAsmParser::ParseDirectiveDef>(".def");
-    AddDirectiveHandler<&COFFAsmParser::ParseDirectiveScl>(".scl");
-    AddDirectiveHandler<&COFFAsmParser::ParseDirectiveType>(".type");
-    AddDirectiveHandler<&COFFAsmParser::ParseDirectiveEndef>(".endef");
-    AddDirectiveHandler<&COFFAsmParser::ParseDirectiveSecRel32>(".secrel32");
+    addDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveText>(".text");
+    addDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveData>(".data");
+    addDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveBSS>(".bss");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveDef>(".def");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveScl>(".scl");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveType>(".type");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveEndef>(".endef");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSecRel32>(".secrel32");
 
     // Win64 EH directives.
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>(
                                                                    ".seh_proc");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProc>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProc>(
                                                                 ".seh_endproc");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartChained>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartChained>(
                                                            ".seh_startchained");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndChained>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndChained>(
                                                              ".seh_endchained");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandler>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandler>(
                                                                 ".seh_handler");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandlerData>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandlerData>(
                                                             ".seh_handlerdata");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushReg>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushReg>(
                                                                 ".seh_pushreg");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSetFrame>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSetFrame>(
                                                                ".seh_setframe");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveAllocStack>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveAllocStack>(
                                                              ".seh_stackalloc");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveReg>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveReg>(
                                                                 ".seh_savereg");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveXMM>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveXMM>(
                                                                 ".seh_savexmm");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushFrame>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushFrame>(
                                                               ".seh_pushframe");
-    AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProlog>(
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProlog>(
                                                             ".seh_endprologue");
-    AddDirectiveHandler<&COFFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
   }
 
   bool ParseSectionDirectiveText(StringRef, SMLoc) {
@@ -140,7 +141,7 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
     for (;;) {
       StringRef Name;
 
-      if (getParser().ParseIdentifier(Name))
+      if (getParser().parseIdentifier(Name))
         return TokError("expected identifier in directive");
 
       MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
@@ -176,7 +177,7 @@ bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
 bool COFFAsmParser::ParseDirectiveDef(StringRef, SMLoc) {
   StringRef SymbolName;
 
-  if (getParser().ParseIdentifier(SymbolName))
+  if (getParser().parseIdentifier(SymbolName))
     return TokError("expected identifier in directive");
 
   MCSymbol *Sym = getContext().GetOrCreateSymbol(SymbolName);
@@ -189,7 +190,7 @@ bool COFFAsmParser::ParseDirectiveDef(StringRef, SMLoc) {
 
 bool COFFAsmParser::ParseDirectiveScl(StringRef, SMLoc) {
   int64_t SymbolStorageClass;
-  if (getParser().ParseAbsoluteExpression(SymbolStorageClass))
+  if (getParser().parseAbsoluteExpression(SymbolStorageClass))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -202,7 +203,7 @@ bool COFFAsmParser::ParseDirectiveScl(StringRef, SMLoc) {
 
 bool COFFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
   int64_t Type;
-  if (getParser().ParseAbsoluteExpression(Type))
+  if (getParser().parseAbsoluteExpression(Type))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -221,7 +222,7 @@ bool COFFAsmParser::ParseDirectiveEndef(StringRef, SMLoc) {
 
 bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
   StringRef SymbolID;
-  if (getParser().ParseIdentifier(SymbolID))
+  if (getParser().parseIdentifier(SymbolID))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -236,7 +237,7 @@ bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
 
 bool COFFAsmParser::ParseSEHDirectiveStartProc(StringRef, SMLoc) {
   StringRef SymbolID;
-  if (getParser().ParseIdentifier(SymbolID))
+  if (getParser().parseIdentifier(SymbolID))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -269,7 +270,7 @@ bool COFFAsmParser::ParseSEHDirectiveEndChained(StringRef, SMLoc) {
 
 bool COFFAsmParser::ParseSEHDirectiveHandler(StringRef, SMLoc) {
   StringRef SymbolID;
-  if (getParser().ParseIdentifier(SymbolID))
+  if (getParser().parseIdentifier(SymbolID))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -322,7 +323,7 @@ bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc L) {
 
   Lex();
   SMLoc startLoc = getLexer().getLoc();
-  if (getParser().ParseAbsoluteExpression(Off))
+  if (getParser().parseAbsoluteExpression(Off))
     return true;
 
   if (Off & 0x0F)
@@ -339,7 +340,7 @@ bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc L) {
 bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc) {
   int64_t Size;
   SMLoc startLoc = getLexer().getLoc();
-  if (getParser().ParseAbsoluteExpression(Size))
+  if (getParser().parseAbsoluteExpression(Size))
     return true;
 
   if (Size & 7)
@@ -363,7 +364,7 @@ bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc L) {
 
   Lex();
   SMLoc startLoc = getLexer().getLoc();
-  if (getParser().ParseAbsoluteExpression(Off))
+  if (getParser().parseAbsoluteExpression(Off))
     return true;
 
   if (Off & 7)
@@ -390,7 +391,7 @@ bool COFFAsmParser::ParseSEHDirectiveSaveXMM(StringRef, SMLoc L) {
 
   Lex();
   SMLoc startLoc = getLexer().getLoc();
-  if (getParser().ParseAbsoluteExpression(Off))
+  if (getParser().parseAbsoluteExpression(Off))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -411,7 +412,7 @@ bool COFFAsmParser::ParseSEHDirectivePushFrame(StringRef, SMLoc) {
   if (getLexer().is(AsmToken::At)) {
     SMLoc startLoc = getLexer().getLoc();
     Lex();
-    if (!getParser().ParseIdentifier(CodeID)) {
+    if (!getParser().parseIdentifier(CodeID)) {
       if (CodeID != "code")
         return Error(startLoc, "expected @code");
       Code = true;
@@ -438,7 +439,7 @@ bool COFFAsmParser::ParseAtUnwindOrAtExcept(bool &unwind, bool &except) {
     return TokError("a handler attribute must begin with '@'");
   SMLoc startLoc = getLexer().getLoc();
   Lex();
-  if (getParser().ParseIdentifier(identifier))
+  if (getParser().parseIdentifier(identifier))
     return Error(startLoc, "expected @unwind or @except");
   if (identifier == "unwind")
     unwind = true;
@@ -479,7 +480,7 @@ bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) {
   }
   else {
     int64_t n;
-    if (getParser().ParseAbsoluteExpression(n))
+    if (getParser().parseAbsoluteExpression(n))
       return true;
     if (n > 15)
       return Error(startLoc, "register number is too high");
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 7b042df..6d6409f 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -26,10 +26,11 @@ namespace {
 /// \brief Implementation of directive handling which is shared across all
 /// Darwin targets.
 class DarwinAsmParser : public MCAsmParserExtension {
-  template<bool (DarwinAsmParser::*Handler)(StringRef, SMLoc)>
-  void AddDirectiveHandler(StringRef Directive) {
-    getParser().AddDirectiveHandler(this, Directive,
-                                    HandleDirective<DarwinAsmParser, Handler>);
+  template<bool (DarwinAsmParser::*HandlerMethod)(StringRef, SMLoc)>
+  void addDirectiveHandler(StringRef Directive) {
+    MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair(
+        this, HandleDirective<DarwinAsmParser, HandlerMethod>);
+    getParser().addDirectiveHandler(Directive, Handler);
   }
 
   bool ParseSectionSwitch(const char *Segment, const char *Section,
@@ -43,77 +44,128 @@ public:
     // Call the base implementation.
     this->MCAsmParserExtension::Initialize(Parser);
 
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDesc>(".desc");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveLsym>(".lsym");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols>(
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDesc>(".desc");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveLsym>(".lsym");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols>(
       ".subsections_via_symbols");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".dump");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".load");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSection>(".section");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectivePushSection>(".pushsection");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectivePopSection>(".popsection");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectivePrevious>(".previous");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogUnique>(
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".dump");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".load");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSection>(".section");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectivePushSection>(
+      ".pushsection");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectivePopSection>(
+      ".popsection");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectivePrevious>(".previous");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogUnique>(
       ".secure_log_unique");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogReset>(
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogReset>(
       ".secure_log_reset");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveTBSS>(".tbss");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveZerofill>(".zerofill");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveTBSS>(".tbss");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveZerofill>(".zerofill");
 
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegion>(".data_region");
-    AddDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegionEnd>(".end_data_region");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegion>(
+      ".data_region");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegionEnd>(
+      ".end_data_region");
 
     // Special section directives.
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(".const_data");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstructor>(".constructor");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveCString>(".cstring");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveData>(".data");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveDestructor>(".destructor");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveDyld>(".dyld");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveFVMLibInit0>(".fvmlib_init0");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveFVMLibInit1>(".fvmlib_init1");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLazySymbolPointers>(".lazy_symbol_pointer");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral16>(".literal16");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral4>(".literal4");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral8>(".literal8");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveModInitFunc>(".mod_init_func");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveModTermFunc>(".mod_term_func");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveNonLazySymbolPointers>(".non_lazy_symbol_pointer");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCatClsMeth>(".objc_cat_cls_meth");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCatInstMeth>(".objc_cat_inst_meth");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCategory>(".objc_category");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClass>(".objc_class");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClassNames>(".objc_class_names");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClassVars>(".objc_class_vars");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClsMeth>(".objc_cls_meth");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClsRefs>(".objc_cls_refs");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCInstMeth>(".objc_inst_meth");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCInstanceVars>(".objc_instance_vars");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMessageRefs>(".objc_message_refs");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMetaClass>(".objc_meta_class");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMethVarNames>(".objc_meth_var_names");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMethVarTypes>(".objc_meth_var_types");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCModuleInfo>(".objc_module_info");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCProtocol>(".objc_protocol");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCSelectorStrs>(".objc_selector_strs");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCStringObject>(".objc_string_object");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCSymbols>(".objc_symbols");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectivePICSymbolStub>(".picsymbol_stub");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveStaticConst>(".static_const");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveStaticData>(".static_data");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveSymbolStub>(".symbol_stub");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveTData>(".tdata");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveText>(".text");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveThreadInitFunc>(".thread_init_func");
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveTLV>(".tlv");
-
-    AddDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveIdent>(".ident");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(
+      ".const_data");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstructor>(
+      ".constructor");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveCString>(
+      ".cstring");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveData>(".data");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveDestructor>(
+      ".destructor");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveDyld>(".dyld");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveFVMLibInit0>(
+      ".fvmlib_init0");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveFVMLibInit1>(
+      ".fvmlib_init1");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveLazySymbolPointers>(
+        ".lazy_symbol_pointer");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveLinkerOption>(
+      ".linker_option");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral16>(
+      ".literal16");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral4>(
+      ".literal4");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral8>(
+      ".literal8");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveModInitFunc>(
+      ".mod_init_func");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveModTermFunc>(
+      ".mod_term_func");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveNonLazySymbolPointers>(
+        ".non_lazy_symbol_pointer");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCatClsMeth>(
+      ".objc_cat_cls_meth");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCatInstMeth>(
+      ".objc_cat_inst_meth");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCategory>(
+      ".objc_category");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClass>(
+      ".objc_class");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClassNames>(
+      ".objc_class_names");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClassVars>(
+      ".objc_class_vars");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClsMeth>(
+      ".objc_cls_meth");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClsRefs>(
+      ".objc_cls_refs");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCInstMeth>(
+      ".objc_inst_meth");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCInstanceVars>(
+        ".objc_instance_vars");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMessageRefs>(
+      ".objc_message_refs");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMetaClass>(
+      ".objc_meta_class");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCMethVarNames>(
+        ".objc_meth_var_names");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCMethVarTypes>(
+        ".objc_meth_var_types");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCModuleInfo>(
+      ".objc_module_info");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCProtocol>(
+      ".objc_protocol");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCSelectorStrs>(
+        ".objc_selector_strs");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCStringObject>(
+        ".objc_string_object");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCSymbols>(
+      ".objc_symbols");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectivePICSymbolStub>(
+      ".picsymbol_stub");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveStaticConst>(
+      ".static_const");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveStaticData>(
+      ".static_data");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveSymbolStub>(
+      ".symbol_stub");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveTData>(".tdata");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveText>(".text");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveThreadInitFunc>(
+      ".thread_init_func");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveTLV>(".tlv");
+
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveIdent>(".ident");
   }
 
   bool ParseDirectiveDesc(StringRef, SMLoc);
   bool ParseDirectiveDumpOrLoad(StringRef, SMLoc);
   bool ParseDirectiveLsym(StringRef, SMLoc);
+  bool ParseDirectiveLinkerOption(StringRef, SMLoc);
   bool ParseDirectiveSection(StringRef, SMLoc);
   bool ParseDirectivePushSection(StringRef, SMLoc);
   bool ParseDirectivePopSection(StringRef, SMLoc);
@@ -293,7 +345,7 @@ public:
   }
   bool ParseSectionDirectiveIdent(StringRef, SMLoc) {
     // Darwin silently ignores the .ident directive.
-    getParser().EatToEndOfStatement();
+    getParser().eatToEndOfStatement();
     return false;
   }
   bool ParseSectionDirectiveThreadInitFunc(StringRef, SMLoc) {
@@ -338,7 +390,7 @@ bool DarwinAsmParser::ParseSectionSwitch(const char *Segment,
 ///  ::= .desc identifier , expression
 bool DarwinAsmParser::ParseDirectiveDesc(StringRef, SMLoc) {
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   // Handle the identifier as the key symbol.
@@ -349,7 +401,7 @@ bool DarwinAsmParser::ParseDirectiveDesc(StringRef, SMLoc) {
   Lex();
 
   int64_t DescValue;
-  if (getParser().ParseAbsoluteExpression(DescValue))
+  if (getParser().parseAbsoluteExpression(DescValue))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -386,11 +438,38 @@ bool DarwinAsmParser::ParseDirectiveDumpOrLoad(StringRef Directive,
     return Warning(IDLoc, "ignoring directive .load for now");
 }
 
+/// ParseDirectiveLinkerOption
+///  ::= .linker_option "string" ( , "string" )*
+bool DarwinAsmParser::ParseDirectiveLinkerOption(StringRef IDVal, SMLoc) {
+  SmallVector<std::string, 4> Args;
+  for (;;) {
+    if (getLexer().isNot(AsmToken::String))
+      return TokError("expected string in '" + Twine(IDVal) + "' directive");
+
+    std::string Data;
+    if (getParser().parseEscapedString(Data))
+      return true;
+
+    Args.push_back(Data);
+
+    Lex();
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+  }
+
+  getStreamer().EmitLinkerOptions(Args);
+  return false;
+}
+
 /// ParseDirectiveLsym
 ///  ::= .lsym identifier , expression
 bool DarwinAsmParser::ParseDirectiveLsym(StringRef, SMLoc) {
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   // Handle the identifier as the key symbol.
@@ -401,7 +480,7 @@ bool DarwinAsmParser::ParseDirectiveLsym(StringRef, SMLoc) {
   Lex();
 
   const MCExpr *Value;
-  if (getParser().ParseExpression(Value))
+  if (getParser().parseExpression(Value))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -422,7 +501,7 @@ bool DarwinAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
   SMLoc Loc = getLexer().getLoc();
 
   StringRef SectionName;
-  if (getParser().ParseIdentifier(SectionName))
+  if (getParser().parseIdentifier(SectionName))
     return Error(Loc, "expected identifier after '.section' directive");
 
   // Verify there is a following comma.
@@ -497,7 +576,7 @@ bool DarwinAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
 /// ParseDirectiveSecureLogUnique
 ///  ::= .secure_log_unique ... message ...
 bool DarwinAsmParser::ParseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
-  StringRef LogMessage = getParser().ParseStringToEndOfStatement();
+  StringRef LogMessage = getParser().parseStringToEndOfStatement();
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.secure_log_unique' directive");
 
@@ -565,7 +644,7 @@ bool DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols(StringRef, SMLoc) {
 bool DarwinAsmParser::ParseDirectiveTBSS(StringRef, SMLoc) {
   SMLoc IDLoc = getLexer().getLoc();
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   // Handle the identifier as the key symbol.
@@ -577,7 +656,7 @@ bool DarwinAsmParser::ParseDirectiveTBSS(StringRef, SMLoc) {
 
   int64_t Size;
   SMLoc SizeLoc = getLexer().getLoc();
-  if (getParser().ParseAbsoluteExpression(Size))
+  if (getParser().parseAbsoluteExpression(Size))
     return true;
 
   int64_t Pow2Alignment = 0;
@@ -585,7 +664,7 @@ bool DarwinAsmParser::ParseDirectiveTBSS(StringRef, SMLoc) {
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
     Pow2AlignmentLoc = getLexer().getLoc();
-    if (getParser().ParseAbsoluteExpression(Pow2Alignment))
+    if (getParser().parseAbsoluteExpression(Pow2Alignment))
       return true;
   }
 
@@ -620,7 +699,7 @@ bool DarwinAsmParser::ParseDirectiveTBSS(StringRef, SMLoc) {
 ///      , align_expression ]]
 bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) {
   StringRef Segment;
-  if (getParser().ParseIdentifier(Segment))
+  if (getParser().parseIdentifier(Segment))
     return TokError("expected segment name after '.zerofill' directive");
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -628,7 +707,7 @@ bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) {
   Lex();
 
   StringRef Section;
-  if (getParser().ParseIdentifier(Section))
+  if (getParser().parseIdentifier(Section))
     return TokError("expected section name after comma in '.zerofill' "
                     "directive");
 
@@ -648,7 +727,7 @@ bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) {
 
   SMLoc IDLoc = getLexer().getLoc();
   StringRef IDStr;
-  if (getParser().ParseIdentifier(IDStr))
+  if (getParser().parseIdentifier(IDStr))
     return TokError("expected identifier in directive");
 
   // handle the identifier as the key symbol.
@@ -660,7 +739,7 @@ bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) {
 
   int64_t Size;
   SMLoc SizeLoc = getLexer().getLoc();
-  if (getParser().ParseAbsoluteExpression(Size))
+  if (getParser().parseAbsoluteExpression(Size))
     return true;
 
   int64_t Pow2Alignment = 0;
@@ -668,7 +747,7 @@ bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) {
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
     Pow2AlignmentLoc = getLexer().getLoc();
-    if (getParser().ParseAbsoluteExpression(Pow2Alignment))
+    if (getParser().parseAbsoluteExpression(Pow2Alignment))
       return true;
   }
 
@@ -712,7 +791,7 @@ bool DarwinAsmParser::ParseDirectiveDataRegion(StringRef, SMLoc) {
   }
   StringRef RegionType;
   SMLoc Loc = getParser().getTok().getLoc();
-  if (getParser().ParseIdentifier(RegionType))
+  if (getParser().parseIdentifier(RegionType))
     return TokError("expected region type after '.data_region' directive");
   int Kind = StringSwitch<int>(RegionType)
     .Case("jt8", MCDR_DataRegionJT8)
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index d55de1f..4c45e08 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -22,10 +22,12 @@ using namespace llvm;
 namespace {
 
 class ELFAsmParser : public MCAsmParserExtension {
-  template<bool (ELFAsmParser::*Handler)(StringRef, SMLoc)>
-  void AddDirectiveHandler(StringRef Directive) {
-    getParser().AddDirectiveHandler(this, Directive,
-                                    HandleDirective<ELFAsmParser, Handler>);
+  template<bool (ELFAsmParser::*HandlerMethod)(StringRef, SMLoc)>
+  void addDirectiveHandler(StringRef Directive) {
+    MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair(
+        this, HandleDirective<ELFAsmParser, HandlerMethod>);
+
+    getParser().addDirectiveHandler(Directive, Handler);
   }
 
   bool ParseSectionSwitch(StringRef Section, unsigned Type,
@@ -41,38 +43,38 @@ public:
     // Call the base implementation.
     this->MCAsmParserExtension::Initialize(Parser);
 
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveData>(".data");
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveText>(".text");
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveBSS>(".bss");
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveRoData>(".rodata");
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTData>(".tdata");
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTBSS>(".tbss");
-    AddDirectiveHandler<
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveData>(".data");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveText>(".text");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveBSS>(".bss");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveRoData>(".rodata");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTData>(".tdata");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTBSS>(".tbss");
+    addDirectiveHandler<
       &ELFAsmParser::ParseSectionDirectiveDataRel>(".data.rel");
-    AddDirectiveHandler<
+    addDirectiveHandler<
       &ELFAsmParser::ParseSectionDirectiveDataRelRo>(".data.rel.ro");
-    AddDirectiveHandler<
+    addDirectiveHandler<
       &ELFAsmParser::ParseSectionDirectiveDataRelRoLocal>(".data.rel.ro.local");
-    AddDirectiveHandler<
+    addDirectiveHandler<
       &ELFAsmParser::ParseSectionDirectiveEhFrame>(".eh_frame");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSection>(".section");
-    AddDirectiveHandler<
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSection>(".section");
+    addDirectiveHandler<
       &ELFAsmParser::ParseDirectivePushSection>(".pushsection");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectivePopSection>(".popsection");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSize>(".size");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectivePrevious>(".previous");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveType>(".type");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveIdent>(".ident");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymver>(".symver");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveVersion>(".version");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveWeakref>(".weakref");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".local");
-    AddDirectiveHandler<
+    addDirectiveHandler<&ELFAsmParser::ParseDirectivePopSection>(".popsection");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSize>(".size");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectivePrevious>(".previous");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveType>(".type");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveIdent>(".ident");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymver>(".symver");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveVersion>(".version");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveWeakref>(".weakref");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".local");
+    addDirectiveHandler<
       &ELFAsmParser::ParseDirectiveSymbolAttribute>(".protected");
-    AddDirectiveHandler<
+    addDirectiveHandler<
       &ELFAsmParser::ParseDirectiveSymbolAttribute>(".internal");
-    AddDirectiveHandler<
+    addDirectiveHandler<
       &ELFAsmParser::ParseDirectiveSymbolAttribute>(".hidden");
   }
 
@@ -167,7 +169,7 @@ bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
     for (;;) {
       StringRef Name;
 
-      if (getParser().ParseIdentifier(Name))
+      if (getParser().parseIdentifier(Name))
         return TokError("expected identifier in directive");
 
       MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
@@ -201,7 +203,7 @@ bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
 
 bool ELFAsmParser::ParseDirectiveSize(StringRef, SMLoc) {
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
@@ -210,7 +212,7 @@ bool ELFAsmParser::ParseDirectiveSize(StringRef, SMLoc) {
   Lex();
 
   const MCExpr *Expr;
-  if (getParser().ParseExpression(Expr))
+  if (getParser().parseExpression(Expr))
     return true;
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -222,7 +224,7 @@ bool ELFAsmParser::ParseDirectiveSize(StringRef, SMLoc) {
 
 bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
   // A section name can contain -, so we cannot just use
-  // ParseIdentifier.
+  // parseIdentifier.
   SMLoc FirstLoc = getLexer().getLoc();
   unsigned Size = 0;
 
@@ -375,14 +377,14 @@ bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
         return TokError("expected '@' or '%' before type");
 
       Lex();
-      if (getParser().ParseIdentifier(TypeName))
+      if (getParser().parseIdentifier(TypeName))
         return TokError("expected identifier in directive");
 
       if (Mergeable) {
         if (getLexer().isNot(AsmToken::Comma))
           return TokError("expected the entry size");
         Lex();
-        if (getParser().ParseAbsoluteExpression(Size))
+        if (getParser().parseAbsoluteExpression(Size))
           return true;
         if (Size <= 0)
           return TokError("entry size must be positive");
@@ -392,12 +394,12 @@ bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
         if (getLexer().isNot(AsmToken::Comma))
           return TokError("expected group name");
         Lex();
-        if (getParser().ParseIdentifier(GroupName))
+        if (getParser().parseIdentifier(GroupName))
           return true;
         if (getLexer().is(AsmToken::Comma)) {
           Lex();
           StringRef Linkage;
-          if (getParser().ParseIdentifier(Linkage))
+          if (getParser().parseIdentifier(Linkage))
             return true;
           if (Linkage != "comdat")
             return TokError("Linkage must be 'comdat'");
@@ -411,7 +413,16 @@ bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
 
   unsigned Type = ELF::SHT_PROGBITS;
 
-  if (!TypeName.empty()) {
+  if (TypeName.empty()) {
+    if (SectionName.startswith(".note"))
+      Type = ELF::SHT_NOTE;
+    else if (SectionName == ".init_array")
+      Type = ELF::SHT_INIT_ARRAY;
+    else if (SectionName == ".fini_array")
+      Type = ELF::SHT_FINI_ARRAY;
+    else if (SectionName == ".preinit_array")
+      Type = ELF::SHT_PREINIT_ARRAY;
+  } else {
     if (TypeName == "init_array")
       Type = ELF::SHT_INIT_ARRAY;
     else if (TypeName == "fini_array")
@@ -450,7 +461,7 @@ bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
 ///  ::= .type identifier , @attribute
 bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   // Handle the identifier as the key symbol.
@@ -468,7 +479,7 @@ bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
   SMLoc TypeLoc;
 
   TypeLoc = getLexer().getLoc();
-  if (getParser().ParseIdentifier(Type))
+  if (getParser().parseIdentifier(Type))
     return TokError("expected symbol type in directive");
 
   MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Type)
@@ -517,7 +528,7 @@ bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) {
     getStreamer().EmitIntValue(0, 1);
     SeenIdent = true;
   }
-  getStreamer().EmitBytes(Data, 0);
+  getStreamer().EmitBytes(Data);
   getStreamer().EmitIntValue(0, 1);
   getStreamer().PopSection();
   return false;
@@ -527,7 +538,7 @@ bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) {
 ///  ::= .symver foo, bar2@zed
 bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -536,7 +547,7 @@ bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
   Lex();
 
   StringRef AliasName;
-  if (getParser().ParseIdentifier(AliasName))
+  if (getParser().parseIdentifier(AliasName))
     return TokError("expected identifier in directive");
 
   if (AliasName.find('@') == StringRef::npos)
@@ -569,7 +580,7 @@ bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) {
   getStreamer().EmitIntValue(Data.size()+1, 4); // namesz.
   getStreamer().EmitIntValue(0, 4);             // descsz = 0 (no description).
   getStreamer().EmitIntValue(1, 4);             // type = NT_VERSION.
-  getStreamer().EmitBytes(Data, 0);             // name.
+  getStreamer().EmitBytes(Data);                // name.
   getStreamer().EmitIntValue(0, 1);             // terminate the string.
   getStreamer().EmitValueToAlignment(4);        // ensure 4 byte alignment.
   getStreamer().PopSection();
@@ -582,7 +593,7 @@ bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) {
   // FIXME: Share code with the other alias building directives.
 
   StringRef AliasName;
-  if (getParser().ParseIdentifier(AliasName))
+  if (getParser().parseIdentifier(AliasName))
     return TokError("expected identifier in directive");
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -591,7 +602,7 @@ bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) {
   Lex();
 
   StringRef Name;
-  if (getParser().ParseIdentifier(Name))
+  if (getParser().parseIdentifier(Name))
     return TokError("expected identifier in directive");
 
   MCSymbol *Alias = getContext().GetOrCreateSymbol(AliasName);
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index a8b00cd..6e1ebad 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -38,9 +38,9 @@ bool MCAsmParser::TokError(const Twine &Msg, ArrayRef<SMRange> Ranges) {
   return true;
 }
 
-bool MCAsmParser::ParseExpression(const MCExpr *&Res) {
+bool MCAsmParser::parseExpression(const MCExpr *&Res) {
   SMLoc L;
-  return ParseExpression(Res, L);
+  return parseExpression(Res, L);
 }
 
 void MCParsedAsmOperand::dump() const {
diff --git a/lib/MC/MCPureStreamer.cpp b/lib/MC/MCPureStreamer.cpp
index 97e5a69..573308a 100644
--- a/lib/MC/MCPureStreamer.cpp
+++ b/lib/MC/MCPureStreamer.cpp
@@ -28,14 +28,15 @@ private:
   virtual void EmitInstToData(const MCInst &Inst);
 
 public:
-  MCPureStreamer(MCContext &Context, MCAsmBackend &TAB,
-                 raw_ostream &OS, MCCodeEmitter *Emitter)
-    : MCObjectStreamer(Context, TAB, OS, Emitter) {}
+  MCPureStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                 MCCodeEmitter *Emitter)
+      : MCObjectStreamer(SK_PureStreamer, Context, TAB, OS, Emitter) {}
 
   /// @name MCStreamer Interface
   /// @{
 
   virtual void InitSections();
+  virtual void InitToTextSection();
   virtual void EmitLabel(MCSymbol *Symbol);
   virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
@@ -99,16 +100,23 @@ public:
   }
 
   /// @}
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_PureStreamer;
+  }
 };
 
 } // end anonymous namespace.
 
 void MCPureStreamer::InitSections() {
+  InitToTextSection();
+}
+
+void MCPureStreamer::InitToTextSection() {
   // FIMXE: To what!?
   SwitchSection(getContext().getMachOSection("__TEXT", "__text",
                                     MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
                                     0, SectionKind::getText()));
-
 }
 
 void MCPureStreamer::EmitLabel(MCSymbol *Symbol) {
diff --git a/lib/MC/MCSectionMachO.cpp b/lib/MC/MCSectionMachO.cpp
index e771556..fc32315 100644
--- a/lib/MC/MCSectionMachO.cpp
+++ b/lib/MC/MCSectionMachO.cpp
@@ -165,9 +165,9 @@ bool MCSectionMachO::isVirtualSection() const {
 
 /// StripSpaces - This removes leading and trailing spaces from the StringRef.
 static void StripSpaces(StringRef &Str) {
-  while (!Str.empty() && isspace(Str[0]))
+  while (!Str.empty() && isspace(static_cast<unsigned char>(Str[0])))
     Str = Str.substr(1);
-  while (!Str.empty() && isspace(Str.back()))
+  while (!Str.empty() && isspace(static_cast<unsigned char>(Str.back())))
     Str = Str.substr(0, Str.size()-1);
 }
 
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 7dffc3e..9857f7b 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -21,10 +21,9 @@
 #include <cstdlib>
 using namespace llvm;
 
-MCStreamer::MCStreamer(MCContext &Ctx)
-  : Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false),
-    CurrentW64UnwindInfo(0), LastSymbol(0),
-    AutoInitSections(false) {
+MCStreamer::MCStreamer(StreamerKind Kind, MCContext &Ctx)
+    : Kind(Kind), Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false),
+      CurrentW64UnwindInfo(0), LastSymbol(0), AutoInitSections(false) {
   const MCSection *section = NULL;
   SectionStack.push_back(std::make_pair(section, section));
 }
@@ -43,7 +42,7 @@ void MCStreamer::reset() {
   LastSymbol = 0;
   const MCSection *section = NULL;
   SectionStack.clear();
-  SectionStack.push_back(std::make_pair(section, section));  
+  SectionStack.push_back(std::make_pair(section, section));
 }
 
 const MCExpr *MCStreamer::BuildSymbolDiff(MCContext &Context,
@@ -104,8 +103,8 @@ void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size,
 
 /// EmitULEB128Value - Special case of EmitULEB128Value that avoids the
 /// client having to pass in a MCExpr for constant integers.
-void MCStreamer::EmitULEB128IntValue(uint64_t Value, unsigned AddrSpace,
-                                     unsigned Padding) {
+void MCStreamer::EmitULEB128IntValue(uint64_t Value, unsigned Padding,
+                                     unsigned AddrSpace) {
   SmallString<128> Tmp;
   raw_svector_ostream OSE(Tmp);
   encodeULEB128(Value, OSE, Padding);
@@ -621,3 +620,8 @@ void MCStreamer::Finish() {
 
   FinishImpl();
 }
+
+MCSymbolData &MCStreamer::getOrCreateSymbolData(MCSymbol *Symbol) {
+  report_fatal_error("Not supported!");
+  return *(static_cast<MCSymbolData*> (NULL));
+}
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 0098bea..a5ba3c3 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -376,6 +376,39 @@ void MachObjectWriter::WriteLinkeditLoadCommand(uint32_t Type,
   assert(OS.tell() - Start == macho::LinkeditLoadCommandSize);
 }
 
+static unsigned ComputeLinkerOptionsLoadCommandSize(
+  const std::vector<std::string> &Options, bool is64Bit)
+{
+  unsigned Size = sizeof(macho::LinkerOptionsLoadCommand);
+  for (unsigned i = 0, e = Options.size(); i != e; ++i)
+    Size += Options[i].size() + 1;
+  return RoundUpToAlignment(Size, is64Bit ? 8 : 4);
+}
+
+void MachObjectWriter::WriteLinkerOptionsLoadCommand(
+  const std::vector<std::string> &Options)
+{
+  unsigned Size = ComputeLinkerOptionsLoadCommandSize(Options, is64Bit());
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  Write32(macho::LCT_LinkerOptions);
+  Write32(Size);
+  Write32(Options.size());
+  uint64_t BytesWritten = sizeof(macho::LinkerOptionsLoadCommand);
+  for (unsigned i = 0, e = Options.size(); i != e; ++i) {
+    // Write each string, including the null byte.
+    const std::string &Option = Options[i];
+    WriteBytes(Option.c_str(), Option.size() + 1);
+    BytesWritten += Option.size() + 1;
+  }
+
+  // Pad to a multiple of the pointer size.
+  WriteBytes("", OffsetToAlignment(BytesWritten, is64Bit() ? 8 : 4));
+
+  assert(OS.tell() - Start == Size);
+}
+
 
 void MachObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                         const MCAsmLayout &Layout,
@@ -693,6 +726,13 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
     macho::SegmentLoadCommand64Size + NumSections * macho::Section64Size :
     macho::SegmentLoadCommand32Size + NumSections * macho::Section32Size;
 
+  // Add the data-in-code load command size, if used.
+  unsigned NumDataRegions = Asm.getDataRegions().size();
+  if (NumDataRegions) {
+    ++NumLoadCommands;
+    LoadCommandsSize += macho::LinkeditLoadCommandSize;
+  }
+
   // Add the symbol table load command sizes, if used.
   unsigned NumSymbols = LocalSymbolData.size() + ExternalSymbolData.size() +
     UndefinedSymbolData.size();
@@ -702,13 +742,15 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
                          macho::DysymtabLoadCommandSize);
   }
 
-  // Add the data-in-code load command size, if used.
-  unsigned NumDataRegions = Asm.getDataRegions().size();
-  if (NumDataRegions) {
+  // Add the linker option load commands sizes.
+  const std::vector<std::vector<std::string> > &LinkerOptions =
+    Asm.getLinkerOptions();
+  for (unsigned i = 0, e = LinkerOptions.size(); i != e; ++i) {
     ++NumLoadCommands;
-    LoadCommandsSize += macho::LinkeditLoadCommandSize;
+    LoadCommandsSize += ComputeLinkerOptionsLoadCommandSize(LinkerOptions[i],
+                                                            is64Bit());
   }
-
+  
   // Compute the total size of the section data, as well as its file size and vm
   // size.
   uint64_t SectionDataStart = (is64Bit() ? macho::Header64Size :
@@ -799,6 +841,11 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
                              IndirectSymbolOffset, NumIndirectSymbols);
   }
 
+  // Write the linker options load commands.
+  for (unsigned i = 0, e = LinkerOptions.size(); i != e; ++i) {
+    WriteLinkerOptionsLoadCommand(LinkerOptions[i]);
+  }
+
   // Write the actual section data.
   for (MCAssembler::const_iterator it = Asm.begin(),
          ie = Asm.end(); it != ie; ++it) {
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 01860c5..6dffed7 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -36,7 +36,7 @@
 using namespace llvm;
 
 namespace {
-typedef llvm::SmallString<COFF::NameSize> name;
+typedef SmallString<COFF::NameSize> name;
 
 enum AuxiliaryType {
   ATFunctionDefinition,
@@ -58,7 +58,7 @@ class COFFSymbol {
 public:
   COFF::symbol Data;
 
-  typedef llvm::SmallVector<AuxSymbol, 1> AuxiliarySymbols;
+  typedef SmallVector<AuxSymbol, 1> AuxiliarySymbols;
 
   name             Name;
   int              Index;
@@ -69,7 +69,7 @@ public:
 
   MCSymbolData const *MCData;
 
-  COFFSymbol(llvm::StringRef name);
+  COFFSymbol(StringRef name);
   size_t size() const;
   void set_name_offset(uint32_t Offset);
 
@@ -97,13 +97,13 @@ public:
   COFFSymbol          *Symbol;
   relocations          Relocations;
 
-  COFFSection(llvm::StringRef name);
+  COFFSection(StringRef name);
   static size_t size();
 };
 
 // This class holds the COFF string table.
 class StringTable {
-  typedef llvm::StringMap<size_t> map;
+  typedef StringMap<size_t> map;
   map Map;
 
   void update_length();
@@ -112,7 +112,7 @@ public:
 
   StringTable();
   size_t size() const;
-  size_t insert(llvm::StringRef String);
+  size_t insert(StringRef String);
 };
 
 class WinCOFFObjectWriter : public MCObjectWriter {
@@ -144,10 +144,12 @@ public:
   COFFSection *createSection(StringRef Name);
 
   template <typename object_t, typename list_t>
-  object_t *createCOFFEntity(llvm::StringRef Name, list_t &List);
+  object_t *createCOFFEntity(StringRef Name, list_t &List);
 
   void DefineSection(MCSectionData const &SectionData);
-  void DefineSymbol(MCSymbolData const &SymbolData, MCAssembler &Assembler);
+  void DefineSymbol(MCSymbol const &Symbol,
+                    MCSymbolData const &SymbolData,
+                    MCAssembler &Assembler);
 
   void MakeSymbolReal(COFFSymbol &S, size_t Index);
   void MakeSectionReal(COFFSection &S, size_t Number);
@@ -202,7 +204,7 @@ static inline void write_uint8_le(void *Data, uint8_t const &Value) {
 //------------------------------------------------------------------------------
 // Symbol class implementation
 
-COFFSymbol::COFFSymbol(llvm::StringRef name)
+COFFSymbol::COFFSymbol(StringRef name)
   : Name(name.begin(), name.end())
   , Other(NULL)
   , Section(NULL)
@@ -254,7 +256,7 @@ bool COFFSymbol::should_keep() const {
 //------------------------------------------------------------------------------
 // Section class implementation
 
-COFFSection::COFFSection(llvm::StringRef name)
+COFFSection::COFFSection(StringRef name)
   : Name(name)
   , MCData(NULL)
   , Symbol(NULL) {
@@ -287,7 +289,7 @@ size_t StringTable::size() const {
 
 /// Add String to the table iff it is not already there.
 /// @returns the index into the string table where the string is now located.
-size_t StringTable::insert(llvm::StringRef String) {
+size_t StringTable::insert(StringRef String) {
   map::iterator i = Map.find(String);
 
   if (i != Map.end())
@@ -341,14 +343,14 @@ COFFSymbol *WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol * Symbol){
   return RetSymbol;
 }
 
-COFFSection *WinCOFFObjectWriter::createSection(llvm::StringRef Name) {
+COFFSection *WinCOFFObjectWriter::createSection(StringRef Name) {
   return createCOFFEntity<COFFSection>(Name, Sections);
 }
 
 /// A template used to lookup or create a symbol/section, and initialize it if
 /// needed.
 template <typename object_t, typename list_t>
-object_t *WinCOFFObjectWriter::createCOFFEntity(llvm::StringRef Name,
+object_t *WinCOFFObjectWriter::createCOFFEntity(StringRef Name,
                                                 list_t &List) {
   object_t *Object = new object_t(Name);
 
@@ -408,9 +410,10 @@ void WinCOFFObjectWriter::DefineSection(MCSectionData const &SectionData) {
 
 /// This function takes a section data object from the assembler
 /// and creates the associated COFF symbol staging object.
-void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
+void WinCOFFObjectWriter::DefineSymbol(MCSymbol const &Symbol,
+                                       MCSymbolData const &SymbolData,
                                        MCAssembler &Assembler) {
-  COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&SymbolData.getSymbol());
+  COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
 
   coff_symbol->Data.Type         = (SymbolData.getFlags() & 0x0000FFFF) >>  0;
   coff_symbol->Data.StorageClass = (SymbolData.getFlags() & 0x00FF0000) >> 16;
@@ -418,20 +421,17 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
   if (SymbolData.getFlags() & COFF::SF_WeakExternal) {
     coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
 
-    if (SymbolData.getSymbol().isVariable()) {
+    if (Symbol.isVariable()) {
       coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
-      const MCExpr *Value = SymbolData.getSymbol().getVariableValue();
 
       // FIXME: This assert message isn't very good.
-      assert(Value->getKind() == MCExpr::SymbolRef &&
+      assert(Symbol.getVariableValue()->getKind() == MCExpr::SymbolRef &&
               "Value must be a SymbolRef!");
 
-      const MCSymbolRefExpr *SymbolRef =
-        static_cast<const MCSymbolRefExpr *>(Value);
-      coff_symbol->Other = GetOrCreateCOFFSymbol(&SymbolRef->getSymbol());
+      coff_symbol->Other = GetOrCreateCOFFSymbol(&Symbol);
     } else {
       std::string WeakName = std::string(".weak.")
-                           +  SymbolData.getSymbol().getName().str()
+                           +  Symbol.getName().str()
                            + ".default";
       COFFSymbol *WeakDefault = createSymbol(WeakName);
       WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
@@ -464,7 +464,7 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
 
   // Bind internal COFF symbol to MC symbol.
   coff_symbol->MCData = &SymbolData;
-  SymbolMap[&SymbolData.getSymbol()] = coff_symbol;
+  SymbolMap[&Symbol] = coff_symbol;
 }
 
 /// making a section real involves assigned it a number and putting
@@ -619,8 +619,11 @@ void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
 
   for (MCAssembler::const_symbol_iterator i = Asm.symbol_begin(),
                                           e = Asm.symbol_end(); i != e; i++) {
-    if (ExportSymbol(*i, Asm))
-      DefineSymbol(*i, Asm);
+    if (ExportSymbol(*i, Asm)) {
+      const MCSymbol &Alias = i->getSymbol();
+      const MCSymbol &Symbol = Alias.AliasedSymbol();
+      DefineSymbol(Alias, Asm.getSymbolData(Symbol), Asm);
+    }
   }
 }
 
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 5489ef8..75f343c 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -50,11 +50,11 @@ public:
   // MCStreamer interface
 
   virtual void InitSections();
+  virtual void InitToTextSection();
   virtual void EmitLabel(MCSymbol *Symbol);
   virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitThumbFunc(MCSymbol *Func);
-  virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
   virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void BeginCOFFSymbolDef(MCSymbol const *Symbol);
@@ -75,6 +75,10 @@ public:
   virtual void EmitWin64EHHandlerData();
   virtual void FinishImpl();
 
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_WinCOFFStreamer;
+  }
+
 private:
   virtual void EmitInstToData(const MCInst &Inst) {
     MCDataFragment *DF = getOrCreateDataFragment();
@@ -128,13 +132,10 @@ private:
 };
 } // end anonymous namespace.
 
-WinCOFFStreamer::WinCOFFStreamer(MCContext &Context,
-                                 MCAsmBackend &MAB,
-                                 MCCodeEmitter &CE,
-                                 raw_ostream &OS)
-    : MCObjectStreamer(Context, MAB, OS, &CE)
-    , CurSymbol(NULL) {
-}
+WinCOFFStreamer::WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                 MCCodeEmitter &CE, raw_ostream &OS)
+    : MCObjectStreamer(SK_WinCOFFStreamer, Context, MAB, OS, &CE),
+      CurSymbol(NULL) {}
 
 void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment, bool External) {
@@ -173,6 +174,10 @@ void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 
 // MCStreamer interface
 
+void WinCOFFStreamer::InitToTextSection() {
+  SetSectionText();
+}
+
 void WinCOFFStreamer::InitSections() {
   SetSectionText();
   SetSectionData();
@@ -196,44 +201,6 @@ void WinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
-  assert((Symbol->isInSection()
-         ? Symbol->getSection().getVariant() == MCSection::SV_COFF
-         : true) && "Got non COFF section in the COFF backend!");
-  // FIXME: This is all very ugly and depressing. What needs to happen here
-  // depends on quite a few things that are all part of relaxation, which we
-  // don't really even do.
-
-  if (Value->getKind() != MCExpr::SymbolRef) {
-    MCObjectStreamer::EmitAssignment(Symbol, Value);
-  } else {
-    // FIXME: This is a horrible way to do this :(. This should really be
-    // handled after we are done with the MC* objects and immediately before
-    // writing out the object file when we know exactly what the symbol should
-    // look like in the coff symbol table. I'm not doing that now because the
-    // COFF object writer doesn't have a clearly defined separation between MC
-    // data structures, the object writers data structures, and the raw, POD,
-    // data structures that get written to disk.
-
-    // Copy over the aliased data.
-    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-    const MCSymbolData &RealSD = getAssembler().getOrCreateSymbolData(
-      dyn_cast<const MCSymbolRefExpr>(Value)->getSymbol());
-
-    // FIXME: This is particularly nasty because it breaks as soon as any data
-    // members of MCSymbolData change.
-    SD.CommonAlign     = RealSD.CommonAlign;
-    SD.CommonSize      = RealSD.CommonSize;
-    SD.Flags           = RealSD.Flags;
-    SD.Fragment        = RealSD.Fragment;
-    SD.Index           = RealSD.Index;
-    SD.IsExternal      = RealSD.IsExternal;
-    SD.IsPrivateExtern = RealSD.IsPrivateExtern;
-    SD.Offset          = RealSD.Offset;
-    SD.SymbolSize      = RealSD.SymbolSize;
-  }
-}
-
 void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                           MCSymbolAttr Attribute) {
   assert(Symbol && "Symbol must be non-null!");
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index dafcb72..0e13d05 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -14,7 +14,6 @@
 #include "llvm/Object/Archive.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
@@ -22,44 +21,6 @@ using namespace object;
 
 static const char *Magic = "!<arch>\n";
 
-namespace {
-struct ArchiveMemberHeader {
-  char Name[16];
-  char LastModified[12];
-  char UID[6];
-  char GID[6];
-  char AccessMode[8];
-  char Size[10]; ///< Size of data, not including header or padding.
-  char Terminator[2];
-
-  ///! Get the name without looking up long names.
-  StringRef getName() const {
-    char EndCond;
-    if (Name[0] == '/' || Name[0] == '#')
-      EndCond = ' ';
-    else
-      EndCond = '/';
-    StringRef::size_type end = StringRef(Name, sizeof(Name)).find(EndCond);
-    if (end == StringRef::npos)
-      end = sizeof(Name);
-    assert(end <= sizeof(Name) && end > 0);
-    // Don't include the EndCond if there is one.
-    return StringRef(Name, end);
-  }
-
-  uint64_t getSize() const {
-    APInt ret;
-    StringRef(Size, sizeof(Size)).getAsInteger(10, ret);
-    return ret.getZExtValue();
-  }
-};
-}
-
-static const ArchiveMemberHeader *ToHeader(const char *base) {
-  return reinterpret_cast<const ArchiveMemberHeader *>(base);
-}
-
-
 static bool isInternalMember(const ArchiveMemberHeader &amh) {
   static const char *const internals[] = {
     "/",
@@ -77,25 +38,6 @@ static bool isInternalMember(const ArchiveMemberHeader &amh) {
 
 void Archive::anchor() { }
 
-Archive::Child Archive::Child::getNext() const {
-  size_t SpaceToSkip = sizeof(ArchiveMemberHeader) +
-    ToHeader(Data.data())->getSize();
-  // If it's odd, add 1 to make it even.
-  if (SpaceToSkip & 1)
-    ++SpaceToSkip;
-
-  const char *NextLoc = Data.data() + SpaceToSkip;
-
-  // Check to see if this is past the end of the archive.
-  if (NextLoc >= Parent->Data->getBufferEnd())
-    return Child(Parent, StringRef(0, 0));
-
-  size_t NextSize = sizeof(ArchiveMemberHeader) +
-    ToHeader(NextLoc)->getSize();
-
-  return Child(Parent, StringRef(NextLoc, NextSize));
-}
-
 error_code Archive::Child::getName(StringRef &Result) const {
   StringRef name = ToHeader(Data.data())->getName();
   // Check if it's a special name.
@@ -110,11 +52,12 @@ error_code Archive::Child::getName(StringRef &Result) const {
     }
     // It's a long name.
     // Get the offset.
-    APInt offset;
-    name.substr(1).getAsInteger(10, offset);
+    std::size_t offset;
+    if (name.substr(1).rtrim(" ").getAsInteger(10, offset))
+      llvm_unreachable("Long name offset is not an integer");
     const char *addr = Parent->StringTable->Data.begin()
                        + sizeof(ArchiveMemberHeader)
-                       + offset.getZExtValue();
+                       + offset;
     // Verify it.
     if (Parent->StringTable == Parent->end_children()
         || addr < (Parent->StringTable->Data.begin()
@@ -133,9 +76,10 @@ error_code Archive::Child::getName(StringRef &Result) const {
     }
     return object_error::success;
   } else if (name.startswith("#1/")) {
-    APInt name_size;
-    name.substr(3).getAsInteger(10, name_size);
-    Result = Data.substr(0, name_size.getZExtValue());
+    uint64_t name_size;
+    if (name.substr(3).rtrim(" ").getAsInteger(10, name_size))
+      llvm_unreachable("Long name length is not an ingeter");
+    Result = Data.substr(sizeof(ArchiveMemberHeader), name_size);
     return object_error::success;
   }
   // It's a simple name.
@@ -146,36 +90,12 @@ error_code Archive::Child::getName(StringRef &Result) const {
   return object_error::success;
 }
 
-uint64_t Archive::Child::getSize() const {
-  uint64_t size = ToHeader(Data.data())->getSize();
-  // Don't include attached name.
-  StringRef name =  ToHeader(Data.data())->getName();
-  if (name.startswith("#1/")) {
-    APInt name_size;
-    name.substr(3).getAsInteger(10, name_size);
-    size -= name_size.getZExtValue();
-  }
-  return size;
-}
-
-MemoryBuffer *Archive::Child::getBuffer() const {
-  StringRef name;
-  if (getName(name)) return NULL;
-  int size = sizeof(ArchiveMemberHeader);
-  if (name.startswith("#1/")) {
-    APInt name_size;
-    name.substr(3).getAsInteger(10, name_size);
-    size += name_size.getZExtValue();
-  }
-  return MemoryBuffer::getMemBuffer(Data.substr(size, getSize()),
-                                    name,
-                                    false);
-}
-
 error_code Archive::Child::getAsBinary(OwningPtr<Binary> &Result) const {
   OwningPtr<Binary> ret;
-  if (error_code ec =
-    createBinary(getBuffer(), ret))
+  OwningPtr<MemoryBuffer> Buff;
+  if (error_code ec = getMemoryBuffer(Buff))
+    return ec;
+  if (error_code ec = createBinary(Buff.take(), ret))
     return ec;
   Result.swap(ret);
   return object_error::success;
@@ -218,6 +138,10 @@ Archive::Archive(MemoryBuffer *source, error_code &ec)
     SymbolTable = i;
     StringTable = e;
     if (i != e) ++i;
+    if (i == e) {
+      ec = object_error::parse_failed;
+      return;
+    }
     if ((ec = i->getName(name)))
       return;
     if (name[0] != '/') {
@@ -260,13 +184,12 @@ Archive::child_iterator Archive::end_children() const {
 }
 
 error_code Archive::Symbol::getName(StringRef &Result) const {
-  Result =
-    StringRef(Parent->SymbolTable->getBuffer()->getBufferStart() + StringIndex);
+  Result = StringRef(Parent->SymbolTable->getBuffer().begin() + StringIndex);
   return object_error::success;
 }
 
 error_code Archive::Symbol::getMember(child_iterator &Result) const {
-  const char *Buf = Parent->SymbolTable->getBuffer()->getBufferStart();
+  const char *Buf = Parent->SymbolTable->getBuffer().begin();
   const char *Offsets = Buf + 4;
   uint32_t Offset = 0;
   if (Parent->kind() == K_GNU) {
@@ -316,13 +239,13 @@ Archive::Symbol Archive::Symbol::getNext() const {
   Symbol t(*this);
   // Go to one past next null.
   t.StringIndex =
-    Parent->SymbolTable->getBuffer()->getBuffer().find('\0', t.StringIndex) + 1;
+      Parent->SymbolTable->getBuffer().find('\0', t.StringIndex) + 1;
   ++t.SymbolIndex;
   return t;
 }
 
 Archive::symbol_iterator Archive::begin_symbols() const {
-  const char *buf = SymbolTable->getBuffer()->getBufferStart();
+  const char *buf = SymbolTable->getBuffer().begin();
   if (kind() == K_GNU) {
     uint32_t symbol_count = 0;
     symbol_count = *reinterpret_cast<const support::ubig32_t*>(buf);
@@ -337,13 +260,12 @@ Archive::symbol_iterator Archive::begin_symbols() const {
     symbol_count = *reinterpret_cast<const support::ulittle32_t*>(buf);
     buf += 4 + (symbol_count * 2); // Skip indices.
   }
-  uint32_t string_start_offset =
-    buf - SymbolTable->getBuffer()->getBufferStart();
+  uint32_t string_start_offset = buf - SymbolTable->getBuffer().begin();
   return symbol_iterator(Symbol(this, 0, string_start_offset));
 }
 
 Archive::symbol_iterator Archive::end_symbols() const {
-  const char *buf = SymbolTable->getBuffer()->getBufferStart();
+  const char *buf = SymbolTable->getBuffer().begin();
   uint32_t symbol_count = 0;
   if (kind() == K_GNU) {
     symbol_count = *reinterpret_cast<const support::ubig32_t*>(buf);
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index b60a2da..b0b84fc 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -269,7 +269,7 @@ error_code COFFObjectFile::getSymbolNMTypeChar(DataRefImpl Symb,
   }
 
   if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL)
-    ret = ::toupper(ret);
+    ret = ::toupper(static_cast<unsigned char>(ret));
 
   Result = ret;
   return object_error::success;
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index 8d5ac63..62626d7 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -29,31 +29,43 @@ ObjectFile *ObjectFile::createELFObjectFile(MemoryBuffer *Object) {
     1ULL << CountTrailingZeros_64(uintptr_t(Object->getBufferStart()));
 
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB)
+#if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 4)
-      return new ELFObjectFile<support::little, 4, false>(Object, ec);
-    else if (MaxAlignment >= 2)
-      return new ELFObjectFile<support::little, 2, false>(Object, ec);
+      return new ELFObjectFile<ELFType<support::little, 4, false> >(Object, ec);
+    else
+#endif
+    if (MaxAlignment >= 2)
+      return new ELFObjectFile<ELFType<support::little, 2, false> >(Object, ec);
     else
       llvm_unreachable("Invalid alignment for ELF file!");
   else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB)
+#if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 4)
-      return new ELFObjectFile<support::big, 4, false>(Object, ec);
-    else if (MaxAlignment >= 2)
-      return new ELFObjectFile<support::big, 2, false>(Object, ec);
+      return new ELFObjectFile<ELFType<support::big, 4, false> >(Object, ec);
+    else
+#endif
+    if (MaxAlignment >= 2)
+      return new ELFObjectFile<ELFType<support::big, 2, false> >(Object, ec);
     else
       llvm_unreachable("Invalid alignment for ELF file!");
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB)
+#if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 8)
-      return new ELFObjectFile<support::big, 8, true>(Object, ec);
-    else if (MaxAlignment >= 2)
-      return new ELFObjectFile<support::big, 2, true>(Object, ec);
+      return new ELFObjectFile<ELFType<support::big, 8, true> >(Object, ec);
+    else
+#endif
+    if (MaxAlignment >= 2)
+      return new ELFObjectFile<ELFType<support::big, 2, true> >(Object, ec);
     else
       llvm_unreachable("Invalid alignment for ELF file!");
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) {
+#if !LLVM_IS_UNALIGNED_ACCESS_FAST
     if (MaxAlignment >= 8)
-      return new ELFObjectFile<support::little, 8, true>(Object, ec);
-    else if (MaxAlignment >= 2)
-      return new ELFObjectFile<support::little, 2, true>(Object, ec);
+      return new ELFObjectFile<ELFType<support::little, 8, true> >(Object, ec);
+    else
+#endif
+    if (MaxAlignment >= 2)
+      return new ELFObjectFile<ELFType<support::little, 2, true> >(Object, ec);
     else
       llvm_unreachable("Invalid alignment for ELF file!");
   }
diff --git a/lib/Object/MachOObject.cpp b/lib/Object/MachOObject.cpp
index a64db1c..c9c341a 100644
--- a/lib/Object/MachOObject.cpp
+++ b/lib/Object/MachOObject.cpp
@@ -44,7 +44,8 @@ static void ReadInMemoryStruct(const MachOObject &MOO,
   }
 
   // Check whether we can return a direct pointer.
-  struct_type *Ptr = (struct_type *) (Buffer.data() + Base);
+  struct_type *Ptr = reinterpret_cast<struct_type *>(
+                       const_cast<char *>(Buffer.data() + Base));
   if (!MOO.isSwappedEndian()) {
     Res = Ptr;
     return;
@@ -258,6 +259,17 @@ void MachOObject::ReadLinkeditDataLoadCommand(const LoadCommandInfo &LCI,
 }
 
 template<>
+void SwapStruct(macho::LinkerOptionsLoadCommand &Value) {
+  SwapValue(Value.Type);
+  SwapValue(Value.Size);
+  SwapValue(Value.Count);
+}
+void MachOObject::ReadLinkerOptionsLoadCommand(const LoadCommandInfo &LCI,
+                   InMemoryStruct<macho::LinkerOptionsLoadCommand> &Res) const {
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), LCI.Offset, Res);
+}
+
+template<>
 void SwapStruct(macho::IndirectSymbolTableEntry &Value) {
   SwapValue(Value.Index);
 }
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 0ad8893..eb1690e 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -273,7 +273,7 @@ error_code MachOObjectFile::getSymbolNMTypeChar(DataRefImpl DRI,
   }
 
   if (Flags & (macho::STF_External | macho::STF_PrivateExtern))
-    Char = toupper(Char);
+    Char = toupper(static_cast<unsigned char>(Char));
   Result = Char;
   return object_error::success;
 }
@@ -1076,6 +1076,7 @@ error_code MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
         printRelocationTargetName(RENext, fmt);
         fmt << "-";
         printRelocationTargetName(RE, fmt);
+        break;
       }
       case macho::RIT_X86_64_TLV:
         printRelocationTargetName(RE, fmt);
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index b14df9a..860c87b 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -33,6 +33,8 @@ ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
   sys::LLVMFileType type = sys::IdentifyFileType(Object->getBufferStart(),
                                 static_cast<unsigned>(Object->getBufferSize()));
   switch (type) {
+    case sys::Unknown_FileType:
+      return 0;
     case sys::ELF_Relocatable_FileType:
     case sys::ELF_Executable_FileType:
     case sys::ELF_SharedObject_FileType:
@@ -52,7 +54,7 @@ ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
     case sys::COFF_FileType:
       return createCOFFObjectFile(Object);
     default:
-      llvm_unreachable("Unknown Object File Type");
+      llvm_unreachable("Unexpected Object File Type");
   }
 }
 
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 0e3c619..5b68fbb 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -101,26 +102,6 @@ decDigitValue(unsigned int c)
   return c - '0';
 }
 
-static unsigned int
-hexDigitValue(unsigned int c)
-{
-  unsigned int r;
-
-  r = c - '0';
-  if (r <= 9)
-    return r;
-
-  r = c - 'A';
-  if (r <= 5)
-    return r + 10;
-
-  r = c - 'a';
-  if (r <= 5)
-    return r + 10;
-
-  return -1U;
-}
-
 /* Return the value of a decimal exponent of the form
    [+-]ddddddd.
 
@@ -1932,6 +1913,12 @@ APFloat::convert(const fltSemantics &toSemantics,
     *losesInfo = (fs != opOK);
   } else if (category == fcNaN) {
     *losesInfo = lostFraction != lfExactlyZero || X86SpecialNan;
+
+    // For x87 extended precision, we want to make a NaN, not a special NaN if
+    // the input wasn't special either.
+    if (!X86SpecialNan && semantics == &APFloat::x87DoubleExtended)
+      APInt::tcSetBit(significandParts(), semantics->precision - 1);
+
     // gcc forces the Quiet bit on, which means (float)(double)(float_sNan)
     // does not give you back the same bits.  This is dubious, and we
     // don't currently do it.  You're really supposed to get
@@ -3032,7 +3019,7 @@ APFloat::initFromPPCDoubleDoubleAPInt(const APInt &api)
 
   // Unless we have a special case, add in second double.
   if (category == fcNormal) {
-    APFloat v(APInt(64, i2));
+    APFloat v(IEEEdouble, APInt(64, i2));
     fs = v.convert(PPCDoubleDouble, rmNearestTiesToEven, &losesInfo);
     assert(fs == opOK && !losesInfo);
     (void)fs;
@@ -3185,27 +3172,43 @@ APFloat::initFromHalfAPInt(const APInt & api)
 /// isIEEE argument distinguishes between PPC128 and IEEE128 (not meaningful
 /// when the size is anything else).
 void
-APFloat::initFromAPInt(const APInt& api, bool isIEEE)
+APFloat::initFromAPInt(const fltSemantics* Sem, const APInt& api)
 {
-  if (api.getBitWidth() == 16)
+  if (Sem == &IEEEhalf)
     return initFromHalfAPInt(api);
-  else if (api.getBitWidth() == 32)
+  if (Sem == &IEEEsingle)
     return initFromFloatAPInt(api);
-  else if (api.getBitWidth()==64)
+  if (Sem == &IEEEdouble)
     return initFromDoubleAPInt(api);
-  else if (api.getBitWidth()==80)
+  if (Sem == &x87DoubleExtended)
     return initFromF80LongDoubleAPInt(api);
-  else if (api.getBitWidth()==128)
-    return (isIEEE ?
-            initFromQuadrupleAPInt(api) : initFromPPCDoubleDoubleAPInt(api));
-  else
-    llvm_unreachable(0);
+  if (Sem == &IEEEquad)
+    return initFromQuadrupleAPInt(api);
+  if (Sem == &PPCDoubleDouble)
+    return initFromPPCDoubleDoubleAPInt(api);
+
+  llvm_unreachable(0);
 }
 
 APFloat
 APFloat::getAllOnesValue(unsigned BitWidth, bool isIEEE)
 {
-  return APFloat(APInt::getAllOnesValue(BitWidth), isIEEE);
+  switch (BitWidth) {
+  case 16:
+    return APFloat(IEEEhalf, APInt::getAllOnesValue(BitWidth));
+  case 32:
+    return APFloat(IEEEsingle, APInt::getAllOnesValue(BitWidth));
+  case 64:
+    return APFloat(IEEEdouble, APInt::getAllOnesValue(BitWidth));
+  case 80:
+    return APFloat(x87DoubleExtended, APInt::getAllOnesValue(BitWidth));
+  case 128:
+    if (isIEEE)
+      return APFloat(IEEEquad, APInt::getAllOnesValue(BitWidth));
+    return APFloat(PPCDoubleDouble, APInt::getAllOnesValue(BitWidth));
+  default:
+    llvm_unreachable("Unknown floating bit width");
+  }
 }
 
 APFloat APFloat::getLargest(const fltSemantics &Sem, bool Negative) {
@@ -3263,16 +3266,16 @@ APFloat APFloat::getSmallestNormalized(const fltSemantics &Sem, bool Negative) {
   return Val;
 }
 
-APFloat::APFloat(const APInt& api, bool isIEEE) {
-  initFromAPInt(api, isIEEE);
+APFloat::APFloat(const fltSemantics &Sem, const APInt &API) {
+  initFromAPInt(&Sem, API);
 }
 
 APFloat::APFloat(float f) {
-  initFromAPInt(APInt::floatToBits(f));
+  initFromAPInt(&IEEEsingle, APInt::floatToBits(f));
 }
 
 APFloat::APFloat(double d) {
-  initFromAPInt(APInt::doubleToBits(d));
+  initFromAPInt(&IEEEdouble, APInt::doubleToBits(d));
 }
 
 namespace {
@@ -3448,7 +3451,7 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
 
   AdjustToPrecision(significand, exp, FormatPrecision);
 
-  llvm::SmallVector<char, 256> buffer;
+  SmallVector<char, 256> buffer;
 
   // Fill the buffer.
   unsigned precision = significand.getBitWidth();
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 61e503b..07cb057 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -1876,6 +1876,17 @@ APInt APInt::udiv(const APInt& RHS) const {
   return Quotient;
 }
 
+APInt APInt::sdiv(const APInt &RHS) const {
+  if (isNegative()) {
+    if (RHS.isNegative())
+      return (-(*this)).udiv(-RHS);
+    return -((-(*this)).udiv(RHS));
+  }
+  if (RHS.isNegative())
+    return -(this->udiv(-RHS));
+  return this->udiv(RHS);
+}
+
 APInt APInt::urem(const APInt& RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord()) {
@@ -1913,6 +1924,17 @@ APInt APInt::urem(const APInt& RHS) const {
   return Remainder;
 }
 
+APInt APInt::srem(const APInt &RHS) const {
+  if (isNegative()) {
+    if (RHS.isNegative())
+      return -((-(*this)).urem(-RHS));
+    return -((-(*this)).urem(RHS));
+  }
+  if (RHS.isNegative())
+    return this->urem(-RHS);
+  return this->urem(RHS);
+}
+
 void APInt::udivrem(const APInt &LHS, const APInt &RHS,
                     APInt &Quotient, APInt &Remainder) {
   // Get some size facts about the dividend and divisor
@@ -1953,6 +1975,24 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
   divide(LHS, lhsWords, RHS, rhsWords, &Quotient, &Remainder);
 }
 
+void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
+                    APInt &Quotient, APInt &Remainder) {
+  if (LHS.isNegative()) {
+    if (RHS.isNegative())
+      APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
+    else {
+      APInt::udivrem(-LHS, RHS, Quotient, Remainder);
+      Quotient = -Quotient;
+    }
+    Remainder = -Remainder;
+  } else if (RHS.isNegative()) {
+    APInt::udivrem(LHS, -RHS, Quotient, Remainder);
+    Quotient = -Quotient;
+  } else {
+    APInt::udivrem(LHS, RHS, Quotient, Remainder);
+  }
+}
+
 APInt APInt::sadd_ov(const APInt &RHS, bool &Overflow) const {
   APInt Res = *this+RHS;
   Overflow = isNonNegative() == RHS.isNonNegative() &&
diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp
index 28f4e64..3c4191b 100644
--- a/lib/Support/Allocator.cpp
+++ b/lib/Support/Allocator.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/Recycler.h"
@@ -82,6 +83,7 @@ void BumpPtrAllocator::Reset() {
   CurSlab->NextPtr = 0;
   CurPtr = (char*)(CurSlab + 1);
   End = ((char*)CurSlab) + CurSlab->Size;
+  BytesAllocated = 0;
 }
 
 /// Allocate - Allocate space at the specified alignment.
@@ -102,6 +104,10 @@ void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) {
   // Check if we can hold it.
   if (Ptr + Size <= End) {
     CurPtr = Ptr + Size;
+    // Update the allocation point of this memory block in MemorySanitizer.
+    // Without this, MemorySanitizer messages for values originated from here
+    // will point to the allocation of the entire slab.
+    __msan_allocated_memory(Ptr, Size);
     return Ptr;
   }
 
@@ -117,6 +123,7 @@ void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) {
 
     Ptr = AlignPtr((char*)(NewSlab + 1), Alignment);
     assert((uintptr_t)Ptr + Size <= (uintptr_t)NewSlab + NewSlab->Size);
+    __msan_allocated_memory(Ptr, Size);
     return Ptr;
   }
 
@@ -125,6 +132,7 @@ void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) {
   Ptr = AlignPtr(CurPtr, Alignment);
   CurPtr = Ptr + Size;
   assert(CurPtr <= End && "Unable to allocate memory!");
+  __msan_allocated_memory(Ptr, Size);
   return Ptr;
 }
 
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index f294a17..5ba69fc 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -8,6 +8,8 @@ add_llvm_library(LLVMSupport
   circular_raw_ostream.cpp
   CommandLine.cpp
   ConstantRange.cpp
+  ConvertUTF.c
+  ConvertUTFWrapper.cpp
   CrashRecoveryContext.cpp
   DataExtractor.cpp
   DataStream.cpp
diff --git a/lib/Support/ConvertUTF.c b/lib/Support/ConvertUTF.c
new file mode 100644
index 0000000..23f17ca
--- /dev/null
+++ b/lib/Support/ConvertUTF.c
@@ -0,0 +1,571 @@
+/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===------------------------------------------------------------------------=*/
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ * 
+ * Disclaimer
+ * 
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ * 
+ * Limitations on Rights to Redistribute This Code
+ * 
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+/* ---------------------------------------------------------------------
+
+    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
+    Author: Mark E. Davis, 1994.
+    Rev History: Rick McGowan, fixes & updates May 2001.
+    Sept 2001: fixed const & error conditions per
+        mods suggested by S. Parent & A. Lillich.
+    June 2002: Tim Dodd added detection and handling of incomplete
+        source sequences, enhanced error detection, added casts
+        to eliminate compiler warnings.
+    July 2003: slight mods to back out aggressive FFFE detection.
+    Jan 2004: updated switches in from-UTF8 conversions.
+    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
+
+    See the header file "ConvertUTF.h" for complete documentation.
+
+------------------------------------------------------------------------ */
+
+
+#include "llvm/Support/ConvertUTF.h"
+#ifdef CVTUTF_DEBUG
+#include <stdio.h>
+#endif
+
+static const int halfShift  = 10; /* used for shifting by 10 bits */
+
+static const UTF32 halfBase = 0x0010000UL;
+static const UTF32 halfMask = 0x3FFUL;
+
+#define UNI_SUR_HIGH_START  (UTF32)0xD800
+#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
+#define UNI_SUR_LOW_START   (UTF32)0xDC00
+#define UNI_SUR_LOW_END     (UTF32)0xDFFF
+#define false      0
+#define true        1
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Index into the table below with the first byte of a UTF-8 sequence to
+ * get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
+ */
+static const char trailingBytesForUTF8[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+};
+
+/*
+ * Magic values subtracted from a buffer value during UTF8 conversion.
+ * This table contains as many values as there might be trailing bytes
+ * in a UTF-8 sequence.
+ */
+static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
+                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
+
+/*
+ * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
+ * into the first byte, depending on how many bytes follow.  There are
+ * as many entries in this table as there are UTF-8 sequence types.
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
+ */
+static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+/* --------------------------------------------------------------------- */
+
+/* The interface converts a whole buffer to avoid function-call overhead.
+ * Constants have been gathered. Loops & conditionals have been removed as
+ * much as possible for efficiency, in favor of drop-through switches.
+ * (See "Note A" at the bottom of the file for equivalent code.)
+ * If your compiler supports it, the "isLegalUTF8" call can be turned
+ * into an inline function.
+ */
+
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF32toUTF16 (
+        const UTF32** sourceStart, const UTF32* sourceEnd, 
+        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF32* source = *sourceStart;
+    UTF16* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch;
+        if (target >= targetEnd) {
+            result = targetExhausted; break;
+        }
+        ch = *source++;
+        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
+            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
+            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+                if (flags == strictConversion) {
+                    --source; /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                } else {
+                    *target++ = UNI_REPLACEMENT_CHAR;
+                }
+            } else {
+                *target++ = (UTF16)ch; /* normal case */
+            }
+        } else if (ch > UNI_MAX_LEGAL_UTF32) {
+            if (flags == strictConversion) {
+                result = sourceIllegal;
+            } else {
+                *target++ = UNI_REPLACEMENT_CHAR;
+            }
+        } else {
+            /* target is a character in range 0xFFFF - 0x10FFFF. */
+            if (target + 1 >= targetEnd) {
+                --source; /* Back up source pointer! */
+                result = targetExhausted; break;
+            }
+            ch -= halfBase;
+            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
+            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
+        }
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF16toUTF32 (
+        const UTF16** sourceStart, const UTF16* sourceEnd, 
+        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF16* source = *sourceStart;
+    UTF32* target = *targetStart;
+    UTF32 ch, ch2;
+    while (source < sourceEnd) {
+        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
+        ch = *source++;
+        /* If we have a surrogate pair, convert to UTF32 first. */
+        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+            /* If the 16 bits following the high surrogate are in the source buffer... */
+            if (source < sourceEnd) {
+                ch2 = *source;
+                /* If it's a low surrogate, convert to UTF32. */
+                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
+                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
+                    ++source;
+                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
+                    --source; /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                }
+            } else { /* We don't have the 16 bits following the high surrogate. */
+                --source; /* return to the high surrogate */
+                result = sourceExhausted;
+                break;
+            }
+        } else if (flags == strictConversion) {
+            /* UTF-16 surrogate values are illegal in UTF-32 */
+            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+                --source; /* return to the illegal value itself */
+                result = sourceIllegal;
+                break;
+            }
+        }
+        if (target >= targetEnd) {
+            source = oldSource; /* Back up source pointer! */
+            result = targetExhausted; break;
+        }
+        *target++ = ch;
+    }
+    *sourceStart = source;
+    *targetStart = target;
+#ifdef CVTUTF_DEBUG
+if (result == sourceIllegal) {
+    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
+    fflush(stderr);
+}
+#endif
+    return result;
+}
+ConversionResult ConvertUTF16toUTF8 (
+        const UTF16** sourceStart, const UTF16* sourceEnd, 
+        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF16* source = *sourceStart;
+    UTF8* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch;
+        unsigned short bytesToWrite = 0;
+        const UTF32 byteMask = 0xBF;
+        const UTF32 byteMark = 0x80; 
+        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
+        ch = *source++;
+        /* If we have a surrogate pair, convert to UTF32 first. */
+        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+            /* If the 16 bits following the high surrogate are in the source buffer... */
+            if (source < sourceEnd) {
+                UTF32 ch2 = *source;
+                /* If it's a low surrogate, convert to UTF32. */
+                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
+                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
+                    ++source;
+                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
+                    --source; /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                }
+            } else { /* We don't have the 16 bits following the high surrogate. */
+                --source; /* return to the high surrogate */
+                result = sourceExhausted;
+                break;
+            }
+        } else if (flags == strictConversion) {
+            /* UTF-16 surrogate values are illegal in UTF-32 */
+            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+                --source; /* return to the illegal value itself */
+                result = sourceIllegal;
+                break;
+            }
+        }
+        /* Figure out how many bytes the result will require */
+        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
+        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
+        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
+        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
+        } else {                            bytesToWrite = 3;
+                                            ch = UNI_REPLACEMENT_CHAR;
+        }
+
+        target += bytesToWrite;
+        if (target > targetEnd) {
+            source = oldSource; /* Back up source pointer! */
+            target -= bytesToWrite; result = targetExhausted; break;
+        }
+        switch (bytesToWrite) { /* note: everything falls through. */
+            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
+        }
+        target += bytesToWrite;
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF32toUTF8 (
+        const UTF32** sourceStart, const UTF32* sourceEnd, 
+        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF32* source = *sourceStart;
+    UTF8* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch;
+        unsigned short bytesToWrite = 0;
+        const UTF32 byteMask = 0xBF;
+        const UTF32 byteMark = 0x80; 
+        ch = *source++;
+        if (flags == strictConversion ) {
+            /* UTF-16 surrogate values are illegal in UTF-32 */
+            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+                --source; /* return to the illegal value itself */
+                result = sourceIllegal;
+                break;
+            }
+        }
+        /*
+         * Figure out how many bytes the result will require. Turn any
+         * illegally large UTF32 things (> Plane 17) into replacement chars.
+         */
+        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
+        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
+        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
+        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
+        } else {                            bytesToWrite = 3;
+                                            ch = UNI_REPLACEMENT_CHAR;
+                                            result = sourceIllegal;
+        }
+        
+        target += bytesToWrite;
+        if (target > targetEnd) {
+            --source; /* Back up source pointer! */
+            target -= bytesToWrite; result = targetExhausted; break;
+        }
+        switch (bytesToWrite) { /* note: everything falls through. */
+            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
+        }
+        target += bytesToWrite;
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Utility routine to tell whether a sequence of bytes is legal UTF-8.
+ * This must be called with the length pre-determined by the first byte.
+ * If not calling this from ConvertUTF8to*, then the length can be set by:
+ *  length = trailingBytesForUTF8[*source]+1;
+ * and the sequence is illegal right away if there aren't that many bytes
+ * available.
+ * If presented with a length > 4, this returns false.  The Unicode
+ * definition of UTF-8 goes up to 4-byte sequences.
+ */
+
+static Boolean isLegalUTF8(const UTF8 *source, int length) {
+    UTF8 a;
+    const UTF8 *srcptr = source+length;
+    switch (length) {
+    default: return false;
+        /* Everything else falls through when "true"... */
+    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+    case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+
+        switch (*source) {
+            /* no fall-through in this inner switch */
+            case 0xE0: if (a < 0xA0) return false; break;
+            case 0xED: if (a > 0x9F) return false; break;
+            case 0xF0: if (a < 0x90) return false; break;
+            case 0xF4: if (a > 0x8F) return false; break;
+            default:   if (a < 0x80) return false;
+        }
+
+    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+    }
+    if (*source > 0xF4) return false;
+    return true;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Exported function to return whether a UTF-8 sequence is legal or not.
+ * This is not used here; it's just exported.
+ */
+Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
+    int length = trailingBytesForUTF8[*source]+1;
+    if (length > sourceEnd - source) {
+        return false;
+    }
+    return isLegalUTF8(source, length);
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Exported function to return the total number of bytes in a codepoint
+ * represented in UTF-8, given the value of the first byte.
+ */
+unsigned getNumBytesForUTF8(UTF8 first) {
+  return trailingBytesForUTF8[first] + 1;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Exported function to return whether a UTF-8 string is legal or not.
+ * This is not used here; it's just exported.
+ */
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
+    while (*source != sourceEnd) {
+        int length = trailingBytesForUTF8[**source] + 1;
+        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
+            return false;
+        *source += length;
+    }
+    return true;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF8toUTF16 (
+        const UTF8** sourceStart, const UTF8* sourceEnd, 
+        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF8* source = *sourceStart;
+    UTF16* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch = 0;
+        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+        if (extraBytesToRead >= sourceEnd - source) {
+            result = sourceExhausted; break;
+        }
+        /* Do this check whether lenient or strict */
+        if (!isLegalUTF8(source, extraBytesToRead+1)) {
+            result = sourceIllegal;
+            break;
+        }
+        /*
+         * The cases all fall through. See "Note A" below.
+         */
+        switch (extraBytesToRead) {
+            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+            case 3: ch += *source++; ch <<= 6;
+            case 2: ch += *source++; ch <<= 6;
+            case 1: ch += *source++; ch <<= 6;
+            case 0: ch += *source++;
+        }
+        ch -= offsetsFromUTF8[extraBytesToRead];
+
+        if (target >= targetEnd) {
+            source -= (extraBytesToRead+1); /* Back up source pointer! */
+            result = targetExhausted; break;
+        }
+        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
+            /* UTF-16 surrogate values are illegal in UTF-32 */
+            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+                if (flags == strictConversion) {
+                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                } else {
+                    *target++ = UNI_REPLACEMENT_CHAR;
+                }
+            } else {
+                *target++ = (UTF16)ch; /* normal case */
+            }
+        } else if (ch > UNI_MAX_UTF16) {
+            if (flags == strictConversion) {
+                result = sourceIllegal;
+                source -= (extraBytesToRead+1); /* return to the start */
+                break; /* Bail out; shouldn't continue */
+            } else {
+                *target++ = UNI_REPLACEMENT_CHAR;
+            }
+        } else {
+            /* target is a character in range 0xFFFF - 0x10FFFF. */
+            if (target + 1 >= targetEnd) {
+                source -= (extraBytesToRead+1); /* Back up source pointer! */
+                result = targetExhausted; break;
+            }
+            ch -= halfBase;
+            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
+            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
+        }
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF8toUTF32 (
+        const UTF8** sourceStart, const UTF8* sourceEnd, 
+        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF8* source = *sourceStart;
+    UTF32* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch = 0;
+        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+        if (extraBytesToRead >= sourceEnd - source) {
+            result = sourceExhausted; break;
+        }
+        /* Do this check whether lenient or strict */
+        if (!isLegalUTF8(source, extraBytesToRead+1)) {
+            result = sourceIllegal;
+            break;
+        }
+        /*
+         * The cases all fall through. See "Note A" below.
+         */
+        switch (extraBytesToRead) {
+            case 5: ch += *source++; ch <<= 6;
+            case 4: ch += *source++; ch <<= 6;
+            case 3: ch += *source++; ch <<= 6;
+            case 2: ch += *source++; ch <<= 6;
+            case 1: ch += *source++; ch <<= 6;
+            case 0: ch += *source++;
+        }
+        ch -= offsetsFromUTF8[extraBytesToRead];
+
+        if (target >= targetEnd) {
+            source -= (extraBytesToRead+1); /* Back up the source pointer! */
+            result = targetExhausted; break;
+        }
+        if (ch <= UNI_MAX_LEGAL_UTF32) {
+            /*
+             * UTF-16 surrogate values are illegal in UTF-32, and anything
+             * over Plane 17 (> 0x10FFFF) is illegal.
+             */
+            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+                if (flags == strictConversion) {
+                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                } else {
+                    *target++ = UNI_REPLACEMENT_CHAR;
+                }
+            } else {
+                *target++ = ch;
+            }
+        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
+            result = sourceIllegal;
+            *target++ = UNI_REPLACEMENT_CHAR;
+        }
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* ---------------------------------------------------------------------
+
+    Note A.
+    The fall-through switches in UTF-8 reading code save a
+    temp variable, some decrements & conditionals.  The switches
+    are equivalent to the following loop:
+        {
+            int tmpBytesToRead = extraBytesToRead+1;
+            do {
+                ch += *source++;
+                --tmpBytesToRead;
+                if (tmpBytesToRead) ch <<= 6;
+            } while (tmpBytesToRead > 0);
+        }
+    In UTF-8 writing code, the switches on "bytesToWrite" are
+    similarly unrolled loops.
+
+   --------------------------------------------------------------------- */
diff --git a/lib/Support/ConvertUTFWrapper.cpp b/lib/Support/ConvertUTFWrapper.cpp
new file mode 100644
index 0000000..458fbb0
--- /dev/null
+++ b/lib/Support/ConvertUTFWrapper.cpp
@@ -0,0 +1,76 @@
+//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ConvertUTF.h"
+
+namespace llvm {
+
+bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
+                       char *&ResultPtr, const UTF8 *&ErrorPtr) {
+  assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);
+  ConversionResult result = conversionOK;
+  // Copy the character span over.
+  if (WideCharWidth == 1) {
+    const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin());
+    if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {
+      result = sourceIllegal;
+      ErrorPtr = Pos;
+    } else {
+      memcpy(ResultPtr, Source.data(), Source.size());
+      ResultPtr += Source.size();
+    }
+  } else if (WideCharWidth == 2) {
+    const UTF8 *sourceStart = (const UTF8*)Source.data();
+    // FIXME: Make the type of the result buffer correct instead of
+    // using reinterpret_cast.
+    UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
+    ConversionFlags flags = strictConversion;
+    result = ConvertUTF8toUTF16(
+        &sourceStart, sourceStart + Source.size(),
+        &targetStart, targetStart + 2*Source.size(), flags);
+    if (result == conversionOK)
+      ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
+  } else if (WideCharWidth == 4) {
+    const UTF8 *sourceStart = (const UTF8*)Source.data();
+    // FIXME: Make the type of the result buffer correct instead of
+    // using reinterpret_cast.
+    UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
+    ConversionFlags flags = strictConversion;
+    result = ConvertUTF8toUTF32(
+        &sourceStart, sourceStart + Source.size(),
+        &targetStart, targetStart + 4*Source.size(), flags);
+    if (result == conversionOK)
+      ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
+  }
+  assert((result != targetExhausted)
+         && "ConvertUTF8toUTFXX exhausted target buffer");
+  return result == conversionOK;
+}
+
+bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
+  const UTF32 *SourceStart = &Source;
+  const UTF32 *SourceEnd = SourceStart + 1;
+  UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr);
+  UTF8 *TargetEnd = TargetStart + 4;
+  ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,
+                                           &TargetStart, TargetEnd,
+                                           strictConversion);
+  if (CR != conversionOK)
+    return false;
+
+  ResultPtr = reinterpret_cast<char*>(TargetStart);
+  return true;
+}
+
+} // end namespace llvm
+
diff --git a/lib/Support/Debug.cpp b/lib/Support/Debug.cpp
index 0c0f15e..d9cb8a9 100644
--- a/lib/Support/Debug.cpp
+++ b/lib/Support/Debug.cpp
@@ -44,7 +44,7 @@ Debug("debug", cl::desc("Enable debug output"), cl::Hidden,
 //until program termination.
 static cl::opt<unsigned>
 DebugBufferSize("debug-buffer-size",
-                cl::desc("Buffer the last N characters of debug output"
+                cl::desc("Buffer the last N characters of debug output "
                          "until program termination. "
                          "[default 0 -- immediate print-out]"),
                 cl::Hidden,
diff --git a/lib/Support/DeltaAlgorithm.cpp b/lib/Support/DeltaAlgorithm.cpp
index f290668..9e52874 100644
--- a/lib/Support/DeltaAlgorithm.cpp
+++ b/lib/Support/DeltaAlgorithm.cpp
@@ -27,15 +27,13 @@ bool DeltaAlgorithm::GetTestResult(const changeset_ty &Changes) {
 
 void DeltaAlgorithm::Split(const changeset_ty &S, changesetlist_ty &Res) {
   // FIXME: Allow clients to provide heuristics for improved splitting.
-  // Get the iterator to the middle.
-  unsigned N = S.size() / 2;
-  changeset_ty::const_iterator middle(S.begin());
-  std::advance(middle, N);
-
-  // Create each vector using the middle as the split.
-  changeset_ty LHS(S.begin(), middle);
-  changeset_ty RHS(middle, S.end());
 
+  // FIXME: This is really slow.
+  changeset_ty LHS, RHS;
+  unsigned idx = 0, N = S.size() / 2;
+  for (changeset_ty::const_iterator it = S.begin(),
+         ie = S.end(); it != ie; ++it, ++idx)
+    ((idx < N) ? LHS : RHS).insert(*it);
   if (!LHS.empty())
     Res.push_back(LHS);
   if (!RHS.empty())
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 615efb8..0f91c11 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -688,6 +688,7 @@ const char *llvm::dwarf::MacinfoString(unsigned Encoding) {
 /// encodings.
 const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   switch (Encoding) {
+  case DW_CFA_nop:                       return "DW_CFA_nop";
   case DW_CFA_advance_loc:               return "DW_CFA_advance_loc";
   case DW_CFA_offset:                    return "DW_CFA_offset";
   case DW_CFA_restore:                   return "DW_CFA_restore";
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index d40439a..f14cb45 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -46,7 +46,7 @@ void llvm::sys::DynamicLibrary::AddSymbol(StringRef symbolName,
                                           void *symbolValue) {
   SmartScopedLock<true> lock(getMutex());
   if (ExplicitSymbols == 0)
-    ExplicitSymbols = new llvm::StringMap<void*>();
+    ExplicitSymbols = new StringMap<void*>();
   (*ExplicitSymbols)[symbolName] = symbolValue;
 }
 
diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
index fc8a2f3..4d7b239 100644
--- a/lib/Support/FileUtilities.cpp
+++ b/lib/Support/FileUtilities.cpp
@@ -87,9 +87,9 @@ static bool CompareNumbers(const char *&F1P, const char *&F2P,
 
   // If one of the positions is at a space and the other isn't, chomp up 'til
   // the end of the space.
-  while (isspace(*F1P) && F1P != F1End)
+  while (isspace(static_cast<unsigned char>(*F1P)) && F1P != F1End)
     ++F1P;
-  while (isspace(*F2P) && F2P != F2End)
+  while (isspace(static_cast<unsigned char>(*F2P)) && F2P != F2End)
     ++F2P;
 
   // If we stop on numbers, compare their difference.
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index 669c238..bff182f 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -53,6 +53,17 @@ std::string llvm::DOT::EscapeString(const std::string &Label) {
   return Str;
 }
 
+/// \brief Get a color string for this node number. Simply round-robin selects
+/// from a reasonable number of colors.
+StringRef llvm::DOT::getColorString(unsigned ColorNumber) {
+  static const int NumColors = 20;
+  static const char* Colors[NumColors] = {
+    "aaaaaa", "aa0000", "00aa00", "aa5500", "0055ff", "aa00aa", "00aaaa",
+    "555555", "ff5555", "55ff55", "ffff55", "5555ff", "ff55ff", "55ffff",
+    "ffaaaa", "aaffaa", "ffffaa", "aaaaff", "ffaaff", "aaffff"};
+  return Colors[ColorNumber % NumColors];
+}
+
 // Execute the graph viewer. Return true if successful.
 static bool LLVM_ATTRIBUTE_UNUSED
 ExecGraphViewer(const sys::Path &ExecPath, std::vector<const char*> &args,
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 5ad5308..b9bbcb9 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/DataStream.h"
 #include "llvm/Support/Debug.h"
@@ -330,7 +331,10 @@ std::string sys::getHostCPUName() {
       case 20:
         return "btver1";
       case 21:
-        return "bdver1";
+        if (Model <= 15)
+          return "bdver1";
+        else if (Model <= 31)
+          return "bdver2";
     default:
       return "generic";
     }
@@ -578,3 +582,14 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features){
   return false;
 }
 #endif
+
+std::string sys::getProcessTriple() {
+  Triple PT(LLVM_HOSTTRIPLE);
+
+  if (sizeof(void *) == 8 && PT.isArch32Bit())
+    PT = PT.get64BitArchVariant();
+  if (sizeof(void *) == 4 && PT.isArch64Bit())
+    PT = PT.get32BitArchVariant();
+
+  return PT.str();
+}
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 075d8a5..92d8b83 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -31,7 +31,7 @@ LockFileManager::readLockFile(StringRef LockFileName) {
   // to read, so we just return.
   bool Exists = false;
   if (sys::fs::exists(LockFileName, Exists) || !Exists)
-    return Optional<std::pair<std::string, int> >();
+    return None;
 
   // Read the owning host and PID out of the lock file. If it appears that the
   // owning process is dead, the lock file is invalid.
@@ -45,7 +45,7 @@ LockFileManager::readLockFile(StringRef LockFileName) {
   // Delete the lock file. It's invalid anyway.
   bool Existed;
   sys::fs::remove(LockFileName, Existed);
-  return Optional<std::pair<std::string, int> >();
+  return None;
 }
 
 bool LockFileManager::processStillExecuting(StringRef Hostname, int PID) {
@@ -64,6 +64,7 @@ bool LockFileManager::processStillExecuting(StringRef Hostname, int PID) {
 
 LockFileManager::LockFileManager(StringRef FileName)
 {
+  this->FileName = FileName;
   LockFileName = FileName;
   LockFileName += ".lock";
 
@@ -175,6 +176,7 @@ void LockFileManager::waitForUnlock() {
 #endif
   // Don't wait more than an hour for the file to appear.
   const unsigned MaxSeconds = 3600;
+  bool LockFileGone = false;
   do {
     // Sleep for the designated interval, to allow the owning process time to
     // finish up and remove the lock file.
@@ -185,10 +187,18 @@ void LockFileManager::waitForUnlock() {
 #else
     nanosleep(&Interval, NULL);
 #endif
-    // If the file no longer exists, we're done.
+    // If the lock file no longer exists, wait for the actual file.
     bool Exists = false;
-    if (!sys::fs::exists(LockFileName.str(), Exists) && !Exists)
-      return;
+    if (!LockFileGone) {
+      if (!sys::fs::exists(LockFileName.str(), Exists) && !Exists) {
+        LockFileGone = true;
+        Exists = false;
+      }
+    }
+    if (LockFileGone) {
+      if (!sys::fs::exists(FileName.str(), Exists) && Exists)
+        return;
+    }
 
     if (!processStillExecuting((*Owner).first, (*Owner).second))
       return;
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 65b4332..691b6f5 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -33,8 +33,13 @@
 #include <unistd.h>
 #else
 #include <io.h>
-#ifndef S_ISFIFO
-#define S_ISFIFO(x) (0)
+// Simplistic definitinos of these macros to allow files to be read with
+// MapInFilePages.
+#ifndef S_ISREG
+#define S_ISREG(x) (1)
+#endif
+#ifndef S_ISBLK
+#define S_ISBLK(x) (0)
 #endif
 #endif
 #include <fcntl.h>
@@ -322,9 +327,10 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
         return error_code(errno, posix_category());
       }
 
-      // If this is a named pipe, we can't trust the size. Create the memory
+      // If this not a file or a block device (e.g. it's a named pipe
+      // or character device), we can't trust the size. Create the memory
       // buffer by copying off the stream.
-      if (S_ISFIFO(FileInfo.st_mode)) {
+      if (!S_ISREG(FileInfo.st_mode) && !S_ISBLK(FileInfo.st_mode)) {
         return getMemoryBufferForStream(FD, Filename, result);
       }
 
diff --git a/lib/Support/PathV2.cpp b/lib/Support/PathV2.cpp
index 98d7382..41add96 100644
--- a/lib/Support/PathV2.cpp
+++ b/lib/Support/PathV2.cpp
@@ -44,7 +44,8 @@ namespace {
 
 #ifdef LLVM_ON_WIN32
     // C:
-    if (path.size() >= 2 && std::isalpha(path[0]) && path[1] == ':')
+    if (path.size() >= 2 && std::isalpha(static_cast<unsigned char>(path[0])) &&
+        path[1] == ':')
       return path.substr(0, 2);
 #endif
 
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 6540319..fac3cad 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -15,12 +15,16 @@
 
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/Locale.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 using namespace llvm;
 
+static const size_t TabStop = 8;
+
 namespace {
   struct LineNoCacheTy {
     int LastQueryBufferID;
@@ -146,7 +150,8 @@ void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
 /// prefixed to the message.
 SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                                    const Twine &Msg,
-                                   ArrayRef<SMRange> Ranges) const {
+                                   ArrayRef<SMRange> Ranges,
+                                   ArrayRef<SMFixIt> FixIts) const {
 
   // First thing to do: find the current buffer containing the specified
   // location to pull out the source line.
@@ -193,6 +198,7 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
         R.End = SMLoc::getFromPointer(LineEnd);
       
       // Translate from SMLoc ranges to column ranges.
+      // FIXME: Handle multibyte characters.
       ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart,
                                          R.End.getPointer()-LineStart));
     }
@@ -202,13 +208,13 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
     
   return SMDiagnostic(*this, Loc, BufferID, LineAndCol.first,
                       LineAndCol.second-1, Kind, Msg.str(),
-                      LineStr, ColRanges);
+                      LineStr, ColRanges, FixIts);
 }
 
 void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                              const Twine &Msg, ArrayRef<SMRange> Ranges,
-                             bool ShowColors) const {
-  SMDiagnostic Diagnostic = GetMessage(Loc, Kind, Msg, Ranges);
+                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+  SMDiagnostic Diagnostic = GetMessage(Loc, Kind, Msg, Ranges, FixIts);
   
   // Report the message with the diagnostic handler if present.
   if (DiagHandler) {
@@ -231,15 +237,108 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 // SMDiagnostic Implementation
 //===----------------------------------------------------------------------===//
 
-SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, const std::string &FN,
+SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
                            int Line, int Col, SourceMgr::DiagKind Kind,
-                           const std::string &Msg,
-                           const std::string &LineStr,
-                           ArrayRef<std::pair<unsigned,unsigned> > Ranges)
+                           StringRef Msg, StringRef LineStr,
+                           ArrayRef<std::pair<unsigned,unsigned> > Ranges,
+                           ArrayRef<SMFixIt> Hints)
   : SM(&sm), Loc(L), Filename(FN), LineNo(Line), ColumnNo(Col), Kind(Kind),
-    Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()) {
+    Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()),
+    FixIts(Hints.begin(), Hints.end()) {
+  std::sort(FixIts.begin(), FixIts.end());
 }
 
+static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
+                           ArrayRef<SMFixIt> FixIts, ArrayRef<char> SourceLine){
+  if (FixIts.empty())
+    return;
+
+  const char *LineStart = SourceLine.begin();
+  const char *LineEnd = SourceLine.end();
+
+  size_t PrevHintEndCol = 0;
+
+  for (ArrayRef<SMFixIt>::iterator I = FixIts.begin(), E = FixIts.end();
+       I != E; ++I) {
+    // If the fixit contains a newline or tab, ignore it.
+    if (I->getText().find_first_of("\n\r\t") != StringRef::npos)
+      continue;
+
+    SMRange R = I->getRange();
+
+    // If the line doesn't contain any part of the range, then ignore it.
+    if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
+      continue;
+
+    // Translate from SMLoc to column.
+    // Ignore pieces of the range that go onto other lines.
+    // FIXME: Handle multibyte characters in the source line.
+    unsigned FirstCol;
+    if (R.Start.getPointer() < LineStart)
+      FirstCol = 0;
+    else
+      FirstCol = R.Start.getPointer() - LineStart;
+
+    // If we inserted a long previous hint, push this one forwards, and add
+    // an extra space to show that this is not part of the previous
+    // completion. This is sort of the best we can do when two hints appear
+    // to overlap.
+    //
+    // Note that if this hint is located immediately after the previous
+    // hint, no space will be added, since the location is more important.
+    unsigned HintCol = FirstCol;
+    if (HintCol < PrevHintEndCol)
+      HintCol = PrevHintEndCol + 1;
+
+    // FIXME: This assertion is intended to catch unintended use of multibyte
+    // characters in fixits. If we decide to do this, we'll have to track
+    // separate byte widths for the source and fixit lines.
+    assert((size_t)llvm::sys::locale::columnWidth(I->getText()) ==
+           I->getText().size());
+
+    // This relies on one byte per column in our fixit hints.
+    unsigned LastColumnModified = HintCol + I->getText().size();
+    if (LastColumnModified > FixItLine.size())
+      FixItLine.resize(LastColumnModified, ' ');
+
+    std::copy(I->getText().begin(), I->getText().end(),
+              FixItLine.begin() + HintCol);
+
+    PrevHintEndCol = LastColumnModified;
+
+    // For replacements, mark the removal range with '~'.
+    // FIXME: Handle multibyte characters in the source line.
+    unsigned LastCol;
+    if (R.End.getPointer() >= LineEnd)
+      LastCol = LineEnd - LineStart;
+    else
+      LastCol = R.End.getPointer() - LineStart;
+
+    std::fill(&CaretLine[FirstCol], &CaretLine[LastCol], '~');
+  }
+}
+
+static void printSourceLine(raw_ostream &S, StringRef LineContents) {
+  // Print out the source line one character at a time, so we can expand tabs.
+  for (unsigned i = 0, e = LineContents.size(), OutCol = 0; i != e; ++i) {
+    if (LineContents[i] != '\t') {
+      S << LineContents[i];
+      ++OutCol;
+      continue;
+    }
+
+    // If we have a tab, emit at least one space, then round up to 8 columns.
+    do {
+      S << ' ';
+      ++OutCol;
+    } while ((OutCol % TabStop) != 0);
+  }
+  S << '\n';
+}
+
+static bool isNonASCII(char c) {
+  return c & 0x80;
+}
 
 void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
                          bool ShowColors) const {
@@ -297,43 +396,48 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
   if (LineNo == -1 || ColumnNo == -1)
     return;
 
+  // FIXME: If there are multibyte or multi-column characters in the source, all
+  // our ranges will be wrong. To do this properly, we'll need a byte-to-column
+  // map like Clang's TextDiagnostic. For now, we'll just handle tabs by
+  // expanding them later, and bail out rather than show incorrect ranges and
+  // misaligned fixits for any other odd characters.
+  if (std::find_if(LineContents.begin(), LineContents.end(), isNonASCII) !=
+      LineContents.end()) {
+    printSourceLine(S, LineContents);
+    return;
+  }
+  size_t NumColumns = LineContents.size();
+
   // Build the line with the caret and ranges.
-  std::string CaretLine(LineContents.size()+1, ' ');
+  std::string CaretLine(NumColumns+1, ' ');
   
   // Expand any ranges.
   for (unsigned r = 0, e = Ranges.size(); r != e; ++r) {
     std::pair<unsigned, unsigned> R = Ranges[r];
-    for (unsigned i = R.first,
-         e = std::min(R.second, (unsigned)LineContents.size()); i != e; ++i)
-      CaretLine[i] = '~';
+    std::fill(&CaretLine[R.first],
+              &CaretLine[std::min((size_t)R.second, CaretLine.size())],
+              '~');
   }
-    
+
+  // Add any fix-its.
+  // FIXME: Find the beginning of the line properly for multibyte characters.
+  std::string FixItInsertionLine;
+  buildFixItLine(CaretLine, FixItInsertionLine, FixIts,
+                 makeArrayRef(Loc.getPointer() - ColumnNo,
+                              LineContents.size()));
+
   // Finally, plop on the caret.
-  if (unsigned(ColumnNo) <= LineContents.size())
+  if (unsigned(ColumnNo) <= NumColumns)
     CaretLine[ColumnNo] = '^';
   else 
-    CaretLine[LineContents.size()] = '^';
+    CaretLine[NumColumns] = '^';
   
   // ... and remove trailing whitespace so the output doesn't wrap for it.  We
   // know that the line isn't completely empty because it has the caret in it at
   // least.
   CaretLine.erase(CaretLine.find_last_not_of(' ')+1);
   
-  // Print out the source line one character at a time, so we can expand tabs.
-  for (unsigned i = 0, e = LineContents.size(), OutCol = 0; i != e; ++i) {
-    if (LineContents[i] != '\t') {
-      S << LineContents[i];
-      ++OutCol;
-      continue;
-    }
-    
-    // If we have a tab, emit at least one space, then round up to 8 columns.
-    do {
-      S << ' ';
-      ++OutCol;
-    } while (OutCol & 7);
-  }
-  S << '\n';
+  printSourceLine(S, LineContents);
 
   if (ShowColors)
     S.changeColor(raw_ostream::GREEN, true);
@@ -350,11 +454,36 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
     do {
       S << CaretLine[i];
       ++OutCol;
-    } while (OutCol & 7);
+    } while ((OutCol % TabStop) != 0);
   }
+  S << '\n';
 
   if (ShowColors)
     S.resetColor();
+
+  // Print out the replacement line, matching tabs in the source line.
+  if (FixItInsertionLine.empty())
+    return;
   
+  for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i != e; ++i) {
+    if (i >= LineContents.size() || LineContents[i] != '\t') {
+      S << FixItInsertionLine[i];
+      ++OutCol;
+      continue;
+    }
+
+    // Okay, we have a tab.  Insert the appropriate number of characters.
+    do {
+      S << FixItInsertionLine[i];
+      // FIXME: This is trying not to break up replacements, but then to re-sync
+      // with the tabs between replacements. This will fail, though, if two
+      // fix-it replacements are exactly adjacent, or if a fix-it contains a
+      // space. Really we should be precomputing column widths, which we'll
+      // need anyway for multibyte chars.
+      if (FixItInsertionLine[i] != ' ')
+        ++i;
+      ++OutCol;
+    } while (((OutCol % TabStop) != 0) && i != e);
+  }
   S << '\n';
 }
diff --git a/lib/Support/TimeValue.cpp b/lib/Support/TimeValue.cpp
index 1a0f7bc..bd8af17 100644
--- a/lib/Support/TimeValue.cpp
+++ b/lib/Support/TimeValue.cpp
@@ -17,11 +17,16 @@
 namespace llvm {
 using namespace sys;
 
+const TimeValue::SecondsType
+  TimeValue::PosixZeroTimeSeconds = -946684800;
+const TimeValue::SecondsType
+  TimeValue::Win32ZeroTimeSeconds = -12591158400ULL;
+
 const TimeValue TimeValue::MinTime       = TimeValue ( INT64_MIN,0 );
 const TimeValue TimeValue::MaxTime       = TimeValue ( INT64_MAX,0 );
 const TimeValue TimeValue::ZeroTime      = TimeValue ( 0,0 );
-const TimeValue TimeValue::PosixZeroTime = TimeValue ( -946684800,0 );
-const TimeValue TimeValue::Win32ZeroTime = TimeValue ( -12591158400ULL,0 );
+const TimeValue TimeValue::PosixZeroTime = TimeValue ( PosixZeroTimeSeconds,0 );
+const TimeValue TimeValue::Win32ZeroTime = TimeValue ( Win32ZeroTimeSeconds,0 );
 
 void
 TimeValue::normalize( void ) {
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index eefb96b..d2508ac 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -19,6 +19,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   switch (Kind) {
   case UnknownArch: return "unknown";
 
+  case aarch64: return "aarch64";
   case arm:     return "arm";
   case hexagon: return "hexagon";
   case mips:    return "mips";
@@ -53,6 +54,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
   default:
     return 0;
 
+  case aarch64: return "aarch64";
+
   case arm:
   case thumb:   return "arm";
 
@@ -140,6 +143,7 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case GNU: return "gnu";
   case GNUEABIHF: return "gnueabihf";
   case GNUEABI: return "gnueabi";
+  case GNUX32: return "gnux32";
   case EABI: return "eabi";
   case MachO: return "macho";
   case Android: return "android";
@@ -151,6 +155,7 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
 
 Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
   return StringSwitch<Triple::ArchType>(Name)
+    .Case("aarch64", aarch64)
     .Case("arm", arm)
     .Case("mips", mips)
     .Case("mipsel", mipsel)
@@ -214,6 +219,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("powerpc", Triple::ppc)
     .Cases("powerpc64", "ppu", Triple::ppc64)
     .Case("mblaze", Triple::mblaze)
+    .Case("aarch64", Triple::aarch64)
     .Cases("arm", "xscale", Triple::arm)
     // FIXME: It would be good to replace these with explicit names for all the
     // various suffixes supported.
@@ -284,6 +290,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
     .StartsWith("eabi", Triple::EABI)
     .StartsWith("gnueabihf", Triple::GNUEABIHF)
     .StartsWith("gnueabi", Triple::GNUEABI)
+    .StartsWith("gnux32", Triple::GNUX32)
     .StartsWith("gnu", Triple::GNU)
     .StartsWith("macho", Triple::MachO)
     .StartsWith("android", Triple::Android)
@@ -674,6 +681,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::spir:
     return 32;
 
+  case llvm::Triple::aarch64:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
   case llvm::Triple::nvptx64:
@@ -702,6 +710,7 @@ Triple Triple::get32BitArchVariant() const {
   Triple T(*this);
   switch (getArch()) {
   case Triple::UnknownArch:
+  case Triple::aarch64:
   case Triple::msp430:
     T.setArch(UnknownArch);
     break;
@@ -753,6 +762,7 @@ Triple Triple::get64BitArchVariant() const {
     T.setArch(UnknownArch);
     break;
 
+  case Triple::aarch64:
   case Triple::spir64:
   case Triple::mips64:
   case Triple::mips64el:
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index 40d6b3f..e00394e 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -51,7 +51,18 @@ int getPosixProtectionFlags(unsigned Flags) {
 	 llvm::sys::Memory::MF_EXEC:
     return PROT_READ | PROT_WRITE | PROT_EXEC;
   case llvm::sys::Memory::MF_EXEC:
+#if defined(__FreeBSD__)
+    // On PowerPC, having an executable page that has no read permission
+    // can have unintended consequences.  The function InvalidateInstruction-
+    // Cache uses instructions dcbf and icbi, both of which are treated by
+    // the processor as loads.  If the page has no read permissions,
+    // executing these instructions will result in a segmentation fault.
+    // Somehow, this problem is not present on Linux, but it does happen
+    // on FreeBSD.
+    return PROT_READ | PROT_EXEC;
+#else
     return PROT_EXEC;
+#endif
   default:
     llvm_unreachable("Illegal memory protection flag specified!");
   }
diff --git a/lib/Support/Unix/PathV2.inc b/lib/Support/Unix/PathV2.inc
index dc43b59..c343455 100644
--- a/lib/Support/Unix/PathV2.inc
+++ b/lib/Support/Unix/PathV2.inc
@@ -423,11 +423,12 @@ retry_random_path:
 rety_open_create:
   int RandomFD = ::open(RandomPath.c_str(), O_RDWR | O_CREAT | O_EXCL, mode);
   if (RandomFD == -1) {
+    int SavedErrno = errno;
     // If the file existed, try again, otherwise, error.
-    if (errno == errc::file_exists)
+    if (SavedErrno == errc::file_exists)
       goto retry_random_path;
     // If path prefix doesn't exist, try to create it.
-    if (errno == errc::no_such_file_or_directory &&
+    if (SavedErrno == errc::no_such_file_or_directory &&
         !exists(path::parent_path(RandomPath))) {
       StringRef p(RandomPath);
       SmallString<64> dir_to_create;
@@ -442,13 +443,15 @@ rety_open_create:
                                (*i)[1] == '/' &&
                                (*i)[2] != '/')
             return make_error_code(errc::no_such_file_or_directory);
-          if (::mkdir(dir_to_create.c_str(), 0700) == -1)
+          if (::mkdir(dir_to_create.c_str(), 0700) == -1 &&
+              errno != errc::file_exists)
             return error_code(errno, system_category());
         }
       }
       goto rety_open_create;
     }
-    return error_code(errno, system_category());
+
+    return error_code(SavedErrno, system_category());
   }
 
    // Make the path absolute.
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 1335b78..9a4454f 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -224,6 +224,8 @@ static unsigned getColumns(int FileID) {
 #if defined(HAVE_SYS_IOCTL_H) && defined(HAVE_TERMIOS_H)
   // Try to determine the width of the terminal.
   struct winsize ws;
+  // Zero-fill ws to avoid a false positive from MemorySanitizer.
+  memset(&ws, 0, sizeof(ws));
   if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
     Columns = ws.ws_col;
 #endif
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index c384316..117151c 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Unix.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/FileSystem.h"
 #include <llvm/Config/config.h>
 #if HAVE_SYS_STAT_H
@@ -164,12 +165,16 @@ static void SetMemoryLimits (unsigned size)
   setrlimit (RLIMIT_RSS, &r);
 #endif
 #ifdef RLIMIT_AS  // e.g. NetBSD doesn't have it.
+  // Don't set virtual memory limit if built with any Sanitizer. They need 80Tb
+  // of virtual memory for shadow memory mapping.
+#if !LLVM_MEMORY_SANITIZER_BUILD && !LLVM_ADDRESS_SANITIZER_BUILD
   // Virtual memory.
   getrlimit (RLIMIT_AS, &r);
   r.rlim_cur = limit;
   setrlimit (RLIMIT_AS, &r);
 #endif
 #endif
+#endif
 }
 
 bool
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 4a2496f..b8be623 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -47,17 +47,19 @@ static void (*InterruptFunction)() = 0;
 static std::vector<std::string> FilesToRemove;
 static std::vector<std::pair<void(*)(void*), void*> > CallBacksToRun;
 
-// IntSigs - Signals that may interrupt the program at any time.
+// IntSigs - Signals that represent requested termination. There's no bug
+// or failure, or if there is, it's not our direct responsibility. For whatever
+// reason, our continued execution is no longer desirable.
 static const int IntSigs[] = {
-  SIGHUP, SIGINT, SIGQUIT, SIGPIPE, SIGTERM, SIGUSR1, SIGUSR2
+  SIGHUP, SIGINT, SIGPIPE, SIGTERM, SIGUSR1, SIGUSR2
 };
 static const int *const IntSigsEnd =
   IntSigs + sizeof(IntSigs) / sizeof(IntSigs[0]);
 
-// KillSigs - Signals that are synchronous with the program that will cause it
-// to die.
+// KillSigs - Signals that represent that we have a bug, and our prompt
+// termination has been ordered.
 static const int KillSigs[] = {
-  SIGILL, SIGTRAP, SIGABRT, SIGFPE, SIGBUS, SIGSEGV
+  SIGILL, SIGTRAP, SIGABRT, SIGFPE, SIGBUS, SIGSEGV, SIGQUIT
 #ifdef SIGSYS
   , SIGSYS
 #endif
@@ -254,7 +256,7 @@ void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
 //
 // On glibc systems we have the 'backtrace' function, which works nicely, but
 // doesn't demangle symbols.
-static void PrintStackTrace(void *) {
+void llvm::sys::PrintStackTrace(FILE *FD) {
 #if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
   static void* StackTrace[256];
   // Use backtrace() to output a backtrace on Linux systems with glibc.
@@ -278,26 +280,30 @@ static void PrintStackTrace(void *) {
     Dl_info dlinfo;
     dladdr(StackTrace[i], &dlinfo);
 
-    fprintf(stderr, "%-2d", i);
+    fprintf(FD, "%-2d", i);
 
     const char* name = strrchr(dlinfo.dli_fname, '/');
-    if (name == NULL) fprintf(stderr, " %-*s", width, dlinfo.dli_fname);
-    else              fprintf(stderr, " %-*s", width, name+1);
+    if (name == NULL) fprintf(FD, " %-*s", width, dlinfo.dli_fname);
+    else              fprintf(FD, " %-*s", width, name+1);
 
-    fprintf(stderr, " %#0*lx",
+    fprintf(FD, " %#0*lx",
             (int)(sizeof(void*) * 2) + 2, (unsigned long)StackTrace[i]);
 
     if (dlinfo.dli_sname != NULL) {
       int res;
-      fputc(' ', stderr);
+      fputc(' ', FD);
       char* d = abi::__cxa_demangle(dlinfo.dli_sname, NULL, NULL, &res);
-      if (d == NULL) fputs(dlinfo.dli_sname, stderr);
-      else           fputs(d, stderr);
+      if (d == NULL) fputs(dlinfo.dli_sname, FD);
+      else           fputs(d, FD);
       free(d);
 
-      fprintf(stderr, " + %tu",(char*)StackTrace[i]-(char*)dlinfo.dli_saddr);
+      // FIXME: When we move to C++11, use %t length modifier. It's not in
+      // C++03 and causes gcc to issue warnings. Losing the upper 32 bits of
+      // the stack offset for a stack dump isn't likely to cause any problems.
+      fprintf(FD, " + %u",(unsigned)((char*)StackTrace[i]-
+                                     (char*)dlinfo.dli_saddr));
     }
-    fputc('\n', stderr);
+    fputc('\n', FD);
   }
 #else
   backtrace_symbols_fd(StackTrace, depth, STDERR_FILENO);
@@ -305,10 +311,14 @@ static void PrintStackTrace(void *) {
 #endif
 }
 
+static void PrintStackTraceSignalHandler(void *) {
+  PrintStackTrace(stderr);
+}
+
 /// PrintStackTraceOnErrorSignal - When an error signal (such as SIGABRT or
 /// SIGSEGV) is delivered to the process, print a stack trace and then exit.
 void llvm::sys::PrintStackTraceOnErrorSignal() {
-  AddSignalHandler(PrintStackTrace, 0);
+  AddSignalHandler(PrintStackTraceSignalHandler, 0);
 
 #if defined(__APPLE__) && !defined(ANDROID)
   // Environment variable to disable any kind of crash dialog.
diff --git a/lib/Support/Unix/TimeValue.inc b/lib/Support/Unix/TimeValue.inc
index 5cf5a9d..df8558b 100644
--- a/lib/Support/Unix/TimeValue.inc
+++ b/lib/Support/Unix/TimeValue.inc
@@ -48,7 +48,8 @@ TimeValue TimeValue::now() {
   }
 
   return TimeValue(
-    static_cast<TimeValue::SecondsType>( the_time.tv_sec + PosixZeroTime.seconds_ ),
+    static_cast<TimeValue::SecondsType>( the_time.tv_sec +
+      PosixZeroTimeSeconds ),
     static_cast<TimeValue::NanoSecondsType>( the_time.tv_usec *
       NANOSECONDS_PER_MICROSECOND ) );
 }
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 98d8a18..f4898e6 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -82,7 +82,7 @@ Path::isValid() const {
   pos = path.rfind(':',len);
   size_t rootslash = 0;
   if (pos != std::string::npos) {
-    if (pos != 1 || !isalpha(path[0]) || len < 3)
+    if (pos != 1 || !isalpha(static_cast<unsigned char>(path[0])) || len < 3)
       return false;
       rootslash = 2;
   }
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index a969753..3dd6660 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -295,6 +295,10 @@ void sys::PrintStackTraceOnErrorSignal() {
   LeaveCriticalSection(&CriticalSection);
 }
 
+void llvm::sys::PrintStackTrace(FILE *) {
+  // FIXME: Implement.
+}
+
 
 void sys::SetInterruptFunction(void (*IF)()) {
   RegisterHandler();
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 106864d..f71abd3 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -241,7 +241,8 @@ raw_ostream &raw_ostream::operator<<(double N) {
       if (cs == '+' || cs == '-') {
         int c1 = buf[len - 2];
         int c0 = buf[len - 1];
-        if (isdigit(c1) && isdigit(c0)) {
+        if (isdigit(static_cast<unsigned char>(c1)) &&
+            isdigit(static_cast<unsigned char>(c0))) {
           // Trim leading '0': "...e+012" -> "...e+12\0"
           buf[len - 3] = c1;
           buf[len - 2] = c0;
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index d0ca756..e1cd623 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -64,11 +64,11 @@ static int createDependencyFile(const TGParser &Parser, const char *argv0) {
     return 1;
   }
   DepOut.os() << OutputFilename << ":";
-  const std::vector<std::string> &Dependencies = Parser.getDependencies();
-  for (std::vector<std::string>::const_iterator I = Dependencies.begin(),
-                                                E = Dependencies.end();
+  const TGLexer::DependenciesMapTy &Dependencies = Parser.getDependencies();
+  for (TGLexer::DependenciesMapTy::const_iterator I = Dependencies.begin(),
+                                                  E = Dependencies.end();
        I != E; ++I) {
-    DepOut.os() << " " << (*I);
+    DepOut.os() << " " << I->first;
   }
   DepOut.os() << "\n";
   DepOut.keep();
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index b1d3a5b..9ad2053 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -935,6 +935,7 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
 
     break;
   }
+  case ADD:
   case SHL:
   case SRA:
   case SRL: {
@@ -945,6 +946,7 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
       int64_t Result;
       switch (getOpcode()) {
       default: llvm_unreachable("Bad opcode!");
+      case ADD: Result = LHSv +  RHSv; break;
       case SHL: Result = LHSv << RHSv; break;
       case SRA: Result = LHSv >> RHSv; break;
       case SRL: Result = (uint64_t)LHSv >> (uint64_t)RHSv; break;
@@ -970,6 +972,7 @@ std::string BinOpInit::getAsString() const {
   std::string Result;
   switch (Opc) {
   case CONCAT: Result = "!con"; break;
+  case ADD: Result = "!add"; break;
   case SHL: Result = "!shl"; break;
   case SRA: Result = "!sra"; break;
   case SRL: Result = "!srl"; break;
@@ -1522,11 +1525,9 @@ Init *FieldInit::resolveReferences(Record &R, const RecordVal *RV) const {
   return const_cast<FieldInit *>(this);
 }
 
-void ProfileDagInit(FoldingSetNodeID &ID,
-                    Init *V,
-                    const std::string &VN,
-                    ArrayRef<Init *> ArgRange,
-                    ArrayRef<std::string> NameRange) {
+static void ProfileDagInit(FoldingSetNodeID &ID, Init *V, const std::string &VN,
+                           ArrayRef<Init *> ArgRange,
+                           ArrayRef<std::string> NameRange) {
   ID.AddPointer(V);
   ID.AddString(VN);
 
diff --git a/lib/TableGen/TGLexer.cpp b/lib/TableGen/TGLexer.cpp
index d733f14..c6be4f8 100644
--- a/lib/TableGen/TGLexer.cpp
+++ b/lib/TableGen/TGLexer.cpp
@@ -309,7 +309,15 @@ bool TGLexer::LexInclude() {
     return true;
   }
   
-  Dependencies.push_back(IncludedFile);
+  DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
+  if (Found != Dependencies.end()) {
+    PrintError(getLoc(),
+               "File '" + IncludedFile + "' has already been included.");
+    SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note,
+                        "previously included here");
+    return true;
+  }
+  Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
   // Save the line number and lex buffer of the includer.
   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
   CurPtr = CurBuf->getBufferStart();
@@ -462,6 +470,7 @@ tgtok::TokKind TGLexer::LexExclaim() {
     .Case("head", tgtok::XHead)
     .Case("tail", tgtok::XTail)
     .Case("con", tgtok::XConcat)
+    .Case("add", tgtok::XADD)
     .Case("shl", tgtok::XSHL)
     .Case("sra", tgtok::XSRA)
     .Case("srl", tgtok::XSRL)
diff --git a/lib/TableGen/TGLexer.h b/lib/TableGen/TGLexer.h
index e2e116b..d1bd70d 100644
--- a/lib/TableGen/TGLexer.h
+++ b/lib/TableGen/TGLexer.h
@@ -15,9 +15,10 @@
 #define TGLEXER_H
 
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/SMLoc.h"
 #include <cassert>
+#include <map>
 #include <string>
-#include <vector>
 
 namespace llvm {
 class MemoryBuffer;
@@ -46,7 +47,7 @@ namespace tgtok {
     MultiClass, String,
     
     // !keywords.
-    XConcat, XSRA, XSRL, XSHL, XStrConcat, XCast, XSubst,
+    XConcat, XADD, XSRA, XSRL, XSHL, XStrConcat, XCast, XSubst,
     XForEach, XHead, XTail, XEmpty, XIf, XEq,
 
     // Integer value.
@@ -73,9 +74,13 @@ class TGLexer {
   /// CurBuffer - This is the current buffer index we're lexing from as managed
   /// by the SourceMgr object.
   int CurBuffer;
+
+public:
+  typedef std::map<std::string, SMLoc> DependenciesMapTy;
+private:
   /// Dependencies - This is the list of all included files.
-  std::vector<std::string> Dependencies;
-  
+  DependenciesMapTy Dependencies;
+
 public:
   TGLexer(SourceMgr &SrcMgr);
   ~TGLexer() {}
@@ -84,7 +89,7 @@ public:
     return CurCode = LexToken();
   }
 
-  const std::vector<std::string> &getDependencies() const {
+  const DependenciesMapTy &getDependencies() const {
     return Dependencies;
   }
   
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 17f0abc..c4b48fe 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 
 namespace llvm {
 struct SubClassReference {
-  SMLoc RefLoc;
+  SMRange RefRange;
   Record *Rec;
   std::vector<Init*> TemplateArgs;
   SubClassReference() : Rec(0) {}
@@ -35,7 +35,7 @@ struct SubClassReference {
 };
 
 struct SubMultiClassReference {
-  SMLoc RefLoc;
+  SMRange RefRange;
   MultiClass *MC;
   std::vector<Init*> TemplateArgs;
   SubMultiClassReference() : MC(0) {}
@@ -150,22 +150,23 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
   // Add all of the values in the subclass into the current class.
   const std::vector<RecordVal> &Vals = SC->getValues();
   for (unsigned i = 0, e = Vals.size(); i != e; ++i)
-    if (AddValue(CurRec, SubClass.RefLoc, Vals[i]))
+    if (AddValue(CurRec, SubClass.RefRange.Start, Vals[i]))
       return true;
 
   const std::vector<Init *> &TArgs = SC->getTemplateArgs();
 
   // Ensure that an appropriate number of template arguments are specified.
   if (TArgs.size() < SubClass.TemplateArgs.size())
-    return Error(SubClass.RefLoc, "More template args specified than expected");
+    return Error(SubClass.RefRange.Start,
+                 "More template args specified than expected");
 
   // Loop over all of the template arguments, setting them to the specified
   // value or leaving them as the default if necessary.
   for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
     if (i < SubClass.TemplateArgs.size()) {
       // If a value is specified for this template arg, set it now.
-      if (SetValue(CurRec, SubClass.RefLoc, TArgs[i], std::vector<unsigned>(),
-                   SubClass.TemplateArgs[i]))
+      if (SetValue(CurRec, SubClass.RefRange.Start, TArgs[i],
+                   std::vector<unsigned>(), SubClass.TemplateArgs[i]))
         return true;
 
       // Resolve it next.
@@ -175,7 +176,8 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
       CurRec->removeValue(TArgs[i]);
 
     } else if (!CurRec->getValue(TArgs[i])->getValue()->isComplete()) {
-      return Error(SubClass.RefLoc,"Value not specified for template argument #"
+      return Error(SubClass.RefRange.Start,
+                   "Value not specified for template argument #"
                    + utostr(i) + " (" + TArgs[i]->getAsUnquotedString()
                    + ") of subclass '" + SC->getNameInitAsString() + "'!");
     }
@@ -184,17 +186,18 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
   // Since everything went well, we can now set the "superclass" list for the
   // current record.
   const std::vector<Record*> &SCs = SC->getSuperClasses();
+  ArrayRef<SMRange> SCRanges = SC->getSuperClassRanges();
   for (unsigned i = 0, e = SCs.size(); i != e; ++i) {
     if (CurRec->isSubClassOf(SCs[i]))
-      return Error(SubClass.RefLoc,
+      return Error(SubClass.RefRange.Start,
                    "Already subclass of '" + SCs[i]->getName() + "'!\n");
-    CurRec->addSuperClass(SCs[i]);
+    CurRec->addSuperClass(SCs[i], SCRanges[i]);
   }
 
   if (CurRec->isSubClassOf(SC))
-    return Error(SubClass.RefLoc,
+    return Error(SubClass.RefRange.Start,
                  "Already subclass of '" + SC->getName() + "'!\n");
-  CurRec->addSuperClass(SC);
+  CurRec->addSuperClass(SC, SubClass.RefRange);
   return false;
 }
 
@@ -211,7 +214,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
   // Add all of the values in the subclass into the current class.
   const std::vector<RecordVal> &SMCVals = SMC->Rec.getValues();
   for (unsigned i = 0, e = SMCVals.size(); i != e; ++i)
-    if (AddValue(CurRec, SubMultiClass.RefLoc, SMCVals[i]))
+    if (AddValue(CurRec, SubMultiClass.RefRange.Start, SMCVals[i]))
       return true;
 
   int newDefStart = CurMC->DefPrototypes.size();
@@ -226,7 +229,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
 
     // Add all of the values in the superclass into the current def.
     for (unsigned i = 0, e = MCVals.size(); i != e; ++i)
-      if (AddValue(NewDef, SubMultiClass.RefLoc, MCVals[i]))
+      if (AddValue(NewDef, SubMultiClass.RefRange.Start, MCVals[i]))
         return true;
 
     CurMC->DefPrototypes.push_back(NewDef);
@@ -237,7 +240,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
   // Ensure that an appropriate number of template arguments are
   // specified.
   if (SMCTArgs.size() < SubMultiClass.TemplateArgs.size())
-    return Error(SubMultiClass.RefLoc,
+    return Error(SubMultiClass.RefRange.Start,
                  "More template args specified than expected");
 
   // Loop over all of the template arguments, setting them to the specified
@@ -246,7 +249,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
     if (i < SubMultiClass.TemplateArgs.size()) {
       // If a value is specified for this template arg, set it in the
       // superclass now.
-      if (SetValue(CurRec, SubMultiClass.RefLoc, SMCTArgs[i],
+      if (SetValue(CurRec, SubMultiClass.RefRange.Start, SMCTArgs[i],
                    std::vector<unsigned>(),
                    SubMultiClass.TemplateArgs[i]))
         return true;
@@ -266,7 +269,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
            ++j) {
         Record *Def = *j;
 
-        if (SetValue(Def, SubMultiClass.RefLoc, SMCTArgs[i],
+        if (SetValue(Def, SubMultiClass.RefRange.Start, SMCTArgs[i],
                      std::vector<unsigned>(),
                      SubMultiClass.TemplateArgs[i]))
           return true;
@@ -278,7 +281,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
         Def->removeValue(SMCTArgs[i]);
       }
     } else if (!CurRec->getValue(SMCTArgs[i])->getValue()->isComplete()) {
-      return Error(SubMultiClass.RefLoc,
+      return Error(SubMultiClass.RefRange.Start,
                    "Value not specified for template argument #"
                    + utostr(i) + " (" + SMCTArgs[i]->getAsUnquotedString()
                    + ") of subclass '" + SMC->Rec.getNameInitAsString() + "'!");
@@ -379,11 +382,12 @@ static bool isObjectStart(tgtok::TokKind K) {
 
 static std::string GetNewAnonymousName() {
   static unsigned AnonCounter = 0;
-  return "anonymous."+utostr(AnonCounter++);
+  unsigned Tmp = AnonCounter++; // MSVC2012 ICEs without this.
+  return "anonymous." + utostr(Tmp);
 }
 
 /// ParseObjectName - If an object name is specified, return it.  Otherwise,
-/// return an anonymous name.
+/// return 0.
 ///   ObjectName ::= Value [ '#' Value ]*
 ///   ObjectName ::= /*empty*/
 ///
@@ -395,7 +399,7 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
     // These are all of the tokens that can begin an object body.
     // Some of these can also begin values but we disallow those cases
     // because they are unlikely to be useful.
-    return StringInit::get(GetNewAnonymousName());
+    return 0;
   default:
     break;
   }
@@ -443,35 +447,18 @@ Record *TGParser::ParseClassID() {
 ///
 MultiClass *TGParser::ParseMultiClassID() {
   if (Lex.getCode() != tgtok::Id) {
-    TokError("expected name for ClassID");
+    TokError("expected name for MultiClassID");
     return 0;
   }
 
   MultiClass *Result = MultiClasses[Lex.getCurStrVal()];
   if (Result == 0)
-    TokError("Couldn't find class '" + Lex.getCurStrVal() + "'");
-
-  Lex.Lex();
-  return Result;
-}
-
-Record *TGParser::ParseDefmID() {
-  if (Lex.getCode() != tgtok::Id) {
-    TokError("expected multiclass name");
-    return 0;
-  }
-
-  MultiClass *MC = MultiClasses[Lex.getCurStrVal()];
-  if (MC == 0) {
     TokError("Couldn't find multiclass '" + Lex.getCurStrVal() + "'");
-    return 0;
-  }
 
   Lex.Lex();
-  return &MC->Rec;
+  return Result;
 }
 
-
 /// ParseSubClassReference - Parse a reference to a subclass or to a templated
 /// subclass.  This returns a SubClassRefTy with a null Record* on error.
 ///
@@ -481,17 +468,21 @@ Record *TGParser::ParseDefmID() {
 SubClassReference TGParser::
 ParseSubClassReference(Record *CurRec, bool isDefm) {
   SubClassReference Result;
-  Result.RefLoc = Lex.getLoc();
+  Result.RefRange.Start = Lex.getLoc();
 
-  if (isDefm)
-    Result.Rec = ParseDefmID();
-  else
+  if (isDefm) {
+    if (MultiClass *MC = ParseMultiClassID())
+      Result.Rec = &MC->Rec;
+  } else {
     Result.Rec = ParseClassID();
+  }
   if (Result.Rec == 0) return Result;
 
   // If there is no template arg list, we're done.
-  if (Lex.getCode() != tgtok::less)
+  if (Lex.getCode() != tgtok::less) {
+    Result.RefRange.End = Lex.getLoc();
     return Result;
+  }
   Lex.Lex();  // Eat the '<'
 
   if (Lex.getCode() == tgtok::greater) {
@@ -512,6 +503,7 @@ ParseSubClassReference(Record *CurRec, bool isDefm) {
     return Result;
   }
   Lex.Lex();
+  Result.RefRange.End = Lex.getLoc();
 
   return Result;
 }
@@ -526,14 +518,16 @@ ParseSubClassReference(Record *CurRec, bool isDefm) {
 SubMultiClassReference TGParser::
 ParseSubMultiClassReference(MultiClass *CurMC) {
   SubMultiClassReference Result;
-  Result.RefLoc = Lex.getLoc();
+  Result.RefRange.Start = Lex.getLoc();
 
   Result.MC = ParseMultiClassID();
   if (Result.MC == 0) return Result;
 
   // If there is no template arg list, we're done.
-  if (Lex.getCode() != tgtok::less)
+  if (Lex.getCode() != tgtok::less) {
+    Result.RefRange.End = Lex.getLoc();
     return Result;
+  }
   Lex.Lex();  // Eat the '<'
 
   if (Lex.getCode() == tgtok::greater) {
@@ -554,6 +548,7 @@ ParseSubMultiClassReference(MultiClass *CurMC) {
     return Result;
   }
   Lex.Lex();
+  Result.RefRange.End = Lex.getLoc();
 
   return Result;
 }
@@ -918,6 +913,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
   }
 
   case tgtok::XConcat:
+  case tgtok::XADD:
   case tgtok::XSRA:
   case tgtok::XSRL:
   case tgtok::XSHL:
@@ -933,6 +929,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     switch (OpTok) {
     default: llvm_unreachable("Unhandled code!");
     case tgtok::XConcat: Code = BinOpInit::CONCAT;Type = DagRecTy::get(); break;
+    case tgtok::XADD:    Code = BinOpInit::ADD;   Type = IntRecTy::get(); break;
     case tgtok::XSRA:    Code = BinOpInit::SRA;   Type = IntRecTy::get(); break;
     case tgtok::XSRL:    Code = BinOpInit::SRL;   Type = IntRecTy::get(); break;
     case tgtok::XSHL:    Code = BinOpInit::SHL;   Type = IntRecTy::get(); break;
@@ -1148,6 +1145,7 @@ RecTy *TGParser::ParseOperatorType() {
 ///   SimpleValue ::= '[' ValueList ']'
 ///   SimpleValue ::= '(' IDValue DagArgList ')'
 ///   SimpleValue ::= CONCATTOK '(' Value ',' Value ')'
+///   SimpleValue ::= ADDTOK '(' Value ',' Value ')'
 ///   SimpleValue ::= SHLTOK '(' Value ',' Value ')'
 ///   SimpleValue ::= SRATOK '(' Value ',' Value ')'
 ///   SimpleValue ::= SRLTOK '(' Value ',' Value ')'
@@ -1214,14 +1212,16 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       return 0;
     }
     Lex.Lex();  // eat the '>'
+    SMLoc EndLoc = Lex.getLoc();
 
     // Create the new record, set it as CurRec temporarily.
     static unsigned AnonCounter = 0;
     Record *NewRec = new Record("anonymous.val."+utostr(AnonCounter++),
                                 NameLoc,
-                                Records);
+                                Records,
+                                /*IsAnonymous=*/true);
     SubClassReference SCRef;
-    SCRef.RefLoc = NameLoc;
+    SCRef.RefRange = SMRange(NameLoc, EndLoc);
     SCRef.Rec = Class;
     SCRef.TemplateArgs = ValueList;
     // Add info about the subclass to NewRec.
@@ -1401,6 +1401,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   case tgtok::XEmpty:
   case tgtok::XCast:  // Value ::= !unop '(' Value ')'
   case tgtok::XConcat:
+  case tgtok::XADD:
   case tgtok::XSRA:
   case tgtok::XSRL:
   case tgtok::XSHL:
@@ -1876,6 +1877,17 @@ bool TGParser::ParseBody(Record *CurRec) {
   return false;
 }
 
+/// \brief Apply the current let bindings to \a CurRec.
+/// \returns true on error, false otherwise.
+bool TGParser::ApplyLetStack(Record *CurRec) {
+  for (unsigned i = 0, e = LetStack.size(); i != e; ++i)
+    for (unsigned j = 0, e = LetStack[i].size(); j != e; ++j)
+      if (SetValue(CurRec, LetStack[i][j].Loc, LetStack[i][j].Name,
+                   LetStack[i][j].Bits, LetStack[i][j].Value))
+        return true;
+  return false;
+}
+
 /// ParseObjectBody - Parse the body of a def or class.  This consists of an
 /// optional ClassList followed by a Body.  CurRec is the current def or class
 /// that is being parsed.
@@ -1906,12 +1918,8 @@ bool TGParser::ParseObjectBody(Record *CurRec) {
     }
   }
 
-  // Process any variables on the let stack.
-  for (unsigned i = 0, e = LetStack.size(); i != e; ++i)
-    for (unsigned j = 0, e = LetStack[i].size(); j != e; ++j)
-      if (SetValue(CurRec, LetStack[i][j].Loc, LetStack[i][j].Name,
-                   LetStack[i][j].Bits, LetStack[i][j].Value))
-        return true;
+  if (ApplyLetStack(CurRec))
+    return true;
 
   return ParseBody(CurRec);
 }
@@ -1927,7 +1935,13 @@ bool TGParser::ParseDef(MultiClass *CurMultiClass) {
   Lex.Lex();  // Eat the 'def' token.
 
   // Parse ObjectName and make a record for it.
-  Record *CurRec = new Record(ParseObjectName(CurMultiClass), DefLoc, Records);
+  Record *CurRec;
+  Init *Name = ParseObjectName(CurMultiClass);
+  if (Name)
+    CurRec = new Record(Name, DefLoc, Records);
+  else
+    CurRec = new Record(GetNewAnonymousName(), DefLoc, Records,
+                        /*IsAnonymous=*/true);
 
   if (!CurMultiClass && Loops.empty()) {
     // Top-level def definition.
@@ -2160,7 +2174,12 @@ bool TGParser::ParseTopLevelLet(MultiClass *CurMultiClass) {
 /// ParseMultiClass - Parse a multiclass definition.
 ///
 ///  MultiClassInst ::= MULTICLASS ID TemplateArgList?
-///                     ':' BaseMultiClassList '{' MultiClassDef+ '}'
+///                     ':' BaseMultiClassList '{' MultiClassObject+ '}'
+///  MultiClassObject ::= DefInst
+///  MultiClassObject ::= MultiClassInst
+///  MultiClassObject ::= DefMInst
+///  MultiClassObject ::= LETCommand '{' ObjectList '}'
+///  MultiClassObject ::= LETCommand Object
 ///
 bool TGParser::ParseMultiClass() {
   assert(Lex.getCode() == tgtok::MultiClass && "Unexpected token");
@@ -2242,7 +2261,7 @@ Record *TGParser::
 InstantiateMulticlassDef(MultiClass &MC,
                          Record *DefProto,
                          Init *DefmPrefix,
-                         SMLoc DefmPrefixLoc) {
+                         SMRange DefmPrefixRange) {
   // We need to preserve DefProto so it can be reused for later
   // instantiations, so create a new Record to inherit from it.
 
@@ -2251,8 +2270,11 @@ InstantiateMulticlassDef(MultiClass &MC,
   // name, substitute the prefix for #NAME#.  Otherwise, use the defm name
   // as a prefix.
 
-  if (DefmPrefix == 0)
+  bool IsAnonymous = false;
+  if (DefmPrefix == 0) {
     DefmPrefix = StringInit::get(GetNewAnonymousName());
+    IsAnonymous = true;
+  }
 
   Init *DefName = DefProto->getNameInit();
 
@@ -2269,21 +2291,21 @@ InstantiateMulticlassDef(MultiClass &MC,
   }
 
   // Make a trail of SMLocs from the multiclass instantiations.
-  SmallVector<SMLoc, 4> Locs(1, DefmPrefixLoc);
+  SmallVector<SMLoc, 4> Locs(1, DefmPrefixRange.Start);
   Locs.append(DefProto->getLoc().begin(), DefProto->getLoc().end());
-  Record *CurRec = new Record(DefName, Locs, Records);
+  Record *CurRec = new Record(DefName, Locs, Records, IsAnonymous);
 
   SubClassReference Ref;
-  Ref.RefLoc = DefmPrefixLoc;
+  Ref.RefRange = DefmPrefixRange;
   Ref.Rec = DefProto;
   AddSubClass(CurRec, Ref);
 
   // Set the value for NAME. We don't resolve references to it 'til later,
   // though, so that uses in nested multiclass names don't get
   // confused.
-  if (SetValue(CurRec, Ref.RefLoc, "NAME", std::vector<unsigned>(),
+  if (SetValue(CurRec, Ref.RefRange.Start, "NAME", std::vector<unsigned>(),
                DefmPrefix)) {
-    Error(DefmPrefixLoc, "Could not resolve "
+    Error(DefmPrefixRange.Start, "Could not resolve "
           + CurRec->getNameInitAsString() + ":NAME to '"
           + DefmPrefix->getAsUnquotedString() + "'");
     return 0;
@@ -2314,7 +2336,7 @@ InstantiateMulticlassDef(MultiClass &MC,
 
     // Ensure redefinition doesn't happen.
     if (Records.getDef(CurRec->getNameInitAsString())) {
-      Error(DefmPrefixLoc, "def '" + CurRec->getNameInitAsString() + 
+      Error(DefmPrefixRange.Start, "def '" + CurRec->getNameInitAsString() +
             "' already defined, instantiating defm with subdef '" + 
             DefProto->getNameInitAsString() + "'");
       return 0;
@@ -2365,33 +2387,30 @@ bool TGParser::ResolveMulticlassDef(MultiClass &MC,
                                     Record *DefProto,
                                     SMLoc DefmPrefixLoc) {
   // If the mdef is inside a 'let' expression, add to each def.
-  for (unsigned i = 0, e = LetStack.size(); i != e; ++i)
-    for (unsigned j = 0, e = LetStack[i].size(); j != e; ++j)
-      if (SetValue(CurRec, LetStack[i][j].Loc, LetStack[i][j].Name,
-                   LetStack[i][j].Bits, LetStack[i][j].Value))
-        return Error(DefmPrefixLoc, "when instantiating this defm");
+  if (ApplyLetStack(CurRec))
+    return Error(DefmPrefixLoc, "when instantiating this defm");
 
   // Don't create a top level definition for defm inside multiclasses,
   // instead, only update the prototypes and bind the template args
   // with the new created definition.
-  if (CurMultiClass) {
-    for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size();
-         i != e; ++i)
-      if (CurMultiClass->DefPrototypes[i]->getNameInit()
-          == CurRec->getNameInit())
-        return Error(DefmPrefixLoc, "defm '" + CurRec->getNameInitAsString() +
-                     "' already defined in this multiclass!");
-    CurMultiClass->DefPrototypes.push_back(CurRec);
+  if (!CurMultiClass)
+    return false;
+  for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size();
+       i != e; ++i)
+    if (CurMultiClass->DefPrototypes[i]->getNameInit()
+        == CurRec->getNameInit())
+      return Error(DefmPrefixLoc, "defm '" + CurRec->getNameInitAsString() +
+                   "' already defined in this multiclass!");
+  CurMultiClass->DefPrototypes.push_back(CurRec);
 
-    // Copy the template arguments for the multiclass into the new def.
-    const std::vector<Init *> &TA =
-      CurMultiClass->Rec.getTemplateArgs();
+  // Copy the template arguments for the multiclass into the new def.
+  const std::vector<Init *> &TA =
+    CurMultiClass->Rec.getTemplateArgs();
 
-    for (unsigned i = 0, e = TA.size(); i != e; ++i) {
-      const RecordVal *RV = CurMultiClass->Rec.getValue(TA[i]);
-      assert(RV && "Template arg doesn't exist?");
-      CurRec->addValue(*RV);
-    }
+  for (unsigned i = 0, e = TA.size(); i != e; ++i) {
+    const RecordVal *RV = CurMultiClass->Rec.getValue(TA[i]);
+    assert(RV && "Template arg doesn't exist?");
+    CurRec->addValue(*RV);
   }
 
   return false;
@@ -2403,14 +2422,14 @@ bool TGParser::ResolveMulticlassDef(MultiClass &MC,
 ///
 bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
   assert(Lex.getCode() == tgtok::Defm && "Unexpected token!");
-
+  SMLoc DefmLoc = Lex.getLoc();
   Init *DefmPrefix = 0;
 
   if (Lex.Lex() == tgtok::Id) {  // eat the defm.
     DefmPrefix = ParseObjectName(CurMultiClass);
   }
 
-  SMLoc DefmPrefixLoc = Lex.getLoc();
+  SMLoc DefmPrefixEndLoc = Lex.getLoc();
   if (Lex.getCode() != tgtok::colon)
     return TokError("expected ':' after defm identifier");
 
@@ -2446,15 +2465,17 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     for (unsigned i = 0, e = MC->DefPrototypes.size(); i != e; ++i) {
       Record *DefProto = MC->DefPrototypes[i];
 
-      Record *CurRec = InstantiateMulticlassDef(*MC, DefProto, DefmPrefix, DefmPrefixLoc);
+      Record *CurRec = InstantiateMulticlassDef(*MC, DefProto, DefmPrefix,
+                                                SMRange(DefmLoc,
+                                                        DefmPrefixEndLoc));
       if (!CurRec)
         return true;
 
-      if (ResolveMulticlassDefArgs(*MC, CurRec, DefmPrefixLoc, SubClassLoc,
+      if (ResolveMulticlassDefArgs(*MC, CurRec, DefmLoc, SubClassLoc,
                                    TArgs, TemplateVals, true/*Delete args*/))
         return Error(SubClassLoc, "could not instantiate def");
 
-      if (ResolveMulticlassDef(*MC, CurRec, DefProto, DefmPrefixLoc))
+      if (ResolveMulticlassDef(*MC, CurRec, DefProto, DefmLoc))
         return Error(SubClassLoc, "could not instantiate def");
 
       NewRecDefs.push_back(CurRec);
@@ -2493,12 +2514,8 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
         if (AddSubClass(CurRec, SubClass))
           return true;
 
-        // Process any variables on the let stack.
-        for (unsigned i = 0, e = LetStack.size(); i != e; ++i)
-          for (unsigned j = 0, e = LetStack[i].size(); j != e; ++j)
-            if (SetValue(CurRec, LetStack[i][j].Loc, LetStack[i][j].Name,
-                         LetStack[i][j].Bits, LetStack[i][j].Value))
-              return true;
+        if (ApplyLetStack(CurRec))
+          return true;
       }
 
       if (Lex.getCode() != tgtok::comma) break;
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index 0ea962b..044e3a0 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -96,7 +96,7 @@ public:
   bool TokError(const Twine &Msg) const {
     return Error(Lex.getLoc(), Msg);
   }
-  const std::vector<std::string> &getDependencies() const {
+  const TGLexer::DependenciesMapTy &getDependencies() const {
     return Lex.getDependencies();
   }
 
@@ -134,7 +134,7 @@ private:  // Parser methods.
   Record *InstantiateMulticlassDef(MultiClass &MC,
                                    Record *DefProto,
                                    Init *DefmPrefix,
-                                   SMLoc DefmPrefixLoc);
+                                   SMRange DefmPrefixRange);
   bool ResolveMulticlassDefArgs(MultiClass &MC,
                                 Record *DefProto,
                                 SMLoc DefmPrefixLoc,
@@ -183,7 +183,7 @@ private:  // Parser methods.
   Init *ParseObjectName(MultiClass *CurMultiClass);
   Record *ParseClassID();
   MultiClass *ParseMultiClassID();
-  Record *ParseDefmID();
+  bool ApplyLetStack(Record *CurRec);
 };
 
 } // end namespace llvm
diff --git a/lib/TableGen/TableGenBackend.cpp b/lib/TableGen/TableGenBackend.cpp
index 7c8367a..79d5677 100644
--- a/lib/TableGen/TableGenBackend.cpp
+++ b/lib/TableGen/TableGenBackend.cpp
@@ -14,13 +14,20 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/TableGenBackend.h"
+#include <algorithm>
+
 using namespace llvm;
 
+const size_t MAX_LINE_LEN = 80U;
+
 static void printLine(raw_ostream &OS, const Twine &Prefix, char Fill,
                       StringRef Suffix) {
-  uint64_t Pos = OS.tell();
+  size_t Pos = (size_t)OS.tell();
+  assert((MAX_LINE_LEN - Prefix.str().size() - Suffix.size() > 0) &&
+    "header line exceeds max limit");
   OS << Prefix;
-  for (unsigned i = OS.tell() - Pos, e = 80 - Suffix.size(); i != e; ++i)
+  const size_t e = MAX_LINE_LEN - Suffix.size();
+  for (size_t i = (size_t)OS.tell() - Pos; i < e; ++i)
     OS << Fill;
   OS << Suffix << '\n';
 }
@@ -28,10 +35,22 @@ static void printLine(raw_ostream &OS, const Twine &Prefix, char Fill,
 void llvm::emitSourceFileHeader(StringRef Desc, raw_ostream &OS) {
   printLine(OS, "/*===- TableGen'erated file ", '-', "*- C++ -*-===*\\");
   printLine(OS, "|*", ' ', "*|");
-  printLine(OS, "|* " + Desc, ' ', "*|");
-  printLine(OS, "|*", ' ', "*|");
-  printLine(OS, "|* Automatically generated file, do not edit!", ' ', "*|");
-  printLine(OS, "|*", ' ', "*|");
+  size_t Pos = 0U;
+  size_t PosE;
+  StringRef Prefix("|*");
+  StringRef Suffix(" *|");
+  do{
+    size_t PSLen = Suffix.size() + Prefix.size();
+    PosE = Pos + ((MAX_LINE_LEN > (Desc.size() - PSLen)) ?
+      Desc.size() :
+      MAX_LINE_LEN - PSLen);
+    printLine(OS, Prefix + Desc.slice(Pos, PosE), ' ', Suffix);
+    Pos = PosE;
+  } while(Pos < Desc.size());
+  printLine(OS, Prefix, ' ', Suffix);
+  printLine(OS, Prefix + " Automatically generated file, do not edit!", ' ',
+    Suffix);
+  printLine(OS, Prefix, ' ', Suffix);
   printLine(OS, "\\*===", '-', "===*/");
   OS << '\n';
 }
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
new file mode 100644
index 0000000..4de4faa
--- /dev/null
+++ b/lib/Target/AArch64/AArch64.h
@@ -0,0 +1,42 @@
+//==-- AArch64.h - Top-level interface for AArch64 representation -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AArch64 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64_H
+#define LLVM_TARGET_AARCH64_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AArch64AsmPrinter;
+class FunctionPass;
+class AArch64TargetMachine;
+class MachineInstr;
+class MCInst;
+
+FunctionPass *createAArch64ISelDAG(AArch64TargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel);
+
+FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
+
+FunctionPass *createAArch64BranchFixupPass();
+
+void LowerAArch64MachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                      AArch64AsmPrinter &AP);
+
+
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
new file mode 100644
index 0000000..e17052b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64.td
@@ -0,0 +1,70 @@
+//===- AArch64.td - Describe the AArch64 Target Machine -------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the AArch64 target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// AArch64 Subtarget features.
+//
+
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+  "Enable Advanced SIMD instructions">;
+
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+  "Enable cryptographic instructions">;
+
+//===----------------------------------------------------------------------===//
+// AArch64 Processors
+//
+
+include "AArch64Schedule.td"
+
+def : Processor<"generic", GenericItineraries, [FeatureNEON, FeatureCrypto]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AArch64RegisterInfo.td"
+
+include "AArch64CallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "AArch64InstrInfo.td"
+
+def AArch64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+
+def A64InstPrinter : AsmWriter {
+  string AsmWriterClassName = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def AArch64 : Target {
+  let InstructionSet = AArch64InstrInfo;
+  let AssemblyWriters = [A64InstPrinter];
+}
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
new file mode 100644
index 0000000..47ebb82
--- /dev/null
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -0,0 +1,347 @@
+//===-- AArch64AsmPrinter.cpp - Print machine code to an AArch64 .s file --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format AArch64 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "AArch64AsmPrinter.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+MachineLocation
+AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  // See emitFrameIndexDebugValue in InstrInfo for where this instruction is
+  // expected to be created.
+  assert(MI->getNumOperands() == 4 && MI->getOperand(0).isReg()
+         && MI->getOperand(1).isImm() && "unexpected custom DBG_VALUE");
+  return MachineLocation(MI->getOperand(0).getReg(),
+                         MI->getOperand(1).getImm());
+}
+
+/// Try to print a floating-point register as if it belonged to a specified
+/// register-class. For example the inline asm operand modifier "b" requires its
+/// argument to be printed as "bN".
+static bool printModifiedFPRAsmOperand(const MachineOperand &MO,
+                                       const TargetRegisterInfo *TRI,
+                                       const TargetRegisterClass &RegClass,
+                                       raw_ostream &O) {
+  if (!MO.isReg())
+    return true;
+
+  for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
+    if (RegClass.contains(*AR)) {
+      O << AArch64InstPrinter::getRegisterName(*AR);
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Implements the 'w' and 'x' inline asm operand modifiers, which print a GPR
+/// with the obvious type and an immediate 0 as either wzr or xzr.
+static bool printModifiedGPRAsmOperand(const MachineOperand &MO,
+                                       const TargetRegisterInfo *TRI,
+                                       const TargetRegisterClass &RegClass,
+                                       raw_ostream &O) {
+  char Prefix = &RegClass == &AArch64::GPR32RegClass ? 'w' : 'x';
+
+  if (MO.isImm() && MO.getImm() == 0) {
+    O << Prefix << "zr";
+    return false;
+  } else if (MO.isReg()) {
+    if (MO.getReg() == AArch64::XSP || MO.getReg() == AArch64::WSP) {
+      O << (Prefix == 'x' ? "sp" : "wsp");
+      return false;
+    }
+
+    for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
+      if (RegClass.contains(*AR)) {
+        O << AArch64InstPrinter::getRegisterName(*AR);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
+                                             bool PrintImmediatePrefix,
+                                             StringRef Suffix, raw_ostream &O) {
+  StringRef Name;
+  StringRef Modifier;
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("Unexpected operand for symbolic address constraint");
+  case MachineOperand::MO_GlobalAddress:
+    Name = Mang->getSymbol(MO.getGlobal())->getName();
+
+    // Global variables may be accessed either via a GOT or in various fun and
+    // interesting TLS-model specific ways. Set the prefix modifier as
+    // appropriate here.
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal())) {
+      Reloc::Model RelocM = TM.getRelocationModel();
+      if (GV->isThreadLocal()) {
+        switch (TM.getTLSModel(GV)) {
+        case TLSModel::GeneralDynamic:
+          Modifier = "tlsdesc";
+          break;
+        case TLSModel::LocalDynamic:
+          Modifier = "dtprel";
+          break;
+        case TLSModel::InitialExec:
+          Modifier = "gottprel";
+          break;
+        case TLSModel::LocalExec:
+          Modifier = "tprel";
+          break;
+        }
+      } else if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
+        Modifier = "got";
+      }
+    }
+    break;
+  case MachineOperand::MO_BlockAddress:
+    Name = GetBlockAddressSymbol(MO.getBlockAddress())->getName();
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    Name = MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    Name = GetCPISymbol(MO.getIndex())->getName();
+    break;
+  }
+
+  // Some instructions (notably ADRP) don't take the # prefix for
+  // immediates. Only print it if asked to.
+  if (PrintImmediatePrefix)
+    O << '#';
+
+  // Only need the joining "_" if both the prefix and the suffix are
+  // non-null. This little block simply takes care of the four possibly
+  // combinations involved there.
+  if (Modifier == "" && Suffix == "")
+    O << Name;
+  else if (Modifier == "" && Suffix != "")
+    O << ":" << Suffix << ':' << Name;
+  else if (Modifier != "" && Suffix == "")
+    O << ":" << Modifier << ':' << Name;
+  else
+    O << ":" << Modifier << '_' << Suffix << ':' << Name;
+
+  return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode, raw_ostream &O) {
+  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  if (!ExtraCode || !ExtraCode[0]) {
+    // There's actually no operand modifier, which leads to a slightly eclectic
+    // set of behaviour which we have to handle here.
+    const MachineOperand &MO = MI->getOperand(OpNum);
+    switch (MO.getType()) {
+    default:
+      llvm_unreachable("Unexpected operand for inline assembly");
+    case MachineOperand::MO_Register:
+      // GCC prints the unmodified operand of a 'w' constraint as the vector
+      // register. Technically, we could allocate the argument as a VPR128, but
+      // that leads to extremely dodgy copies being generated to get the data
+      // there.
+      if (printModifiedFPRAsmOperand(MO, TRI, AArch64::VPR128RegClass, O))
+        O << AArch64InstPrinter::getRegisterName(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      O << '#' << MO.getImm();
+      break;
+    case MachineOperand::MO_FPImmediate:
+      assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
+      O << "#0.0";
+      break;
+    case MachineOperand::MO_BlockAddress:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      return printSymbolicAddress(MO, false, "", O);
+    }
+    return false;
+  }
+
+  // We have a real modifier to handle.
+  switch(ExtraCode[0]) {
+  default:
+    // See if this is a generic operand
+    return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+  case 'c': // Don't print "#" before an immediate operand.
+    if (!MI->getOperand(OpNum).isImm())
+      return true;
+    O << MI->getOperand(OpNum).getImm();
+    return false;
+  case 'w':
+    // Output 32-bit general register operand, constant zero as wzr, or stack
+    // pointer as wsp. Ignored when used with other operand types.
+    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::GPR32RegClass, O);
+  case 'x':
+    // Output 64-bit general register operand, constant zero as xzr, or stack
+    // pointer as sp. Ignored when used with other operand types.
+    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::GPR64RegClass, O);
+  case 'H':
+    // Output higher numbered of a 64-bit general register pair
+  case 'Q':
+    // Output least significant register of a 64-bit general register pair
+  case 'R':
+    // Output most significant register of a 64-bit general register pair
+
+    // FIXME note: these three operand modifiers will require, to some extent,
+    // adding a paired GPR64 register class. Initial investigation suggests that
+    // assertions are hit unless it has a type and is made legal for that type
+    // in ISelLowering. After that step is made, the number of modifications
+    // needed explodes (operation legality, calling conventions, stores, reg
+    // copies ...).
+    llvm_unreachable("FIXME: Unimplemented register pairs");
+  case 'b':
+    // Output 8-bit FP/SIMD scalar register operand, prefixed with b.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR8RegClass, O);
+  case 'h':
+    // Output 16-bit FP/SIMD scalar register operand, prefixed with h.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR16RegClass, O);
+  case 's':
+    // Output 32-bit FP/SIMD scalar register operand, prefixed with s.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR32RegClass, O);
+  case 'd':
+    // Output 64-bit FP/SIMD scalar register operand, prefixed with d.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR64RegClass, O);
+  case 'q':
+    // Output 128-bit FP/SIMD scalar register operand, prefixed with q.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR128RegClass, O);
+  case 'A':
+    // Output symbolic address with appropriate relocation modifier (also
+    // suitable for ADRP).
+    return printSymbolicAddress(MI->getOperand(OpNum), false, "", O);
+  case 'L':
+    // Output bits 11:0 of symbolic address with appropriate :lo12: relocation
+    // modifier.
+    return printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O);
+  case 'G':
+    // Output bits 23:12 of symbolic address with appropriate :hi12: relocation
+    // modifier (currently only for TLS local exec).
+    return printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O);
+  }
+
+
+}
+
+bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                              unsigned OpNum,
+                                              unsigned AsmVariant,
+                                              const char *ExtraCode,
+                                              raw_ostream &O) {
+  // Currently both the memory constraints (m and Q) behave the same and amount
+  // to the address as a single register. In future, we may allow "m" to provide
+  // both a base and an offset.
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline assembly memory operand");
+  O << '[' << AArch64InstPrinter::getRegisterName(MO.getReg()) << ']';
+  return false;
+}
+
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                               raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps==4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[' << AArch64InstPrinter::getRegisterName(MI->getOperand(0).getReg());
+  OS << '+' << MI->getOperand(1).getImm();
+  OS << ']';
+  OS << "+" << MI->getOperand(NOps - 2).getImm();
+}
+
+
+#include "AArch64GenMCPseudoLowering.inc"
+
+void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(OutStreamer, MI))
+    return;
+
+  switch (MI->getOpcode()) {
+  case AArch64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+  }
+
+  MCInst TmpInst;
+  LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this);
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize(0), 0);
+      }
+      Stubs.clear();
+    }
+  }
+}
+
+bool AArch64AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  return AsmPrinter::runOnMachineFunction(MF);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmPrinter() {
+    RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64Target);
+}
+
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.h b/lib/Target/AArch64/AArch64AsmPrinter.h
new file mode 100644
index 0000000..af0c9fe
--- /dev/null
+++ b/lib/Target/AArch64/AArch64AsmPrinter.h
@@ -0,0 +1,80 @@
+// AArch64AsmPrinter.h - Print machine code to an AArch64 .s file -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64 assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64ASMPRINTER_H
+#define LLVM_AARCH64ASMPRINTER_H
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter {
+
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  // emitPseudoExpansionLowering - tblgen'erated.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  public:
+  explicit AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) {
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+  }
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+  MCOperand lowerSymbolOperand(const MachineOperand &MO,
+                               const MCSymbol *Sym) const;
+
+  void EmitInstruction(const MachineInstr *MI);
+  void EmitEndOfAsmFile(Module &M);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O);
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  /// printSymbolicAddress - Given some kind of reasonably bare symbolic
+  /// reference, print out the appropriate asm string to represent it. If
+  /// appropriate, a relocation-specifier will be produced, composed of a
+  /// general class derived from the MO parameter and an instruction-specific
+  /// suffix, provided in Suffix. E.g. ":got_lo12:" if a Suffix of "lo12" is
+  /// given.
+  bool printSymbolicAddress(const MachineOperand &MO,
+                            bool PrintImmediatePrefix,
+                            StringRef Suffix, raw_ostream &O);
+
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+
+  virtual const char *getPassName() const {
+    return "AArch64 Assembly Printer";
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
new file mode 100644
index 0000000..71233ba
--- /dev/null
+++ b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
@@ -0,0 +1,600 @@
+//===-- AArch64BranchFixupPass.cpp - AArch64 branch fixup -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that fixes AArch64 branches which have ended up out
+// of range for their immediate operands.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-branch-fixup"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+
+/// Return the worst case padding that could result from unknown offset bits.
+/// This does not include alignment padding caused by known offset bits.
+///
+/// @param LogAlign log2(alignment)
+/// @param KnownBits Number of known low offset bits.
+static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
+  if (KnownBits < LogAlign)
+    return (1u << LogAlign) - (1u << KnownBits);
+  return 0;
+}
+
+namespace {
+  /// Due to limited PC-relative displacements, conditional branches to distant
+  /// blocks may need converting into an unconditional equivalent. For example:
+  ///     tbz w1, #0, far_away
+  /// becomes
+  ///     tbnz w1, #0, skip
+  ///     b far_away
+  ///   skip:
+  class AArch64BranchFixup : public MachineFunctionPass {
+    /// Information about the offset and size of a single basic block.
+    struct BasicBlockInfo {
+      /// Distance from the beginning of the function to the beginning of this
+      /// basic block.
+      ///
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
+      unsigned Offset;
+
+      /// Size of the basic block in bytes.  If the block contains inline
+      /// assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      /// The number of low bits in Offset that are known to be exact.  The
+      /// remaining bits of Offset are an upper bound.
+      uint8_t KnownBits;
+
+      /// When non-zero, the block contains instructions (inline asm) of unknown
+      /// size.  The real size may be smaller than Size bytes by a multiple of 1
+      /// << Unalign.
+      uint8_t Unalign;
+
+      BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0) {}
+
+      /// Compute the number of known offset bits internally to this block.
+      /// This number should be used to predict worst case padding when
+      /// splitting the block.
+      unsigned internalKnownBits() const {
+        unsigned Bits = Unalign ? Unalign : KnownBits;
+        // If the block size isn't a multiple of the known bits, assume the
+        // worst case padding.
+        if (Size & ((1u << Bits) - 1))
+          Bits = CountTrailingZeros_32(Size);
+        return Bits;
+      }
+
+      /// Compute the offset immediately following this block.  If LogAlign is
+      /// specified, return the offset the successor block will get if it has
+      /// this alignment.
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        if (!LogAlign)
+          return PO;
+        // Add alignment padding from the terminator.
+        return PO + UnknownPadding(LogAlign, internalKnownBits());
+      }
+
+      /// Compute the number of known low bits of postOffset.  If this block
+      /// contains inline asm, the number of known bits drops to the
+      /// instruction alignment.  An aligned terminator may increase the number
+      /// of know bits.
+      /// If LogAlign is given, also consider the alignment of the next block.
+      unsigned postKnownBits(unsigned LogAlign = 0) const {
+        return std::max(LogAlign, internalKnownBits());
+      }
+    };
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// One per immediate branch, keeping the machine instruction pointer,
+    /// conditional or unconditional, the max displacement, and (if IsCond is
+    /// true) the corresponding inverted branch opcode.
+    struct ImmBranch {
+      MachineInstr *MI;
+      unsigned OffsetBits : 31;
+      bool IsCond : 1;
+      ImmBranch(MachineInstr *mi, unsigned offsetbits, bool cond)
+        : MI(mi), OffsetBits(offsetbits), IsCond(cond) {}
+    };
+
+    /// Keep track of all the immediate branch instructions.
+    ///
+    std::vector<ImmBranch> ImmBranches;
+
+    MachineFunction *MF;
+    const AArch64InstrInfo *TII;
+  public:
+    static char ID;
+    AArch64BranchFixup() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "AArch64 branch fixup pass";
+    }
+
+  private:
+    void initializeFunctionInfo();
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB,
+                     unsigned OffsetBits);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+
+    void computeBlockSize(MachineBasicBlock *MBB);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    void dumpBBs();
+    void verify();
+  };
+  char AArch64BranchFixup::ID = 0;
+}
+
+/// check BBOffsets
+void AArch64BranchFixup::verify() {
+#ifndef NDEBUG
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    unsigned MBBId = MBB->getNumber();
+    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void AArch64BranchFixup::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << " kb=" << unsigned(BBI.KnownBits)
+             << " ua=" << unsigned(BBI.Unalign)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
+
+/// Returns an instance of the branch fixup pass.
+FunctionPass *llvm::createAArch64BranchFixupPass() {
+  return new AArch64BranchFixup();
+}
+
+bool AArch64BranchFixup::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  DEBUG(dbgs() << "***** AArch64BranchFixup ******");
+  TII = (const AArch64InstrInfo*)MF->getTarget().getInstrInfo();
+
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block and location of each immediate branch.
+  initializeFunctionInfo();
+
+  // Iteratively fix up branches until there is no change.
+  unsigned NoBRIters = 0;
+  bool MadeChange = false;
+  while (true) {
+    DEBUG(dbgs() << "Beginning iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    if (!BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  ImmBranches.clear();
+
+  return MadeChange;
+}
+
+/// Return true if the specified basic block can fallthrough into the block
+/// immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (llvm::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// Do the initial scan of the function, building up information about the sizes
+/// of each block, and each immediate branch.
+void AArch64BranchFixup::initializeFunctionInfo() {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+  // The known bits of the entry block offset are determined by the function
+  // alignment.
+  BBInfo.front().KnownBits = MF->getAlignment();
+
+  // Compute block offsets and known bits.
+  adjustBBOffsetsAfter(MF->begin());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
+
+      int Opc = I->getOpcode();
+      if (I->isBranch()) {
+        bool IsCond = false;
+
+        // The offsets encoded in instructions here scale by the instruction
+        // size (4 bytes), effectively increasing their range by 2 bits.
+        unsigned Bits = 0;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other JT branches
+        case AArch64::TBZxii:
+        case AArch64::TBZwii:
+        case AArch64::TBNZxii:
+        case AArch64::TBNZwii:
+          IsCond = true;
+          Bits = 14 + 2;
+          break;
+        case AArch64::Bcc:
+        case AArch64::CBZx:
+        case AArch64::CBZw:
+        case AArch64::CBNZx:
+        case AArch64::CBNZw:
+          IsCond = true;
+          Bits = 19 + 2;
+          break;
+        case AArch64::Bimm:
+          Bits = 26 + 2;
+          break;
+        }
+
+        // Record this immediate branch.
+        ImmBranches.push_back(ImmBranch(I, Bits, IsCond));
+      }
+    }
+  }
+}
+
+/// Compute the size and some alignment information for MBB.  This function
+/// updates BBInfo directly.
+void AArch64BranchFixup::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+  BBI.Unalign = 0;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    BBI.Size += TII->getInstSizeInBytes(*I);
+    // For inline asm, GetInstSizeInBytes returns a conservative estimate.
+    // The actual size may be smaller, but still a multiple of the instr size.
+    if (I->isInlineAsm())
+      BBI.Unalign = 2;
+  }
+}
+
+/// Return the current offset of the specified machine instruction from the
+/// start of the function.  This offset changes as stuff is moved around inside
+/// the function.
+unsigned AArch64BranchFixup::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->getInstSizeInBytes(*I);
+  }
+  return Offset;
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *
+AArch64BranchFixup::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::Bimm)).addMBB(NewBB);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+void AArch64BranchFixup::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
+    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
+    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+
+    // This is where block i begins.  Stop if the offset is already correct,
+    // and we have updated 2 blocks.  This is the maximum number of blocks
+    // changed before calling this function.
+    if (i > BBNum + 2 &&
+        BBInfo[i].Offset == Offset &&
+        BBInfo[i].KnownBits == KnownBits)
+      break;
+
+    BBInfo[i].Offset = Offset;
+    BBInfo[i].KnownBits = KnownBits;
+  }
+}
+
+/// Returns true if the distance between specific MI and specific BB can fit in
+/// MI's displacement field.
+bool AArch64BranchFixup::isBBInRange(MachineInstr *MI,
+                                     MachineBasicBlock *DestBB,
+                                     unsigned OffsetBits) {
+  int64_t BrOffset   = getOffsetOf(MI);
+  int64_t DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " bits available=" << OffsetBits
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  return isIntN(OffsetBits, DestOffset - BrOffset);
+}
+
+/// Fix up an immediate branch whose destination is too far away to fit in its
+/// displacement field.
+bool AArch64BranchFixup::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = 0;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (MI->getOperand(i).isMBB()) {
+      DestBB = MI->getOperand(i).getMBB();
+      break;
+    }
+  }
+  assert(DestBB && "Branch with no destination BB?");
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.OffsetBits))
+    return false;
+
+  assert(Br.IsCond && "Only conditional branches should need fixup");
+  return fixupConditionalBr(Br);
+}
+
+/// Fix up a conditional branch whose destination is too far away to fit in its
+/// displacement field. It is converted to an inverse conditional branch + an
+/// unconditional branch to the destination.
+bool
+AArch64BranchFixup::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  unsigned CondBrMBBOperand = 0;
+
+  // The general idea is to add an unconditional branch to the destination and
+  // invert the conditional branch to jump over it. Complications occur around
+  // fallthrough and unreachable ends to the block.
+  //   b.lt L1
+  //   =>
+  //   b.ge L2
+  //   b   L1
+  // L2:
+
+  // First we invert the conditional branch, by creating a replacement if
+  // necessary. This if statement contains all the special handling of different
+  // branch types.
+  if (MI->getOpcode() == AArch64::Bcc) {
+    // The basic block is operand number 1 for Bcc
+    CondBrMBBOperand = 1;
+
+    A64CC::CondCodes CC = (A64CC::CondCodes)MI->getOperand(0).getImm();
+    CC = A64InvertCondCode(CC);
+    MI->getOperand(0).setImm(CC);
+  } else {
+    MachineInstrBuilder InvertedMI;
+    int InvertedOpcode;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unknown branch type");
+    case AArch64::TBZxii: InvertedOpcode = AArch64::TBNZxii; break;
+    case AArch64::TBZwii: InvertedOpcode = AArch64::TBNZwii; break;
+    case AArch64::TBNZxii: InvertedOpcode = AArch64::TBZxii; break;
+    case AArch64::TBNZwii: InvertedOpcode = AArch64::TBZwii; break;
+    case AArch64::CBZx: InvertedOpcode = AArch64::CBNZx; break;
+    case AArch64::CBZw: InvertedOpcode = AArch64::CBNZw; break;
+    case AArch64::CBNZx: InvertedOpcode = AArch64::CBZx; break;
+    case AArch64::CBNZw: InvertedOpcode = AArch64::CBZw; break;
+    }
+
+    InvertedMI = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(InvertedOpcode));
+    for (unsigned i = 0, e= MI->getNumOperands(); i != e; ++i) {
+      InvertedMI.addOperand(MI->getOperand(i));
+      if (MI->getOperand(i).isMBB())
+        CondBrMBBOperand = i;
+    }
+
+    MI->eraseFromParent();
+    MI = Br.MI = InvertedMI;
+  }
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through
+  // block. Otherwise, split the MBB before the next instruction.
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+        BMI->getOpcode() == AArch64::Bimm) {
+      // Last MI in the BB is an unconditional branch. We can swap destinations:
+      // b.eq L1 (temporarily b.ne L1 after first change)
+      // b   L2
+      // =>
+      // b.ne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBBInRange(MI, NewDest, Br.OffsetBits)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        MachineBasicBlock *DestBB = MI->getOperand(CondBrMBBOperand).getMBB();
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(CondBrMBBOperand).setMBB(NewDest);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    MachineBasicBlock::iterator MBBI = MI; ++MBBI;
+    splitBlockBeforeInstr(MBBI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->getInstSizeInBytes(MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+
+  // After splitting and removing the unconditional branch from the original BB,
+  // the structure is now:
+  // oldbb:
+  //   [things]
+  //   b.invertedCC L1
+  // splitbb/fallthroughbb:
+  //   [old b L2/real continuation]
+  //
+  // We now have to change the conditional branch to point to splitbb and add an
+  // unconditional branch after it to L1, giving the final structure:
+  // oldbb:
+  //   [things]
+  //   b.invertedCC splitbb
+  //   b L1
+  // splitbb/fallthroughbb:
+  //   [old b L2/real continuation]
+  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#"
+               << MI->getOperand(CondBrMBBOperand).getMBB()->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new unconditional branch and fixup the destination of the
+  // conditional one.  Also update the ImmBranch as well as adding a new entry
+  // for the new branch.
+  BuildMI(MBB, DebugLoc(), TII->get(AArch64::Bimm))
+    .addMBB(MI->getOperand(CondBrMBBOperand).getMBB());
+  MI->getOperand(CondBrMBBOperand).setMBB(NextBB);
+
+  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+
+  // 26 bits written down in Bimm, specifying a multiple of 4.
+  unsigned OffsetBits = 26 + 2;
+  ImmBranches.push_back(ImmBranch(&MBB->back(), OffsetBits, false));
+
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
new file mode 100644
index 0000000..b880d83
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConv.td
@@ -0,0 +1,196 @@
+//==-- AArch64CallingConv.td - Calling Conventions for ARM ----*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for AArch64 architecture.
+//===----------------------------------------------------------------------===//
+
+
+// The AArch64 Procedure Call Standard is unfortunately specified at a slightly
+// higher level of abstraction than LLVM's target interface presents. In
+// particular, it refers (like other ABIs, in fact) directly to
+// structs. However, generic LLVM code takes the liberty of lowering structure
+// arguments to the component fields before we see them.
+//
+// As a result, the obvious direct map from LLVM IR to PCS concepts can't be
+// implemented, so the goals of this calling convention are, in decreasing
+// priority order:
+//     1. Expose *some* way to express the concepts required to implement the
+//        generic PCS from a front-end.
+//     2. Provide a sane ABI for pure LLVM.
+//     3. Follow the generic PCS as closely as is naturally possible.
+//
+// The suggested front-end implementation of PCS features is:
+//     * Integer, float and vector arguments of all sizes which end up in
+//       registers are passed and returned via the natural LLVM type.
+//     * Structure arguments with size <= 16 bytes are passed and returned in
+//       registers as similar integer or composite types. For example:
+//       [1 x i64], [2 x i64] or [1 x i128] (if alignment 16 needed).
+//     * HFAs in registers follow rules similar to small structs: appropriate
+//       composite types.
+//     * Structure arguments with size > 16 bytes are passed via a pointer,
+//       handled completely by the front-end.
+//     * Structure return values > 16 bytes via an sret pointer argument.
+//     * Other stack-based arguments (not large structs) are passed using byval
+//       pointers. Padding arguments are added beforehand to guarantee a large
+//       struct doesn't later use integer registers.
+//
+// N.b. this means that it is the front-end's responsibility (if it cares about
+// PCS compliance) to check whether enough registers are available for an
+// argument when deciding how to pass it.
+
+class CCIfAlign<int Align, CCAction A>:
+  CCIf<"ArgFlags.getOrigAlign() == " # Align, A>;
+
+def CC_A64_APCS : CallingConv<[
+  // SRet is an LLVM-specific concept, so it takes precedence over general ABI
+  // concerns. However, this rule will be used by C/C++ frontends to implement
+  // structure return.
+  CCIfSRet<CCAssignToReg<[X8]>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Canonicalise the various types that live in different floating-point
+  // registers. This makes sense because the PCS does not distinguish Short
+  // Vectors and Floating-point types.
+  CCIfType<[v2i8], CCBitConvertToType<f16>>,
+  CCIfType<[v4i8, v2i16], CCBitConvertToType<f32>>,
+  CCIfType<[v8i8, v4i16, v2i32, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCBitConvertToType<f128>>,
+
+  // PCS: "C.1: If the argument is a Half-, Single-, Double- or Quad- precision
+  // Floating-point or Short Vector Type and the NSRN is less than 8, then the
+  // argument is allocated to the least significant bits of register
+  // v[NSRN]. The NSRN is incremented by one. The argument has now been
+  // allocated."
+  CCIfType<[f16],  CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
+  CCIfType<[f32],  CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64],  CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // PCS: "C.2: If the argument is an HFA and there are sufficient unallocated
+  // SIMD and Floating-point registers (NSRN - number of elements < 8), then the
+  // argument is allocated to SIMD and Floating-point registers (with one
+  // register per element of the HFA). The NSRN is incremented by the number of
+  // registers used. The argument has now been allocated."
+  //
+  // N.b. As above, this rule is the responsibility of the front-end.
+
+  // "C.3: If the argument is an HFA then the NSRN is set to 8 and the size of
+  // the argument is rounded up to the nearest multiple of 8 bytes."
+  //
+  // "C.4: If the argument is an HFA, a Quad-precision Floating-point or Short
+  // Vector Type then the NSAA is rounded up to the larger of 8 or the Natural
+  // Alignment of the Argument's type."
+  //
+  // It is expected that these will be satisfied by adding dummy arguments to
+  // the prototype.
+
+  // PCS: "C.5: If the argument is a Half- or Single- precision Floating-point
+  // type then the size of the argument is set to 8 bytes. The effect is as if
+  // the argument had been copied to the least significant bits of a 64-bit
+  // register and the remaining bits filled with unspecified values."
+  CCIfType<[f16, f32], CCPromoteToType<f64>>,
+
+  // PCS: "C.6: If the argument is an HFA, a Half-, Single-, Double- or Quad-
+  // precision Floating-point or Short Vector Type, then the argument is copied
+  // to memory at the adjusted NSAA. The NSAA is incremented by the size of the
+  // argument. The argument has now been allocated."
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+  CCIfType<[f128], CCAssignToStack<16, 16>>,
+
+  // PCS: "C.7: If the argument is an Integral Type, the size of the argument is
+  // less than or equal to 8 bytes and the NGRN is less than 8, the argument is
+  // copied to the least significant bits of x[NGRN]. The NGRN is incremented by
+  // one. The argument has now been allocated."
+
+  // First we implement C.8 and C.9 (128-bit types get even registers). i128 is
+  // represented as two i64s, the first one being split. If we delayed this
+  // operation C.8 would never be reached.
+  CCIfType<[i64],
+        CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6], [X0, X1, X3, X5]>>>,
+
+  // Note: the promotion also implements C.14.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // And now the real implementation of C.7
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+
+  // PCS: "C.8: If the argument has an alignment of 16 then the NGRN is rounded
+  // up to the next even number."
+  //
+  // "C.9: If the argument is an Integral Type, the size of the argument is
+  // equal to 16 and the NGRN is less than 7, the argument is copied to x[NGRN]
+  // and x[NGRN+1], x[NGRN] shall contain the lower addressed double-word of the
+  // memory representation of the argument. The NGRN is incremented by two. The
+  // argument has now been allocated."
+  //
+  // Subtlety here: what if alignment is 16 but it is not an integral type? All
+  // floating-point types have been allocated already, which leaves composite
+  // types: this is why a front-end may need to produce i128 for a struct <= 16
+  // bytes.
+
+  // PCS: "C.10 If the argument is a Composite Type and the size in double-words
+  // of the argument is not more than 8 minus NGRN, then the argument is copied
+  // into consecutive general-purpose registers, starting at x[NGRN]. The
+  // argument is passed as though it had been loaded into the registers from a
+  // double-word aligned address with an appropriate sequence of LDR
+  // instructions loading consecutive registers from memory (the contents of any
+  // unused parts of the registers are unspecified by this standard). The NGRN
+  // is incremented by the number of registers used. The argument has now been
+  // allocated."
+  //
+  // Another one that's the responsibility of the front-end (sigh).
+
+  // PCS: "C.11: The NGRN is set to 8."
+  CCCustom<"CC_AArch64NoMoreRegs">,
+
+  // PCS: "C.12: The NSAA is rounded up to the larger of 8 or the Natural
+  // Alignment of the argument's type."
+  //
+  // PCS: "C.13: If the argument is a composite type then the argument is copied
+  // to memory at the adjusted NSAA. The NSAA is by the size of the
+  // argument. The argument has now been allocated."
+  //
+  // Note that the effect of this corresponds to a memcpy rather than register
+  // stores so that the struct ends up correctly addressable at the adjusted
+  // NSAA.
+
+  // PCS: "C.14: If the size of the argument is less than 8 bytes then the size
+  // of the argument is set to 8 bytes. The effect is as if the argument was
+  // copied to the least significant bits of a 64-bit register and the remaining
+  // bits filled with unspecified values."
+  //
+  // Integer types were widened above. Floating-point and composite types have
+  // already been allocated completely. Nothing to do.
+
+  // PCS: "C.15: The argument is copied to memory at the adjusted NSAA. The NSAA
+  // is incremented by the size of the argument. The argument has now been
+  // allocated."
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64], CCAssignToStack<8, 8>>
+
+]>;
+
+// According to the PCS, X19-X30 are callee-saved, however only the low 64-bits
+// of vector registers (8-15) are callee-saved. The order here is is picked up
+// by PrologEpilogInserter.cpp to allocate stack slots, starting from top of
+// stack upon entry. This gives the customary layout of x30 at [sp-8], x29 at
+// [sp-16], ...
+def CSR_PCS : CalleeSavedRegs<(add (sequence "X%u", 30, 19),
+                                   (sequence "D%u", 15, 8))>;
+
+
+// TLS descriptor calls are extremely restricted in their changes, to allow
+// optimisations in the (hopefully) more common fast path where no real action
+// is needed. They actually have to preserve all registers, except for the
+// unavoidable X30 and the return register X0.
+def TLSDesc : CalleeSavedRegs<(add (sequence "X%u", 29, 1),
+                                   (sequence "Q%u", 31, 0))>;
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
new file mode 100644
index 0000000..cca6d12
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -0,0 +1,686 @@
+//===- AArch64FrameLowering.cpp - AArch64 Frame Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+void AArch64FrameLowering::splitSPAdjustments(uint64_t Total,
+                                              uint64_t &Initial,
+                                              uint64_t &Residual) const {
+  // 0x1f0 here is a pessimistic (i.e. realistic) boundary: x-register LDP
+  // instructions have a 7-bit signed immediate scaled by 8, giving a reach of
+  // 0x1f8, but stack adjustment should always be a multiple of 16.
+  if (Total <= 0x1f0) {
+    Initial = Total;
+    Residual = 0;
+  } else {
+    Initial = 0x1f0;
+    Residual = Total - Initial;
+  }
+}
+
+void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
+  AArch64MachineFunctionInfo *FuncInfo =
+    MF.getInfo<AArch64MachineFunctionInfo>();
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  bool NeedsFrameMoves = MMI.hasDebugInfo()
+    || MF.getFunction()->needsUnwindTableEntry();
+
+  uint64_t NumInitialBytes, NumResidualBytes;
+
+  // Currently we expect the stack to be laid out by
+  //     sub sp, sp, #initial
+  //     stp x29, x30, [sp, #offset]
+  //     ...
+  //     str xxx, [sp, #offset]
+  //     sub sp, sp, #rest (possibly via extra instructions).
+  if (MFI->getCalleeSavedInfo().size()) {
+    // If there are callee-saved registers, we want to store them efficiently as
+    // a block, and virtual base assignment happens too early to do it for us so
+    // we adjust the stack in two phases: first just for callee-saved fiddling,
+    // then to allocate the rest of the frame.
+    splitSPAdjustments(MFI->getStackSize(), NumInitialBytes, NumResidualBytes);
+  } else {
+    // If there aren't any callee-saved registers, two-phase adjustment is
+    // inefficient. It's more efficient to adjust with NumInitialBytes too
+    // because when we're in a "callee pops argument space" situation, that pop
+    // must be tacked onto Initial for correctness.
+    NumInitialBytes = MFI->getStackSize();
+    NumResidualBytes = 0;
+  }
+
+  // Tell everyone else how much adjustment we're expecting them to use. In
+  // particular if an adjustment is required for a tail call the epilogue could
+  // have a different view of things.
+  FuncInfo->setInitialStackAdjust(NumInitialBytes);
+
+  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumInitialBytes,
+               MachineInstr::FrameSetup);
+
+  if (NeedsFrameMoves && NumInitialBytes) {
+    // We emit this update even if the CFA is set from a frame pointer later so
+    // that the CFA is valid in the interim.
+    MCSymbol *SPLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
+      .addSym(SPLabel);
+
+    MachineLocation Dst(MachineLocation::VirtualFP);
+    MachineLocation Src(AArch64::XSP, NumInitialBytes);
+    Moves.push_back(MachineMove(SPLabel, Dst, Src));
+  }
+
+  // Otherwise we need to set the frame pointer and/or add a second stack
+  // adjustment.
+
+  bool FPNeedsSetting = hasFP(MF);
+  for (; MBBI != MBB.end(); ++MBBI) {
+    // Note that this search makes strong assumptions about the operation used
+    // to store the frame-pointer: it must be "STP x29, x30, ...". This could
+    // change in future, but until then there's no point in implementing
+    // untestable more generic cases.
+    if (FPNeedsSetting && MBBI->getOpcode() == AArch64::LSPair64_STR
+                       && MBBI->getOperand(0).getReg() == AArch64::X29) {
+      int64_t X29FrameIdx = MBBI->getOperand(2).getIndex();
+      FuncInfo->setFramePointerOffset(MFI->getObjectOffset(X29FrameIdx));
+
+      ++MBBI;
+      emitRegUpdate(MBB, MBBI, DL, TII, AArch64::X29, AArch64::XSP,
+                    AArch64::X29,
+                    NumInitialBytes + MFI->getObjectOffset(X29FrameIdx),
+                    MachineInstr::FrameSetup);
+
+      // The offset adjustment used when emitting debugging locations relative
+      // to whatever frame base is set. AArch64 uses the default frame base (FP
+      // or SP) and this adjusts the calculations to be correct.
+      MFI->setOffsetAdjustment(- MFI->getObjectOffset(X29FrameIdx)
+                               - MFI->getStackSize());
+
+      if (NeedsFrameMoves) {
+        MCSymbol *FPLabel = MMI.getContext().CreateTempSymbol();
+        BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
+          .addSym(FPLabel);
+        MachineLocation Dst(MachineLocation::VirtualFP);
+        MachineLocation Src(AArch64::X29, -MFI->getObjectOffset(X29FrameIdx));
+        Moves.push_back(MachineMove(FPLabel, Dst, Src));
+      }
+
+      FPNeedsSetting = false;
+    }
+
+    if (!MBBI->getFlag(MachineInstr::FrameSetup))
+      break;
+  }
+
+  assert(!FPNeedsSetting && "Frame pointer couldn't be set");
+
+  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumResidualBytes,
+               MachineInstr::FrameSetup);
+
+  // Now we emit the rest of the frame setup information, if necessary: we've
+  // already noted the FP and initial SP moves so we're left with the prologue's
+  // final SP update and callee-saved register locations.
+  if (!NeedsFrameMoves)
+    return;
+
+  // Reuse the label if appropriate, so create it in this outer scope.
+  MCSymbol *CSLabel = 0;
+
+  // The rest of the stack adjustment
+  if (!hasFP(MF) && NumResidualBytes) {
+    CSLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
+      .addSym(CSLabel);
+
+    MachineLocation Dst(MachineLocation::VirtualFP);
+    MachineLocation Src(AArch64::XSP, NumResidualBytes + NumInitialBytes);
+    Moves.push_back(MachineMove(CSLabel, Dst, Src));
+  }
+
+  // And any callee-saved registers (it's fine to leave them to the end here,
+  // because the old values are still valid at this point.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.size()) {
+    if (!CSLabel) {
+      CSLabel = MMI.getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
+        .addSym(CSLabel);
+    }
+
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      MachineLocation Dst(MachineLocation::VirtualFP,
+                          MFI->getObjectOffset(I->getFrameIdx()));
+      MachineLocation Src(I->getReg());
+      Moves.push_back(MachineMove(CSLabel, Dst, Src));
+    }
+  }
+}
+
+void
+AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  AArch64MachineFunctionInfo *FuncInfo =
+    MF.getInfo<AArch64MachineFunctionInfo>();
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  DebugLoc DL = MBBI->getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  // Initial and residual are named for consitency with the prologue. Note that
+  // in the epilogue, the residual adjustment is executed first.
+  uint64_t NumInitialBytes = FuncInfo->getInitialStackAdjust();
+  uint64_t NumResidualBytes = MFI.getStackSize() - NumInitialBytes;
+  uint64_t ArgumentPopSize = 0;
+  if (RetOpcode == AArch64::TC_RETURNdi ||
+      RetOpcode == AArch64::TC_RETURNxi) {
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+    MachineInstrBuilder MIB;
+    if (RetOpcode == AArch64::TC_RETURNdi) {
+      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_Bimm));
+      if (JumpTarget.isGlobal()) {
+        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                             JumpTarget.getTargetFlags());
+      } else {
+        assert(JumpTarget.isSymbol() && "unexpected tail call destination");
+        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                              JumpTarget.getTargetFlags());
+      }
+    } else {
+      assert(RetOpcode == AArch64::TC_RETURNxi && JumpTarget.isReg()
+             && "Unexpected tail call");
+
+      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_BRx));
+      MIB.addReg(JumpTarget.getReg(), RegState::Kill);
+    }
+
+    // Add the extra operands onto the new tail call instruction even though
+    // they're not used directly (so that liveness is tracked properly etc).
+    for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
+        MIB->addOperand(MBBI->getOperand(i));
+
+
+    // Delete the pseudo instruction TC_RETURN.
+    MachineInstr *NewMI = prior(MBBI);
+    MBB.erase(MBBI);
+    MBBI = NewMI;
+
+    // For a tail-call in a callee-pops-arguments environment, some or all of
+    // the stack may actually be in use for the call's arguments, this is
+    // calculated during LowerCall and consumed here...
+    ArgumentPopSize = StackAdjust.getImm();
+  } else {
+    // ... otherwise the amount to pop is *all* of the argument space,
+    // conveniently stored in the MachineFunctionInfo by
+    // LowerFormalArguments. This will, of course, be zero for the C calling
+    // convention.
+    ArgumentPopSize = FuncInfo->getArgumentStackToRestore();
+  }
+
+  assert(NumInitialBytes % 16 == 0 && NumResidualBytes % 16 == 0
+         && "refusing to adjust stack by misaligned amt");
+
+  // We may need to address callee-saved registers differently, so find out the
+  // bound on the frame indices.
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  // The "residual" stack update comes first from this direction and guarantees
+  // that SP is NumInitialBytes below its value on function entry, either by a
+  // direct update or restoring it from the frame pointer.
+  if (NumInitialBytes + ArgumentPopSize != 0) {
+    emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16,
+                 NumInitialBytes + ArgumentPopSize);
+    --MBBI;
+  }
+
+
+  // MBBI now points to the instruction just past the last callee-saved
+  // restoration (either RET/B if NumInitialBytes == 0, or the "ADD sp, sp"
+  // otherwise).
+
+  // Now we need to find out where to put the bulk of the stack adjustment
+  MachineBasicBlock::iterator FirstEpilogue = MBBI;
+  while (MBBI != MBB.begin()) {
+    --MBBI;
+
+    unsigned FrameOp;
+    for (FrameOp = 0; FrameOp < MBBI->getNumOperands(); ++FrameOp) {
+      if (MBBI->getOperand(FrameOp).isFI())
+        break;
+    }
+
+    // If this instruction doesn't have a frame index we've reached the end of
+    // the callee-save restoration.
+    if (FrameOp == MBBI->getNumOperands())
+      break;
+
+    // Likewise if it *is* a local reference, but not to a callee-saved object.
+    int FrameIdx = MBBI->getOperand(FrameOp).getIndex();
+    if (FrameIdx < MinCSFI || FrameIdx > MaxCSFI)
+      break;
+
+    FirstEpilogue = MBBI;
+  }
+
+  if (MF.getFrameInfo()->hasVarSizedObjects()) {
+    int64_t StaticFrameBase;
+    StaticFrameBase = -(NumInitialBytes + FuncInfo->getFramePointerOffset());
+    emitRegUpdate(MBB, FirstEpilogue, DL, TII,
+                  AArch64::XSP, AArch64::X29, AArch64::NoRegister,
+                  StaticFrameBase);
+  } else {
+    emitSPUpdate(MBB, FirstEpilogue, DL,TII, AArch64::X16, NumResidualBytes);
+  }
+}
+
+int64_t
+AArch64FrameLowering::resolveFrameIndexReference(MachineFunction &MF,
+                                                 int FrameIndex,
+                                                 unsigned &FrameReg,
+                                                 int SPAdj,
+                                                 bool IsCalleeSaveOp) const {
+  AArch64MachineFunctionInfo *FuncInfo =
+    MF.getInfo<AArch64MachineFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  int64_t TopOfFrameOffset = MFI->getObjectOffset(FrameIndex);
+
+  assert(!(IsCalleeSaveOp && FuncInfo->getInitialStackAdjust() == 0)
+         && "callee-saved register in unexpected place");
+
+  // If the frame for this function is particularly large, we adjust the stack
+  // in two phases which means the callee-save related operations see a
+  // different (intermediate) stack size.
+  int64_t FrameRegPos;
+  if (IsCalleeSaveOp) {
+    FrameReg = AArch64::XSP;
+    FrameRegPos = -static_cast<int64_t>(FuncInfo->getInitialStackAdjust());
+  } else if (useFPForAddressing(MF)) {
+    // Have to use the frame pointer since we have no idea where SP is.
+    FrameReg = AArch64::X29;
+    FrameRegPos = FuncInfo->getFramePointerOffset();
+  } else {
+    FrameReg = AArch64::XSP;
+    FrameRegPos = -static_cast<int64_t>(MFI->getStackSize()) + SPAdj;
+  }
+
+  return TopOfFrameOffset - FrameRegPos;
+}
+
+/// Estimate and return the size of the frame.
+static unsigned estimateStackSize(MachineFunction &MF) {
+  // FIXME: Make generic? Really consider after upstreaming.  This code is now
+  // shared between PEI, ARM *and* here.
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  int Offset = 0;
+
+  // This code is very, very similar to PEI::calculateFrameObjectOffsets().
+  // It really should be refactored to share code. Until then, changes
+  // should keep in mind that there's tight coupling between the two.
+
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -MFI->getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    Offset += MFI->getObjectSize(i);
+    unsigned Align = MFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset+Align-1)/Align*Align;
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  if (MFI->adjustsStack() && TFI->hasReservedCallFrame(MF))
+    Offset += MFI->getMaxCallFrameSize();
+
+  // Round up the size to a multiple of the alignment.  If the function has
+  // any calls or alloca's, align to the target's StackAlignment value to
+  // ensure that the callee's frame or the alloca data is suitably aligned;
+  // otherwise, for leaf functions, align to the TransientStackAlignment
+  // value.
+  unsigned StackAlign;
+  if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+      (RegInfo->needsStackRealignment(MF) && MFI->getObjectIndexEnd() != 0))
+    StackAlign = TFI->getStackAlignment();
+  else
+    StackAlign = TFI->getTransientStackAlignment();
+
+  // If the frame pointer is eliminated, all frame offsets will be relative to
+  // SP not FP. Align to MaxAlign so this works.
+  StackAlign = std::max(StackAlign, MaxAlign);
+  unsigned AlignMask = StackAlign - 1;
+  Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+
+  return (unsigned)Offset;
+}
+
+void
+AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  const AArch64RegisterInfo *RegInfo =
+    static_cast<const AArch64RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64InstrInfo &TII =
+    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+
+  if (hasFP(MF)) {
+    MF.getRegInfo().setPhysRegUsed(AArch64::X29);
+    MF.getRegInfo().setPhysRegUsed(AArch64::X30);
+  }
+
+  // If addressing of local variables is going to be more complicated than
+  // shoving a base register and an offset into the instruction then we may well
+  // need to scavenge registers. We should either specifically add an
+  // callee-save register for this purpose or allocate an extra spill slot.
+
+  bool BigStack =
+    (RS && estimateStackSize(MF) >= TII.estimateRSStackLimit(MF))
+    || MFI->hasVarSizedObjects() // Access will be from X29: messes things up
+    || (MFI->adjustsStack() && !hasReservedCallFrame(MF));
+
+  if (!BigStack)
+    return;
+
+  // We certainly need some slack space for the scavenger, preferably an extra
+  // register.
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+  uint16_t ExtraReg = AArch64::NoRegister;
+
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    if (AArch64::GPR64RegClass.contains(CSRegs[i]) &&
+        !MF.getRegInfo().isPhysRegUsed(CSRegs[i])) {
+      ExtraReg = CSRegs[i];
+      break;
+    }
+  }
+
+  if (ExtraReg != 0) {
+    MF.getRegInfo().setPhysRegUsed(ExtraReg);
+  } else {
+    // Create a stack slot for scavenging purposes. PrologEpilogInserter
+    // helpfully places it near either SP or FP for us to avoid
+    // infinitely-regression during scavenging.
+    const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+    RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+  }
+}
+
+bool AArch64FrameLowering::determinePrologueDeath(MachineBasicBlock &MBB,
+                                                  unsigned Reg) const {
+  // If @llvm.returnaddress is called then it will refer to X30 by some means;
+  // the prologue store does not kill the register.
+  if (Reg == AArch64::X30) {
+    if (MBB.getParent()->getFrameInfo()->isReturnAddressTaken()
+        && MBB.getParent()->getRegInfo().isLiveIn(Reg))
+    return false;
+  }
+
+  // In all other cases, physical registers are dead after they've been saved
+  // but live at the beginning of the prologue block.
+  MBB.addLiveIn(Reg);
+  return true;
+}
+
+void
+AArch64FrameLowering::emitFrameMemOps(bool isPrologue, MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      const std::vector<CalleeSavedInfo> &CSI,
+                                      const TargetRegisterInfo *TRI,
+                                      LoadStoreMethod PossClasses[],
+                                      unsigned NumClasses) const {
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // A certain amount of implicit contract is present here. The actual stack
+  // offsets haven't been allocated officially yet, so for strictly correct code
+  // we rely on the fact that the elements of CSI are allocated in order
+  // starting at SP, purely as dictated by size and alignment. In practice since
+  // this function handles the only accesses to those slots it's not quite so
+  // important.
+  //
+  // We have also ordered the Callee-saved register list in AArch64CallingConv
+  // so that the above scheme puts registers in order: in particular we want
+  // &X30 to be &X29+8 for an ABI-correct frame record (PCS 5.2.2)
+  for (unsigned i = 0, e = CSI.size(); i < e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    // First we need to find out which register class the register belongs to so
+    // that we can use the correct load/store instrucitons.
+    unsigned ClassIdx;
+    for (ClassIdx = 0; ClassIdx < NumClasses; ++ClassIdx) {
+      if (PossClasses[ClassIdx].RegClass->contains(Reg))
+        break;
+    }
+    assert(ClassIdx != NumClasses
+           && "Asked to store register in unexpected class");
+    const TargetRegisterClass &TheClass = *PossClasses[ClassIdx].RegClass;
+
+    // Now we need to decide whether it's possible to emit a paired instruction:
+    // for this we want the next register to be in the same class.
+    MachineInstrBuilder NewMI;
+    bool Pair = false;
+    if (i + 1 < CSI.size() && TheClass.contains(CSI[i+1].getReg())) {
+      Pair = true;
+      unsigned StLow = 0, StHigh = 0;
+      if (isPrologue) {
+        // Most of these registers will be live-in to the MBB and killed by our
+        // store, though there are exceptions (see determinePrologueDeath).
+        StLow = getKillRegState(determinePrologueDeath(MBB, CSI[i+1].getReg()));
+        StHigh = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
+      } else {
+        StLow = RegState::Define;
+        StHigh = RegState::Define;
+      }
+
+      NewMI = BuildMI(MBB, MBBI, DL, TII.get(PossClasses[ClassIdx].PairOpcode))
+                .addReg(CSI[i+1].getReg(), StLow)
+                .addReg(CSI[i].getReg(), StHigh);
+
+      // If it's a paired op, we've consumed two registers
+      ++i;
+    } else {
+      unsigned State;
+      if (isPrologue) {
+        State = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
+      } else {
+        State = RegState::Define;
+      }
+
+      NewMI = BuildMI(MBB, MBBI, DL,
+                      TII.get(PossClasses[ClassIdx].SingleOpcode))
+                .addReg(CSI[i].getReg(), State);
+    }
+
+    // Note that the FrameIdx refers to the second register in a pair: it will
+    // be allocated the smaller numeric address and so is the one an LDP/STP
+    // address must use.
+    int FrameIdx = CSI[i].getFrameIdx();
+    MachineMemOperand::MemOperandFlags Flags;
+    Flags = isPrologue ? MachineMemOperand::MOStore : MachineMemOperand::MOLoad;
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                             Flags,
+                             Pair ? TheClass.getSize() * 2 : TheClass.getSize(),
+                             MFI.getObjectAlignment(FrameIdx));
+
+    NewMI.addFrameIndex(FrameIdx)
+      .addImm(0)                  // address-register offset
+      .addMemOperand(MMO);
+
+    if (isPrologue)
+      NewMI.setMIFlags(MachineInstr::FrameSetup);
+
+    // For aesthetic reasons, during an epilogue we want to emit complementary
+    // operations to the prologue, but in the opposite order. So we still
+    // iterate through the CalleeSavedInfo list in order, but we put the
+    // instructions successively earlier in the MBB.
+    if (!isPrologue)
+      --MBBI;
+  }
+}
+
+bool
+AArch64FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  static LoadStoreMethod PossibleClasses[] = {
+    {&AArch64::GPR64RegClass, AArch64::LSPair64_STR, AArch64::LS64_STR},
+    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_STR, AArch64::LSFP64_STR},
+  };
+  unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
+
+  emitFrameMemOps(/* isPrologue = */ true, MBB, MBBI, CSI, TRI,
+                  PossibleClasses, NumClasses);
+
+  return true;
+}
+
+bool
+AArch64FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+
+  if (CSI.empty())
+    return false;
+
+  static LoadStoreMethod PossibleClasses[] = {
+    {&AArch64::GPR64RegClass, AArch64::LSPair64_LDR, AArch64::LS64_LDR},
+    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_LDR, AArch64::LSFP64_LDR},
+  };
+  unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
+
+  emitFrameMemOps(/* isPrologue = */ false, MBB, MBBI, CSI, TRI,
+                  PossibleClasses, NumClasses);
+
+  return true;
+}
+
+bool
+AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
+
+  // This is a decision of ABI compliance. The AArch64 PCS gives various options
+  // for conformance, and even at the most stringent level more or less permits
+  // elimination for leaf functions because there's no loss of functionality
+  // (for debugging etc)..
+  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->hasCalls())
+    return true;
+
+  // The following are hard-limits: incorrect code will be generated if we try
+  // to omit the frame.
+  return (RI->needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+bool
+AArch64FrameLowering::useFPForAddressing(const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Of the various reasons for having a frame pointer, it's actually only
+  // variable-sized objects that prevent reservation of a call frame.
+  return !(hasFP(MF) && MFI->hasVarSizedObjects());
+}
+
+void
+AArch64FrameLowering::eliminateCallFramePseudoInstr(
+                                MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const {
+  const AArch64InstrInfo &TII =
+    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MI->getDebugLoc();
+  int Opcode = MI->getOpcode();
+  bool IsDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? MI->getOperand(1).getImm() : 0;
+
+  if (!hasReservedCallFrame(MF)) {
+    unsigned Align = getStackAlignment();
+
+    int64_t Amount = MI->getOperand(0).getImm();
+    Amount = RoundUpToAlignment(Amount, Align);
+    if (!IsDestroy) Amount = -Amount;
+
+    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+    // doesn't have to pop anything), then the first operand will be zero too so
+    // this adjustment is a no-op.
+    if (CalleePopAmount == 0) {
+      // FIXME: in-function stack adjustment for calls is limited to 12-bits
+      // because there's no guaranteed temporary register available. Mostly call
+      // frames will be allocated at the start of a function so this is OK, but
+      // it is a limitation that needs dealing with.
+      assert(Amount > -0xfff && Amount < 0xfff && "call frame too large");
+      emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, Amount);
+    }
+  } else if (CalleePopAmount != 0) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    assert(CalleePopAmount < 0xfff && "call frame too large");
+    emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, -CalleePopAmount);
+  }
+
+  MBB.erase(MI);
+}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
new file mode 100644
index 0000000..45ea0ec
--- /dev/null
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -0,0 +1,108 @@
+//==- AArch64FrameLowering.h - Define frame lowering for AArch64 -*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements the AArch64-specific parts of the TargetFrameLowering
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_FRAMEINFO_H
+#define LLVM_AARCH64_FRAMEINFO_H
+
+#include "AArch64Subtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class AArch64Subtarget;
+
+class AArch64FrameLowering : public TargetFrameLowering {
+private:
+  // In order to unify the spilling and restoring of callee-saved registers into
+  // emitFrameMemOps, we need to be able to specify which instructions to use
+  // for the relevant memory operations on each register class. An array of the
+  // following struct is populated and passed in to achieve this.
+  struct LoadStoreMethod {
+    const TargetRegisterClass *RegClass; // E.g. GPR64RegClass
+
+    // The preferred instruction.
+    unsigned PairOpcode; // E.g. LSPair64_STR
+
+    // Sometimes only a single register can be handled at once.
+    unsigned SingleOpcode; // E.g. LS64_STR
+  };
+protected:
+  const AArch64Subtarget &STI;
+
+public:
+  explicit AArch64FrameLowering(const AArch64Subtarget &sti)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 16, 0, 16),
+      STI(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  /// Decides how much stack adjustment to perform in each phase of the prologue
+  /// and epilogue.
+  void splitSPAdjustments(uint64_t Total, uint64_t &Initial,
+                          uint64_t &Residual) const;
+
+  int64_t resolveFrameIndexReference(MachineFunction &MF, int FrameIndex,
+                                     unsigned &FrameReg, int SPAdj,
+                                     bool IsCalleeSaveOp) const;
+
+  virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                    RegScavenger *RS) const;
+
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const;
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+
+  /// If the register is X30 (i.e. LR) and the return address is used in the
+  /// function then the callee-save store doesn't actually kill the register,
+  /// otherwise it does.
+  bool determinePrologueDeath(MachineBasicBlock &MBB, unsigned Reg) const;
+
+  /// This function emits the loads or stores required during prologue and
+  /// epilogue as efficiently as possible.
+  ///
+  /// The operations involved in setting up and tearing down the frame are
+  /// similar enough to warrant a shared function, particularly as discrepancies
+  /// between the two would be disastrous.
+  void emitFrameMemOps(bool isStore, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MI,
+                       const std::vector<CalleeSavedInfo> &CSI,
+                       const TargetRegisterInfo *TRI,
+                       LoadStoreMethod PossibleClasses[],
+                       unsigned NumClasses) const;
+
+
+  virtual bool hasFP(const MachineFunction &MF) const;
+
+  virtual bool useFPForAddressing(const MachineFunction &MF) const;
+
+  /// On AA
+  virtual bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
new file mode 100644
index 0000000..46b8221
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -0,0 +1,415 @@
+//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AArch64 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-isel"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===--------------------------------------------------------------------===//
+/// AArch64 specific code to select AArch64 machine instructions for
+/// SelectionDAG operations.
+///
+namespace {
+
+class AArch64DAGToDAGISel : public SelectionDAGISel {
+  AArch64TargetMachine &TM;
+  const AArch64InstrInfo *TII;
+
+  /// Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+public:
+  explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
+                               CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(tm, OptLevel), TM(tm),
+      TII(static_cast<const AArch64InstrInfo*>(TM.getInstrInfo())),
+      Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
+  }
+
+  virtual const char *getPassName() const {
+    return "AArch64 Instruction Selection";
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+  template<unsigned MemSize>
+  bool SelectOffsetUImm12(SDValue N, SDValue &UImm12) {
+    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+    if (!CN || CN->getZExtValue() % MemSize != 0
+        || CN->getZExtValue() / MemSize > 0xfff)
+      return false;
+
+    UImm12 =  CurDAG->getTargetConstant(CN->getZExtValue() / MemSize, MVT::i64);
+    return true;
+  }
+
+  template<unsigned RegWidth>
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+  }
+
+  bool SelectFPZeroOperand(SDValue N, SDValue &Dummy);
+
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                unsigned RegWidth);
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps);
+
+  bool SelectLogicalImm(SDValue N, SDValue &Imm);
+
+  template<unsigned RegWidth>
+  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos) {
+    return SelectTSTBOperand(N, FixedPos, RegWidth);
+  }
+
+  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos, unsigned RegWidth);
+
+  SDNode *TrySelectToMoveImm(SDNode *N);
+  SDNode *LowerToFPLitPool(SDNode *Node);
+  SDNode *SelectToLitPool(SDNode *N);
+
+  SDNode* Select(SDNode*);
+private:
+};
+}
+
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                              unsigned RegWidth) {
+  const ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+  if (!CN) return false;
+
+  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+  // x-register.
+  //
+  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+  // integers.
+  bool IsExact;
+
+  // fbits is between 1 and 64 in the worst-case, which means the fmul
+  // could have 2^64 as an actual operand. Need 65 bits of precision.
+  APSInt IntVal(65, true);
+  CN->getValueAPF().convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+
+  // N.b. isPowerOf2 also checks for > 0.
+  if (!IsExact || !IntVal.isPowerOf2()) return false;
+  unsigned FBits = IntVal.logBase2();
+
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding FBits, but it must still be in range.
+  if (FBits == 0 || FBits > RegWidth) return false;
+
+  FixedPos = CurDAG->getTargetConstant(64 - FBits, MVT::i32);
+  return true;
+}
+
+bool
+AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                 char ConstraintCode,
+                                                 std::vector<SDValue> &OutOps) {
+  switch (ConstraintCode) {
+  default: llvm_unreachable("Unrecognised AArch64 memory constraint");
+  case 'm':
+    // FIXME: more freedom is actually permitted for 'm'. We can go
+    // hunting for a base and an offset if we want. Of course, since
+    // we don't really know how the operand is going to be used we're
+    // probably restricted to the load/store pair's simm7 as an offset
+    // range anyway.
+  case 'Q':
+    OutOps.push_back(Op);
+  }
+
+  return false;
+}
+
+bool
+AArch64DAGToDAGISel::SelectFPZeroOperand(SDValue N, SDValue &Dummy) {
+  ConstantFPSDNode *Imm = dyn_cast<ConstantFPSDNode>(N);
+  if (!Imm || !Imm->getValueAPF().isPosZero())
+    return false;
+
+  // Doesn't actually carry any information, but keeps TableGen quiet.
+  Dummy = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) {
+  uint32_t Bits;
+  uint32_t RegWidth = N.getValueType().getSizeInBits();
+
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+  if (!CN) return false;
+
+  if (!A64Imms::isLogicalImm(RegWidth, CN->getZExtValue(), Bits))
+    return false;
+
+  Imm = CurDAG->getTargetConstant(Bits, MVT::i32);
+  return true;
+}
+
+SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
+  SDNode *ResNode;
+  DebugLoc dl = Node->getDebugLoc();
+  EVT DestType = Node->getValueType(0);
+  unsigned DestWidth = DestType.getSizeInBits();
+
+  unsigned MOVOpcode;
+  EVT MOVType;
+  int UImm16, Shift;
+  uint32_t LogicalBits;
+
+  uint64_t BitPat = cast<ConstantSDNode>(Node)->getZExtValue();
+  if (A64Imms::isMOVZImm(DestWidth, BitPat, UImm16, Shift)) {
+    MOVType = DestType;
+    MOVOpcode = DestWidth == 64 ? AArch64::MOVZxii : AArch64::MOVZwii;
+  } else if (A64Imms::isMOVNImm(DestWidth, BitPat, UImm16, Shift)) {
+    MOVType = DestType;
+    MOVOpcode = DestWidth == 64 ? AArch64::MOVNxii : AArch64::MOVNwii;
+  } else if (DestWidth == 64 && A64Imms::isMOVNImm(32, BitPat, UImm16, Shift)) {
+    // To get something like 0x0000_0000_ffff_1234 into a 64-bit register we can
+    // use a 32-bit instruction: "movn w0, 0xedbc".
+    MOVType = MVT::i32;
+    MOVOpcode = AArch64::MOVNwii;
+  } else if (A64Imms::isLogicalImm(DestWidth, BitPat, LogicalBits))  {
+    MOVOpcode = DestWidth == 64 ? AArch64::ORRxxi : AArch64::ORRwwi;
+    uint16_t ZR = DestWidth == 64 ? AArch64::XZR : AArch64::WZR;
+
+    return CurDAG->getMachineNode(MOVOpcode, dl, DestType,
+                              CurDAG->getRegister(ZR, DestType),
+                              CurDAG->getTargetConstant(LogicalBits, MVT::i32));
+  } else {
+    // Can't handle it in one instruction. There's scope for permitting two (or
+    // more) instructions, but that'll need more thought.
+    return NULL;
+  }
+
+  ResNode = CurDAG->getMachineNode(MOVOpcode, dl, MOVType,
+                                   CurDAG->getTargetConstant(UImm16, MVT::i32),
+                                   CurDAG->getTargetConstant(Shift, MVT::i32));
+
+  if (MOVType != DestType) {
+    ResNode = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
+                          MVT::i64, MVT::i32, MVT::Other,
+                          CurDAG->getTargetConstant(0, MVT::i64),
+                          SDValue(ResNode, 0),
+                          CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32));
+  }
+
+  return ResNode;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
+  DebugLoc DL = Node->getDebugLoc();
+  uint64_t UnsignedVal = cast<ConstantSDNode>(Node)->getZExtValue();
+  int64_t SignedVal = cast<ConstantSDNode>(Node)->getSExtValue();
+  EVT DestType = Node->getValueType(0);
+  EVT PtrVT = TLI.getPointerTy();
+
+  // Since we may end up loading a 64-bit constant from a 32-bit entry the
+  // constant in the pool may have a different type to the eventual node.
+  ISD::LoadExtType Extension;
+  EVT MemType;
+
+  assert((DestType == MVT::i64 || DestType == MVT::i32)
+         && "Only expect integer constants at the moment");
+
+  if (DestType == MVT::i32) {
+    Extension = ISD::NON_EXTLOAD;
+    MemType = MVT::i32;
+  } else if (UnsignedVal <= UINT32_MAX) {
+    Extension = ISD::ZEXTLOAD;
+    MemType = MVT::i32;
+  } else if (SignedVal >= INT32_MIN && SignedVal <= INT32_MAX) {
+    Extension = ISD::SEXTLOAD;
+    MemType = MVT::i32;
+  } else {
+    Extension = ISD::NON_EXTLOAD;
+    MemType = MVT::i64;
+  }
+
+  Constant *CV = ConstantInt::get(Type::getIntNTy(*CurDAG->getContext(),
+                                                  MemType.getSizeInBits()),
+                                  UnsignedVal);
+  SDValue PoolAddr;
+  unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(CV->getType());
+  PoolAddr = CurDAG->getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                             CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0,
+                                                         AArch64II::MO_NO_FLAG),
+                             CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0,
+                                                           AArch64II::MO_LO12),
+                             CurDAG->getConstant(Alignment, MVT::i32));
+
+  return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(),
+                            PoolAddr,
+                            MachinePointerInfo::getConstantPool(), MemType,
+                            /* isVolatile = */ false,
+                            /* isNonTemporal = */ false,
+                            Alignment).getNode();
+}
+
+SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) {
+  DebugLoc DL = Node->getDebugLoc();
+  const ConstantFP *FV = cast<ConstantFPSDNode>(Node)->getConstantFPValue();
+  EVT PtrVT = TLI.getPointerTy();
+  EVT DestType = Node->getValueType(0);
+
+  unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(FV->getType());
+  SDValue PoolAddr;
+
+  assert(TM.getCodeModel() == CodeModel::Small &&
+         "Only small code model supported");
+  PoolAddr = CurDAG->getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                             CurDAG->getTargetConstantPool(FV, PtrVT, 0, 0,
+                                                         AArch64II::MO_NO_FLAG),
+                             CurDAG->getTargetConstantPool(FV, PtrVT, 0, 0,
+                                                           AArch64II::MO_LO12),
+                             CurDAG->getConstant(Alignment, MVT::i32));
+
+  return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr,
+                         MachinePointerInfo::getConstantPool(),
+                         /* isVolatile = */ false,
+                         /* isNonTemporal = */ false,
+                         /* isInvariant = */ true,
+                         Alignment).getNode();
+}
+
+bool
+AArch64DAGToDAGISel::SelectTSTBOperand(SDValue N, SDValue &FixedPos,
+                                       unsigned RegWidth) {
+  const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+  if (!CN) return false;
+
+  uint64_t Val = CN->getZExtValue();
+
+  if (!isPowerOf2_64(Val)) return false;
+
+  unsigned TestedBit = Log2_64(Val);
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding TestedBit, but it must still be in range.
+  if (TestedBit >= RegWidth) return false;
+
+  FixedPos = CurDAG->getTargetConstant(TestedBit, MVT::i64);
+  return true;
+}
+
+SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected
+  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
+
+  if (Node->isMachineOpcode()) {
+    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    return NULL;
+  }
+
+  switch (Node->getOpcode()) {
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    EVT PtrTy = TLI.getPointerTy();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy);
+    return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy,
+                                TFI, CurDAG->getTargetConstant(0, PtrTy));
+  }
+  case ISD::ConstantPool: {
+    // Constant pools are fine, just create a Target entry.
+    ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Node);
+    const Constant *C = CN->getConstVal();
+    SDValue CP = CurDAG->getTargetConstantPool(C, CN->getValueType(0));
+
+    ReplaceUses(SDValue(Node, 0), CP);
+    return NULL;
+  }
+  case ISD::Constant: {
+    SDNode *ResNode = 0;
+    if (cast<ConstantSDNode>(Node)->getZExtValue() == 0) {
+      // XZR and WZR are probably even better than an actual move: most of the
+      // time they can be folded into another instruction with *no* cost.
+
+      EVT Ty = Node->getValueType(0);
+      assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type");
+      uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR;
+      ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                       Node->getDebugLoc(),
+                                       Register, Ty).getNode();
+    }
+
+    // Next best option is a move-immediate, see if we can do that.
+    if (!ResNode) {
+      ResNode = TrySelectToMoveImm(Node);
+    }
+
+    if (ResNode)
+      return ResNode;
+
+    // If even that fails we fall back to a lit-pool entry at the moment. Future
+    // tuning may change this to a sequence of MOVZ/MOVN/MOVK instructions.
+    ResNode = SelectToLitPool(Node);
+    assert(ResNode && "We need *some* way to materialise a constant");
+
+    // We want to continue selection at this point since the litpool access
+    // generated used generic nodes for simplicity.
+    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+    Node = ResNode;
+    break;
+  }
+  case ISD::ConstantFP: {
+    if (A64Imms::isFPImm(cast<ConstantFPSDNode>(Node)->getValueAPF())) {
+      // FMOV will take care of it from TableGen
+      break;
+    }
+
+    SDNode *ResNode = LowerToFPLitPool(Node);
+    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+
+    // We want to continue selection at this point since the litpool access
+    // generated used generic nodes for simplicity.
+    Node = ResNode;
+    break;
+  }
+  default:
+    break; // Let generic code handle it
+  }
+
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(dbgs() << "=> ";
+        if (ResNode == NULL || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        dbgs() << "\n");
+
+  return ResNode;
+}
+
+/// This pass converts a legalized DAG into a AArch64-specific DAG, ready for
+/// instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDAG(AArch64TargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new AArch64DAGToDAGISel(TM, OptLevel);
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
new file mode 100644
index 0000000..cea7f91
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -0,0 +1,2976 @@
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AArch64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-isel"
+#include "AArch64.h"
+#include "AArch64ISelLowering.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64TargetObjectFile.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallingConv.h"
+
+using namespace llvm;
+
+static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
+  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+
+  if (Subtarget->isTargetLinux())
+    return new AArch64LinuxTargetObjectFile();
+  if (Subtarget->isTargetELF())
+    return new TargetLoweringObjectFileELF();
+  llvm_unreachable("unknown subtarget type");
+}
+
+
+AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
+  : TargetLowering(TM, createTLOF(TM)),
+    Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
+    RegInfo(TM.getRegisterInfo()),
+    Itins(TM.getInstrItineraryData()) {
+
+  // SIMD compares set the entire lane's bits to 1
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Scalar register <-> type mapping
+  addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
+  addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
+  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
+  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
+  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+
+  computeRegisterProperties();
+
+  // Some atomic operations can be folded into load-acquire or store-release
+  // instructions on AArch64. It's marginally simpler to let LLVM expand
+  // everything out to a barrier and then recombine the (few) barriers we can.
+  setInsertFencesForAtomic(true);
+  setTargetDAGCombine(ISD::ATOMIC_FENCE);
+  setTargetDAGCombine(ISD::ATOMIC_STORE);
+
+  // We combine OR nodes for bitfield and NEON BSL operations.
+  setTargetDAGCombine(ISD::OR);
+
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::SRA);
+
+  // AArch64 does not have i1 loads, or much of anything for i1 really.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
+
+  setStackPointerRegisterToSaveRestore(AArch64::XSP);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+
+  // We'll lower globals to wrappers for selection.
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+
+  // A64 instructions have the comparison predicate attached to the user of the
+  // result, but having a separate comparison is valuable for matching.
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::SELECT, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f64, Custom);
+
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+
+  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  // Legal floating-point operations.
+  setOperationAction(ISD::FABS, MVT::f32, Legal);
+  setOperationAction(ISD::FABS, MVT::f64, Legal);
+
+  setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+  setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+
+  setOperationAction(ISD::FNEG, MVT::f32, Legal);
+  setOperationAction(ISD::FNEG, MVT::f64, Legal);
+
+  setOperationAction(ISD::FRINT, MVT::f32, Legal);
+  setOperationAction(ISD::FRINT, MVT::f64, Legal);
+
+  setOperationAction(ISD::FSQRT, MVT::f32, Legal);
+  setOperationAction(ISD::FSQRT, MVT::f64, Legal);
+
+  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
+
+  // Illegal floating-point operations.
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+
+  setOperationAction(ISD::FEXP, MVT::f32, Expand);
+  setOperationAction(ISD::FEXP, MVT::f64, Expand);
+
+  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
+
+  setOperationAction(ISD::FLOG, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG, MVT::f64, Expand);
+
+  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
+
+  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
+
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+
+  setOperationAction(ISD::FPOWI, MVT::f32, Expand);
+  setOperationAction(ISD::FPOWI, MVT::f64, Expand);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+
+
+  // Virtually no operation on f128 is legal, but LLVM can't expand them when
+  // there's a valid register class, so we need custom operations in most cases.
+  setOperationAction(ISD::FABS,       MVT::f128, Expand);
+  setOperationAction(ISD::FADD,       MVT::f128, Custom);
+  setOperationAction(ISD::FCOPYSIGN,  MVT::f128, Expand);
+  setOperationAction(ISD::FCOS,       MVT::f128, Expand);
+  setOperationAction(ISD::FDIV,       MVT::f128, Custom);
+  setOperationAction(ISD::FMA,        MVT::f128, Expand);
+  setOperationAction(ISD::FMUL,       MVT::f128, Custom);
+  setOperationAction(ISD::FNEG,       MVT::f128, Expand);
+  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Expand);
+  setOperationAction(ISD::FP_ROUND,   MVT::f128, Expand);
+  setOperationAction(ISD::FPOW,       MVT::f128, Expand);
+  setOperationAction(ISD::FREM,       MVT::f128, Expand);
+  setOperationAction(ISD::FRINT,      MVT::f128, Expand);
+  setOperationAction(ISD::FSIN,       MVT::f128, Expand);
+  setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
+  setOperationAction(ISD::FSUB,       MVT::f128, Custom);
+  setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
+  setOperationAction(ISD::SETCC,      MVT::f128, Custom);
+  setOperationAction(ISD::BR_CC,      MVT::f128, Custom);
+  setOperationAction(ISD::SELECT,     MVT::f128, Expand);
+  setOperationAction(ISD::SELECT_CC,  MVT::f128, Custom);
+  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Custom);
+
+  // Lowering for many of the conversions is actually specified by the non-f128
+  // type. The LowerXXX function will be trivial when f128 isn't involved.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
+  setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
+
+  // This prevents LLVM trying to compress double constants into a floating
+  // constant-pool entry and trying to load from there. It's of doubtful benefit
+  // for A64: we'd need LDR followed by FCVT, I believe.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
+
+  setExceptionPointerRegister(AArch64::X0);
+  setExceptionSelectorRegister(AArch64::X1);
+}
+
+EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const {
+  // It's reasonably important that this value matches the "natural" legal
+  // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
+  // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
+  if (!VT.isVector()) return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+static void getExclusiveOperation(unsigned Size, unsigned &ldrOpc,
+                                  unsigned &strOpc) {
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for atomic binary op!");
+  case 1:
+    ldrOpc = AArch64::LDXR_byte;
+    strOpc = AArch64::STXR_byte;
+    break;
+  case 2:
+    ldrOpc = AArch64::LDXR_hword;
+    strOpc = AArch64::STXR_hword;
+    break;
+  case 4:
+    ldrOpc = AArch64::LDXR_word;
+    strOpc = AArch64::STXR_word;
+    break;
+  case 8:
+    ldrOpc = AArch64::LDXR_dword;
+    strOpc = AArch64::STXR_dword;
+    break;
+  }
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                        unsigned Size,
+                                        unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(Size, ldrOpc, strOpc);
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  const TargetRegisterClass *TRC
+    = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldxr dest, ptr
+  //   <binop> scratch, dest, incr
+  //   stxr stxr_status, scratch, ptr
+  //   cmp stxr_status, #0
+  //   b.ne loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (BinOpcode) {
+    // All arithmetic operations we'll be creating are designed to take an extra
+    // shift or extend operand, which we can conveniently set to zero.
+
+    // Operand order needs to go the other way for NAND.
+    if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
+      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
+        .addReg(incr).addReg(dest).addImm(0);
+    else
+      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
+        .addReg(dest).addReg(incr).addImm(0);
+  }
+
+  // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
+  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+
+  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
+  BuildMI(BB, dl, TII->get(AArch64::SUBwwi_lsl0_cmp))
+    .addReg(stxr_status).addImm(0);
+  BuildMI(BB, dl, TII->get(AArch64::Bcc))
+    .addImm(A64CC::NE).addMBB(loopMBB);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned Size,
+                                              unsigned CmpOp,
+                                              A64CC::CondCodes Cond) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  unsigned oldval = dest;
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *TRC, *TRCsp;
+  if (Size == 8) {
+    TRC = &AArch64::GPR64RegClass;
+    TRCsp = &AArch64::GPR64xspRegClass;
+  } else {
+    TRC = &AArch64::GPR32RegClass;
+    TRCsp = &AArch64::GPR32wspRegClass;
+  }
+
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(Size, ldrOpc, strOpc);
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  unsigned scratch = MRI.createVirtualRegister(TRC);
+  MRI.constrainRegClass(scratch, TRCsp);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldxr dest, ptr
+  //   cmp incr, dest (, sign extend if necessary)
+  //   csel scratch, dest, incr, cond
+  //   stxr stxr_status, scratch, ptr
+  //   cmp stxr_status, #0
+  //   b.ne loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+
+  // Build compare and cmov instructions.
+  MRI.constrainRegClass(incr, TRCsp);
+  BuildMI(BB, dl, TII->get(CmpOp))
+    .addReg(incr).addReg(oldval).addImm(0);
+
+  BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
+          scratch)
+    .addReg(oldval).addReg(incr).addImm(Cond);
+
+  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+
+  BuildMI(BB, dl, TII->get(strOpc), stxr_status)
+    .addReg(scratch).addReg(ptr);
+  BuildMI(BB, dl, TII->get(AArch64::SUBwwi_lsl0_cmp))
+    .addReg(stxr_status).addImm(0);
+  BuildMI(BB, dl, TII->get(AArch64::Bcc))
+    .addImm(A64CC::NE).addMBB(loopMBB);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned Size) const {
+  unsigned dest    = MI->getOperand(0).getReg();
+  unsigned ptr     = MI->getOperand(1).getReg();
+  unsigned oldval  = MI->getOperand(2).getReg();
+  unsigned newval  = MI->getOperand(3).getReg();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *TRCsp;
+  TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
+
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(Size, ldrOpc, strOpc);
+
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It; // insert the new blocks after the current block
+
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loop1MBB
+  BB->addSuccessor(loop1MBB);
+
+  // loop1MBB:
+  //   ldxr dest, [ptr]
+  //   cmp dest, oldval
+  //   b.ne exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+
+  unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
+  MRI.constrainRegClass(dest, TRCsp);
+  BuildMI(BB, dl, TII->get(CmpOp))
+    .addReg(dest).addReg(oldval).addImm(0);
+  BuildMI(BB, dl, TII->get(AArch64::Bcc))
+    .addImm(A64CC::NE).addMBB(exitMBB);
+  BB->addSuccessor(loop2MBB);
+  BB->addSuccessor(exitMBB);
+
+  // loop2MBB:
+  //   strex stxr_status, newval, [ptr]
+  //   cmp stxr_status, #0
+  //   b.ne loop1MBB
+  BB = loop2MBB;
+  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+
+  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
+  BuildMI(BB, dl, TII->get(AArch64::SUBwwi_lsl0_cmp))
+    .addReg(stxr_status).addImm(0);
+  BuildMI(BB, dl, TII->get(AArch64::Bcc))
+    .addImm(A64CC::NE).addMBB(loop1MBB);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  // We materialise the F128CSEL pseudo-instruction using conditional branches
+  // and loads, giving an instruciton sequence like:
+  //     str q0, [sp]
+  //     b.ne IfTrue
+  //     b Finish
+  // IfTrue:
+  //     str q1, [sp]
+  // Finish:
+  //     ldr q0, [sp]
+  //
+  // Using virtual registers would probably not be beneficial since COPY
+  // instructions are expensive for f128 (there's no actual instruction to
+  // implement them).
+  //
+  // An alternative would be to do an integer-CSEL on some address. E.g.:
+  //     mov x0, sp
+  //     add x1, sp, #16
+  //     str q0, [x0]
+  //     str q1, [x1]
+  //     csel x0, x0, x1, ne
+  //     ldr q0, [x0]
+  //
+  // It's unclear which approach is actually optimal.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineFunction *MF = MBB->getParent();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction::iterator It = MBB;
+  ++It;
+
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned IfTrueReg = MI->getOperand(1).getReg();
+  unsigned IfFalseReg = MI->getOperand(2).getReg();
+  unsigned CondCode = MI->getOperand(3).getImm();
+  bool NZCVKilled = MI->getOperand(4).isKill();
+
+  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, TrueBB);
+  MF->insert(It, EndBB);
+
+  // Transfer rest of current basic-block to EndBB
+  EndBB->splice(EndBB->begin(), MBB,
+                llvm::next(MachineBasicBlock::iterator(MI)),
+                MBB->end());
+  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // We need somewhere to store the f128 value needed.
+  int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
+
+  //     [... start of incoming MBB ...]
+  //     str qIFFALSE, [sp]
+  //     b.cc IfTrue
+  //     b Done
+  BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
+    .addReg(IfFalseReg)
+    .addFrameIndex(ScratchFI)
+    .addImm(0);
+  BuildMI(MBB, DL, TII->get(AArch64::Bcc))
+    .addImm(CondCode)
+    .addMBB(TrueBB);
+  BuildMI(MBB, DL, TII->get(AArch64::Bimm))
+    .addMBB(EndBB);
+  MBB->addSuccessor(TrueBB);
+  MBB->addSuccessor(EndBB);
+
+  // IfTrue:
+  //     str qIFTRUE, [sp]
+  BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
+    .addReg(IfTrueReg)
+    .addFrameIndex(ScratchFI)
+    .addImm(0);
+
+  // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
+  // blocks.
+  TrueBB->addSuccessor(EndBB);
+
+  // Done:
+  //     ldr qDEST, [sp]
+  //     [... rest of incoming MBB ...]
+  if (!NZCVKilled)
+    EndBB->addLiveIn(AArch64::NZCV);
+  MachineInstr *StartOfEnd = EndBB->begin();
+  BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
+    .addFrameIndex(ScratchFI)
+    .addImm(0);
+
+  MI->eraseFromParent();
+  return EndBB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unhandled instruction with custom inserter");
+  case AArch64::F128CSEL:
+    return EmitF128CSEL(MI, MBB);
+  case AArch64::ATOMIC_LOAD_ADD_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
+  case AArch64::ATOMIC_LOAD_ADD_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
+  case AArch64::ATOMIC_LOAD_ADD_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
+  case AArch64::ATOMIC_LOAD_ADD_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_SUB_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
+  case AArch64::ATOMIC_LOAD_SUB_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
+  case AArch64::ATOMIC_LOAD_SUB_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
+  case AArch64::ATOMIC_LOAD_SUB_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_AND_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
+  case AArch64::ATOMIC_LOAD_AND_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
+  case AArch64::ATOMIC_LOAD_AND_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
+  case AArch64::ATOMIC_LOAD_AND_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_OR_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
+  case AArch64::ATOMIC_LOAD_OR_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
+  case AArch64::ATOMIC_LOAD_OR_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
+  case AArch64::ATOMIC_LOAD_OR_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_XOR_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
+  case AArch64::ATOMIC_LOAD_XOR_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
+  case AArch64::ATOMIC_LOAD_XOR_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
+  case AArch64::ATOMIC_LOAD_XOR_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_NAND_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
+  case AArch64::ATOMIC_LOAD_NAND_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
+  case AArch64::ATOMIC_LOAD_NAND_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
+  case AArch64::ATOMIC_LOAD_NAND_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_MIN_I8:
+    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
+  case AArch64::ATOMIC_LOAD_MIN_I16:
+    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
+  case AArch64::ATOMIC_LOAD_MIN_I32:
+    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
+  case AArch64::ATOMIC_LOAD_MIN_I64:
+    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
+
+  case AArch64::ATOMIC_LOAD_MAX_I8:
+    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
+  case AArch64::ATOMIC_LOAD_MAX_I16:
+    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
+  case AArch64::ATOMIC_LOAD_MAX_I32:
+    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
+  case AArch64::ATOMIC_LOAD_MAX_I64:
+    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
+
+  case AArch64::ATOMIC_LOAD_UMIN_I8:
+    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
+  case AArch64::ATOMIC_LOAD_UMIN_I16:
+    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
+  case AArch64::ATOMIC_LOAD_UMIN_I32:
+    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
+  case AArch64::ATOMIC_LOAD_UMIN_I64:
+    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
+
+  case AArch64::ATOMIC_LOAD_UMAX_I8:
+    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
+  case AArch64::ATOMIC_LOAD_UMAX_I16:
+    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
+  case AArch64::ATOMIC_LOAD_UMAX_I32:
+    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
+  case AArch64::ATOMIC_LOAD_UMAX_I64:
+    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
+
+  case AArch64::ATOMIC_SWAP_I8:
+    return emitAtomicBinary(MI, MBB, 1, 0);
+  case AArch64::ATOMIC_SWAP_I16:
+    return emitAtomicBinary(MI, MBB, 2, 0);
+  case AArch64::ATOMIC_SWAP_I32:
+    return emitAtomicBinary(MI, MBB, 4, 0);
+  case AArch64::ATOMIC_SWAP_I64:
+    return emitAtomicBinary(MI, MBB, 8, 0);
+
+  case AArch64::ATOMIC_CMP_SWAP_I8:
+    return emitAtomicCmpSwap(MI, MBB, 1);
+  case AArch64::ATOMIC_CMP_SWAP_I16:
+    return emitAtomicCmpSwap(MI, MBB, 2);
+  case AArch64::ATOMIC_CMP_SWAP_I32:
+    return emitAtomicCmpSwap(MI, MBB, 4);
+  case AArch64::ATOMIC_CMP_SWAP_I64:
+    return emitAtomicCmpSwap(MI, MBB, 8);
+  }
+}
+
+
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  case AArch64ISD::BR_CC:          return "AArch64ISD::BR_CC";
+  case AArch64ISD::Call:           return "AArch64ISD::Call";
+  case AArch64ISD::FPMOV:          return "AArch64ISD::FPMOV";
+  case AArch64ISD::GOTLoad:        return "AArch64ISD::GOTLoad";
+  case AArch64ISD::BFI:            return "AArch64ISD::BFI";
+  case AArch64ISD::EXTR:           return "AArch64ISD::EXTR";
+  case AArch64ISD::Ret:            return "AArch64ISD::Ret";
+  case AArch64ISD::SBFX:           return "AArch64ISD::SBFX";
+  case AArch64ISD::SELECT_CC:      return "AArch64ISD::SELECT_CC";
+  case AArch64ISD::SETCC:          return "AArch64ISD::SETCC";
+  case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
+  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
+  case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
+  case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
+
+  default:                       return NULL;
+  }
+}
+
+static const uint16_t AArch64FPRArgRegs[] = {
+  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
+};
+static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
+
+static const uint16_t AArch64ArgRegs[] = {
+  AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
+  AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
+};
+static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
+
+static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                 CCValAssign::LocInfo LocInfo,
+                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  // Mark all remaining general purpose registers as allocated. We don't
+  // backtrack: if (for example) an i128 gets put on the stack, no subsequent
+  // i64 will go in registers (C.11).
+  for (unsigned i = 0; i < NumArgRegs; ++i)
+    State.AllocateReg(AArch64ArgRegs[i]);
+
+  return false;
+}
+
+#include "AArch64GenCallingConv.inc"
+
+CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
+
+  switch(CC) {
+  default: llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return CC_A64_APCS;
+  }
+}
+
+void
+AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                           DebugLoc DL, SDValue &Chain) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+
+  SmallVector<SDValue, 8> MemOps;
+
+  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
+                                                         NumArgRegs);
+  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
+                                                         NumFPRArgRegs);
+
+  unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
+  int GPRIdx = 0;
+  if (GPRSaveSize != 0) {
+    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                                   MachinePointerInfo::getStack(i * 8),
+                                   false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
+  }
+
+  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+  int FPRIdx = 0;
+  if (FPRSaveSize != 0) {
+    FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+    SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
+                                   &AArch64::FPR128RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                                   MachinePointerInfo::getStack(i * 16),
+                                   false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(16, getPointerTy()));
+    }
+  }
+
+  int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true);
+
+  FuncInfo->setVariadicStackIdx(StackIdx);
+  FuncInfo->setVariadicGPRIdx(GPRIdx);
+  FuncInfo->setVariadicGPRSize(GPRSaveSize);
+  FuncInfo->setVariadicFPRIdx(FPRIdx);
+  FuncInfo->setVariadicFPRSize(FPRSaveSize);
+
+  if (!MemOps.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
+                        MemOps.size());
+  }
+}
+
+
+SDValue
+AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
+
+  SmallVector<SDValue, 16> ArgValues;
+
+  SDValue ArgValue;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+    if (Flags.isByVal()) {
+      // Byval is used for small structs and HFAs in the PCS, but the system
+      // should work in a non-compliant manner for larger structs.
+      EVT PtrTy = getPointerTy();
+      int Size = Flags.getByValSize();
+      unsigned NumRegs = (Size + 7) / 8;
+
+      unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
+                                                 VA.getLocMemOffset(),
+                                                 false);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      InVals.push_back(FrameIdxN);
+
+      continue;
+    } else if (VA.isRegLoc()) {
+      MVT RegVT = VA.getLocVT();
+      const TargetRegisterClass *RC = getRegClassFor(RegVT);
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+
+      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc());
+
+      int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
+                                      VA.getLocMemOffset(), true);
+
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                             MachinePointerInfo::getFixedStack(FI),
+                             false, false, false, 0);
+
+
+    }
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
+      break;
+    case CCValAssign::SExt:
+    case CCValAssign::ZExt:
+    case CCValAssign::AExt: {
+      unsigned DestSize = VA.getValVT().getSizeInBits();
+      unsigned DestSubReg;
+
+      switch (DestSize) {
+      case 8: DestSubReg = AArch64::sub_8; break;
+      case 16: DestSubReg = AArch64::sub_16; break;
+      case 32: DestSubReg = AArch64::sub_32; break;
+      case 64: DestSubReg = AArch64::sub_64; break;
+      default: llvm_unreachable("Unexpected argument promotion");
+      }
+
+      ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+                                   VA.getValVT(), ArgValue,
+                                   DAG.getTargetConstant(DestSubReg, MVT::i32)),
+                         0);
+      break;
+    }
+    }
+
+    InVals.push_back(ArgValue);
+  }
+
+  if (isVarArg)
+    SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
+
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+    // This is a non-standard ABI so by fiat I say we're allowed to make full
+    // use of the stack area to be popped, which must be aligned to 16 bytes in
+    // any case:
+    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+
+    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+    // a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+    // This realignment carries over to the available bytes below. Our own
+    // callers will guarantee the space is free by giving an aligned value to
+    // CALLSEQ_START.
+  }
+  // Even if we're not expected to free up the space, it's useful to know how
+  // much is there while considering tail calls (because we can reuse it).
+  FuncInfo->setBytesInStackArgArea(StackArgSize);
+
+  return Chain;
+}
+
+SDValue
+AArch64TargetLowering::LowerReturn(SDValue Chain,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   DebugLoc dl, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analyze outgoing return values.
+  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    // PCS: "If the type, T, of the result of a function is such that
+    // void func(T arg) would require that arg be passed as a value in a
+    // register (or set of registers) according to the rules in 5.4, then the
+    // result is returned in the same registers as would be used for such an
+    // argument.
+    //
+    // Otherwise, the caller shall reserve a block of memory of sufficient
+    // size and alignment to hold the result. The address of the memory block
+    // shall be passed as an additional argument to the function in x8."
+    //
+    // This is implemented in two places. The register-return values are dealt
+    // with here, more complex returns are passed as an sret parameter, which
+    // means we don't have to worry about it during actual return.
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
+
+
+    SDValue Arg = OutVals[i];
+
+    // There's no convenient note in the ABI about this as there is for normal
+    // arguments, but it says return values are passed in the same registers as
+    // an argument would be. I believe that includes the comments about
+    // unspecified higher bits, putting the burden of widening on the *caller*
+    // for return values.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+    case CCValAssign::ZExt:
+    case CCValAssign::AExt:
+      // Floating-point values should only be extended when they're going into
+      // memory, which can't happen here so an integer extend is acceptable.
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
+}
+
+SDValue
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &IsTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool IsVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
+  bool IsSibCall = false;
+
+  if (IsTailCall) {
+    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
+                                                   Outs, OutVals, Ins, DAG);
+
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
+  }
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+
+  // On AArch64 (and all other architectures I'm aware of) the most this has to
+  // do is adjust the stack pointer.
+  unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
+  }
+
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
+
+  if (IsTailCall && !IsSibCall) {
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
+
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+  }
+
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
+                                        getPointerTy());
+
+  SmallVector<SDValue, 8> MemOpChains;
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    SDValue Arg = OutVals[i];
+
+    // Callee does the actual widening, so all extensions just use an implicit
+    // definition of the rest of the Loc. Aesthetically, this would be nicer as
+    // an ANY_EXTEND, but that isn't valid for floating-point types and this
+    // alternative works on integer types too.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+    case CCValAssign::ZExt:
+    case CCValAssign::AExt: {
+      unsigned SrcSize = VA.getValVT().getSizeInBits();
+      unsigned SrcSubReg;
+
+      switch (SrcSize) {
+      case 8: SrcSubReg = AArch64::sub_8; break;
+      case 16: SrcSubReg = AArch64::sub_16; break;
+      case 32: SrcSubReg = AArch64::sub_32; break;
+      case 64: SrcSubReg = AArch64::sub_64; break;
+      default: llvm_unreachable("Unexpected argument promotion");
+      }
+
+      Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                                    VA.getLocVT(),
+                                    DAG.getUNDEF(VA.getLocVT()),
+                                    Arg,
+                                    DAG.getTargetConstant(SrcSubReg, MVT::i32)),
+                    0);
+
+      break;
+    }
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      // A normal register (sub-) argument. For now we just note it down because
+      // we want to copy things into registers as late as possible to avoid
+      // register-pressure (and possibly worse).
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc() && "unexpected argument location");
+
+    SDValue DstAddr;
+    MachinePointerInfo DstInfo;
+    if (IsTailCall) {
+      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
+                                          VA.getLocVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
+      int32_t Offset = VA.getLocMemOffset() + FPDiff;
+      int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+      DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+      DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+      // Make sure any stack arguments overlapping with where we're storing are
+      // loaded before this eventual operation. Otherwise they'll be clobbered.
+      Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+    } else {
+      SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset());
+
+      DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+      DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
+    }
+
+    if (Flags.isByVal()) {
+      SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
+      SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
+                                  Flags.getByValAlign(),
+                                  /*isVolatile = */ false,
+                                  /*alwaysInline = */ false,
+                                  DstInfo, MachinePointerInfo(0));
+      MemOpChains.push_back(Cpy);
+    } else {
+      // Normal stack argument, put it where it's needed.
+      SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
+                                   false, false, 0);
+      MemOpChains.push_back(Store);
+    }
+  }
+
+  // The loads and stores generated above shouldn't clash with each
+  // other. Combining them with this TokenFactor notes that fact for the rest of
+  // the backend.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Most of the rest of the instructions need to be glued together; we don't
+  // want assignments to actual registers used by a call to be rearranged by a
+  // well-meaning scheduler.
+  SDValue InFlag;
+
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // The linker is responsible for inserting veneers when necessary to put a
+  // function call destination in range, so we don't need to bother with a
+  // wrapper here.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  }
+
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(0, true), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // We produce the following DAG scheme for the actual call instruction:
+  //     (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
+  //
+  // Most arguments aren't going to be used and just keep the values live as
+  // far as LLVM is concerned. It's expected to be selected as simply "bl
+  // callee" (for a direct, non-tail call).
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+  }
+
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+
+  // Add a register mask operand representing the call-preserved registers. This
+  // is used later in codegen to constrain register-allocation.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // If we needed glue, put it in as the last argument.
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  if (IsTailCall) {
+    return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+  }
+
+  Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Now we can reclaim the stack, just as well do it before working out where
+  // our return value is.
+  if (!IsSibCall) {
+    uint64_t CalleePopBytes
+      = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
+
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(CalleePopBytes, true),
+                               InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  return LowerCallResult(Chain, InFlag, CallConv,
+                         IsVarArg, Ins, dl, DAG, InVals);
+}
+
+SDValue
+AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                      CallingConv::ID CallConv, bool IsVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
+
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    // Return values that are too big to fit into registers should use an sret
+    // pointer, so this can be a lot simpler than the main argument code.
+    assert(VA.isRegLoc() && "Memory locations not expected for call return");
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                                     InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+    case CCValAssign::SExt:
+    case CCValAssign::AExt:
+      // Floating-point arguments only get extended/truncated if they're going
+      // in memory, so using the integer operation is acceptable here.
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+bool
+AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    bool IsVarArg,
+                                    bool IsCalleeStructRet,
+                                    bool IsCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const {
+
+  // For CallingConv::C this function knows whether the ABI needs
+  // changing. That's not true for other conventions so they will have to opt in
+  // manually.
+  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+    return false;
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible (see
+  // X86) but less efficient and uglier in LowerCall.
+  for (Function::const_arg_iterator i = CallerF->arg_begin(),
+         e = CallerF->arg_end(); i != e; ++i)
+    if (i->hasByValAttr())
+      return false;
+
+  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+    if (IsTailCallConvention(CalleeCC) && CCMatch)
+      return true;
+    return false;
+  }
+
+  // Now we search for cases where we can use a tail call without changing the
+  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
+  // concept.
+
+  // I want anyone implementing a new calling convention to think long and hard
+  // about this assert.
+  assert((!IsVarArg || CalleeCC == CallingConv::C)
+         && "Unexpected variadic calling convention");
+
+  if (IsVarArg && !Outs.empty()) {
+    // At least two cases here: if caller is fastcc then we can't have any
+    // memory arguments (we'd be expected to clean up the stack afterwards). If
+    // caller is C then we could potentially use its argument area.
+
+    // FIXME: for now we take the most conservative of these in both cases:
+    // disallow all variadic memory operands.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
+
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // Nothing more to check if the callee is taking no arguments
+  if (Outs.empty())
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+
+  const AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+
+  // If the stack arguments for this call would fit into our own save area then
+  // the call can be made tail.
+  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+}
+
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+                                                   bool TailCallOpt) const {
+  return CallCC == CallingConv::Fast && TailCallOpt;
+}
+
+bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+  return CallCC == CallingConv::Fast;
+}
+
+SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
+                                                   SelectionDAG &DAG,
+                                                   MachineFrameInfo *MFI,
+                                                   int ClobberedFI) const {
+  SmallVector<SDValue, 8> ArgChains;
+  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
+  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument corresponding
+  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+         UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0) {
+          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
+          int64_t InLastByte = InFirstByte;
+          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
+
+          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+            ArgChains.push_back(SDValue(L, 1));
+        }
+
+   // Build a tokenfactor for all the chains.
+   return DAG.getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other,
+                      &ArgChains[0], ArgChains.size());
+}
+
+static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
+  switch (CC) {
+  case ISD::SETEQ:  return A64CC::EQ;
+  case ISD::SETGT:  return A64CC::GT;
+  case ISD::SETGE:  return A64CC::GE;
+  case ISD::SETLT:  return A64CC::LT;
+  case ISD::SETLE:  return A64CC::LE;
+  case ISD::SETNE:  return A64CC::NE;
+  case ISD::SETUGT: return A64CC::HI;
+  case ISD::SETUGE: return A64CC::HS;
+  case ISD::SETULT: return A64CC::LO;
+  case ISD::SETULE: return A64CC::LS;
+  default: llvm_unreachable("Unexpected condition code");
+  }
+}
+
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
+  // icmp is implemented using adds/subs immediate, which take an unsigned
+  // 12-bit immediate, optionally shifted left by 12 bits.
+
+  // Symmetric by using adds/subs
+  if (Val < 0)
+    Val = -Val;
+
+  return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
+}
+
+SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
+                                        ISD::CondCode CC, SDValue &A64cc,
+                                        SelectionDAG &DAG, DebugLoc &dl) const {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    int64_t C = 0;
+    EVT VT = RHSC->getValueType(0);
+    bool knownInvalid = false;
+
+    // I'm not convinced the rest of LLVM handles these edge cases properly, but
+    // we can at least get it right.
+    if (isSignedIntSetCC(CC)) {
+      C = RHSC->getSExtValue();
+    } else if (RHSC->getZExtValue() > INT64_MAX) {
+      // A 64-bit constant not representable by a signed 64-bit integer is far
+      // too big to fit into a SUBS immediate anyway.
+      knownInvalid = true;
+    } else {
+      C = RHSC->getZExtValue();
+    }
+
+    if (!knownInvalid && !isLegalICmpImmediate(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default: break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if (isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          RHS = DAG.getConstant(C-1, VT);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if (isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          RHS = DAG.getConstant(C-1, VT);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if (isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          RHS = DAG.getConstant(C+1, VT);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if (isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          RHS = DAG.getConstant(C+1, VT);
+        }
+        break;
+      }
+    }
+  }
+
+  A64CC::CondCodes CondCode = IntCCToA64CC(CC);
+  A64cc = DAG.getConstant(CondCode, MVT::i32);
+  return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                     DAG.getCondCode(CC));
+}
+
+static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
+                                    A64CC::CondCodes &Alternative) {
+  A64CC::CondCodes CondCode = A64CC::Invalid;
+  Alternative = A64CC::Invalid;
+
+  switch (CC) {
+  default: llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: CondCode = A64CC::EQ; break;
+  case ISD::SETGT:
+  case ISD::SETOGT: CondCode = A64CC::GT; break;
+  case ISD::SETGE:
+  case ISD::SETOGE: CondCode = A64CC::GE; break;
+  case ISD::SETOLT: CondCode = A64CC::MI; break;
+  case ISD::SETOLE: CondCode = A64CC::LS; break;
+  case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
+  case ISD::SETO:   CondCode = A64CC::VC; break;
+  case ISD::SETUO:  CondCode = A64CC::VS; break;
+  case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
+  case ISD::SETUGT: CondCode = A64CC::HI; break;
+  case ISD::SETUGE: CondCode = A64CC::PL; break;
+  case ISD::SETLT:
+  case ISD::SETULT: CondCode = A64CC::LT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: CondCode = A64CC::LE; break;
+  case ISD::SETNE:
+  case ISD::SETUNE: CondCode = A64CC::NE; break;
+  }
+  return CondCode;
+}
+
+SDValue
+AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT PtrVT = getPointerTy();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small
+         && "Only small code model supported at the moment");
+
+  // The most efficient code is PC-relative anyway for the small memory model,
+  // so we don't need to worry about relocation model.
+  return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                     DAG.getTargetBlockAddress(BA, PtrVT, 0,
+                                               AArch64II::MO_NO_FLAG),
+                     DAG.getTargetBlockAddress(BA, PtrVT, 0,
+                                               AArch64II::MO_LO12),
+                     DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
+}
+
+
+// (BRCOND chain, val, dest)
+SDValue
+AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Chain = Op.getOperand(0);
+  SDValue TheBit = Op.getOperand(1);
+  SDValue DestBB = Op.getOperand(2);
+
+  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
+  // that as the consumer we are responsible for ignoring rubbish in higher
+  // bits.
+  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
+                       DAG.getConstant(1, MVT::i32));
+
+  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
+                               DAG.getConstant(0, TheBit.getValueType()),
+                               DAG.getCondCode(ISD::SETNE));
+
+  return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
+                     A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
+                     DestBB);
+}
+
+// (BR_CC chain, condcode, lhs, rhs, dest)
+SDValue
+AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue DestBB = Op.getOperand(4);
+
+  if (LHS.getValueType() == MVT::f128) {
+    // f128 comparisons are lowered to runtime calls by a routine which sets
+    // LHS, RHS and CC appropriately for the rest of this function to continue.
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (RHS.getNode() == 0) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue A64cc;
+
+    // Integers are handled in a separate function because the combinations of
+    // immediates and tests can get hairy and we may want to fiddle things.
+    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+
+    return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
+                       Chain, CmpOp, A64cc, DestBB);
+  }
+
+  // Note that some LLVM floating-point CondCodes can't be lowered to a single
+  // conditional branch, hence FPCCToA64CC can set a second test, where either
+  // passing is sufficient.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
+                                 Chain, SetCC, A64cc, DestBB);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
+                           A64BR_CC, SetCC, A64cc, DestBB);
+
+  }
+
+  return A64BR_CC;
+}
+
+SDValue
+AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
+                                       RTLIB::Libcall Call) const {
+  ArgListTy Args;
+  ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op.getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
+
+  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+
+  // By default, the input chain to this libcall is the entry node of the
+  // function. If the libcall is going to be emitted as a tail call then
+  // isUsedByReturnOnly will change it to the right chain if the return
+  // node which is being folded has a non-entry input chain.
+  SDValue InChain = DAG.getEntryNode();
+
+  // isTailCall may be true since the callee does not reference caller stack
+  // frame. Check if it's in the right position.
+  SDValue TCChain = InChain;
+  bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
+  if (isTailCall)
+    InChain = TCChain;
+
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
+                    0, getLibcallCallingConv(Call), isTailCall,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, Op->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  if (!CallInfo.second.getNode())
+    // It's a tailcall, return the chain (which is the DAG root).
+    return DAG.getRoot();
+
+  return CallInfo.first;
+}
+
+SDValue
+AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  LC  = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, Op.getDebugLoc());
+}
+
+SDValue
+AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+
+  RTLIB::Libcall LC;
+  LC  = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128ToCall(Op, DAG, LC);
+}
+
+SDValue
+AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                      bool IsSigned) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  if (IsSigned)
+    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128ToCall(Op, DAG, LC);
+}
+
+SDValue
+AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
+  // we make that distinction here.
+
+  // We support the small memory model for now.
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small);
+
+  EVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
+  unsigned Alignment = GV->getAlignment();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+
+  if (GV->isWeakForLinker() && RelocM == Reloc::Static) {
+    // Weak symbols can't use ADRP/ADD pair since they should evaluate to
+    // zero when undefined. In PIC mode the GOT can take care of this, but in
+    // absolute mode we use a constant pool load.
+    SDValue PoolAddr;
+    PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
+                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
+                                                     AArch64II::MO_NO_FLAG),
+                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
+                                                     AArch64II::MO_LO12),
+                           DAG.getConstant(8, MVT::i32));
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
+                       MachinePointerInfo::getConstantPool(),
+                       /*isVolatile=*/ false,  /*isNonTemporal=*/ true,
+                       /*isInvariant=*/ true, 8);
+  }
+
+  if (Alignment == 0) {
+    const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
+    if (GVPtrTy->getElementType()->isSized()) {
+      Alignment
+        = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
+    } else {
+      // Be conservative if we can't guess, not that it really matters:
+      // functions and labels aren't valid for loads, and the methods used to
+      // actually calculate an address work with any alignment.
+      Alignment = 1;
+    }
+  }
+
+  unsigned char HiFixup, LoFixup;
+  bool UseGOT = Subtarget->GVIsIndirectSymbol(GV, RelocM);
+
+  if (UseGOT) {
+    HiFixup = AArch64II::MO_GOT;
+    LoFixup = AArch64II::MO_GOT_LO12;
+    Alignment = 8;
+  } else {
+    HiFixup = AArch64II::MO_NO_FLAG;
+    LoFixup = AArch64II::MO_LO12;
+  }
+
+  // AArch64's small model demands the following sequence:
+  // ADRP x0, somewhere
+  // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
+  SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
+                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                                             HiFixup),
+                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                                             LoFixup),
+                                  DAG.getConstant(Alignment, MVT::i32));
+
+  if (UseGOT) {
+    GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
+                            GlobalRef);
+  }
+
+  if (GN->getOffset() != 0)
+    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
+                       DAG.getConstant(GN->getOffset(), PtrVT));
+
+  return GlobalRef;
+}
+
+SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
+                                                SDValue DescAddr,
+                                                DebugLoc DL,
+                                                SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+
+  // The function we need to call is simply the first entry in the GOT for this
+  // descriptor, load it in preparation.
+  SDValue Func, Chain;
+  Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
+                     DescAddr);
+
+  // The function takes only one argument: the address of the descriptor itself
+  // in X0.
+  SDValue Glue;
+  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
+  Glue = Chain.getValue(1);
+
+  // Finally, there's a special calling-convention which means that the lookup
+  // must preserve all registers (except X0, obviously).
+  const TargetRegisterInfo *TRI  = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *A64RI
+    = static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
+
+  // We're now ready to populate the argument list, as with a normal call:
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Func);
+  Ops.push_back(SymAddr);
+  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
+  Ops.push_back(DAG.getRegisterMask(Mask));
+  Ops.push_back(Glue);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0],
+                      Ops.size());
+  Glue = Chain.getValue(1);
+
+  // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
+  // back to the generic handling code.
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
+}
+
+SDValue
+AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  SDValue TPOff;
+  EVT PtrVT = getPointerTy();
+  DebugLoc DL = Op.getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+
+  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
+
+  if (Model == TLSModel::InitialExec) {
+    TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                                   AArch64II::MO_GOTTPREL),
+                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                                   AArch64II::MO_GOTTPREL_LO12),
+                        DAG.getConstant(8, MVT::i32));
+    TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
+                        TPOff);
+  } else if (Model == TLSModel::LocalExec) {
+    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
+                                               AArch64II::MO_TPREL_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
+                                               AArch64II::MO_TPREL_G0_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
+                                       TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+  } else if (Model == TLSModel::GeneralDynamic) {
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                                AArch64II::MO_TLSDESC);
+    SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                                AArch64II::MO_TLSDESC_LO12);
+    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                                   HiDesc, LoDesc,
+                                   DAG.getConstant(8, MVT::i32));
+    SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
+
+    TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
+  } else if (Model == TLSModel::LocalDynamic) {
+    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
+    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
+    // the beginning of the module's TLS region, followed by a DTPREL offset
+    // calculation.
+
+    // These accesses will need deduplicating if there's more than one.
+    AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
+      .getInfo<AArch64MachineFunctionInfo>();
+    MFI->incNumLocalDynamicTLSAccesses();
+
+
+    // Get the location of _TLS_MODULE_BASE_:
+    SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                AArch64II::MO_TLSDESC);
+    SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                AArch64II::MO_TLSDESC_LO12);
+    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                                   HiDesc, LoDesc,
+                                   DAG.getConstant(8, MVT::i32));
+    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
+
+    ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
+
+    // Get the variable's offset from _TLS_MODULE_BASE_
+    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
+                                               AArch64II::MO_DTPREL_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
+                                               AArch64II::MO_DTPREL_G0_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
+                                       TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+  } else
+      llvm_unreachable("Unsupported TLS access model");
+
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
+}
+
+SDValue
+AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                                      bool IsSigned) const {
+  if (Op.getValueType() != MVT::f128) {
+    // Legal for everything except f128.
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  if (IsSigned)
+    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128ToCall(Op, DAG, LC);
+}
+
+
+SDValue
+AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  DebugLoc dl = JT->getDebugLoc();
+
+  // When compiling PIC, jump tables get put in the code section so a static
+  // relocation-style is acceptable for both cases.
+  return DAG.getNode(AArch64ISD::WrapperSmall, dl, getPointerTy(),
+                     DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()),
+                     DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
+                                            AArch64II::MO_LO12),
+                     DAG.getConstant(1, MVT::i32));
+}
+
+// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
+SDValue
+AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue IfTrue = Op.getOperand(2);
+  SDValue IfFalse = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  if (LHS.getValueType() == MVT::f128) {
+    // f128 comparisons are lowered to libcalls, but slot in nicely here
+    // afterwards.
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (RHS.getNode() == 0) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue A64cc;
+
+    // Integers are handled in a separate function because the combinations of
+    // immediates and tests can get hairy and we may want to fiddle things.
+    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+
+    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                       CmpOp, IfTrue, IfFalse, A64cc);
+  }
+
+  // Note that some LLVM floating-point CondCodes can't be lowered to a single
+  // conditional branch, hence FPCCToA64CC can set a second test, where either
+  // passing is sufficient.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
+                                     Op.getValueType(),
+                                     SetCC, IfTrue, IfFalse, A64cc);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                               SetCC, IfTrue, A64SELECT_CC, A64cc);
+
+  }
+
+  return A64SELECT_CC;
+}
+
+// (SELECT testbit, iftrue, iffalse)
+SDValue
+AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue TheBit = Op.getOperand(0);
+  SDValue IfTrue = Op.getOperand(1);
+  SDValue IfFalse = Op.getOperand(2);
+
+  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
+  // that as the consumer we are responsible for ignoring rubbish in higher
+  // bits.
+  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
+                       DAG.getConstant(1, MVT::i32));
+  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
+                               DAG.getConstant(0, TheBit.getValueType()),
+                               DAG.getCondCode(ISD::SETNE));
+
+  return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                     A64CMP, IfTrue, IfFalse,
+                     DAG.getConstant(A64CC::NE, MVT::i32));
+}
+
+// (SETCC lhs, rhs, condcode)
+SDValue
+AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  EVT VT = Op.getValueType();
+
+  if (LHS.getValueType() == MVT::f128) {
+    // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
+    // for the rest of the function (some i32 or i64 values).
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, use it.
+    if (RHS.getNode() == 0) {
+      assert(LHS.getValueType() == Op.getValueType() &&
+             "Unexpected setcc expansion!");
+      return LHS;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue A64cc;
+
+    // Integers are handled in a separate function because the combinations of
+    // immediates and tests can get hairy and we may want to fiddle things.
+    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+
+    return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
+                       CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
+                       A64cc);
+  }
+
+  // Note that some LLVM floating-point CondCodes can't be lowered to a single
+  // conditional branch, hence FPCCToA64CC can set a second test, where either
+  // passing is sufficient.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
+                                     CmpOp, DAG.getConstant(1, VT),
+                                     DAG.getConstant(0, VT), A64cc);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
+                               DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
+  }
+
+  return A64SELECT_CC;
+}
+
+SDValue
+AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+
+  // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
+  // rather than just 8.
+  return DAG.getMemcpy(Op.getOperand(0), Op.getDebugLoc(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(32, MVT::i32), 8, false, false,
+                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
+}
+
+SDValue
+AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  // The layout of the va_list struct is specified in the AArch64 Procedure Call
+  // Standard, section B.3.
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue VAList = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SmallVector<SDValue, 4> MemOps;
+
+  // void *__stack at offset 0
+  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
+                                    getPointerTy());
+  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
+                                MachinePointerInfo(SV), false, false, 0));
+
+  // void *__gr_top at offset 8
+  int GPRSize = FuncInfo->getVariadicGPRSize();
+  if (GPRSize > 0) {
+    SDValue GRTop, GRTopAddr;
+
+    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(8, getPointerTy()));
+
+    GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
+    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
+                        DAG.getConstant(GPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
+                                  MachinePointerInfo(SV, 8),
+                                  false, false, 0));
+  }
+
+  // void *__vr_top at offset 16
+  int FPRSize = FuncInfo->getVariadicFPRSize();
+  if (FPRSize > 0) {
+    SDValue VRTop, VRTopAddr;
+    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(16, getPointerTy()));
+
+    VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
+    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
+                        DAG.getConstant(FPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
+                                  MachinePointerInfo(SV, 16),
+                                  false, false, 0));
+  }
+
+  // int __gr_offs at offset 24
+  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(24, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
+                                GROffsAddr, MachinePointerInfo(SV, 24),
+                                false, false, 0));
+
+  // int __vr_offs at offset 28
+  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(28, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
+                                VROffsAddr, MachinePointerInfo(SV, 28),
+                                false, false, 0));
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
+                     MemOps.size());
+}
+
+SDValue
+AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
+  case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
+  case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
+  case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
+  case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
+  case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
+  case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
+  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+
+  case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
+  case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
+  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+  case ISD::SELECT: return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::SETCC: return LowerSETCC(Op, DAG);
+  case ISD::VACOPY: return LowerVACOPY(Op, DAG);
+  case ISD::VASTART: return LowerVASTART(Op, DAG);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // We're looking for an SRA/SHL pair which form an SBFX.
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    return SDValue();
+
+  uint64_t TruncMask = N->getConstantOperandVal(1);
+  if (!isMask_64(TruncMask))
+    return SDValue();
+
+  uint64_t Width = CountPopulation_64(TruncMask);
+  SDValue Shift = N->getOperand(0);
+
+  if (Shift.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
+    return SDValue();
+  uint64_t LSB = Shift->getConstantOperandVal(1);
+
+  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
+    return SDValue();
+
+  return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
+                     DAG.getConstant(LSB, MVT::i64),
+                     DAG.getConstant(LSB + Width - 1, MVT::i64));
+}
+
+static SDValue PerformATOMIC_FENCECombine(SDNode *FenceNode,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  // An atomic operation followed by an acquiring atomic fence can be reduced to
+  // an acquiring load. The atomic operation provides a convenient pointer to
+  // load from. If the original operation was a load anyway we can actually
+  // combine the two operations into an acquiring load.
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue AtomicOp = FenceNode->getOperand(0);
+  AtomicSDNode *AtomicNode = dyn_cast<AtomicSDNode>(AtomicOp);
+
+  // A fence on its own can't be optimised
+  if (!AtomicNode)
+    return SDValue();
+
+  AtomicOrdering FenceOrder
+    = static_cast<AtomicOrdering>(FenceNode->getConstantOperandVal(1));
+  SynchronizationScope FenceScope
+    = static_cast<SynchronizationScope>(FenceNode->getConstantOperandVal(2));
+
+  if (FenceOrder != Acquire || FenceScope != AtomicNode->getSynchScope())
+    return SDValue();
+
+  // If the original operation was an ATOMIC_LOAD then we'll be replacing it, so
+  // the chain we use should be its input, otherwise we'll put our store after
+  // it so we use its output chain.
+  SDValue Chain = AtomicNode->getOpcode() == ISD::ATOMIC_LOAD ?
+    AtomicNode->getChain() : AtomicOp;
+
+  // We have an acquire fence with a handy atomic operation nearby, we can
+  // convert the fence into a load-acquire, discarding the result.
+  DebugLoc DL = FenceNode->getDebugLoc();
+  SDValue Op = DAG.getAtomic(ISD::ATOMIC_LOAD, DL, AtomicNode->getMemoryVT(),
+                             AtomicNode->getValueType(0),
+                             Chain,                  // Chain
+                             AtomicOp.getOperand(1), // Pointer
+                             AtomicNode->getMemOperand(), Acquire,
+                             FenceScope);
+
+  if (AtomicNode->getOpcode() == ISD::ATOMIC_LOAD)
+    DAG.ReplaceAllUsesWith(AtomicNode, Op.getNode());
+
+  return Op.getValue(1);
+}
+
+static SDValue PerformATOMIC_STORECombine(SDNode *N,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  // A releasing atomic fence followed by an atomic store can be combined into a
+  // single store operation.
+  SelectionDAG &DAG = DCI.DAG;
+  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(N);
+  SDValue FenceOp = AtomicNode->getOperand(0);
+
+  if (FenceOp.getOpcode() != ISD::ATOMIC_FENCE)
+    return SDValue();
+
+  AtomicOrdering FenceOrder
+    = static_cast<AtomicOrdering>(FenceOp->getConstantOperandVal(1));
+  SynchronizationScope FenceScope
+    = static_cast<SynchronizationScope>(FenceOp->getConstantOperandVal(2));
+
+  if (FenceOrder != Release || FenceScope != AtomicNode->getSynchScope())
+    return SDValue();
+
+  DebugLoc DL = AtomicNode->getDebugLoc();
+  return DAG.getAtomic(ISD::ATOMIC_STORE, DL, AtomicNode->getMemoryVT(),
+                       FenceOp.getOperand(0),  // Chain
+                       AtomicNode->getOperand(1),       // Pointer
+                       AtomicNode->getOperand(2),       // Value
+                       AtomicNode->getMemOperand(), Release,
+                       FenceScope);
+}
+
+/// For a true bitfield insert, the bits getting into that contiguous mask
+/// should come from the low part of an existing value: they must be formed from
+/// a compatible SHL operation (unless they're already low). This function
+/// checks that condition and returns the least-significant bit that's
+/// intended. If the operation not a field preparation, -1 is returned.
+static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT,
+                            SDValue &MaskedVal, uint64_t Mask) {
+  if (!isShiftedMask_64(Mask))
+    return -1;
+
+  // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
+  // instruction. BFI will do a left-shift by LSB before applying the mask we've
+  // spotted, so in general we should pre-emptively "undo" that by making sure
+  // the incoming bits have had a right-shift applied to them.
+  //
+  // This right shift, however, will combine with existing left/right shifts. In
+  // the simplest case of a completely straight bitfield operation, it will be
+  // expected to completely cancel out with an existing SHL. More complicated
+  // cases (e.g. bitfield to bitfield copy) may still need a real shift before
+  // the BFI.
+
+  uint64_t LSB = CountTrailingZeros_64(Mask);
+  int64_t ShiftRightRequired = LSB;
+  if (MaskedVal.getOpcode() == ISD::SHL &&
+      isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
+    ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
+    MaskedVal = MaskedVal.getOperand(0);
+  } else if (MaskedVal.getOpcode() == ISD::SRL &&
+             isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
+    ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
+    MaskedVal = MaskedVal.getOperand(0);
+  }
+
+  if (ShiftRightRequired > 0)
+    MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
+                            DAG.getConstant(ShiftRightRequired, MVT::i64));
+  else if (ShiftRightRequired < 0) {
+    // We could actually end up with a residual left shift, for example with
+    // "struc.bitfield = val << 1".
+    MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
+                            DAG.getConstant(-ShiftRightRequired, MVT::i64));
+  }
+
+  return LSB;
+}
+
+/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
+/// a mask and an extension. Returns true if a BFI was found and provides
+/// information on its surroundings.
+static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
+                          bool &Extended) {
+  Extended = false;
+  if (N.getOpcode() == ISD::ZERO_EXTEND) {
+    Extended = true;
+    N = N.getOperand(0);
+  }
+
+  if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
+    Mask = N->getConstantOperandVal(1);
+    N = N.getOperand(0);
+  } else {
+    // Mask is the whole width.
+    Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
+  }
+
+  if (N.getOpcode() == AArch64ISD::BFI) {
+    BFI = N;
+    return true;
+  }
+
+  return false;
+}
+
+/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
+/// is roughly equivalent to (and (BFI ...), mask). This form is used because it
+/// can often be further combined with a larger mask. Ultimately, we want mask
+/// to be 2^32-1 or 2^64-1 so the AND can be skipped.
+static SDValue tryCombineToBFI(SDNode *N,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
+  // abandon the effort.
+  SDValue LHS = N->getOperand(0);
+  if (LHS.getOpcode() != ISD::AND)
+    return SDValue();
+
+  uint64_t LHSMask;
+  if (isa<ConstantSDNode>(LHS.getOperand(1)))
+    LHSMask = LHS->getConstantOperandVal(1);
+  else
+    return SDValue();
+
+  // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
+  // is or abandon the effort.
+  SDValue RHS = N->getOperand(1);
+  if (RHS.getOpcode() != ISD::AND)
+    return SDValue();
+
+  uint64_t RHSMask;
+  if (isa<ConstantSDNode>(RHS.getOperand(1)))
+    RHSMask = RHS->getConstantOperandVal(1);
+  else
+    return SDValue();
+
+  // Can't do anything if the masks are incompatible.
+  if (LHSMask & RHSMask)
+    return SDValue();
+
+  // Now we need one of the masks to be a contiguous field. Without loss of
+  // generality that should be the RHS one.
+  SDValue Bitfield = LHS.getOperand(0);
+  if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
+    // We know that LHS is a candidate new value, and RHS isn't already a better
+    // one.
+    std::swap(LHS, RHS);
+    std::swap(LHSMask, RHSMask);
+  }
+
+  // We've done our best to put the right operands in the right places, all we
+  // can do now is check whether a BFI exists.
+  Bitfield = RHS.getOperand(0);
+  int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
+  if (LSB == -1)
+    return SDValue();
+
+  uint32_t Width = CountPopulation_64(RHSMask);
+  assert(Width && "Expected non-zero bitfield width");
+
+  SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
+                            LHS.getOperand(0), Bitfield,
+                            DAG.getConstant(LSB, MVT::i64),
+                            DAG.getConstant(Width, MVT::i64));
+
+  // Mask is trivial
+  if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
+    return BFI;
+
+  return DAG.getNode(ISD::AND, DL, VT, BFI,
+                     DAG.getConstant(LHSMask | RHSMask, VT));
+}
+
+/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
+/// original input. This is surprisingly common because SROA splits things up
+/// into i8 chunks, so the originally detected MaskedBFI may actually only act
+/// on the low (say) byte of a word. This is then orred into the rest of the
+/// word afterwards.
+///
+/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
+///
+/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
+/// MaskedBFI. We can also deal with a certain amount of extend/truncate being
+/// involved.
+static SDValue tryCombineToLargerBFI(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // First job is to hunt for a MaskedBFI on either the left or right. Swap
+  // operands if it's actually on the right.
+  SDValue BFI;
+  SDValue PossExtraMask;
+  uint64_t ExistingMask = 0;
+  bool Extended = false;
+  if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
+    PossExtraMask = N->getOperand(1);
+  else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
+    PossExtraMask = N->getOperand(0);
+  else
+    return SDValue();
+
+  // We can only combine a BFI with another compatible mask.
+  if (PossExtraMask.getOpcode() != ISD::AND ||
+      !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
+    return SDValue();
+
+  uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
+
+  // Masks must be compatible.
+  if (ExtraMask & ExistingMask)
+    return SDValue();
+
+  SDValue OldBFIVal = BFI.getOperand(0);
+  SDValue NewBFIVal = BFI.getOperand(1);
+  if (Extended) {
+    // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
+    // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
+    // need to be made compatible.
+    assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
+           && "Invalid types for BFI");
+    OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
+    NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
+  }
+
+  // We need the MaskedBFI to be combined with a mask of the *same* value.
+  if (PossExtraMask.getOperand(0) != OldBFIVal)
+    return SDValue();
+
+  BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
+                    OldBFIVal, NewBFIVal,
+                    BFI.getOperand(2), BFI.getOperand(3));
+
+  // If the masking is trivial, we don't need to create it.
+  if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
+    return BFI;
+
+  return DAG.getNode(ISD::AND, DL, VT, BFI,
+                     DAG.getConstant(ExtraMask | ExistingMask, VT));
+}
+
+/// An EXTR instruction is made up of two shifts, ORed together. This helper
+/// searches for and classifies those shifts.
+static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
+                         bool &FromHi) {
+  if (N.getOpcode() == ISD::SHL)
+    FromHi = false;
+  else if (N.getOpcode() == ISD::SRL)
+    FromHi = true;
+  else
+    return false;
+
+  if (!isa<ConstantSDNode>(N.getOperand(1)))
+    return false;
+
+  ShiftAmount = N->getConstantOperandVal(1);
+  Src = N->getOperand(0);
+  return true;
+}
+
+/// EXTR instruction extracts a contiguous chunk of bits from two existing
+/// registers viewed as a high/low pair. This function looks for the pattern:
+/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
+/// EXTR. Can't quite be done in TableGen because the two immediates aren't
+/// independent.
+static SDValue tryCombineToEXTR(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue LHS;
+  uint32_t ShiftLHS = 0;
+  bool LHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
+    return SDValue();
+
+  SDValue RHS;
+  uint32_t ShiftRHS = 0;
+  bool RHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
+    return SDValue();
+
+  // If they're both trying to come from the high part of the register, they're
+  // not really an EXTR.
+  if (LHSFromHi == RHSFromHi)
+    return SDValue();
+
+  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
+    return SDValue();
+
+  if (LHSFromHi) {
+    std::swap(LHS, RHS);
+    std::swap(ShiftLHS, ShiftRHS);
+  }
+
+  return DAG.getNode(AArch64ISD::EXTR, DL, VT,
+                     LHS, RHS,
+                     DAG.getConstant(ShiftRHS, MVT::i64));
+}
+
+/// Target-specific dag combine xforms for ISD::OR
+static SDValue PerformORCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const AArch64Subtarget *Subtarget) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  // Attempt to recognise bitfield-insert operations.
+  SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
+  if (Res.getNode())
+    return Res;
+
+  // Attempt to combine an existing MaskedBFI operation into one with a larger
+  // mask.
+  Res = tryCombineToLargerBFI(N, DCI, Subtarget);
+  if (Res.getNode())
+    return Res;
+
+  Res = tryCombineToEXTR(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  return SDValue();
+}
+
+/// Target-specific dag combine xforms for ISD::SRA
+static SDValue PerformSRACombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // We're looking for an SRA/SHL pair which form an SBFX.
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    return SDValue();
+
+  uint64_t ExtraSignBits = N->getConstantOperandVal(1);
+  SDValue Shift = N->getOperand(0);
+
+  if (Shift.getOpcode() != ISD::SHL)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
+    return SDValue();
+
+  uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
+  uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
+  uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
+
+  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
+    return SDValue();
+
+  return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
+                     DAG.getConstant(LSB, MVT::i64),
+                     DAG.getConstant(LSB + Width - 1, MVT::i64));
+}
+
+
+SDValue
+AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                         DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::AND: return PerformANDCombine(N, DCI);
+  case ISD::ATOMIC_FENCE: return PerformATOMIC_FENCECombine(N, DCI);
+  case ISD::ATOMIC_STORE: return PerformATOMIC_STORECombine(N, DCI);
+  case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
+  case ISD::SRA: return PerformSRACombine(N, DCI);
+  }
+  return SDValue();
+}
+
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'w': // An FP/SIMD vector register
+      return C_RegisterClass;
+    case 'I': // Constant that can be used with an ADD instruction
+    case 'J': // Constant that can be used with a SUB instruction
+    case 'K': // Constant that can be used with a 32-bit logical instruction
+    case 'L': // Constant that can be used with a 64-bit logical instruction
+    case 'M': // Constant that can be used as a 32-bit MOV immediate
+    case 'N': // Constant that can be used as a 64-bit MOV immediate
+    case 'Y': // Floating point constant zero
+    case 'Z': // Integer constant zero
+      return C_Other;
+    case 'Q': // A memory reference with base register and no offset
+      return C_Memory;
+    case 'S': // A symbolic address
+      return C_Other;
+    }
+  }
+
+  // FIXME: Ump, Utf, Usa, Ush
+  // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
+  //      whatever they may be
+  // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
+  // Usa: An absolute symbolic address
+  // Ush: The high part (bits 32:12) of a pc-relative symbolic address
+  assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
+         && Constraint != "Ush" && "Unimplemented constraints");
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+                                                const char *Constraint) const {
+
+  llvm_unreachable("Constraint weight unimplemented");
+}
+
+void
+AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                    std::string &Constraint,
+                                                    std::vector<SDValue> &Ops,
+                                                    SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Only length 1 constraints are C_Other.
+  if (Constraint.size() != 1) return;
+
+  // Only C_Other constraints get lowered like this. That means constants for us
+  // so return early if there's no hope the constraint can be lowered.
+
+  switch(Constraint[0]) {
+  default: break;
+  case 'I': case 'J': case 'K': case 'L':
+  case 'M': case 'N': case 'Z': {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    uint64_t CVal = C->getZExtValue();
+    uint32_t Bits;
+
+    switch (Constraint[0]) {
+    default:
+      // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
+      // is a peculiarly useless SUB constraint.
+      llvm_unreachable("Unimplemented C_Other constraint");
+    case 'I':
+      if (CVal <= 0xfff)
+        break;
+      return;
+    case 'K':
+      if (A64Imms::isLogicalImm(32, CVal, Bits))
+        break;
+      return;
+    case 'L':
+      if (A64Imms::isLogicalImm(64, CVal, Bits))
+        break;
+      return;
+    case 'Z':
+      if (CVal == 0)
+        break;
+      return;
+    }
+
+    Result = DAG.getTargetConstant(CVal, Op.getValueType());
+    break;
+  }
+  case 'S': {
+    // An absolute symbolic address or label reference.
+    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), Op.getDebugLoc(),
+                                          GA->getValueType(0));
+    } else if (const BlockAddressSDNode *BA
+                 = dyn_cast<BlockAddressSDNode>(Op)) {
+      Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
+                                         BA->getValueType(0));
+    } else if (const ExternalSymbolSDNode *ES
+                 = dyn_cast<ExternalSymbolSDNode>(Op)) {
+      Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
+                                           ES->getValueType(0));
+    } else
+      return;
+    break;
+  }
+  case 'Y':
+    if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+      if (CFP->isExactlyValue(0.0)) {
+        Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
+        break;
+      }
+    }
+    return;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  // It's an unknown constraint for us. Let generic code have a go.
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+                                                  const std::string &Constraint,
+                                                  EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT.getSizeInBits() <= 32)
+        return std::make_pair(0U, &AArch64::GPR32RegClass);
+      else if (VT == MVT::i64)
+        return std::make_pair(0U, &AArch64::GPR64RegClass);
+      break;
+    case 'w':
+      if (VT == MVT::f16)
+        return std::make_pair(0U, &AArch64::FPR16RegClass);
+      else if (VT == MVT::f32)
+        return std::make_pair(0U, &AArch64::FPR32RegClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
+      else if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::VPR64RegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
+      else if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::VPR128RegClass);
+      break;
+    }
+  }
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
new file mode 100644
index 0000000..4960d28
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -0,0 +1,247 @@
+//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AArch64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64_ISELLOWERING_H
+#define LLVM_TARGET_AARCH64_ISELLOWERING_H
+
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+
+namespace llvm {
+namespace AArch64ISD {
+  enum NodeType {
+    // Start the numbering from where ISD NodeType finishes.
+    FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+    // This is a conditional branch which also notes the flag needed
+    // (eq/sgt/...). A64 puts this information on the branches rather than
+    // compares as LLVM does.
+    BR_CC,
+
+    // A node to be selected to an actual call operation: either BL or BLR in
+    // the absence of tail calls.
+    Call,
+
+    // Indicates a floating-point immediate which fits into the format required
+    // by the FMOV instructions. First (and only) operand is the 8-bit encoded
+    // value of that immediate.
+    FPMOV,
+
+    // Corresponds directly to an EXTR instruction. Operands are an LHS an RHS
+    // and an LSB.
+    EXTR,
+
+    // Wraps a load from the GOT, which should always be performed with a 64-bit
+    // load instruction. This prevents the DAG combiner folding a truncate to
+    // form a smaller memory access.
+    GOTLoad,
+
+    // Performs a bitfield insert. Arguments are: the value being inserted into;
+    // the value being inserted; least significant bit changed; width of the
+    // field.
+    BFI,
+
+    // Simply a convenient node inserted during ISelLowering to represent
+    // procedure return. Will almost certainly be selected to "RET".
+    Ret,
+
+    /// Extracts a field of contiguous bits from the source and sign extends
+    /// them into a single register. Arguments are: source; immr; imms. Note
+    /// these are pre-encoded since DAG matching can't cope with combining LSB
+    /// and Width into these values itself.
+    SBFX,
+
+    /// This is an A64-ification of the standard LLVM SELECT_CC operation. The
+    /// main difference is that it only has the values and an A64 condition,
+    /// which will be produced by a setcc instruction.
+    SELECT_CC,
+
+    /// This serves most of the functions of the LLVM SETCC instruction, for two
+    /// purposes. First, it prevents optimisations from fiddling with the
+    /// compare after we've moved the CondCode information onto the SELECT_CC or
+    /// BR_CC instructions. Second, it gives a legal instruction for the actual
+    /// comparison.
+    ///
+    /// It keeps a record of the condition flags asked for because certain
+    /// instructions are only valid for a subset of condition codes.
+    SETCC,
+
+    // Designates a node which is a tail call: both a call and a return
+    // instruction as far as selction is concerned. It should be selected to an
+    // unconditional branch. Has the usual plethora of call operands, but: 1st
+    // is callee, 2nd is stack adjustment required immediately before branch.
+    TC_RETURN,
+
+    // Designates a call used to support the TLS descriptor ABI. The call itself
+    // will be indirect ("BLR xN") but a relocation-specifier (".tlsdesccall
+    // var") must be attached somehow during code generation. It takes two
+    // operands: the callee and the symbol to be relocated against.
+    TLSDESCCALL,
+
+    // Leaf node which will be lowered to an appropriate MRS to obtain the
+    // thread pointer: TPIDR_EL0.
+    THREAD_POINTER,
+
+    /// Extracts a field of contiguous bits from the source and zero extends
+    /// them into a single register. Arguments are: source; immr; imms. Note
+    /// these are pre-encoded since DAG matching can't cope with combining LSB
+    /// and Width into these values itself.
+    UBFX,
+
+    // Wraps an address which the ISelLowering phase has decided should be
+    // created using the small absolute memory model: i.e. adrp/add or
+    // adrp/mem-op. This exists to prevent bare TargetAddresses which may never
+    // get selected.
+    WrapperSmall
+  };
+}
+
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64TargetLowering : public TargetLowering {
+public:
+  explicit AArch64TargetLowering(AArch64TargetMachine &TM);
+
+  const char *getTargetNodeName(unsigned Opcode) const;
+
+  CCAssignFn *CCAssignFnForNode(CallingConv::ID CC) const;
+
+  SDValue LowerFormalArguments(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerReturn(SDValue Chain,
+                      CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      DebugLoc dl, SelectionDAG &DAG) const;
+
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool IsVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          DebugLoc dl, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals) const;
+
+  void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                           DebugLoc DL, SDValue &Chain) const;
+
+
+  /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+  /// for tail call optimization. Targets which want to do tail call
+  /// optimization should implement this function.
+  bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    bool IsVarArg,
+                                    bool IsCalleeStructRet,
+                                    bool IsCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const;
+
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo *MFI, int ClobberedFI) const;
+
+  EVT getSetCCResultType(EVT VT) const;
+
+  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+
+  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+  bool isLegalICmpImmediate(int64_t Val) const;
+  SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                         SDValue &A64cc, SelectionDAG &DAG, DebugLoc &dl) const;
+
+  virtual MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+  MachineBasicBlock *
+  emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *MBB,
+                   unsigned Size, unsigned Opcode) const;
+
+  MachineBasicBlock *
+  emitAtomicBinaryMinMax(MachineInstr *MI, MachineBasicBlock *BB,
+                         unsigned Size, unsigned CmpOp,
+                         A64CC::CondCodes Cond) const;
+  MachineBasicBlock *
+  emitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
+                    unsigned Size) const;
+
+  MachineBasicBlock *
+  EmitF128CSEL(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+  SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
+                          RTLIB::Libcall Call) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
+  SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, DebugLoc DL,
+                           SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+
+  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+  /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+  /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+  /// is expanded to mul + add.
+  virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+
+  ConstraintType getConstraintType(const std::string &Constraint) const;
+
+  ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+                                                  const char *Constraint) const;
+  void LowerAsmOperandForConstraint(SDValue Op,
+                                    std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const;
+
+  std::pair<unsigned, const TargetRegisterClass*>
+  getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+private:
+  const AArch64Subtarget *Subtarget;
+  const TargetRegisterInfo *RegInfo;
+  const InstrItineraryData *Itins;
+};
+} // namespace llvm
+
+#endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
new file mode 100644
index 0000000..cb93471
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -0,0 +1,961 @@
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file describes AArch64 instruction formats, down to the level of the
+// instruction's overall class.
+// ===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// A64 Instruction Format Definitions.
+//===----------------------------------------------------------------------===//
+
+// A64 is currently the only instruction set supported by the AArch64
+// architecture.
+class A64Inst<dag outs, dag ins, string asmstr, list<dag> patterns,
+              InstrItinClass itin>
+    : Instruction {
+  // All A64 instructions are 32-bit. This field will be filled in
+  // gradually going down the hierarchy.
+  field bits<32> Inst;
+
+  field bits<32> Unpredictable = 0;
+  // SoftFail is the generic name for this field, but we alias it so
+  // as to make it more obvious what it means in ARM-land.
+  field bits<32> SoftFail = Unpredictable;
+
+  // LLVM-level model of the AArch64/A64 distinction.
+  let Namespace = "AArch64";
+  let DecoderNamespace = "A64";
+  let Size = 4;
+
+  // Set the templated fields
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = patterns;
+  let Itinerary = itin;
+}
+
+class PseudoInst<dag outs, dag ins, list<dag> patterns> : Instruction {
+  let Namespace = "AArch64";
+
+  let OutOperandList = outs;
+  let InOperandList= ins;
+  let Pattern = patterns;
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// Represents a pseudo-instruction that represents a single A64 instruction for
+// whatever reason, the eventual result will be a 32-bit real instruction.
+class A64PseudoInst<dag outs, dag ins, list<dag> patterns>
+  : PseudoInst<outs, ins, patterns> {
+  let Size = 4;
+}
+
+// As above, this will be a single A64 instruction, but we can actually give the
+// expansion in TableGen.
+class A64PseudoExpand<dag outs, dag ins, list<dag> patterns, dag Result>
+  : A64PseudoInst<outs, ins, patterns>,
+    PseudoInstExpansion<Result>;
+
+
+// First, some common cross-hierarchy register formats.
+
+class A64InstRd<dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rd;
+
+  let Inst{4-0} = Rd;
+}
+
+class A64InstRt<dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rt;
+
+  let Inst{4-0} = Rt;
+}
+
+
+class A64InstRdn<dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+    : A64InstRd<outs, ins, asmstr, patterns, itin> {
+  // Inherit rdt
+  bits<5> Rn;
+
+  let Inst{9-5} = Rn;
+}
+
+class A64InstRtn<dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+    : A64InstRt<outs, ins, asmstr, patterns, itin> {
+  // Inherit rdt
+  bits<5> Rn;
+
+  let Inst{9-5} = Rn;
+}
+
+// Instructions taking Rt,Rt2,Rn
+class A64InstRtt2n<dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rt2;
+
+  let Inst{14-10} = Rt2;
+}
+
+class A64InstRdnm<dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rm;
+
+  let Inst{20-16} = Rm;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Actual A64 Instruction Formats
+//
+
+// Format for Add-subtract (extended register) instructions.
+class A64I_addsubext<bit sf, bit op, bit S, bits<2> opt, bits<3> option,
+                     dag outs, dag ins, string asmstr, list<dag> patterns,
+                     InstrItinClass itin>
+    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+    bits<3> Imm3;
+
+    let Inst{31} = sf;
+    let Inst{30} = op;
+    let Inst{29} = S;
+    let Inst{28-24} = 0b01011;
+    let Inst{23-22} = opt;
+    let Inst{21} = 0b1;
+    // Rm inherited in 20-16
+    let Inst{15-13} = option;
+    let Inst{12-10} = Imm3;
+    // Rn inherited in 9-5
+    // Rd inherited in 4-0
+}
+
+// Format for Add-subtract (immediate) instructions.
+class A64I_addsubimm<bit sf, bit op, bit S, bits<2> shift,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<12> Imm12;
+
+  let Inst{31} = sf;
+  let Inst{30} = op;
+  let Inst{29} = S;
+  let Inst{28-24} = 0b10001;
+  let Inst{23-22} = shift;
+  let Inst{21-10} = Imm12;
+}
+
+// Format for Add-subtract (shifted register) instructions.
+class A64I_addsubshift<bit sf, bit op, bit S, bits<2> shift,
+                       dag outs, dag ins, string asmstr, list<dag> patterns,
+                       InstrItinClass itin>
+    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+    bits<6> Imm6;
+
+    let Inst{31} = sf;
+    let Inst{30} = op;
+    let Inst{29} = S;
+    let Inst{28-24} = 0b01011;
+    let Inst{23-22} = shift;
+    let Inst{21} = 0b0;
+    // Rm inherited in 20-16
+    let Inst{15-10} = Imm6;
+    // Rn inherited in 9-5
+    // Rd inherited in 4-0
+}
+
+// Format for Add-subtract (with carry) instructions.
+class A64I_addsubcarry<bit sf, bit op, bit S, bits<6> opcode2,
+                       dag outs, dag ins, string asmstr, list<dag> patterns,
+                       InstrItinClass itin>
+    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+    let Inst{31} = sf;
+    let Inst{30} = op;
+    let Inst{29} = S;
+    let Inst{28-21} = 0b11010000;
+    // Rm inherited in 20-16
+    let Inst{15-10} = opcode2;
+    // Rn inherited in 9-5
+    // Rd inherited in 4-0
+}
+
+
+// Format for Bitfield instructions
+class A64I_bitfield<bit sf, bits<2> opc, bit n,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<6> ImmR;
+  bits<6> ImmS;
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{22} = n;
+  let Inst{21-16} = ImmR;
+  let Inst{15-10} = ImmS;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for compare and branch (immediate) instructions.
+class A64I_cmpbr<bit sf, bit op,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRt<outs, ins, asmstr, patterns, itin> {
+  bits<19> Label;
+
+  let Inst{31} = sf;
+  let Inst{30-25} = 0b011010;
+  let Inst{24} = op;
+  let Inst{23-5} = Label;
+  // Inherit Rt in 4-0
+}
+
+// Format for conditional branch (immediate) instructions.
+class A64I_condbr<bit o1, bit o0,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<19> Label;
+  bits<4> Cond;
+
+  let Inst{31-25} = 0b0101010;
+  let Inst{24} = o1;
+  let Inst{23-5} = Label;
+  let Inst{4} = o0;
+  let Inst{3-0} = Cond;
+}
+
+// Format for conditional compare (immediate) instructions.
+class A64I_condcmpimm<bit sf, bit op, bit o2, bit o3, bit s,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rn;
+  bits<5> UImm5;
+  bits<4> NZCVImm;
+  bits<4> Cond;
+
+  let Inst{31} = sf;
+  let Inst{30} = op;
+  let Inst{29} = s;
+  let Inst{28-21} = 0b11010010;
+  let Inst{20-16} = UImm5;
+  let Inst{15-12} = Cond;
+  let Inst{11} = 0b1;
+  let Inst{10} = o2;
+  let Inst{9-5} = Rn;
+  let Inst{4} = o3;
+  let Inst{3-0} = NZCVImm;
+}
+
+// Format for conditional compare (register) instructions.
+class A64I_condcmpreg<bit sf, bit op, bit o2, bit o3, bit s,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> NZCVImm;
+  bits<4> Cond;
+
+
+  let Inst{31} = sf;
+  let Inst{30} = op;
+  let Inst{29} = s;
+  let Inst{28-21} = 0b11010010;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = Cond;
+  let Inst{11} = 0b0;
+  let Inst{10} = o2;
+  let Inst{9-5} = Rn;
+  let Inst{4} = o3;
+  let Inst{3-0} = NZCVImm;
+}
+
+// Format for conditional select instructions.
+class A64I_condsel<bit sf, bit op, bit s, bits<2> op2,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<4> Cond;
+
+  let Inst{31} = sf;
+  let Inst{30} = op;
+  let Inst{29} = s;
+  let Inst{28-21} = 0b11010100;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = Cond;
+  let Inst{11-10} = op2;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for data processing (1 source) instructions
+class A64I_dp_1src<bit sf, bit S, bits<5> opcode2, bits<6> opcode,
+                string asmstr, dag outs, dag ins,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = sf;
+  let Inst{30} = 0b1;
+  let Inst{29} = S;
+  let Inst{28-21} = 0b11010110;
+  let Inst{20-16} = opcode2;
+  let Inst{15-10} = opcode;
+}
+
+// Format for data processing (2 source) instructions
+class A64I_dp_2src<bit sf, bits<6> opcode, bit S,
+                string asmstr, dag outs, dag ins,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = sf;
+  let Inst{30} = 0b0;
+  let Inst{29} = S;
+  let Inst{28-21} = 0b11010110;
+  let Inst{15-10} = opcode;
+}
+
+// Format for data-processing (3 source) instructions
+
+class A64I_dp3<bit sf, bits<6> opcode,
+               dag outs, dag ins, string asmstr,
+               list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<5> Ra;
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opcode{5-4};
+  let Inst{28-24} = 0b11011;
+  let Inst{23-21} = opcode{3-1};
+  // Inherits Rm in 20-16
+  let Inst{15} = opcode{0};
+  let Inst{14-10} = Ra;
+  // Inherits Rn in 9-5
+  // Inherits Rd in 4-0
+}
+
+// Format for exception generation instructions
+class A64I_exception<bits<3> opc, bits<3> op2, bits<2> ll,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<16> UImm16;
+
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = opc;
+  let Inst{20-5} = UImm16;
+  let Inst{4-2} = op2;
+  let Inst{1-0} = ll;
+}
+
+// Format for extract (immediate) instructions
+class A64I_extract<bit sf, bits<3> op, bit n,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<6> LSB;
+
+  let Inst{31} = sf;
+  let Inst{30-29} = op{2-1};
+  let Inst{28-23} = 0b100111;
+  let Inst{22} = n;
+  let Inst{21} = op{0};
+  // Inherits Rm in bits 20-16
+  let Inst{15-10} = LSB;
+  // Inherits Rn in 9-5
+  // Inherits Rd in 4-0
+}
+
+// Format for floating-point compare instructions.
+class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = op;
+  let Inst{13-10} = 0b1000;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = opcode2;
+}
+
+// Format for floating-point conditional compare instructions.
+class A64I_fpccmp<bit m, bit s, bits<2> type, bit op,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> NZCVImm;
+  bits<4> Cond;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = Cond;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5} = Rn;
+  let Inst{4} = op;
+  let Inst{3-0} = NZCVImm;
+}
+
+// Format for floating-point conditional select instructions.
+class A64I_fpcondsel<bit m, bit s, bits<2> type,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<4> Cond;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = Cond;
+  let Inst{11-10} = 0b11;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+
+// Format for floating-point data-processing (1 source) instructions.
+class A64I_fpdp1<bit m, bit s, bits<2> type, bits<6> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for floating-point data-processing (2 sources) instructions.
+class A64I_fpdp2<bit m, bit s, bits<2> type, bits<4> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for floating-point data-processing (3 sources) instructions.
+class A64I_fpdp3<bit m, bit s, bits<2> type, bit o1, bit o0,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<5> Ra;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11111;
+  let Inst{23-22} = type;
+  let Inst{21} = o1;
+  // Inherit Rm in 20-16
+  let Inst{15} = o0;
+  let Inst{14-10} = Ra;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for floating-point <-> fixed-point conversion instructions.
+class A64I_fpfixed<bit sf, bit s, bits<2> type, bits<2> mode, bits<3> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<6> Scale;
+
+  let Inst{31} = sf;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b0;
+  let Inst{20-19} = mode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = Scale;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for floating-point <-> integer conversion instructions.
+class A64I_fpint<bit sf, bit s, bits<2> type, bits<2> rmode, bits<3> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = sf;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+
+// Format for floating-point immediate instructions.
+class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRd<outs, ins, asmstr, patterns, itin> {
+  bits<8> Imm8;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-13} = Imm8;
+  let Inst{12-10} = 0b100;
+  let Inst{9-5} = imm5;
+  // Inherit Rd in 4-0
+}
+
+// Format for load-register (literal) instructions.
+class A64I_LDRlit<bits<2> opc, bit v,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRt<outs, ins, asmstr, patterns, itin> {
+  bits<19> Imm19;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5} = Imm19;
+  // Inherit Rt in 4-0
+}
+
+// Format for load-store exclusive instructions.
+class A64I_LDSTex_tn<bits<2> size, bit o2, bit L, bit o1, bit o0,
+                 dag outs, dag ins, string asmstr,
+                 list <dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-30} = size;
+  let Inst{29-24} = 0b001000;
+  let Inst{23} = o2;
+  let Inst{22} = L;
+  let Inst{21} = o1;
+  let Inst{15} = o0;
+}
+
+class A64I_LDSTex_tt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
+                     dag outs, dag ins, string asmstr,
+                     list <dag> patterns, InstrItinClass itin>:
+      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
+   bits<5> Rt2;
+   let Inst{14-10} = Rt2;
+}
+
+class A64I_LDSTex_stn<bits<2> size, bit o2, bit L, bit o1, bit o0,
+                     dag outs, dag ins, string asmstr,
+                     list <dag> patterns, InstrItinClass itin>:
+      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
+   bits<5> Rs;
+   let Inst{20-16} = Rs;
+}
+
+class A64I_LDSTex_stt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
+                     dag outs, dag ins, string asmstr,
+                     list <dag> patterns, InstrItinClass itin>:
+      A64I_LDSTex_stn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
+   bits<5> Rt2;
+   let Inst{14-10} = Rt2;
+}
+
+// Format for load-store register (immediate post-indexed) instructions
+class A64I_LSpostind<bits<2> size, bit v, bits<2> opc,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<9> SImm9;
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b0;
+  let Inst{20-12} = SImm9;
+  let Inst{11-10} = 0b01;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for load-store register (immediate pre-indexed) instructions
+class A64I_LSpreind<bits<2> size, bit v, bits<2> opc,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<9> SImm9;
+
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b0;
+  let Inst{20-12} = SImm9;
+  let Inst{11-10} = 0b11;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for load-store register (unprivileged) instructions
+class A64I_LSunpriv<bits<2> size, bit v, bits<2> opc,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<9> SImm9;
+
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b0;
+  let Inst{20-12} = SImm9;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for load-store (unscaled immediate) instructions.
+class A64I_LSunalimm<bits<2> size, bit v, bits<2> opc,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<9> SImm9;
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b0;
+  let Inst{20-12} = SImm9;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+
+// Format for load-store (unsigned immediate) instructions.
+class A64I_LSunsigimm<bits<2> size, bit v, bits<2> opc,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<12> UImm12;
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opc;
+  let Inst{21-10} = UImm12;
+}
+
+// Format for load-store register (register offset) instructions.
+class A64I_LSregoff<bits<2> size, bit v, bits<2> opc, bit optionlo,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rm;
+
+  // Complex operand selection needed for these instructions, so they
+  // need an "addr" field for encoding/decoding to be generated.
+  bits<3> Ext;
+  // OptionHi = Ext{2-1}
+  // S = Ext{0}
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = Ext{2-1};
+  let Inst{13} = optionlo;
+  let Inst{12} = Ext{0};
+  let Inst{11-10} = 0b10;
+  // Inherits Rn in 9-5
+  // Inherits Rt in 4-0
+
+  let AddedComplexity = 50;
+}
+
+// Format for Load-store register pair (offset) instructions
+class A64I_LSPoffset<bits<2> opc, bit v, bit l,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
+  bits<7> SImm7;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26} = v;
+  let Inst{25-23} = 0b010;
+  let Inst{22} = l;
+  let Inst{21-15} = SImm7;
+  // Inherit Rt2 in 14-10
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for Load-store register pair (post-indexed) instructions
+class A64I_LSPpostind<bits<2> opc, bit v, bit l,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
+  bits<7> SImm7;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26} = v;
+  let Inst{25-23} = 0b001;
+  let Inst{22} = l;
+  let Inst{21-15} = SImm7;
+  // Inherit Rt2 in 14-10
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for Load-store register pair (pre-indexed) instructions
+class A64I_LSPpreind<bits<2> opc, bit v, bit l,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
+  bits<7> SImm7;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26} = v;
+  let Inst{25-23} = 0b011;
+  let Inst{22} = l;
+  let Inst{21-15} = SImm7;
+  // Inherit Rt2 in 14-10
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for Load-store non-temporal register pair (offset) instructions
+class A64I_LSPnontemp<bits<2> opc, bit v, bit l,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
+  bits<7> SImm7;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26} = v;
+  let Inst{25-23} = 0b000;
+  let Inst{22} = l;
+  let Inst{21-15} = SImm7;
+  // Inherit Rt2 in 14-10
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for Logical (immediate) instructions
+class A64I_logicalimm<bit sf, bits<2> opc,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bit N;
+  bits<6> ImmR;
+  bits<6> ImmS;
+
+  // N, ImmR and ImmS have no separate existence in any assembly syntax (or for
+  // selection), so we'll combine them into a single field here.
+  bits<13> Imm;
+  // N = Imm{12};
+  // ImmR = Imm{11-6};
+  // ImmS = Imm{5-0};
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100100;
+  let Inst{22} = Imm{12};
+  let Inst{21-16} = Imm{11-6};
+  let Inst{15-10} = Imm{5-0};
+  // Rn inherited in 9-5
+  // Rd inherited in 4-0
+}
+
+// Format for Logical (shifted register) instructions
+class A64I_logicalshift<bit sf, bits<2> opc, bits<2> shift, bit N,
+                        dag outs, dag ins, string asmstr,
+                        list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<6> Imm6;
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opc;
+  let Inst{28-24} = 0b01010;
+  let Inst{23-22} = shift;
+  let Inst{21} = N;
+  // Rm inherited
+  let Inst{15-10} = Imm6;
+  // Rn inherited
+  // Rd inherited
+}
+
+// Format for Move wide (immediate)
+class A64I_movw<bit sf, bits<2> opc,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRd<outs, ins, asmstr, patterns, itin> {
+  bits<16> UImm16;
+  bits<2> Shift; // Called "hw" officially
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = Shift;
+  let Inst{20-5} = UImm16;
+  // Inherits Rd in 4-0
+}
+
+// Format for PC-relative addressing instructions, ADR and ADRP.
+class A64I_PCADR<bit op,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRd<outs, ins, asmstr, patterns, itin> {
+  bits<21> Label;
+
+  let Inst{31} = op;
+  let Inst{30-29} = Label{1-0};
+  let Inst{28-24} = 0b10000;
+  let Inst{23-5} = Label{20-2};
+}
+
+// Format for system instructions
+class A64I_system<bit l,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<2> Op0;
+  bits<3> Op1;
+  bits<4> CRn;
+  bits<4> CRm;
+  bits<3> Op2;
+  bits<5> Rt;
+
+  let Inst{31-22} = 0b1101010100;
+  let Inst{21} = l;
+  let Inst{20-19} = Op0;
+  let Inst{18-16} = Op1;
+  let Inst{15-12} = CRn;
+  let Inst{11-8} = CRm;
+  let Inst{7-5} = Op2;
+  let Inst{4-0} = Rt;
+
+  // These instructions can do horrible things.
+  let hasSideEffects = 1;
+}
+
+// Format for unconditional branch (immediate) instructions
+class A64I_Bimm<bit op,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  // Doubly special in not even sharing register fields with other
+  // instructions, so we create our own Rn here.
+  bits<26> Label;
+
+  let Inst{31} = op;
+  let Inst{30-26} = 0b00101;
+  let Inst{25-0} = Label;
+}
+
+// Format for Test & branch (immediate) instructions
+class A64I_TBimm<bit op,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRt<outs, ins, asmstr, patterns, itin> {
+  // Doubly special in not even sharing register fields with other
+  // instructions, so we create our own Rn here.
+  bits<6> Imm;
+  bits<14> Label;
+
+  let Inst{31} = Imm{5};
+  let Inst{30-25} = 0b011011;
+  let Inst{24} = op;
+  let Inst{23-19} = Imm{4-0};
+  let Inst{18-5} = Label;
+  // Inherit Rt in 4-0
+}
+
+// Format for Unconditional branch (register) instructions, including
+// RET.  Shares no fields with instructions further up the hierarchy
+// so top-level.
+class A64I_Breg<bits<4> opc, bits<5> op2, bits<6> op3, bits<5> op4,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  // Doubly special in not even sharing register fields with other
+  // instructions, so we create our own Rn here.
+  bits<5> Rn;
+
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = opc;
+  let Inst{20-16} = op2;
+  let Inst{15-10} = op3;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = op4;
+}
+
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
new file mode 100644
index 0000000..7b93463
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -0,0 +1,822 @@
+//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include <algorithm>
+
+#define GET_INSTRINFO_CTOR
+#include "AArch64GenInstrInfo.inc"
+
+using namespace llvm;
+
+AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
+  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+    RI(*this, STI), Subtarget(STI) {}
+
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  unsigned Opc = 0;
+  unsigned ZeroReg = 0;
+  if (DestReg == AArch64::XSP || SrcReg == AArch64::XSP) {
+    // E.g. ADD xDst, xsp, #0 (, lsl #0)
+    BuildMI(MBB, I, DL, get(AArch64::ADDxxi_lsl0_s), DestReg)
+      .addReg(SrcReg)
+      .addImm(0);
+    return;
+  } else if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+    // E.g. ADD wDST, wsp, #0 (, lsl #0)
+    BuildMI(MBB, I, DL, get(AArch64::ADDwwi_lsl0_s), DestReg)
+      .addReg(SrcReg)
+      .addImm(0);
+    return;
+  } else if (DestReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(SrcReg));
+    // E.g. MSR NZCV, xDST
+    BuildMI(MBB, I, DL, get(AArch64::MSRix))
+      .addImm(A64SysReg::NZCV)
+      .addReg(SrcReg);
+  } else if (SrcReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(DestReg));
+    // E.g. MRS xDST, NZCV
+    BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg)
+      .addImm(A64SysReg::NZCV);
+  } else if (AArch64::GPR64RegClass.contains(DestReg)) {
+    assert(AArch64::GPR64RegClass.contains(SrcReg));
+    Opc = AArch64::ORRxxx_lsl;
+    ZeroReg = AArch64::XZR;
+  } else if (AArch64::GPR32RegClass.contains(DestReg)) {
+    assert(AArch64::GPR32RegClass.contains(SrcReg));
+    Opc = AArch64::ORRwww_lsl;
+    ZeroReg = AArch64::WZR;
+  } else if (AArch64::FPR32RegClass.contains(DestReg)) {
+    assert(AArch64::FPR32RegClass.contains(SrcReg));
+    BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
+      .addReg(SrcReg);
+    return;
+  } else if (AArch64::FPR64RegClass.contains(DestReg)) {
+    assert(AArch64::FPR64RegClass.contains(SrcReg));
+    BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
+      .addReg(SrcReg);
+    return;
+  } else if (AArch64::FPR128RegClass.contains(DestReg)) {
+    assert(AArch64::FPR128RegClass.contains(SrcReg));
+
+    // FIXME: there's no good way to do this, at least without NEON:
+    //   + There's no single move instruction for q-registers
+    //   + We can't create a spill slot and use normal STR/LDR because stack
+    //     allocation has already happened
+    //   + We can't go via X-registers with FMOV because register allocation has
+    //     already happened.
+    // This may not be efficient, but at least it works.
+    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
+      .addReg(SrcReg)
+      .addReg(AArch64::XSP)
+      .addImm(0x1ff & -16);
+
+    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
+      .addReg(AArch64::XSP, RegState::Define)
+      .addReg(AArch64::XSP)
+      .addImm(16);
+    return;
+  } else {
+    llvm_unreachable("Unknown register class in copyPhysReg");
+  }
+
+  // E.g. ORR xDst, xzr, xSrc, lsl #0
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(ZeroReg)
+    .addReg(SrcReg)
+    .addImm(0);
+}
+
+MachineInstr *
+AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                           uint64_t Offset, const MDNode *MDPtr,
+                                           DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+    .addFrameIndex(FrameIx).addImm(0)
+    .addImm(Offset)
+    .addMetadata(MDPtr);
+  return &*MIB;
+}
+
+/// Does the Opcode represent a conditional branch that we can remove and re-add
+/// at the end of a basic block?
+static bool isCondBranch(unsigned Opc) {
+  return Opc == AArch64::Bcc || Opc == AArch64::CBZw || Opc == AArch64::CBZx ||
+         Opc == AArch64::CBNZw || Opc == AArch64::CBNZx ||
+         Opc == AArch64::TBZwii || Opc == AArch64::TBZxii ||
+         Opc == AArch64::TBNZwii || Opc == AArch64::TBNZxii;
+}
+
+/// Takes apart a given conditional branch MachineInstr (see isCondBranch),
+/// setting TBB to the destination basic block and populating the Cond vector
+/// with data necessary to recreate the conditional branch at a later
+/// date. First element will be the opcode, and subsequent ones define the
+/// conditions being branched on in an instruction-specific manner.
+static void classifyCondBranch(MachineInstr *I, MachineBasicBlock *&TBB,
+                               SmallVectorImpl<MachineOperand> &Cond) {
+  switch(I->getOpcode()) {
+  case AArch64::Bcc:
+  case AArch64::CBZw:
+  case AArch64::CBZx:
+  case AArch64::CBNZw:
+  case AArch64::CBNZx:
+    // These instructions just have one predicate operand in position 0 (either
+    // a condition code or a register being compared).
+    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
+    Cond.push_back(I->getOperand(0));
+    TBB = I->getOperand(1).getMBB();
+    return;
+  case AArch64::TBZwii:
+  case AArch64::TBZxii:
+  case AArch64::TBNZwii:
+  case AArch64::TBNZxii:
+    // These have two predicate operands: a register and a bit position.
+    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
+    Cond.push_back(I->getOperand(0));
+    Cond.push_back(I->getOperand(1));
+    TBB = I->getOperand(2).getMBB();
+    return;
+  default:
+    llvm_unreachable("Unknown conditional branch to classify");
+  }
+}
+
+
+bool
+AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastOpc == AArch64::Bimm) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranch(LastOpc)) {
+      classifyCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && LastOpc == AArch64::Bimm) {
+    while (SecondLastOpc == AArch64::Bimm) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = I;
+        SecondLastOpc = SecondLastInst->getOpcode();
+      }
+    }
+  }
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (LastOpc == AArch64::Bimm) {
+    if (SecondLastOpc == AArch64::Bcc) {
+      TBB =  SecondLastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(AArch64::Bcc));
+      Cond.push_back(SecondLastInst->getOperand(0));
+      FBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (isCondBranch(SecondLastOpc)) {
+      classifyCondBranch(SecondLastInst, TBB, Cond);
+      FBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (SecondLastOpc == AArch64::Bimm && LastOpc == AArch64::Bimm) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+bool AArch64InstrInfo::ReverseBranchCondition(
+                                  SmallVectorImpl<MachineOperand> &Cond) const {
+  switch (Cond[0].getImm()) {
+  case AArch64::Bcc: {
+    A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(Cond[1].getImm());
+    CC = A64InvertCondCode(CC);
+    Cond[1].setImm(CC);
+    return false;
+  }
+  case AArch64::CBZw:
+    Cond[0].setImm(AArch64::CBNZw);
+    return false;
+  case AArch64::CBZx:
+    Cond[0].setImm(AArch64::CBNZx);
+    return false;
+  case AArch64::CBNZw:
+    Cond[0].setImm(AArch64::CBZw);
+    return false;
+  case AArch64::CBNZx:
+    Cond[0].setImm(AArch64::CBZx);
+    return false;
+  case AArch64::TBZwii:
+    Cond[0].setImm(AArch64::TBNZwii);
+    return false;
+  case AArch64::TBZxii:
+    Cond[0].setImm(AArch64::TBNZxii);
+    return false;
+  case AArch64::TBNZwii:
+    Cond[0].setImm(AArch64::TBZwii);
+    return false;
+  case AArch64::TBNZxii:
+    Cond[0].setImm(AArch64::TBZxii);
+    return false;
+  default:
+    llvm_unreachable("Unknown branch type");
+  }
+}
+
+
+unsigned
+AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               DebugLoc DL) const {
+  if (FBB == 0 && Cond.empty()) {
+    BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(TBB);
+    return 1;
+  } else if (FBB == 0) {
+    MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
+    for (int i = 1, e = Cond.size(); i != e; ++i)
+      MIB.addOperand(Cond[i]);
+    MIB.addMBB(TBB);
+    return 1;
+  }
+
+  MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
+  for (int i = 1, e = Cond.size(); i != e; ++i)
+    MIB.addOperand(Cond[i]);
+  MIB.addMBB(TBB);
+
+  BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(FBB);
+  return 2;
+}
+
+unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (I->getOpcode() != AArch64::Bimm && !isCondBranch(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!isCondBranch(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+bool
+AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AArch64::TLSDESC_BLRx: {
+    MachineInstr *NewMI =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(AArch64::TLSDESCCALL))
+        .addOperand(MI.getOperand(1));
+    MI.setDesc(get(AArch64::BLRx));
+
+    llvm::finalizeBundle(MBB, NewMI, *++MBBI);
+    return true;
+    }
+  default:
+    return false;
+  }
+
+  return false;
+}
+
+void
+AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      unsigned SrcReg, bool isKill,
+                                      int FrameIdx,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+
+  MachineMemOperand *MMO
+    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                              MachineMemOperand::MOStore,
+                              MFI.getObjectSize(FrameIdx),
+                              Align);
+
+  unsigned StoreOp = 0;
+  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
+    switch(RC->getSize()) {
+    case 4: StoreOp = AArch64::LS32_STR; break;
+    case 8: StoreOp = AArch64::LS64_STR; break;
+    default:
+      llvm_unreachable("Unknown size for regclass");
+    }
+  } else {
+    assert((RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
+            RC->hasType(MVT::f128))
+           && "Expected integer or floating type for store");
+    switch (RC->getSize()) {
+    case 4: StoreOp = AArch64::LSFP32_STR; break;
+    case 8: StoreOp = AArch64::LSFP64_STR; break;
+    case 16: StoreOp = AArch64::LSFP128_STR; break;
+    default:
+      llvm_unreachable("Unknown size for regclass");
+    }
+  }
+
+  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
+  NewMI.addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FrameIdx)
+    .addImm(0)
+    .addMemOperand(MMO);
+
+}
+
+void
+AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       unsigned DestReg, int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+
+  MachineMemOperand *MMO
+    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                              MachineMemOperand::MOLoad,
+                              MFI.getObjectSize(FrameIdx),
+                              Align);
+
+  unsigned LoadOp = 0;
+  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
+    switch(RC->getSize()) {
+    case 4: LoadOp = AArch64::LS32_LDR; break;
+    case 8: LoadOp = AArch64::LS64_LDR; break;
+    default:
+      llvm_unreachable("Unknown size for regclass");
+    }
+  } else {
+    assert((RC->hasType(MVT::f32) || RC->hasType(MVT::f64)
+            || RC->hasType(MVT::f128))
+           && "Expected integer or floating type for store");
+    switch (RC->getSize()) {
+    case 4: LoadOp = AArch64::LSFP32_LDR; break;
+    case 8: LoadOp = AArch64::LSFP64_LDR; break;
+    case 16: LoadOp = AArch64::LSFP128_LDR; break;
+    default:
+      llvm_unreachable("Unknown size for regclass");
+    }
+  }
+
+  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
+  NewMI.addFrameIndex(FrameIdx)
+       .addImm(0)
+       .addMemOperand(MMO);
+}
+
+unsigned AArch64InstrInfo::estimateRSStackLimit(MachineFunction &MF) const {
+  unsigned Limit = (1 << 16) - 1;
+  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        if (!I->getOperand(i).isFI()) continue;
+
+        // When using ADDxxi_lsl0_s to get the address of a stack object, 0xfff
+        // is the largest offset guaranteed to fit in the immediate offset.
+        if (I->getOpcode() == AArch64::ADDxxi_lsl0_s) {
+          Limit = std::min(Limit, 0xfffu);
+          break;
+        }
+
+        int AccessScale, MinOffset, MaxOffset;
+        getAddressConstraints(*I, AccessScale, MinOffset, MaxOffset);
+        Limit = std::min(Limit, static_cast<unsigned>(MaxOffset));
+
+        break; // At most one FI per instruction
+      }
+    }
+  }
+
+  return Limit;
+}
+void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI,
+                                             int &AccessScale, int &MinOffset,
+                                             int &MaxOffset) const {
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Unkown load/store kind");
+  case TargetOpcode::DBG_VALUE:
+    AccessScale = 1;
+    MinOffset = INT_MIN;
+    MaxOffset = INT_MAX;
+    return;
+  case AArch64::LS8_LDR: case AArch64::LS8_STR:
+  case AArch64::LSFP8_LDR: case AArch64::LSFP8_STR:
+  case AArch64::LDRSBw:
+  case AArch64::LDRSBx:
+    AccessScale = 1;
+    MinOffset = 0;
+    MaxOffset = 0xfff;
+    return;
+  case AArch64::LS16_LDR: case AArch64::LS16_STR:
+  case AArch64::LSFP16_LDR: case AArch64::LSFP16_STR:
+  case AArch64::LDRSHw:
+  case AArch64::LDRSHx:
+    AccessScale = 2;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LS32_LDR:  case AArch64::LS32_STR:
+  case AArch64::LSFP32_LDR: case AArch64::LSFP32_STR:
+  case AArch64::LDRSWx:
+  case AArch64::LDPSWx:
+    AccessScale = 4;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LS64_LDR: case AArch64::LS64_STR:
+  case AArch64::LSFP64_LDR: case AArch64::LSFP64_STR:
+  case AArch64::PRFM:
+    AccessScale = 8;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LSFP128_LDR: case AArch64::LSFP128_STR:
+    AccessScale = 16;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LSPair32_LDR: case AArch64::LSPair32_STR:
+  case AArch64::LSFPPair32_LDR: case AArch64::LSFPPair32_STR:
+    AccessScale = 4;
+    MinOffset = -0x40 * AccessScale;
+    MaxOffset = 0x3f * AccessScale;
+    return;
+  case AArch64::LSPair64_LDR: case AArch64::LSPair64_STR:
+  case AArch64::LSFPPair64_LDR: case AArch64::LSFPPair64_STR:
+    AccessScale = 8;
+    MinOffset = -0x40 * AccessScale;
+    MaxOffset = 0x3f * AccessScale;
+    return;
+  case AArch64::LSFPPair128_LDR: case AArch64::LSFPPair128_STR:
+    AccessScale = 16;
+    MinOffset = -0x40 * AccessScale;
+    MaxOffset = 0x3f * AccessScale;
+    return;
+  }
+}
+
+unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  const MCInstrDesc &MCID = MI.getDesc();
+  const MachineBasicBlock &MBB = *MI.getParent();
+  const MachineFunction &MF = *MBB.getParent();
+  const MCAsmInfo &MAI = *MF.getTarget().getMCAsmInfo();
+
+  if (MCID.getSize())
+    return MCID.getSize();
+
+  if (MI.getOpcode() == AArch64::INLINEASM)
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
+
+  if (MI.isLabel())
+    return 0;
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::BUNDLE:
+    return getInstBundleLength(MI);
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::DBG_VALUE:
+    return 0;
+  case AArch64::TLSDESCCALL:
+    return 0;
+  default:
+    llvm_unreachable("Unknown instruction class");
+  }
+}
+
+unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
+  unsigned Size = 0;
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+  while (++I != E && I->isInsideBundle()) {
+    assert(!I->isBundle() && "No nested bundle!");
+    Size += getInstSizeInBytes(*I);
+  }
+  return Size;
+}
+
+bool llvm::rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                unsigned FrameReg, int &Offset,
+                                const AArch64InstrInfo &TII) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MFI.getObjectOffset(FrameRegIdx);
+  llvm_unreachable("Unimplemented rewriteFrameIndex");
+}
+
+void llvm::emitRegUpdate(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI,
+                         DebugLoc dl, const TargetInstrInfo &TII,
+                         unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
+                         int64_t NumBytes, MachineInstr::MIFlag MIFlags) {
+  if (NumBytes == 0 && DstReg == SrcReg)
+    return;
+  else if (abs(NumBytes) & ~0xffffff) {
+    // Generically, we have to materialize the offset into a temporary register
+    // and subtract it. There are a couple of ways this could be done, for now
+    // we'll use a movz/movk or movn/movk sequence.
+    uint64_t Bits = static_cast<uint64_t>(abs(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVZxii), ScratchReg)
+      .addImm(0xffff & Bits).addImm(0)
+      .setMIFlags(MIFlags);
+
+    Bits >>= 16;
+    if (Bits & 0xffff) {
+      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
+        .addReg(ScratchReg)
+        .addImm(0xffff & Bits).addImm(1)
+        .setMIFlags(MIFlags);
+    }
+
+    Bits >>= 16;
+    if (Bits & 0xffff) {
+      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
+        .addReg(ScratchReg)
+        .addImm(0xffff & Bits).addImm(2)
+        .setMIFlags(MIFlags);
+    }
+
+    Bits >>= 16;
+    if (Bits & 0xffff) {
+      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
+        .addReg(ScratchReg)
+        .addImm(0xffff & Bits).addImm(3)
+        .setMIFlags(MIFlags);
+    }
+
+    // ADD DST, SRC, xTMP (, lsl #0)
+    unsigned AddOp = NumBytes > 0 ? AArch64::ADDxxx_uxtx : AArch64::SUBxxx_uxtx;
+    BuildMI(MBB, MBBI, dl, TII.get(AddOp), DstReg)
+      .addReg(SrcReg, RegState::Kill)
+      .addReg(ScratchReg, RegState::Kill)
+      .addImm(0)
+      .setMIFlag(MIFlags);
+    return;
+  }
+
+  // Now we know that the adjustment can be done in at most two add/sub
+  // (immediate) instructions, which is always more efficient than a
+  // literal-pool load, or even a hypothetical movz/movk/add sequence
+
+  // Decide whether we're doing addition or subtraction
+  unsigned LowOp, HighOp;
+  if (NumBytes >= 0) {
+    LowOp = AArch64::ADDxxi_lsl0_s;
+    HighOp = AArch64::ADDxxi_lsl12_s;
+  } else {
+    LowOp = AArch64::SUBxxi_lsl0_s;
+    HighOp = AArch64::SUBxxi_lsl12_s;
+    NumBytes = abs(NumBytes);
+  }
+
+  // If we're here, at the very least a move needs to be produced, which just
+  // happens to be materializable by an ADD.
+  if ((NumBytes & 0xfff) || NumBytes == 0) {
+    BuildMI(MBB, MBBI, dl, TII.get(LowOp), DstReg)
+      .addReg(SrcReg, RegState::Kill)
+      .addImm(NumBytes & 0xfff)
+      .setMIFlag(MIFlags);
+
+    // Next update should use the register we've just defined.
+    SrcReg = DstReg;
+  }
+
+  if (NumBytes & 0xfff000) {
+    BuildMI(MBB, MBBI, dl, TII.get(HighOp), DstReg)
+      .addReg(SrcReg, RegState::Kill)
+      .addImm(NumBytes >> 12)
+      .setMIFlag(MIFlags);
+  }
+}
+
+void llvm::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                        DebugLoc dl, const TargetInstrInfo &TII,
+                        unsigned ScratchReg, int64_t NumBytes,
+                        MachineInstr::MIFlag MIFlags) {
+  emitRegUpdate(MBB, MI, dl, TII, AArch64::XSP, AArch64::XSP, AArch64::X16,
+                NumBytes, MIFlags);
+}
+
+
+namespace {
+  struct LDTLSCleanup : public MachineFunctionPass {
+    static char ID;
+    LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      AArch64MachineFunctionInfo* MFI
+        = MF.getInfo<AArch64MachineFunctionInfo>();
+      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+        // No point folding accesses if there isn't at least two.
+        return false;
+      }
+
+      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+      return VisitNode(DT->getRootNode(), 0);
+    }
+
+    // Visit the dominator subtree rooted at Node in pre-order.
+    // If TLSBaseAddrReg is non-null, then use that to replace any
+    // TLS_base_addr instructions. Otherwise, create the register
+    // when the first such instruction is seen, and then use it
+    // as we encounter more instructions.
+    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+      MachineBasicBlock *BB = Node->getBlock();
+      bool Changed = false;
+
+      // Traverse the current block.
+      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+           ++I) {
+        switch (I->getOpcode()) {
+        case AArch64::TLSDESC_BLRx:
+          // Make sure it's a local dynamic access.
+          if (!I->getOperand(1).isSymbol() ||
+              strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
+            break;
+
+          if (TLSBaseAddrReg)
+            I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+          else
+            I = SetRegister(I, &TLSBaseAddrReg);
+          Changed = true;
+          break;
+        default:
+          break;
+        }
+      }
+
+      // Visit the children of this block in the dominator tree.
+      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+           I != E; ++I) {
+        Changed |= VisitNode(*I, TLSBaseAddrReg);
+      }
+
+      return Changed;
+    }
+
+    // Replace the TLS_base_addr instruction I with a copy from
+    // TLSBaseAddrReg, returning the new instruction.
+    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+                                         unsigned TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const AArch64TargetMachine *TM =
+          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+      const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+      // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+      // code sequence assumes the address will be.
+      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   AArch64::X0)
+        .addReg(TLSBaseAddrReg);
+
+      // Erase the TLS_base_addr instruction.
+      I->eraseFromParent();
+
+      return Copy;
+    }
+
+    // Create a virtal register in *TLSBaseAddrReg, and populate it by
+    // inserting a copy instruction after I. Returns the new instruction.
+    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const AArch64TargetMachine *TM =
+          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+      const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+      // Create a virtual register for the TLS base address.
+      MachineRegisterInfo &RegInfo = MF->getRegInfo();
+      *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+
+      // Insert a copy from X0 to TLSBaseAddrReg for later.
+      MachineInstr *Next = I->getNextNode();
+      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   *TLSBaseAddrReg)
+        .addReg(AArch64::X0);
+
+      return Copy;
+    }
+
+    virtual const char *getPassName() const {
+      return "Local Dynamic TLS Access Clean-up";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createAArch64CleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
new file mode 100644
index 0000000..22a2ab4
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -0,0 +1,112 @@
+//===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64INSTRINFO_H
+#define LLVM_TARGET_AARCH64INSTRINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "AArch64RegisterInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AArch64GenInstrInfo.inc"
+
+namespace llvm {
+
+class AArch64Subtarget;
+
+class AArch64InstrInfo : public AArch64GenInstrInfo {
+  const AArch64RegisterInfo RI;
+  const AArch64Subtarget &Subtarget;
+public:
+  explicit AArch64InstrInfo(const AArch64Subtarget &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  const AArch64Subtarget &getSubTarget() const { return Subtarget; }
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  /// Look through the instructions in this function and work out the largest
+  /// the stack frame can be while maintaining the ability to address local
+  /// slots with no complexities.
+  unsigned estimateRSStackLimit(MachineFunction &MF) const;
+
+  /// getAddressConstraints - For loads and stores (and PRFMs) taking an
+  /// immediate offset, this function determines the constraints required for
+  /// the immediate. It must satisfy:
+  ///    + MinOffset <= imm <= MaxOffset
+  ///    + imm % OffsetScale == 0
+  void getAddressConstraints(const MachineInstr &MI, int &AccessScale,
+                             int &MinOffset, int &MaxOffset) const;
+
+
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const;
+
+  unsigned getInstBundleLength(const MachineInstr &MI) const;
+
+};
+
+bool rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                          unsigned FrameReg, int &Offset,
+                          const AArch64InstrInfo &TII);
+
+
+void emitRegUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   DebugLoc dl, const TargetInstrInfo &TII,
+                   unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
+                   int64_t NumBytes,
+                   MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                  DebugLoc dl, const TargetInstrInfo &TII,
+                  unsigned ScratchReg, int64_t NumBytes,
+                  MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
new file mode 100644
index 0000000..562a7f6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -0,0 +1,5109 @@
+//===----- AArch64InstrInfo.td - AArch64 Instruction Info ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the AArch64 scalar instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "AArch64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Target-specific ISD nodes and profiles
+//===----------------------------------------------------------------------===//
+
+def SDT_A64ret : SDTypeProfile<0, 0, []>;
+def A64ret : SDNode<"AArch64ISD::Ret", SDT_A64ret, [SDNPHasChain,
+                                                    SDNPOptInGlue,
+                                                    SDNPVariadic]>;
+
+// (ins NZCV, Condition, Dest)
+def SDT_A64br_cc : SDTypeProfile<0, 3, [SDTCisVT<0, i32>]>;
+def A64br_cc : SDNode<"AArch64ISD::BR_CC", SDT_A64br_cc, [SDNPHasChain]>;
+
+// (outs Result), (ins NZCV, IfTrue, IfFalse, Condition)
+def SDT_A64select_cc : SDTypeProfile<1, 4, [SDTCisVT<1, i32>,
+                                            SDTCisSameAs<0, 2>,
+                                            SDTCisSameAs<2, 3>]>;
+def A64select_cc : SDNode<"AArch64ISD::SELECT_CC", SDT_A64select_cc>;
+
+// (outs NZCV), (ins LHS, RHS, Condition)
+def SDT_A64setcc : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
+                                        SDTCisSameAs<1, 2>]>;
+def A64setcc : SDNode<"AArch64ISD::SETCC", SDT_A64setcc>;
+
+
+// (outs GPR64), (ins)
+def A64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+// A64 compares don't care about the cond really (they set all flags) so a
+// simple binary operator is useful.
+def A64cmp : PatFrag<(ops node:$lhs, node:$rhs),
+                     (A64setcc node:$lhs, node:$rhs, cond)>;
+
+
+// When matching a notional (CMP op1, (sub 0, op2)), we'd like to use a CMN
+// instruction on the grounds that "op1 - (-op2) == op1 + op2". However, the C
+// and V flags can be set differently by this operation. It comes down to
+// whether "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are
+// then everything is fine. If not then the optimization is wrong. Thus general
+// comparisons are only valid if op2 != 0.
+
+// So, finally, the only LLVM-native comparisons that don't mention C and V are
+// SETEQ and SETNE. They're the only ones we can safely use CMN for in the
+// absence of information about op2.
+def equality_cond : PatLeaf<(cond), [{
+  return N->get() == ISD::SETEQ || N->get() == ISD::SETNE;
+}]>;
+
+def A64cmn : PatFrag<(ops node:$lhs, node:$rhs),
+                     (A64setcc node:$lhs, (sub 0, node:$rhs), equality_cond)>;
+
+// There are two layers of indirection here, driven by the following
+// considerations.
+//     + TableGen does not know CodeModel or Reloc so that decision should be
+//       made for a variable/address at ISelLowering.
+//     + The output of ISelLowering should be selectable (hence the Wrapper,
+//       rather than a bare target opcode)
+def SDTAArch64Wrapper : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<1, 2>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisPtrTy<0>]>;
+
+def A64WrapperSmall : SDNode<"AArch64ISD::WrapperSmall", SDTAArch64Wrapper>;
+
+
+def SDTAArch64GOTLoad : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+def A64GOTLoad : SDNode<"AArch64ISD::GOTLoad", SDTAArch64GOTLoad,
+                        [SDNPHasChain]>;
+
+
+// (A64BFI LHS, RHS, LSB, Width)
+def SDTA64BFI : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+                                     SDTCisSameAs<1, 2>,
+                                     SDTCisVT<3, i64>,
+                                     SDTCisVT<4, i64>]>;
+
+def A64Bfi : SDNode<"AArch64ISD::BFI", SDTA64BFI>;
+
+// (A64EXTR HiReg, LoReg, LSB)
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                      SDTCisVT<3, i64>]>;
+def A64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+// (A64[SU]BFX Field, ImmR, ImmS).
+//
+// Note that ImmR and ImmS are already encoded for the actual instructions. The
+// more natural LSB and Width mix together to form ImmR and ImmS, something
+// which TableGen can't handle.
+def SDTA64BFX : SDTypeProfile<1, 3, [SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
+def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>;
+
+def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>;
+
+//===----------------------------------------------------------------------===//
+// Call sequence pseudo-instructions
+//===----------------------------------------------------------------------===//
+
+
+def SDT_AArch64Call : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def AArch64Call : SDNode<"AArch64ISD::Call", SDT_AArch64Call,
+                     [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+def AArch64tcret : SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64Call,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// The TLSDESCCALL node is a variant call which goes to an indirectly calculated
+// destination but needs a relocation against a fixed symbol. As such it has two
+// certain operands: the callee and the relocated variable.
+//
+// The TLS ABI only allows it to be selected to a BLR instructin (with
+// appropriate relocation).
+def SDTTLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+
+def A64tlsdesc_blr : SDNode<"AArch64ISD::TLSDESCCALL", SDTTLSDescCall,
+                            [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                             SDNPVariadic]>;
+
+
+def SDT_AArch64CallSeqStart : SDCallSeqStart<[ SDTCisPtrTy<0> ]>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AArch64CallSeqStart,
+                                  [SDNPHasChain, SDNPOutGlue]>;
+
+def SDT_AArch64CallSeqEnd   : SDCallSeqEnd<[ SDTCisPtrTy<0>, SDTCisPtrTy<1> ]>;
+def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",   SDT_AArch64CallSeqEnd,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+
+
+// These pseudo-instructions have special semantics by virtue of being passed to
+// the InstrInfo constructor. CALLSEQ_START/CALLSEQ_END are produced by
+// LowerCall to (in our case) tell the back-end about stack adjustments for
+// arguments passed on the stack. Here we select those markers to
+// pseudo-instructions which explicitly set the stack, and finally in the
+// RegisterInfo we convert them to a true stack adjustment.
+let Defs = [XSP], Uses = [XSP] in {
+  def ADJCALLSTACKDOWN : PseudoInst<(outs), (ins i64imm:$amt),
+                                    [(AArch64callseq_start timm:$amt)]>;
+
+  def ADJCALLSTACKUP : PseudoInst<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                                 [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic operation pseudo-instructions
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1, Defs = [NZCV] in {
+multiclass AtomicSizes<string opname> {
+  def _I8 : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$incr),
+    [(set GPR32:$dst, (!cast<SDNode>(opname # "_8") GPR64:$ptr, GPR32:$incr))]>;
+  def _I16 : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$incr),
+   [(set GPR32:$dst, (!cast<SDNode>(opname # "_16") GPR64:$ptr, GPR32:$incr))]>;
+  def _I32 : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$incr),
+   [(set GPR32:$dst, (!cast<SDNode>(opname # "_32") GPR64:$ptr, GPR32:$incr))]>;
+  def _I64 : PseudoInst<(outs GPR64:$dst), (ins GPR64:$ptr, GPR64:$incr),
+   [(set GPR64:$dst, (!cast<SDNode>(opname # "_64") GPR64:$ptr, GPR64:$incr))]>;
+}
+}
+
+defm ATOMIC_LOAD_ADD  : AtomicSizes<"atomic_load_add">;
+defm ATOMIC_LOAD_SUB  : AtomicSizes<"atomic_load_sub">;
+defm ATOMIC_LOAD_AND  : AtomicSizes<"atomic_load_and">;
+defm ATOMIC_LOAD_OR   : AtomicSizes<"atomic_load_or">;
+defm ATOMIC_LOAD_XOR  : AtomicSizes<"atomic_load_xor">;
+defm ATOMIC_LOAD_NAND : AtomicSizes<"atomic_load_nand">;
+defm ATOMIC_LOAD_MIN  : AtomicSizes<"atomic_load_min">;
+defm ATOMIC_LOAD_MAX  : AtomicSizes<"atomic_load_max">;
+defm ATOMIC_LOAD_UMIN : AtomicSizes<"atomic_load_umin">;
+defm ATOMIC_LOAD_UMAX : AtomicSizes<"atomic_load_umax">;
+defm ATOMIC_SWAP      : AtomicSizes<"atomic_swap">;
+
+let usesCustomInserter = 1, Defs = [NZCV] in {
+def ATOMIC_CMP_SWAP_I8
+  : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$old, GPR32:$new),
+               [(set GPR32:$dst,
+                     (atomic_cmp_swap_8 GPR64:$ptr, GPR32:$old, GPR32:$new))]>;
+def ATOMIC_CMP_SWAP_I16
+  : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$old, GPR32:$new),
+               [(set GPR32:$dst,
+                     (atomic_cmp_swap_16 GPR64:$ptr, GPR32:$old, GPR32:$new))]>;
+def ATOMIC_CMP_SWAP_I32
+  : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$old, GPR32:$new),
+               [(set GPR32:$dst,
+                     (atomic_cmp_swap_32 GPR64:$ptr, GPR32:$old, GPR32:$new))]>;
+def ATOMIC_CMP_SWAP_I64
+  : PseudoInst<(outs GPR64:$dst), (ins GPR64:$ptr, GPR64:$old, GPR64:$new),
+               [(set GPR64:$dst,
+                     (atomic_cmp_swap_64 GPR64:$ptr, GPR64:$old, GPR64:$new))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Add-subtract (extended register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP
+
+// The RHS of these operations is conceptually a sign/zero-extended
+// register, optionally shifted left by 1-4. The extension can be a
+// NOP (e.g. "sxtx" sign-extending a 64-bit register to 64-bits) but
+// must be specified with one exception:
+
+// If one of the registers is sp/wsp then LSL is an alias for UXTW in
+// 32-bit instructions and UXTX in 64-bit versions, the shift amount
+// is not optional in that case (but can explicitly be 0), and the
+// entire suffix can be skipped (e.g. "add sp, x3, x2").
+
+multiclass extend_operands<string PREFIX, string Diag> {
+     def _asmoperand : AsmOperandClass {
+         let Name = PREFIX;
+         let RenderMethod = "addRegExtendOperands";
+         let PredicateMethod = "isRegExtend<A64SE::" # PREFIX # ">";
+         let DiagnosticType = "AddSubRegExtend" # Diag;
+     }
+
+     def _operand : Operand<i64>,
+                    ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 4; }]> {
+         let PrintMethod = "printRegExtendOperand<A64SE::" # PREFIX # ">";
+         let DecoderMethod = "DecodeRegExtendOperand";
+         let ParserMatchClass = !cast<AsmOperandClass>(PREFIX # "_asmoperand");
+     }
+}
+
+defm UXTB : extend_operands<"UXTB", "Small">;
+defm UXTH : extend_operands<"UXTH", "Small">;
+defm UXTW : extend_operands<"UXTW", "Small">;
+defm UXTX : extend_operands<"UXTX", "Large">;
+defm SXTB : extend_operands<"SXTB", "Small">;
+defm SXTH : extend_operands<"SXTH", "Small">;
+defm SXTW : extend_operands<"SXTW", "Small">;
+defm SXTX : extend_operands<"SXTX", "Large">;
+
+def LSL_extasmoperand : AsmOperandClass {
+    let Name = "RegExtendLSL";
+    let RenderMethod = "addRegExtendOperands";
+    let DiagnosticType = "AddSubRegExtendLarge";
+}
+
+def LSL_extoperand : Operand<i64> {
+    let ParserMatchClass = LSL_extasmoperand;
+}
+
+
+// The patterns for various sign-extensions are a little ugly and
+// non-uniform because everything has already been promoted to the
+// legal i64 and i32 types. We'll wrap the various variants up in a
+// class for use later.
+class extend_types {
+    dag uxtb; dag uxth; dag uxtw; dag uxtx;
+    dag sxtb; dag sxth; dag sxtw; dag sxtx;
+}
+
+def extends_to_i64 : extend_types {
+    let uxtb = (and (anyext GPR32:$Rm), 255);
+    let uxth = (and (anyext GPR32:$Rm), 65535);
+    let uxtw = (zext GPR32:$Rm);
+    let uxtx = (i64 GPR64:$Rm);
+
+    let sxtb = (sext_inreg (anyext GPR32:$Rm), i8);
+    let sxth = (sext_inreg (anyext GPR32:$Rm), i16);
+    let sxtw = (sext GPR32:$Rm);
+    let sxtx = (i64 GPR64:$Rm);
+}
+
+
+def extends_to_i32 : extend_types {
+    let uxtb = (and GPR32:$Rm, 255);
+    let uxth = (and GPR32:$Rm, 65535);
+    let uxtw = (i32 GPR32:$Rm);
+    let uxtx = (i32 GPR32:$Rm);
+
+    let sxtb = (sext_inreg GPR32:$Rm, i8);
+    let sxth = (sext_inreg GPR32:$Rm, i16);
+    let sxtw = (i32 GPR32:$Rm);
+    let sxtx = (i32 GPR32:$Rm);
+}
+
+// Now, six of the extensions supported are easy and uniform: if the source size
+// is 32-bits or less, then Rm is always a 32-bit register. We'll instantiate
+// those instructions in one block.
+
+// The uxtx/sxtx could potentially be merged in, but three facts dissuaded me:
+//     + It would break the naming scheme: either ADDxx_uxtx or ADDww_uxtx would
+//       be impossible.
+//     + Patterns are very different as well.
+//     + Passing different registers would be ugly (more fields in extend_types
+//       would probably be the best option).
+multiclass addsub_exts<bit sf, bit op, bit S, string asmop,
+                       SDPatternOperator opfrag,
+                       dag outs, extend_types exts, RegisterClass GPRsp> {
+    def w_uxtb : A64I_addsubext<sf, op, S, 0b00, 0b000,
+                      outs,
+                      (ins GPRsp:$Rn, GPR32:$Rm, UXTB_operand:$Imm3),
+                      !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                      [(opfrag GPRsp:$Rn, (shl exts.uxtb, UXTB_operand:$Imm3))],
+                      NoItinerary>;
+    def w_uxth : A64I_addsubext<sf, op, S, 0b00, 0b001,
+                      outs,
+                      (ins GPRsp:$Rn, GPR32:$Rm, UXTH_operand:$Imm3),
+                      !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                      [(opfrag GPRsp:$Rn, (shl exts.uxth, UXTH_operand:$Imm3))],
+                      NoItinerary>;
+    def w_uxtw : A64I_addsubext<sf, op, S, 0b00, 0b010,
+                      outs,
+                      (ins GPRsp:$Rn, GPR32:$Rm, UXTW_operand:$Imm3),
+                      !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                      [(opfrag GPRsp:$Rn, (shl exts.uxtw, UXTW_operand:$Imm3))],
+                      NoItinerary>;
+
+    def w_sxtb : A64I_addsubext<sf, op, S, 0b00, 0b100,
+                      outs,
+                      (ins GPRsp:$Rn, GPR32:$Rm, SXTB_operand:$Imm3),
+                      !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                      [(opfrag GPRsp:$Rn, (shl exts.sxtb, SXTB_operand:$Imm3))],
+                      NoItinerary>;
+    def w_sxth : A64I_addsubext<sf, op, S, 0b00, 0b101,
+                      outs,
+                      (ins GPRsp:$Rn, GPR32:$Rm, SXTH_operand:$Imm3),
+                      !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                      [(opfrag GPRsp:$Rn, (shl exts.sxth, SXTH_operand:$Imm3))],
+                      NoItinerary>;
+    def w_sxtw : A64I_addsubext<sf, op, S, 0b00, 0b110,
+                      outs,
+                      (ins GPRsp:$Rn, GPR32:$Rm, SXTW_operand:$Imm3),
+                      !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                      [(opfrag GPRsp:$Rn, (shl exts.sxtw, SXTW_operand:$Imm3))],
+                      NoItinerary>;
+}
+
+// These two could be merge in with the above, but their patterns aren't really
+// necessary and the naming-scheme would necessarily break:
+multiclass addsub_xxtx<bit op, bit S, string asmop, SDPatternOperator opfrag,
+                       dag outs> {
+    def x_uxtx : A64I_addsubext<0b1, op, S, 0b00, 0b011,
+                   outs,
+                   (ins GPR64xsp:$Rn, GPR64:$Rm, UXTX_operand:$Imm3),
+                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                   [(opfrag GPR64xsp:$Rn, (shl GPR64:$Rm, UXTX_operand:$Imm3))],
+                   NoItinerary>;
+
+    def x_sxtx : A64I_addsubext<0b1, op, S, 0b00, 0b111,
+                   outs,
+                   (ins GPR64xsp:$Rn, GPR64:$Rm, SXTX_operand:$Imm3),
+                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                   [/* No Pattern: same as uxtx */],
+                   NoItinerary>;
+}
+
+multiclass addsub_wxtx<bit op, bit S, string asmop, dag outs> {
+    def w_uxtx : A64I_addsubext<0b0, op, S, 0b00, 0b011,
+                              outs,
+                              (ins GPR32wsp:$Rn, GPR32:$Rm, UXTX_operand:$Imm3),
+                              !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                              [/* No pattern: probably same as uxtw */],
+                              NoItinerary>;
+
+    def w_sxtx : A64I_addsubext<0b0, op, S, 0b00, 0b111,
+                              outs,
+                              (ins GPR32wsp:$Rn, GPR32:$Rm, SXTX_operand:$Imm3),
+                              !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                              [/* No Pattern: probably same as uxtw */],
+                              NoItinerary>;
+}
+
+class SetRD<RegisterClass RC, SDPatternOperator op>
+ : PatFrag<(ops node:$lhs, node:$rhs), (set RC:$Rd, (op node:$lhs, node:$rhs))>;
+class SetNZCV<SDPatternOperator op>
+  : PatFrag<(ops node:$lhs, node:$rhs), (set NZCV, (op node:$lhs, node:$rhs))>;
+
+defm ADDxx :addsub_exts<0b1, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
+                        (outs GPR64xsp:$Rd), extends_to_i64, GPR64xsp>,
+            addsub_xxtx<     0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
+                        (outs GPR64xsp:$Rd)>;
+defm ADDww :addsub_exts<0b0, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR32wsp, add>,
+                        (outs GPR32wsp:$Rd), extends_to_i32, GPR32wsp>,
+            addsub_wxtx<     0b0, 0b0, "add\t$Rd, ",
+                        (outs GPR32wsp:$Rd)>;
+defm SUBxx :addsub_exts<0b1, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
+                        (outs GPR64xsp:$Rd), extends_to_i64, GPR64xsp>,
+            addsub_xxtx<     0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
+                        (outs GPR64xsp:$Rd)>;
+defm SUBww :addsub_exts<0b0, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR32wsp, sub>,
+                        (outs GPR32wsp:$Rd), extends_to_i32, GPR32wsp>,
+            addsub_wxtx<     0b1, 0b0, "sub\t$Rd, ",
+                        (outs GPR32wsp:$Rd)>;
+
+let Defs = [NZCV] in {
+defm ADDSxx :addsub_exts<0b1, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
+                         (outs GPR64:$Rd), extends_to_i64, GPR64xsp>,
+             addsub_xxtx<     0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
+                         (outs GPR64:$Rd)>;
+defm ADDSww :addsub_exts<0b0, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR32, addc>,
+                         (outs GPR32:$Rd), extends_to_i32, GPR32wsp>,
+             addsub_wxtx<     0b0, 0b1, "adds\t$Rd, ",
+                         (outs GPR32:$Rd)>;
+defm SUBSxx :addsub_exts<0b1, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
+                         (outs GPR64:$Rd), extends_to_i64, GPR64xsp>,
+             addsub_xxtx<     0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
+                         (outs GPR64:$Rd)>;
+defm SUBSww :addsub_exts<0b0, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR32, subc>,
+                         (outs GPR32:$Rd), extends_to_i32, GPR32wsp>,
+             addsub_wxtx<     0b1, 0b1, "subs\t$Rd, ",
+                         (outs GPR32:$Rd)>;
+
+
+let Rd = 0b11111, isCompare = 1 in {
+defm CMNx : addsub_exts<0b1, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
+                        (outs), extends_to_i64, GPR64xsp>,
+            addsub_xxtx<     0b0, 0b1, "cmn\t", SetNZCV<A64cmn>, (outs)>;
+defm CMNw : addsub_exts<0b0, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
+                        (outs), extends_to_i32, GPR32wsp>,
+            addsub_wxtx<     0b0, 0b1, "cmn\t", (outs)>;
+defm CMPx : addsub_exts<0b1, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
+                        (outs), extends_to_i64, GPR64xsp>,
+            addsub_xxtx<     0b1, 0b1, "cmp\t", SetNZCV<A64cmp>, (outs)>;
+defm CMPw : addsub_exts<0b0, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
+                        (outs), extends_to_i32, GPR32wsp>,
+            addsub_wxtx<     0b1, 0b1, "cmp\t", (outs)>;
+}
+}
+
+// Now patterns for the operation without a shift being needed. No patterns are
+// created for uxtx/sxtx since they're non-uniform and it's expected that
+// add/sub (shifted register) will handle those cases anyway.
+multiclass addsubext_noshift_patterns<string prefix, SDPatternOperator nodeop,
+                                      RegisterClass GPRsp, extend_types exts> {
+    def : Pat<(nodeop GPRsp:$Rn, exts.uxtb),
+              (!cast<Instruction>(prefix # "w_uxtb") GPRsp:$Rn, GPR32:$Rm, 0)>;
+    def : Pat<(nodeop GPRsp:$Rn, exts.uxth),
+              (!cast<Instruction>(prefix # "w_uxth") GPRsp:$Rn, GPR32:$Rm, 0)>;
+    def : Pat<(nodeop GPRsp:$Rn, exts.uxtw),
+              (!cast<Instruction>(prefix # "w_uxtw") GPRsp:$Rn, GPR32:$Rm, 0)>;
+
+    def : Pat<(nodeop GPRsp:$Rn, exts.sxtb),
+              (!cast<Instruction>(prefix # "w_sxtb") GPRsp:$Rn, GPR32:$Rm, 0)>;
+    def : Pat<(nodeop GPRsp:$Rn, exts.sxth),
+              (!cast<Instruction>(prefix # "w_sxth") GPRsp:$Rn, GPR32:$Rm, 0)>;
+    def : Pat<(nodeop GPRsp:$Rn, exts.sxtw),
+              (!cast<Instruction>(prefix # "w_sxtw") GPRsp:$Rn, GPR32:$Rm, 0)>;
+}
+
+defm : addsubext_noshift_patterns<"ADDxx", add, GPR64xsp, extends_to_i64>;
+defm : addsubext_noshift_patterns<"ADDww", add, GPR32wsp, extends_to_i32>;
+defm : addsubext_noshift_patterns<"SUBxx", sub, GPR64xsp, extends_to_i64>;
+defm : addsubext_noshift_patterns<"SUBww", sub, GPR32wsp, extends_to_i32>;
+
+defm : addsubext_noshift_patterns<"CMNx", A64cmn, GPR64xsp, extends_to_i64>;
+defm : addsubext_noshift_patterns<"CMNw", A64cmn, GPR32wsp, extends_to_i32>;
+defm : addsubext_noshift_patterns<"CMPx", A64cmp, GPR64xsp, extends_to_i64>;
+defm : addsubext_noshift_patterns<"CMPw", A64cmp, GPR32wsp, extends_to_i32>;
+
+// An extend of "lsl #imm" is valid if and only if one of Rn and Rd is
+// sp/wsp. It is synonymous with uxtx/uxtw depending on the size of the
+// operation. Also permitted in this case is complete omission of the argument,
+// which implies "lsl #0".
+multiclass lsl_aliases<string asmop, Instruction inst, RegisterClass GPR_Rd,
+                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
+    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
+                    (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
+
+    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm, $LSL"),
+                (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
+
+}
+
+defm : lsl_aliases<"add",  ADDxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
+defm : lsl_aliases<"add",  ADDxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
+defm : lsl_aliases<"add",  ADDwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
+defm : lsl_aliases<"add",  ADDwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
+defm : lsl_aliases<"sub",  SUBxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
+defm : lsl_aliases<"sub",  SUBxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
+defm : lsl_aliases<"sub",  SUBwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
+defm : lsl_aliases<"sub",  SUBwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
+
+// Rd cannot be sp for flag-setting variants so only half of the aliases are
+// needed.
+defm : lsl_aliases<"adds", ADDSxxx_uxtx, GPR64, Rxsp, GPR64>;
+defm : lsl_aliases<"adds", ADDSwww_uxtw, GPR32, Rwsp, GPR32>;
+defm : lsl_aliases<"subs", SUBSxxx_uxtx, GPR64, Rxsp, GPR64>;
+defm : lsl_aliases<"subs", SUBSwww_uxtw, GPR32, Rwsp, GPR32>;
+
+// CMP unfortunately has to be different because the instruction doesn't have a
+// dest register.
+multiclass cmp_lsl_aliases<string asmop, Instruction inst,
+                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
+    def : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
+                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
+
+    def : InstAlias<!strconcat(asmop, " $Rn, $Rm, $LSL"),
+                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
+}
+
+defm : cmp_lsl_aliases<"cmp", CMPxx_uxtx, Rxsp, GPR64>;
+defm : cmp_lsl_aliases<"cmp", CMPww_uxtw, Rwsp, GPR32>;
+defm : cmp_lsl_aliases<"cmn", CMNxx_uxtx, Rxsp, GPR64>;
+defm : cmp_lsl_aliases<"cmn", CMNww_uxtw, Rwsp, GPR32>;
+
+//===----------------------------------------------------------------------===//
+// Add-subtract (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, MOV
+
+// These instructions accept a 12-bit unsigned immediate, optionally shifted
+// left by 12 bits. Official assembly format specifies a 12 bit immediate with
+// one of "", "LSL #0", "LSL #12" supplementary operands.
+
+// There are surprisingly few ways to make this work with TableGen, so this
+// implementation has separate instructions for the "LSL #0" and "LSL #12"
+// variants.
+
+// If the MCInst retained a single combined immediate (which could be 0x123000,
+// for example) then both components (imm & shift) would have to be delegated to
+// a single assembly operand. This would entail a separate operand parser
+// (because the LSL would have to live in the same AArch64Operand as the
+// immediate to be accessible); assembly parsing is rather complex and
+// error-prone C++ code.
+//
+// By splitting the immediate, we can delegate handling this optional operand to
+// an InstAlias. Supporting functions to generate the correct MCInst are still
+// required, but these are essentially trivial and parsing can remain generic.
+//
+// Rejected plans with rationale:
+// ------------------------------
+//
+// In an ideal world you'de have two first class immediate operands (in
+// InOperandList, specifying imm12 and shift). Unfortunately this is not
+// selectable by any means I could discover.
+//
+// An Instruction with two MCOperands hidden behind a single entry in
+// InOperandList (expanded by ComplexPatterns and MIOperandInfo) was functional,
+// but required more C++ code to handle encoding/decoding. Parsing (the intended
+// main beneficiary) ended up equally complex because of the optional nature of
+// "LSL #0".
+//
+// Attempting to circumvent the need for a custom OperandParser above by giving
+// InstAliases without the "lsl #0" failed. add/sub could be accommodated but
+// the cmp/cmn aliases didn't use the MIOperandInfo to determine how operands
+// should be parsed: there was no way to accommodate an "lsl #12".
+
+let ParserMethod = "ParseImmWithLSLOperand",
+    RenderMethod = "addImmWithLSLOperands" in {
+  // Derived PredicateMethod fields are different for each
+  def addsubimm_lsl0_asmoperand : AsmOperandClass {
+    let Name = "AddSubImmLSL0";
+    // If an error is reported against this operand, instruction could also be a
+    // register variant.
+    let DiagnosticType = "AddSubSecondSource";
+  }
+
+  def addsubimm_lsl12_asmoperand : AsmOperandClass {
+    let Name = "AddSubImmLSL12";
+    let DiagnosticType = "AddSubSecondSource";
+  }
+}
+
+def shr_12_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() >> 12, MVT::i32);
+}]>;
+
+def shr_12_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((-N->getSExtValue()) >> 12, MVT::i32);
+}]>;
+
+def neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-N->getSExtValue(), MVT::i32);
+}]>;
+
+
+multiclass addsub_imm_operands<ValueType ty> {
+ let PrintMethod = "printAddSubImmLSL0Operand",
+      EncoderMethod = "getAddSubImmOpValue",
+      ParserMatchClass = addsubimm_lsl0_asmoperand in {
+    def _posimm_lsl0 : Operand<ty>,
+        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff) == 0; }]>;
+    def _negimm_lsl0 : Operand<ty>,
+        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff) == 0; }],
+                neg_XFORM>;
+  }
+
+  let PrintMethod = "printAddSubImmLSL12Operand",
+      EncoderMethod = "getAddSubImmOpValue",
+      ParserMatchClass = addsubimm_lsl12_asmoperand in {
+    def _posimm_lsl12 : Operand<ty>,
+        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff000) == 0; }],
+                shr_12_XFORM>;
+
+    def _negimm_lsl12 : Operand<ty>,
+        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff000) == 0; }],
+                shr_12_neg_XFORM>;
+  }
+}
+
+// The add operands don't need any transformation
+defm addsubimm_operand_i32 : addsub_imm_operands<i32>;
+defm addsubimm_operand_i64 : addsub_imm_operands<i64>;
+
+multiclass addsubimm_varieties<string prefix, bit sf, bit op, bits<2> shift,
+                               string asmop, string cmpasmop,
+                               Operand imm_operand, Operand cmp_imm_operand,
+                               RegisterClass GPR, RegisterClass GPRsp,
+                               AArch64Reg ZR> {
+    // All registers for non-S variants allow SP
+  def _s : A64I_addsubimm<sf, op, 0b0, shift,
+                         (outs GPRsp:$Rd),
+                         (ins GPRsp:$Rn, imm_operand:$Imm12),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm12"),
+                         [(set GPRsp:$Rd,
+                               (add GPRsp:$Rn, imm_operand:$Imm12))],
+                         NoItinerary>;
+
+
+  // S variants can read SP but would write to ZR
+  def _S : A64I_addsubimm<sf, op, 0b1, shift,
+                         (outs GPR:$Rd),
+                         (ins GPRsp:$Rn, imm_operand:$Imm12),
+                         !strconcat(asmop, "s\t$Rd, $Rn, $Imm12"),
+                         [(set GPR:$Rd, (addc GPRsp:$Rn, imm_operand:$Imm12))],
+                         NoItinerary> {
+    let Defs = [NZCV];
+  }
+
+  // Note that the pattern here for ADDS is subtle. Canonically CMP
+  // a, b becomes SUBS a, b. If b < 0 then this is equivalent to
+  // ADDS a, (-b). This is not true in general.
+  def _cmp : A64I_addsubimm<sf, op, 0b1, shift,
+                            (outs), (ins GPRsp:$Rn, imm_operand:$Imm12),
+                            !strconcat(cmpasmop, " $Rn, $Imm12"),
+                            [(set NZCV,
+                              (A64cmp GPRsp:$Rn, cmp_imm_operand:$Imm12))],
+                            NoItinerary> {
+    let Rd = 0b11111;
+    let Defs = [NZCV];
+    let isCompare = 1;
+  }
+}
+
+
+multiclass addsubimm_shifts<string prefix, bit sf, bit op,
+           string asmop, string cmpasmop, string operand, string cmpoperand,
+           RegisterClass GPR, RegisterClass GPRsp, AArch64Reg ZR> {
+  defm _lsl0 : addsubimm_varieties<prefix # "_lsl0", sf, op, 0b00,
+                                   asmop, cmpasmop,
+                                   !cast<Operand>(operand # "_lsl0"),
+                                   !cast<Operand>(cmpoperand # "_lsl0"),
+                                   GPR, GPRsp, ZR>;
+
+  defm _lsl12 : addsubimm_varieties<prefix # "_lsl12", sf, op, 0b01,
+                                    asmop, cmpasmop,
+                                    !cast<Operand>(operand # "_lsl12"),
+                                    !cast<Operand>(cmpoperand # "_lsl12"),
+                                    GPR, GPRsp, ZR>;
+}
+
+defm ADDwwi : addsubimm_shifts<"ADDwi", 0b0, 0b0, "add", "cmn",
+                              "addsubimm_operand_i32_posimm",
+                              "addsubimm_operand_i32_negimm",
+                              GPR32, GPR32wsp, WZR>;
+defm ADDxxi : addsubimm_shifts<"ADDxi", 0b1, 0b0, "add", "cmn",
+                              "addsubimm_operand_i64_posimm",
+                              "addsubimm_operand_i64_negimm",
+                              GPR64, GPR64xsp, XZR>;
+defm SUBwwi : addsubimm_shifts<"SUBwi", 0b0, 0b1, "sub", "cmp",
+                              "addsubimm_operand_i32_negimm",
+                              "addsubimm_operand_i32_posimm",
+                              GPR32, GPR32wsp, WZR>;
+defm SUBxxi : addsubimm_shifts<"SUBxi", 0b1, 0b1, "sub", "cmp",
+                              "addsubimm_operand_i64_negimm",
+                              "addsubimm_operand_i64_posimm",
+                              GPR64, GPR64xsp, XZR>;
+
+multiclass MOVsp<RegisterClass GPRsp, RegisterClass SP, Instruction addop> {
+  def _fromsp : InstAlias<"mov $Rd, $Rn",
+                          (addop GPRsp:$Rd, SP:$Rn, 0),
+                          0b1>;
+
+  def _tosp : InstAlias<"mov $Rd, $Rn",
+                        (addop SP:$Rd, GPRsp:$Rn, 0),
+                        0b1>;
+}
+
+// Recall Rxsp is a RegisterClass containing *just* xsp.
+defm MOVxx : MOVsp<GPR64xsp, Rxsp, ADDxxi_lsl0_s>;
+defm MOVww : MOVsp<GPR32wsp, Rwsp, ADDwwi_lsl0_s>;
+
+//===----------------------------------------------------------------------===//
+// Add-subtract (shifted register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, NEG, NEGS
+
+//===-------------------------------
+// 1. The "shifed register" operands. Shared with logical insts.
+//===-------------------------------
+
+multiclass shift_operands<string prefix, string form> {
+  def _asmoperand_i32 : AsmOperandClass {
+    let Name = "Shift" # form # "i32";
+    let RenderMethod = "addShiftOperands";
+    let PredicateMethod = "isShift<A64SE::" # form # ", false>";
+    let DiagnosticType = "AddSubRegShift32";
+  }
+
+  // Note that the operand type is intentionally i64 because the DAGCombiner
+  // puts these into a canonical form.
+  def _i32 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
+    let ParserMatchClass
+          = !cast<AsmOperandClass>(prefix # "_asmoperand_i32");
+    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
+    let DecoderMethod = "Decode32BitShiftOperand";
+  }
+
+  def _asmoperand_i64 : AsmOperandClass {
+      let Name = "Shift" # form # "i64";
+      let RenderMethod = "addShiftOperands";
+      let PredicateMethod = "isShift<A64SE::" # form # ", true>";
+      let DiagnosticType = "AddSubRegShift64";
+  }
+
+  def _i64 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
+    let ParserMatchClass
+          = !cast<AsmOperandClass>(prefix # "_asmoperand_i64");
+    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
+  }
+}
+
+defm lsl_operand : shift_operands<"lsl_operand", "LSL">;
+defm lsr_operand : shift_operands<"lsr_operand", "LSR">;
+defm asr_operand : shift_operands<"asr_operand", "ASR">;
+
+// Not used for add/sub, but defined here for completeness. The "logical
+// (shifted register)" instructions *do* have an ROR variant.
+defm ror_operand : shift_operands<"ror_operand", "ROR">;
+
+//===-------------------------------
+// 2. The basic 3.5-operand ADD/SUB/ADDS/SUBS instructions.
+//===-------------------------------
+
+// N.b. the commutable parameter is just !N. It will be first against the wall
+// when the revolution comes.
+multiclass addsub_shifts<string prefix, bit sf, bit op, bit s, bit commutable,
+                         string asmop, SDPatternOperator opfrag, string sty,
+                         RegisterClass GPR, list<Register> defs> {
+  let isCommutable = commutable, Defs = defs in {
+  def _lsl : A64I_addsubshift<sf, op, s, 0b00,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set GPR:$Rd, (opfrag GPR:$Rn, (shl GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _lsr : A64I_addsubshift<sf, op, s, 0b01,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set GPR:$Rd, (opfrag GPR:$Rn, (srl GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _asr : A64I_addsubshift<sf, op, s, 0b10,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set GPR:$Rd, (opfrag GPR:$Rn, (sra GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+  }
+
+  def _noshift
+      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
+                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
+                                                      GPR:$Rm, 0)>;
+
+  def : Pat<(opfrag GPR:$Rn, GPR:$Rm),
+            (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+multiclass addsub_sizes<string prefix, bit op, bit s, bit commutable,
+                         string asmop, SDPatternOperator opfrag,
+                         list<Register> defs> {
+  defm xxx : addsub_shifts<prefix # "xxx", 0b1, op, s,
+                           commutable, asmop, opfrag, "i64", GPR64, defs>;
+  defm www : addsub_shifts<prefix # "www", 0b0, op, s,
+                           commutable, asmop, opfrag, "i32", GPR32, defs>;
+}
+
+
+defm ADD : addsub_sizes<"ADD", 0b0, 0b0, 0b1, "add", add, []>;
+defm SUB : addsub_sizes<"SUB", 0b1, 0b0, 0b0, "sub", sub, []>;
+
+defm ADDS : addsub_sizes<"ADDS", 0b0, 0b1, 0b1, "adds", addc, [NZCV]>;
+defm SUBS : addsub_sizes<"SUBS", 0b1, 0b1, 0b0, "subs", subc, [NZCV]>;
+
+//===-------------------------------
+// 1. The NEG/NEGS aliases
+//===-------------------------------
+
+multiclass neg_alias<Instruction INST, RegisterClass GPR,
+                     Register ZR, Operand shift_operand, SDNode shiftop> {
+   def : InstAlias<"neg $Rd, $Rm, $Imm6",
+                   (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
+
+   def : Pat<(sub 0, (shiftop GPR:$Rm, shift_operand:$Imm6)),
+             (INST ZR, GPR:$Rm, shift_operand:$Imm6)>;
+}
+
+defm : neg_alias<SUBwww_lsl, GPR32, WZR, lsl_operand_i32, shl>;
+defm : neg_alias<SUBwww_lsr, GPR32, WZR, lsr_operand_i32, srl>;
+defm : neg_alias<SUBwww_asr, GPR32, WZR, asr_operand_i32, sra>;
+def : InstAlias<"neg $Rd, $Rm", (SUBwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
+def : Pat<(sub 0, GPR32:$Rm), (SUBwww_lsl WZR, GPR32:$Rm, 0)>;
+
+defm : neg_alias<SUBxxx_lsl, GPR64, XZR, lsl_operand_i64, shl>;
+defm : neg_alias<SUBxxx_lsr, GPR64, XZR, lsr_operand_i64, srl>;
+defm : neg_alias<SUBxxx_asr, GPR64, XZR, asr_operand_i64, sra>;
+def : InstAlias<"neg $Rd, $Rm", (SUBxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
+def : Pat<(sub 0, GPR64:$Rm), (SUBxxx_lsl XZR, GPR64:$Rm, 0)>;
+
+// NEGS doesn't get any patterns yet: defining multiple outputs means C++ has to
+// be involved.
+class negs_alias<Instruction INST, RegisterClass GPR,
+                 Register ZR, Operand shift_operand, SDNode shiftop>
+  : InstAlias<"negs $Rd, $Rm, $Imm6",
+              (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
+
+def : negs_alias<SUBSwww_lsl, GPR32, WZR, lsl_operand_i32, shl>;
+def : negs_alias<SUBSwww_lsr, GPR32, WZR, lsr_operand_i32, srl>;
+def : negs_alias<SUBSwww_asr, GPR32, WZR, asr_operand_i32, sra>;
+def : InstAlias<"negs $Rd, $Rm", (SUBSwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
+
+def : negs_alias<SUBSxxx_lsl, GPR64, XZR, lsl_operand_i64, shl>;
+def : negs_alias<SUBSxxx_lsr, GPR64, XZR, lsr_operand_i64, srl>;
+def : negs_alias<SUBSxxx_asr, GPR64, XZR, asr_operand_i64, sra>;
+def : InstAlias<"negs $Rd, $Rm", (SUBSxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
+
+//===-------------------------------
+// 1. The CMP/CMN aliases
+//===-------------------------------
+
+multiclass cmp_shifts<string prefix, bit sf, bit op, bit commutable,
+                      string asmop, SDPatternOperator opfrag, string sty,
+                      RegisterClass GPR> {
+  let isCommutable = commutable, Rd = 0b11111, Defs = [NZCV] in {
+  def _lsl : A64I_addsubshift<sf, op, 0b1, 0b00,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
+                       [(set NZCV, (opfrag GPR:$Rn, (shl GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _lsr : A64I_addsubshift<sf, op, 0b1, 0b01,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
+                       [(set NZCV, (opfrag GPR:$Rn, (srl GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _asr : A64I_addsubshift<sf, op, 0b1, 0b10,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
+                       [(set NZCV, (opfrag GPR:$Rn, (sra GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+  }
+
+  def _noshift
+      : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
+                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+
+  def : Pat<(opfrag GPR:$Rn, GPR:$Rm),
+            (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+defm CMPww : cmp_shifts<"CMPww", 0b0, 0b1, 0b0, "cmp", A64cmp, "i32", GPR32>;
+defm CMPxx : cmp_shifts<"CMPxx", 0b1, 0b1, 0b0, "cmp", A64cmp, "i64", GPR64>;
+
+defm CMNww : cmp_shifts<"CMNww", 0b0, 0b0, 0b1, "cmn", A64cmn, "i32", GPR32>;
+defm CMNxx : cmp_shifts<"CMNxx", 0b1, 0b0, 0b1, "cmn", A64cmn, "i64", GPR64>;
+
+//===----------------------------------------------------------------------===//
+// Add-subtract (with carry) instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADC, ADCS, SBC, SBCS + aliases NGC, NGCS
+
+multiclass A64I_addsubcarrySizes<bit op, bit s, string asmop> {
+  let Uses = [NZCV] in {
+    def www : A64I_addsubcarry<0b0, op, s, 0b000000,
+                               (outs GPR32:$Rd), (ins GPR32:$Rn, GPR32:$Rm),
+                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                               [], NoItinerary>;
+
+    def xxx : A64I_addsubcarry<0b1, op, s, 0b000000,
+                               (outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                               [], NoItinerary>;
+  }
+}
+
+let isCommutable = 1 in {
+  defm ADC : A64I_addsubcarrySizes<0b0, 0b0, "adc">;
+}
+
+defm SBC : A64I_addsubcarrySizes<0b1, 0b0, "sbc">;
+
+let Defs = [NZCV] in {
+  let isCommutable = 1 in {
+    defm ADCS : A64I_addsubcarrySizes<0b0, 0b1, "adcs">;
+  }
+
+  defm SBCS : A64I_addsubcarrySizes<0b1, 0b1, "sbcs">;
+}
+
+def : InstAlias<"ngc $Rd, $Rm", (SBCwww GPR32:$Rd, WZR, GPR32:$Rm)>;
+def : InstAlias<"ngc $Rd, $Rm", (SBCxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
+def : InstAlias<"ngcs $Rd, $Rm", (SBCSwww GPR32:$Rd, WZR, GPR32:$Rm)>;
+def : InstAlias<"ngcs $Rd, $Rm", (SBCSxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
+
+// Note that adde and sube can form a chain longer than two (e.g. for 256-bit
+// addition). So the flag-setting instructions are appropriate.
+def : Pat<(adde GPR32:$Rn, GPR32:$Rm), (ADCSwww GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(adde GPR64:$Rn, GPR64:$Rm), (ADCSxxx GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sube GPR32:$Rn, GPR32:$Rm), (SBCSwww GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sube GPR64:$Rn, GPR64:$Rm), (SBCSxxx GPR64:$Rn, GPR64:$Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield
+//===----------------------------------------------------------------------===//
+// Contains: SBFM, BFM, UBFM, [SU]XT[BHW], ASR, LSR, LSL, SBFI[ZX], BFI, BFXIL,
+//     UBFIZ, UBFX
+
+// Because of the rather complicated nearly-overlapping aliases, the decoding of
+// this range of instructions is handled manually. The architectural
+// instructions are BFM, SBFM and UBFM but a disassembler should never produce
+// these.
+//
+// In the end, the best option was to use BFM instructions for decoding under
+// almost all circumstances, but to create aliasing *Instructions* for each of
+// the canonical forms and specify a completely custom decoder which would
+// substitute the correct MCInst as needed.
+//
+// This also simplifies instruction selection, parsing etc because the MCInsts
+// have a shape that's closer to their use in code.
+
+//===-------------------------------
+// 1. The architectural BFM instructions
+//===-------------------------------
+
+def uimm5_asmoperand : AsmOperandClass {
+  let Name = "UImm5";
+  let PredicateMethod = "isUImm<5>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm5";
+}
+
+def uimm6_asmoperand : AsmOperandClass {
+  let Name = "UImm6";
+  let PredicateMethod = "isUImm<6>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm6";
+}
+
+def bitfield32_imm : Operand<i64>,
+                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 32; }]> {
+  let ParserMatchClass = uimm5_asmoperand;
+
+  let DecoderMethod = "DecodeBitfield32ImmOperand";
+}
+
+
+def bitfield64_imm : Operand<i64>,
+                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
+  let ParserMatchClass = uimm6_asmoperand;
+
+  // Default decoder works in 64-bit case: the 6-bit field can take any value.
+}
+
+multiclass A64I_bitfieldSizes<bits<2> opc, string asmop> {
+  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
+                    (ins GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
+                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                    [], NoItinerary> {
+    let DecoderMethod = "DecodeBitfieldInstruction";
+  }
+
+  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
+                    (ins GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
+                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                    [], NoItinerary> {
+    let DecoderMethod = "DecodeBitfieldInstruction";
+  }
+}
+
+defm SBFM : A64I_bitfieldSizes<0b00, "sbfm">;
+defm UBFM : A64I_bitfieldSizes<0b10, "ubfm">;
+
+// BFM instructions modify the destination register rather than defining it
+// completely.
+def BFMwwii :
+  A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
+        (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
+        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  let DecoderMethod = "DecodeBitfieldInstruction";
+  let Constraints = "$src = $Rd";
+}
+
+def BFMxxii :
+  A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
+        (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
+        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  let DecoderMethod = "DecodeBitfieldInstruction";
+  let Constraints = "$src = $Rd";
+}
+
+
+//===-------------------------------
+// 2. Extend aliases to 64-bit dest
+//===-------------------------------
+
+// Unfortunately the extensions that end up as 64-bits cannot be handled by an
+// instruction alias: their syntax is (for example) "SXTB x0, w0", which needs
+// to be mapped to "SBFM x0, x0, #0, 7" (changing the class of Rn). InstAlias is
+// not capable of such a map as far as I'm aware
+
+// Note that these instructions are strictly more specific than the
+// BFM ones (in ImmR) so they can handle their own decoding.
+class A64I_bf_ext<bit sf, bits<2> opc, RegisterClass GPRDest, string asmop,
+                    bits<6> imms, dag pattern>
+  : A64I_bitfield<sf, opc, sf,
+                  (outs GPRDest:$Rd), (ins GPR32:$Rn),
+                  !strconcat(asmop, "\t$Rd, $Rn"),
+                  [(set GPRDest:$Rd, pattern)], NoItinerary> {
+  let ImmR = 0b000000;
+  let ImmS = imms;
+}
+
+// Signed extensions
+def SXTBxw : A64I_bf_ext<0b1, 0b00, GPR64, "sxtb", 7,
+                         (sext_inreg (anyext GPR32:$Rn), i8)>;
+def SXTBww : A64I_bf_ext<0b0, 0b00, GPR32, "sxtb", 7,
+                         (sext_inreg GPR32:$Rn, i8)>;
+def SXTHxw : A64I_bf_ext<0b1, 0b00, GPR64, "sxth", 15,
+                         (sext_inreg (anyext GPR32:$Rn), i16)>;
+def SXTHww : A64I_bf_ext<0b0, 0b00, GPR32, "sxth", 15,
+                         (sext_inreg GPR32:$Rn, i16)>;
+def SXTWxw : A64I_bf_ext<0b1, 0b00, GPR64, "sxtw", 31, (sext GPR32:$Rn)>;
+
+// Unsigned extensions
+def UXTBww : A64I_bf_ext<0b0, 0b10, GPR32, "uxtb", 7,
+                         (and GPR32:$Rn, 255)>;
+def UXTHww : A64I_bf_ext<0b0, 0b10, GPR32, "uxth", 15,
+                         (and GPR32:$Rn, 65535)>;
+
+// The 64-bit unsigned variants are not strictly architectural but recommended
+// for consistency.
+let isAsmParserOnly = 1 in {
+  def UXTBxw : A64I_bf_ext<0b0, 0b10, GPR64, "uxtb", 7,
+                           (and (anyext GPR32:$Rn), 255)>;
+  def UXTHxw : A64I_bf_ext<0b0, 0b10, GPR64, "uxth", 15,
+                           (and (anyext GPR32:$Rn), 65535)>;
+}
+
+// Extra patterns for when the source register is actually 64-bits
+// too. There's no architectural difference here, it's just LLVM
+// shinanigans. There's no need for equivalent zero-extension patterns
+// because they'll already be caught by logical (immediate) matching.
+def : Pat<(sext_inreg GPR64:$Rn, i8),
+          (SXTBxw (EXTRACT_SUBREG GPR64:$Rn, sub_32))>;
+def : Pat<(sext_inreg GPR64:$Rn, i16),
+          (SXTHxw (EXTRACT_SUBREG GPR64:$Rn, sub_32))>;
+def : Pat<(sext_inreg GPR64:$Rn, i32),
+          (SXTWxw (EXTRACT_SUBREG GPR64:$Rn, sub_32))>;
+
+
+//===-------------------------------
+// 3. Aliases for ASR and LSR (the simple shifts)
+//===-------------------------------
+
+// These also handle their own decoding because ImmS being set makes
+// them take precedence over BFM.
+multiclass A64I_shift<bits<2> opc, string asmop, SDNode opnode> {
+  def wwi : A64I_bitfield<0b0, opc, 0b0,
+                    (outs GPR32:$Rd), (ins GPR32:$Rn, bitfield32_imm:$ImmR),
+                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
+                    [(set GPR32:$Rd, (opnode GPR32:$Rn, bitfield32_imm:$ImmR))],
+                    NoItinerary> {
+    let ImmS = 31;
+  }
+
+  def xxi : A64I_bitfield<0b1, opc, 0b1,
+                    (outs GPR64:$Rd), (ins GPR64:$Rn, bitfield64_imm:$ImmR),
+                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
+                    [(set GPR64:$Rd, (opnode GPR64:$Rn, bitfield64_imm:$ImmR))],
+                    NoItinerary> {
+    let ImmS = 63;
+  }
+
+}
+
+defm ASR : A64I_shift<0b00, "asr", sra>;
+defm LSR : A64I_shift<0b10, "lsr", srl>;
+
+//===-------------------------------
+// 4. Aliases for LSL
+//===-------------------------------
+
+// Unfortunately LSL and subsequent aliases are much more complicated. We need
+// to be able to say certain output instruction fields depend in a complex
+// manner on combinations of input assembly fields).
+//
+// MIOperandInfo *might* have been able to do it, but at the cost of
+// significantly more C++ code.
+
+// N.b. contrary to usual practice these operands store the shift rather than
+// the machine bits in an MCInst. The complexity overhead of consistency
+// outweighed the benefits in this case (custom asmparser, printer and selection
+// vs custom encoder).
+def bitfield32_lsl_imm : Operand<i64>,
+                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
+  let ParserMatchClass = uimm5_asmoperand;
+  let EncoderMethod = "getBitfield32LSLOpValue";
+}
+
+def bitfield64_lsl_imm : Operand<i64>,
+                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
+  let ParserMatchClass = uimm6_asmoperand;
+  let EncoderMethod = "getBitfield64LSLOpValue";
+}
+
+class A64I_bitfield_lsl<bit sf, RegisterClass GPR, Operand operand>
+  : A64I_bitfield<sf, 0b10, sf, (outs GPR:$Rd), (ins GPR:$Rn, operand:$FullImm),
+                  "lsl\t$Rd, $Rn, $FullImm",
+                  [(set GPR:$Rd, (shl GPR:$Rn, operand:$FullImm))],
+                  NoItinerary> {
+  bits<12> FullImm;
+  let ImmR = FullImm{5-0};
+  let ImmS = FullImm{11-6};
+
+  // No disassembler allowed because it would overlap with BFM which does the
+  // actual work.
+  let isAsmParserOnly = 1;
+}
+
+def LSLwwi : A64I_bitfield_lsl<0b0, GPR32, bitfield32_lsl_imm>;
+def LSLxxi : A64I_bitfield_lsl<0b1, GPR64, bitfield64_lsl_imm>;
+
+//===-------------------------------
+// 5. Aliases for bitfield extract instructions
+//===-------------------------------
+
+def bfx32_width_asmoperand : AsmOperandClass {
+  let Name = "BFX32Width";
+  let PredicateMethod = "isBitfieldWidth<32>";
+  let RenderMethod = "addBFXWidthOperands";
+  let DiagnosticType = "Width32";
+}
+
+def bfx32_width : Operand<i64>, ImmLeaf<i64, [{ return true; }]> {
+  let PrintMethod = "printBFXWidthOperand";
+  let ParserMatchClass = bfx32_width_asmoperand;
+}
+
+def bfx64_width_asmoperand : AsmOperandClass {
+  let Name = "BFX64Width";
+  let PredicateMethod = "isBitfieldWidth<64>";
+  let RenderMethod = "addBFXWidthOperands";
+  let DiagnosticType = "Width64";
+}
+
+def bfx64_width : Operand<i64> {
+  let PrintMethod = "printBFXWidthOperand";
+  let ParserMatchClass = bfx64_width_asmoperand;
+}
+
+
+multiclass A64I_bitfield_extract<bits<2> opc, string asmop, SDNode op> {
+  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
+                       (ins GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
+                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                       [(set GPR32:$Rd, (op GPR32:$Rn, imm:$ImmR, imm:$ImmS))],
+                       NoItinerary> {
+    // As above, no disassembler allowed.
+    let isAsmParserOnly = 1;
+  }
+
+  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
+                       (ins GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
+                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                       [(set GPR64:$Rd, (op GPR64:$Rn, imm:$ImmR, imm:$ImmS))],
+                       NoItinerary> {
+    // As above, no disassembler allowed.
+    let isAsmParserOnly = 1;
+  }
+}
+
+defm SBFX :  A64I_bitfield_extract<0b00, "sbfx", A64Sbfx>;
+defm UBFX :  A64I_bitfield_extract<0b10, "ubfx", A64Ubfx>;
+
+// Again, variants based on BFM modify Rd so need it as an input too.
+def BFXILwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
+           (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
+           "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  // As above, no disassembler allowed.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src = $Rd";
+}
+
+def BFXILxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
+           (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
+           "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  // As above, no disassembler allowed.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src = $Rd";
+}
+
+// SBFX instructions can do a 1-instruction sign-extension of boolean values.
+def : Pat<(sext_inreg GPR64:$Rn, i1), (SBFXxxii GPR64:$Rn, 0, 0)>;
+def : Pat<(sext_inreg GPR32:$Rn, i1), (SBFXwwii GPR32:$Rn, 0, 0)>;
+def : Pat<(i64 (sext_inreg (anyext GPR32:$Rn), i1)),
+          (SBFXxxii (SUBREG_TO_REG (i64 0), GPR32:$Rn, sub_32), 0, 0)>;
+
+// UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
+// use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
+def : Pat<(zext GPR32:$Rn), (SUBREG_TO_REG (i64 0), (UBFXwwii GPR32:$Rn, 0, 31),
+                                           sub_32)>;
+
+//===-------------------------------
+// 6. Aliases for bitfield insert instructions
+//===-------------------------------
+
+def bfi32_lsb_asmoperand : AsmOperandClass {
+  let Name = "BFI32LSB";
+  let PredicateMethod = "isUImm<5>";
+  let RenderMethod = "addBFILSBOperands<32>";
+  let DiagnosticType = "UImm5";
+}
+
+def bfi32_lsb : Operand<i64>,
+                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
+  let PrintMethod = "printBFILSBOperand<32>";
+  let ParserMatchClass = bfi32_lsb_asmoperand;
+}
+
+def bfi64_lsb_asmoperand : AsmOperandClass {
+  let Name = "BFI64LSB";
+  let PredicateMethod = "isUImm<6>";
+  let RenderMethod = "addBFILSBOperands<64>";
+  let DiagnosticType = "UImm6";
+}
+
+def bfi64_lsb : Operand<i64>,
+                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
+  let PrintMethod = "printBFILSBOperand<64>";
+  let ParserMatchClass = bfi64_lsb_asmoperand;
+}
+
+// Width verification is performed during conversion so width operand can be
+// shared between 32/64-bit cases. Still needed for the print method though
+// because ImmR encodes "width - 1".
+def bfi32_width_asmoperand : AsmOperandClass {
+  let Name = "BFI32Width";
+  let PredicateMethod = "isBitfieldWidth<32>";
+  let RenderMethod = "addBFIWidthOperands";
+  let DiagnosticType = "Width32";
+}
+
+def bfi32_width : Operand<i64>,
+                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 32; }]> {
+  let PrintMethod = "printBFIWidthOperand";
+  let ParserMatchClass = bfi32_width_asmoperand;
+}
+
+def bfi64_width_asmoperand : AsmOperandClass {
+  let Name = "BFI64Width";
+  let PredicateMethod = "isBitfieldWidth<64>";
+  let RenderMethod = "addBFIWidthOperands";
+  let DiagnosticType = "Width64";
+}
+
+def bfi64_width : Operand<i64>,
+                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 64; }]> {
+  let PrintMethod = "printBFIWidthOperand";
+  let ParserMatchClass = bfi64_width_asmoperand;
+}
+
+multiclass A64I_bitfield_insert<bits<2> opc, string asmop> {
+  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
+                           (ins GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
+                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                           [], NoItinerary> {
+    // As above, no disassembler allowed.
+    let isAsmParserOnly = 1;
+  }
+
+  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
+                           (ins GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
+                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                           [], NoItinerary> {
+    // As above, no disassembler allowed.
+    let isAsmParserOnly = 1;
+  }
+}
+
+defm SBFIZ :  A64I_bitfield_insert<0b00, "sbfiz">;
+defm UBFIZ :  A64I_bitfield_insert<0b10, "ubfiz">;
+
+
+def BFIwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
+                (ins GPR32:$src, GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
+                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  // As above, no disassembler allowed.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src = $Rd";
+}
+
+def BFIxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
+                (ins GPR64:$src, GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
+                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  // As above, no disassembler allowed.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src = $Rd";
+}
+
+//===----------------------------------------------------------------------===//
+// Compare and branch (immediate)
+//===----------------------------------------------------------------------===//
+// Contains: CBZ, CBNZ
+
+class label_asmoperand<int width, int scale> : AsmOperandClass {
+  let Name = "Label" # width # "_" # scale;
+  let PredicateMethod = "isLabel<" # width # "," # scale # ">";
+  let RenderMethod = "addLabelOperands<" # width # ", " # scale # ">";
+  let DiagnosticType = "Label";
+}
+
+def label_wid19_scal4_asmoperand : label_asmoperand<19, 4>;
+
+// All conditional immediate branches are the same really: 19 signed bits scaled
+// by the instruction-size (4).
+def bcc_target : Operand<OtherVT> {
+  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
+  let ParserMatchClass = label_wid19_scal4_asmoperand;
+  let PrintMethod = "printLabelOperand<19, 4>";
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_condbr>";
+  let OperandType = "OPERAND_PCREL";
+}
+
+multiclass cmpbr_sizes<bit op, string asmop, ImmLeaf SETOP> {
+  let isBranch = 1, isTerminator = 1 in {
+  def x : A64I_cmpbr<0b1, op,
+                     (outs),
+                     (ins GPR64:$Rt, bcc_target:$Label),
+                     !strconcat(asmop,"\t$Rt, $Label"),
+                     [(A64br_cc (A64cmp GPR64:$Rt, 0), SETOP, bb:$Label)],
+                     NoItinerary>;
+
+  def w : A64I_cmpbr<0b0, op,
+                     (outs),
+                     (ins GPR32:$Rt, bcc_target:$Label),
+                     !strconcat(asmop,"\t$Rt, $Label"),
+                     [(A64br_cc (A64cmp GPR32:$Rt, 0), SETOP, bb:$Label)],
+                     NoItinerary>;
+  }
+}
+
+defm CBZ  : cmpbr_sizes<0b0, "cbz",  ImmLeaf<i32, [{
+  return Imm == A64CC::EQ;
+}]> >;
+defm CBNZ : cmpbr_sizes<0b1, "cbnz", ImmLeaf<i32, [{
+  return Imm == A64CC::NE;
+}]> >;
+
+//===----------------------------------------------------------------------===//
+// Conditional branch (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: B.cc
+
+def cond_code_asmoperand : AsmOperandClass {
+  let Name = "CondCode";
+  let DiagnosticType = "CondCode";
+}
+
+def cond_code : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm <= 15;
+}]> {
+  let PrintMethod = "printCondCodeOperand";
+  let ParserMatchClass = cond_code_asmoperand;
+}
+
+def Bcc : A64I_condbr<0b0, 0b0, (outs),
+                (ins cond_code:$Cond, bcc_target:$Label),
+                "b.$Cond $Label", [(A64br_cc NZCV, (i32 imm:$Cond), bb:$Label)],
+                NoItinerary> {
+  let Uses = [NZCV];
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Conditional compare (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: CCMN, CCMP
+
+def uimm4_asmoperand : AsmOperandClass {
+  let Name = "UImm4";
+  let PredicateMethod = "isUImm<4>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm4";
+}
+
+def uimm4 : Operand<i32> {
+  let ParserMatchClass = uimm4_asmoperand;
+}
+
+def uimm5 : Operand<i32> {
+  let ParserMatchClass = uimm5_asmoperand;
+}
+
+// The only difference between this operand and the one for instructions like
+// B.cc is that it's parsed manually. The other get parsed implicitly as part of
+// the mnemonic handling.
+def cond_code_op_asmoperand : AsmOperandClass {
+  let Name = "CondCodeOp";
+  let RenderMethod = "addCondCodeOperands";
+  let PredicateMethod = "isCondCode";
+  let ParserMethod = "ParseCondCodeOperand";
+  let DiagnosticType = "CondCode";
+}
+
+def cond_code_op : Operand<i32> {
+  let PrintMethod = "printCondCodeOperand";
+  let ParserMatchClass = cond_code_op_asmoperand;
+}
+
+class A64I_condcmpimmImpl<bit sf, bit op, RegisterClass GPR, string asmop>
+  : A64I_condcmpimm<sf, op, 0b0, 0b0, 0b1, (outs),
+                (ins GPR:$Rn, uimm5:$UImm5, uimm4:$NZCVImm, cond_code_op:$Cond),
+                !strconcat(asmop, "\t$Rn, $UImm5, $NZCVImm, $Cond"),
+                [], NoItinerary> {
+  let Defs = [NZCV];
+}
+
+def CCMNwi : A64I_condcmpimmImpl<0b0, 0b0, GPR32, "ccmn">;
+def CCMNxi : A64I_condcmpimmImpl<0b1, 0b0, GPR64, "ccmn">;
+def CCMPwi : A64I_condcmpimmImpl<0b0, 0b1, GPR32, "ccmp">;
+def CCMPxi : A64I_condcmpimmImpl<0b1, 0b1, GPR64, "ccmp">;
+
+//===----------------------------------------------------------------------===//
+// Conditional compare (register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: CCMN, CCMP
+
+class A64I_condcmpregImpl<bit sf, bit op, RegisterClass GPR, string asmop>
+  : A64I_condcmpreg<sf, op, 0b0, 0b0, 0b1,
+                    (outs),
+                    (ins GPR:$Rn, GPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
+                    !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
+                    [], NoItinerary> {
+  let Defs = [NZCV];
+}
+
+def CCMNww : A64I_condcmpregImpl<0b0, 0b0, GPR32, "ccmn">;
+def CCMNxx : A64I_condcmpregImpl<0b1, 0b0, GPR64, "ccmn">;
+def CCMPww : A64I_condcmpregImpl<0b0, 0b1, GPR32, "ccmp">;
+def CCMPxx : A64I_condcmpregImpl<0b1, 0b1, GPR64, "ccmp">;
+
+//===----------------------------------------------------------------------===//
+// Conditional select instructions
+//===----------------------------------------------------------------------===//
+// Contains: CSEL, CSINC, CSINV, CSNEG + aliases CSET, CSETM, CINC, CINV, CNEG
+
+// Condition code which is encoded as the inversion (semantically rather than
+// bitwise) in the instruction.
+def inv_cond_code_op_asmoperand : AsmOperandClass {
+  let Name = "InvCondCodeOp";
+  let RenderMethod = "addInvCondCodeOperands";
+  let PredicateMethod = "isCondCode";
+  let ParserMethod = "ParseCondCodeOperand";
+  let DiagnosticType = "CondCode";
+}
+
+def inv_cond_code_op : Operand<i32> {
+  let ParserMatchClass = inv_cond_code_op_asmoperand;
+}
+
+// Having a separate operand for the selectable use-case is debatable, but gives
+// consistency with cond_code.
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(N->getZExtValue());
+  return CurDAG->getTargetConstant(A64InvertCondCode(CC), MVT::i32);
+}]>;
+
+def inv_cond_code
+  : ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 15; }], inv_cond_XFORM>;
+
+
+multiclass A64I_condselSizes<bit op, bits<2> op2, string asmop,
+                             SDPatternOperator select> {
+  let Uses = [NZCV] in {
+    def wwwc : A64I_condsel<0b0, op, 0b0, op2,
+                            (outs GPR32:$Rd),
+                            (ins GPR32:$Rn, GPR32:$Rm, cond_code_op:$Cond),
+                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
+                            [(set GPR32:$Rd, (select GPR32:$Rn, GPR32:$Rm))],
+                            NoItinerary>;
+
+
+    def xxxc : A64I_condsel<0b1, op, 0b0, op2,
+                            (outs GPR64:$Rd),
+                            (ins GPR64:$Rn, GPR64:$Rm, cond_code_op:$Cond),
+                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
+                            [(set GPR64:$Rd, (select GPR64:$Rn, GPR64:$Rm))],
+                            NoItinerary>;
+  }
+}
+
+def simple_select
+  : PatFrag<(ops node:$lhs, node:$rhs),
+            (A64select_cc NZCV, node:$lhs, node:$rhs, (i32 imm:$Cond))>;
+
+class complex_select<SDPatternOperator opnode>
+  : PatFrag<(ops node:$lhs, node:$rhs),
+        (A64select_cc NZCV, node:$lhs, (opnode node:$rhs), (i32 imm:$Cond))>;
+
+
+defm CSEL : A64I_condselSizes<0b0, 0b00, "csel", simple_select>;
+defm CSINC : A64I_condselSizes<0b0, 0b01, "csinc",
+                               complex_select<PatFrag<(ops node:$val),
+                                                      (add node:$val, 1)>>>;
+defm CSINV : A64I_condselSizes<0b1, 0b00, "csinv", complex_select<not>>;
+defm CSNEG : A64I_condselSizes<0b1, 0b01, "csneg", complex_select<ineg>>;
+
+// Now the instruction aliases, which fit nicely into LLVM's model:
+
+def : InstAlias<"cset $Rd, $Cond",
+                (CSINCwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cset $Rd, $Cond",
+                (CSINCxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
+def : InstAlias<"csetm $Rd, $Cond",
+                (CSINVwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
+def : InstAlias<"csetm $Rd, $Cond",
+                (CSINVxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cinc $Rd, $Rn, $Cond",
+           (CSINCwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cinc $Rd, $Rn, $Cond",
+           (CSINCxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cinv $Rd, $Rn, $Cond",
+           (CSINVwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cinv $Rd, $Rn, $Cond",
+           (CSINVxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cneg $Rd, $Rn, $Cond",
+           (CSNEGwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cneg $Rd, $Rn, $Cond",
+           (CSNEGxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
+
+// Finally some helper patterns.
+
+// For CSET (a.k.a. zero-extension of icmp)
+def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
+          (CSINCwwwc WZR, WZR, cond_code:$Cond)>;
+def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
+          (CSINCwwwc WZR, WZR, inv_cond_code:$Cond)>;
+
+def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
+          (CSINCxxxc XZR, XZR, cond_code:$Cond)>;
+def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
+          (CSINCxxxc XZR, XZR, inv_cond_code:$Cond)>;
+
+// For CSETM (a.k.a. sign-extension of icmp)
+def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
+          (CSINVwwwc WZR, WZR, cond_code:$Cond)>;
+def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
+          (CSINVwwwc WZR, WZR, inv_cond_code:$Cond)>;
+
+def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
+          (CSINVxxxc XZR, XZR, cond_code:$Cond)>;
+def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
+          (CSINVxxxc XZR, XZR, inv_cond_code:$Cond)>;
+
+// CINC, CINV and CNEG get dealt with automatically, which leaves the issue of
+// commutativity. The instructions are to complex for isCommutable to be used,
+// so we have to create the patterns manually:
+
+// No commutable pattern for CSEL since the commuted version is isomorphic.
+
+// CSINC
+def :Pat<(A64select_cc NZCV, (add GPR32:$Rm, 1), GPR32:$Rn,
+         inv_cond_code:$Cond),
+         (CSINCwwwc GPR32:$Rn, GPR32:$Rm, inv_cond_code:$Cond)>;
+def :Pat<(A64select_cc NZCV, (add GPR64:$Rm, 1), GPR64:$Rn,
+         inv_cond_code:$Cond),
+         (CSINCxxxc GPR64:$Rn, GPR64:$Rm, inv_cond_code:$Cond)>;
+
+// CSINV
+def :Pat<(A64select_cc NZCV, (not GPR32:$Rm), GPR32:$Rn, inv_cond_code:$Cond),
+         (CSINVwwwc GPR32:$Rn, GPR32:$Rm, inv_cond_code:$Cond)>;
+def :Pat<(A64select_cc NZCV, (not GPR64:$Rm), GPR64:$Rn, inv_cond_code:$Cond),
+         (CSINVxxxc GPR64:$Rn, GPR64:$Rm, inv_cond_code:$Cond)>;
+
+// CSNEG
+def :Pat<(A64select_cc NZCV, (ineg GPR32:$Rm), GPR32:$Rn, inv_cond_code:$Cond),
+         (CSNEGwwwc GPR32:$Rn, GPR32:$Rm, inv_cond_code:$Cond)>;
+def :Pat<(A64select_cc NZCV, (ineg GPR64:$Rm), GPR64:$Rn, inv_cond_code:$Cond),
+         (CSNEGxxxc GPR64:$Rn, GPR64:$Rm, inv_cond_code:$Cond)>;
+
+//===----------------------------------------------------------------------===//
+// Data Processing (1 source) instructions
+//===----------------------------------------------------------------------===//
+// Contains: RBIT, REV16, REV, REV32, CLZ, CLS.
+
+// We define an unary operator which always fails. We will use this to
+// define unary operators that cannot be matched.
+
+class A64I_dp_1src_impl<bit sf, bits<6> opcode, string asmop,
+                   list<dag> patterns, RegisterClass GPRrc,
+                   InstrItinClass itin>:
+      A64I_dp_1src<sf,
+                   0,
+                   0b00000,
+                   opcode,
+                   !strconcat(asmop, "\t$Rd, $Rn"),
+                   (outs GPRrc:$Rd),
+                   (ins GPRrc:$Rn),
+                   patterns,
+                   itin>;
+
+multiclass A64I_dp_1src <bits<6> opcode, string asmop> {
+  let hasSideEffects = 0 in {
+    def ww : A64I_dp_1src_impl<0b0, opcode, asmop, [], GPR32, NoItinerary>;
+    def xx : A64I_dp_1src_impl<0b1, opcode, asmop, [], GPR64, NoItinerary>;
+  }
+}
+
+defm RBIT  : A64I_dp_1src<0b000000, "rbit">;
+defm CLS   : A64I_dp_1src<0b000101, "cls">;
+defm CLZ   : A64I_dp_1src<0b000100, "clz">;
+
+def : Pat<(ctlz GPR32:$Rn), (CLZww GPR32:$Rn)>;
+def : Pat<(ctlz GPR64:$Rn), (CLZxx GPR64:$Rn)>;
+def : Pat<(ctlz_zero_undef GPR32:$Rn), (CLZww GPR32:$Rn)>;
+def : Pat<(ctlz_zero_undef GPR64:$Rn), (CLZxx GPR64:$Rn)>;
+
+def : Pat<(cttz GPR32:$Rn), (CLZww (RBITww GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn), (CLZxx (RBITxx GPR64:$Rn))>;
+def : Pat<(cttz_zero_undef GPR32:$Rn), (CLZww (RBITww GPR32:$Rn))>;
+def : Pat<(cttz_zero_undef GPR64:$Rn), (CLZxx (RBITxx GPR64:$Rn))>;
+
+
+def REVww : A64I_dp_1src_impl<0b0, 0b000010, "rev",
+                              [(set GPR32:$Rd, (bswap GPR32:$Rn))],
+                              GPR32, NoItinerary>;
+def REVxx : A64I_dp_1src_impl<0b1, 0b000011, "rev",
+                              [(set GPR64:$Rd, (bswap GPR64:$Rn))],
+                              GPR64, NoItinerary>;
+def REV32xx : A64I_dp_1src_impl<0b1, 0b000010, "rev32",
+                          [(set GPR64:$Rd, (bswap (rotr GPR64:$Rn, (i64 32))))],
+                          GPR64, NoItinerary>;
+def REV16ww : A64I_dp_1src_impl<0b0, 0b000001, "rev16",
+                          [(set GPR32:$Rd, (bswap (rotr GPR32:$Rn, (i64 16))))],
+                          GPR32,
+                          NoItinerary>;
+def REV16xx : A64I_dp_1src_impl<0b1, 0b000001, "rev16", [], GPR64, NoItinerary>;
+
+//===----------------------------------------------------------------------===//
+// Data Processing (2 sources) instructions
+//===----------------------------------------------------------------------===//
+// Contains: CRC32C?[BHWX], UDIV, SDIV, LSLV, LSRV, ASRV, RORV + aliases LSL,
+//           LSR, ASR, ROR
+
+
+class dp_2src_impl<bit sf, bits<6> opcode, string asmop, list<dag> patterns,
+                   RegisterClass GPRsp,
+                   InstrItinClass itin>:
+      A64I_dp_2src<sf,
+                   opcode,
+                   0,
+                   !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                   (outs GPRsp:$Rd),
+                   (ins GPRsp:$Rn, GPRsp:$Rm),
+                   patterns,
+                   itin>;
+
+multiclass dp_2src_crc<bit c, string asmop> {
+  def B_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 0},
+                           !strconcat(asmop, "b"), [], GPR32, NoItinerary>;
+  def H_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 1},
+                           !strconcat(asmop, "h"), [], GPR32, NoItinerary>;
+  def W_www : dp_2src_impl<0b0, {0, 1, 0, c, 1, 0},
+                           !strconcat(asmop, "w"), [], GPR32, NoItinerary>;
+  def X_wwx : A64I_dp_2src<0b1, {0, 1, 0, c, 1, 1}, 0b0,
+                           !strconcat(asmop, "x\t$Rd, $Rn, $Rm"),
+                           (outs GPR32:$Rd), (ins GPR32:$Rn, GPR64:$Rm), [],
+                           NoItinerary>;
+}
+
+multiclass dp_2src_zext <bits<6> opcode, string asmop, SDPatternOperator op> {
+   def www : dp_2src_impl<0b0,
+                         opcode,
+                         asmop,
+                         [(set GPR32:$Rd,
+                               (op GPR32:$Rn, (i64 (zext GPR32:$Rm))))],
+                         GPR32,
+                         NoItinerary>;
+   def xxx : dp_2src_impl<0b1,
+                         opcode,
+                         asmop,
+                         [(set GPR64:$Rd, (op GPR64:$Rn, GPR64:$Rm))],
+                         GPR64,
+                         NoItinerary>;
+}
+
+
+multiclass dp_2src <bits<6> opcode, string asmop, SDPatternOperator op> {
+    def www : dp_2src_impl<0b0,
+                         opcode,
+                         asmop,
+                         [(set GPR32:$Rd, (op GPR32:$Rn, GPR32:$Rm))],
+                         GPR32,
+                         NoItinerary>;
+   def xxx : dp_2src_impl<0b1,
+                         opcode,
+                         asmop,
+                         [(set GPR64:$Rd, (op GPR64:$Rn, GPR64:$Rm))],
+                         GPR64,
+                         NoItinerary>;
+}
+
+// Here we define the data processing 2 source instructions.
+defm CRC32  : dp_2src_crc<0b0, "crc32">;
+defm CRC32C : dp_2src_crc<0b1, "crc32c">;
+
+defm UDIV : dp_2src<0b000010, "udiv", udiv>;
+defm SDIV : dp_2src<0b000011, "sdiv", sdiv>;
+
+defm LSLV : dp_2src_zext<0b001000, "lsl", shl>;
+defm LSRV : dp_2src_zext<0b001001, "lsr", srl>;
+defm ASRV : dp_2src_zext<0b001010, "asr", sra>;
+defm RORV : dp_2src_zext<0b001011, "ror", rotr>;
+
+// Extra patterns for an incoming 64-bit value for a 32-bit
+// operation. Since the LLVM operations are undefined (as in C) if the
+// RHS is out of range, it's perfectly permissible to discard the high
+// bits of the GPR64.
+def : Pat<(shl GPR32:$Rn, GPR64:$Rm),
+          (LSLVwww GPR32:$Rn, (EXTRACT_SUBREG GPR64:$Rm, sub_32))>;
+def : Pat<(srl GPR32:$Rn, GPR64:$Rm),
+          (LSRVwww GPR32:$Rn, (EXTRACT_SUBREG GPR64:$Rm, sub_32))>;
+def : Pat<(sra GPR32:$Rn, GPR64:$Rm),
+          (ASRVwww GPR32:$Rn, (EXTRACT_SUBREG GPR64:$Rm, sub_32))>;
+def : Pat<(rotr GPR32:$Rn, GPR64:$Rm),
+          (RORVwww GPR32:$Rn, (EXTRACT_SUBREG GPR64:$Rm, sub_32))>;
+
+// Here we define the aliases for the data processing 2 source instructions.
+def LSL_mnemonic : MnemonicAlias<"lslv", "lsl">;
+def LSR_mnemonic : MnemonicAlias<"lsrv", "lsr">;
+def ASR_menmonic : MnemonicAlias<"asrv", "asr">;
+def ROR_menmonic : MnemonicAlias<"rorv", "ror">;
+
+//===----------------------------------------------------------------------===//
+// Data Processing (3 sources) instructions
+//===----------------------------------------------------------------------===//
+// Contains: MADD, MSUB, SMADDL, SMSUBL, SMULH, UMADDL, UMSUBL, UMULH
+//    + aliases MUL, MNEG, SMULL, SMNEGL, UMULL, UMNEGL
+
+class A64I_dp3_4operand<bit sf, bits<6> opcode, RegisterClass AccReg,
+                        RegisterClass SrcReg, string asmop, dag pattern>
+  : A64I_dp3<sf, opcode,
+             (outs AccReg:$Rd), (ins SrcReg:$Rn, SrcReg:$Rm, AccReg:$Ra),
+             !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Ra"),
+             [(set AccReg:$Rd, pattern)], NoItinerary> {
+  RegisterClass AccGPR = AccReg;
+  RegisterClass SrcGPR = SrcReg;
+}
+
+def MADDwwww : A64I_dp3_4operand<0b0, 0b000000, GPR32, GPR32, "madd",
+                                 (add GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm))>;
+def MADDxxxx : A64I_dp3_4operand<0b1, 0b000000, GPR64, GPR64, "madd",
+                                 (add GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm))>;
+
+def MSUBwwww : A64I_dp3_4operand<0b0, 0b000001, GPR32, GPR32, "msub",
+                                 (sub GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm))>;
+def MSUBxxxx : A64I_dp3_4operand<0b1, 0b000001, GPR64, GPR64, "msub",
+                                 (sub GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm))>;
+
+def SMADDLxwwx : A64I_dp3_4operand<0b1, 0b000010, GPR64, GPR32, "smaddl",
+               (add GPR64:$Ra, (mul (i64 (sext GPR32:$Rn)), (sext GPR32:$Rm)))>;
+def SMSUBLxwwx : A64I_dp3_4operand<0b1, 0b000011, GPR64, GPR32, "smsubl",
+               (sub GPR64:$Ra, (mul (i64 (sext GPR32:$Rn)), (sext GPR32:$Rm)))>;
+
+def UMADDLxwwx : A64I_dp3_4operand<0b1, 0b001010, GPR64, GPR32, "umaddl",
+               (add GPR64:$Ra, (mul (i64 (zext GPR32:$Rn)), (zext GPR32:$Rm)))>;
+def UMSUBLxwwx : A64I_dp3_4operand<0b1, 0b001011, GPR64, GPR32, "umsubl",
+               (sub GPR64:$Ra, (mul (i64 (zext GPR32:$Rn)), (zext GPR32:$Rm)))>;
+
+let isCommutable = 1, PostEncoderMethod = "fixMulHigh" in {
+  def UMULHxxx : A64I_dp3<0b1, 0b001100, (outs GPR64:$Rd),
+                          (ins GPR64:$Rn, GPR64:$Rm),
+                          "umulh\t$Rd, $Rn, $Rm",
+                          [(set GPR64:$Rd, (mulhu GPR64:$Rn, GPR64:$Rm))],
+                          NoItinerary>;
+
+  def SMULHxxx : A64I_dp3<0b1, 0b000100, (outs GPR64:$Rd),
+                          (ins GPR64:$Rn, GPR64:$Rm),
+                          "smulh\t$Rd, $Rn, $Rm",
+                          [(set GPR64:$Rd, (mulhs GPR64:$Rn, GPR64:$Rm))],
+                          NoItinerary>;
+}
+
+multiclass A64I_dp3_3operand<string asmop, A64I_dp3_4operand INST,
+                             Register ZR, dag pattern> {
+  def : InstAlias<asmop # " $Rd, $Rn, $Rm",
+                  (INST INST.AccGPR:$Rd, INST.SrcGPR:$Rn, INST.SrcGPR:$Rm, ZR)>;
+
+  def : Pat<pattern, (INST INST.SrcGPR:$Rn, INST.SrcGPR:$Rm, ZR)>;
+}
+
+defm : A64I_dp3_3operand<"mul", MADDwwww, WZR, (mul GPR32:$Rn, GPR32:$Rm)>;
+defm : A64I_dp3_3operand<"mul", MADDxxxx, XZR, (mul GPR64:$Rn, GPR64:$Rm)>;
+
+defm : A64I_dp3_3operand<"mneg", MSUBwwww, WZR,
+                         (sub 0, (mul GPR32:$Rn, GPR32:$Rm))>;
+defm : A64I_dp3_3operand<"mneg", MSUBxxxx, XZR,
+                         (sub 0, (mul GPR64:$Rn, GPR64:$Rm))>;
+
+defm : A64I_dp3_3operand<"smull", SMADDLxwwx, XZR,
+                         (mul (i64 (sext GPR32:$Rn)), (sext GPR32:$Rm))>;
+defm : A64I_dp3_3operand<"smnegl", SMSUBLxwwx, XZR,
+                       (sub 0, (mul (i64 (sext GPR32:$Rn)), (sext GPR32:$Rm)))>;
+
+defm : A64I_dp3_3operand<"umull", UMADDLxwwx, XZR,
+                         (mul (i64 (zext GPR32:$Rn)), (zext GPR32:$Rm))>;
+defm : A64I_dp3_3operand<"umnegl", UMSUBLxwwx, XZR,
+                       (sub 0, (mul (i64 (zext GPR32:$Rn)), (zext GPR32:$Rm)))>;
+
+
+//===----------------------------------------------------------------------===//
+// Exception generation
+//===----------------------------------------------------------------------===//
+// Contains: SVC, HVC, SMC, BRK, HLT, DCPS1, DCPS2, DCPS3
+
+def uimm16_asmoperand : AsmOperandClass {
+  let Name = "UImm16";
+  let PredicateMethod = "isUImm<16>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm16";
+}
+
+def uimm16 : Operand<i32> {
+  let ParserMatchClass = uimm16_asmoperand;
+}
+
+class A64I_exceptImpl<bits<3> opc, bits<2> ll, string asmop>
+  : A64I_exception<opc, 0b000, ll, (outs), (ins uimm16:$UImm16),
+                   !strconcat(asmop, "\t$UImm16"), [], NoItinerary> {
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+
+def SVCi : A64I_exceptImpl<0b000, 0b01, "svc">;
+def HVCi : A64I_exceptImpl<0b000, 0b10, "hvc">;
+def SMCi : A64I_exceptImpl<0b000, 0b11, "smc">;
+def BRKi : A64I_exceptImpl<0b001, 0b00, "brk">;
+def HLTi : A64I_exceptImpl<0b010, 0b00, "hlt">;
+
+def DCPS1i : A64I_exceptImpl<0b101, 0b01, "dcps1">;
+def DCPS2i : A64I_exceptImpl<0b101, 0b10, "dcps2">;
+def DCPS3i : A64I_exceptImpl<0b101, 0b11, "dcps3">;
+
+// The immediate is optional for the DCPS instructions, defaulting to 0.
+def : InstAlias<"dcps1", (DCPS1i 0)>;
+def : InstAlias<"dcps2", (DCPS2i 0)>;
+def : InstAlias<"dcps3", (DCPS3i 0)>;
+
+//===----------------------------------------------------------------------===//
+// Extract (immediate)
+//===----------------------------------------------------------------------===//
+// Contains: EXTR + alias ROR
+
+def EXTRwwwi : A64I_extract<0b0, 0b000, 0b0,
+                            (outs GPR32:$Rd),
+                            (ins GPR32:$Rn, GPR32:$Rm, bitfield32_imm:$LSB),
+                            "extr\t$Rd, $Rn, $Rm, $LSB",
+                            [(set GPR32:$Rd,
+                                  (A64Extr GPR32:$Rn, GPR32:$Rm, imm:$LSB))],
+                            NoItinerary>;
+def EXTRxxxi : A64I_extract<0b1, 0b000, 0b1,
+                            (outs GPR64:$Rd),
+                            (ins GPR64:$Rn, GPR64:$Rm, bitfield64_imm:$LSB),
+                            "extr\t$Rd, $Rn, $Rm, $LSB",
+                            [(set GPR64:$Rd,
+                                  (A64Extr GPR64:$Rn, GPR64:$Rm, imm:$LSB))],
+                            NoItinerary>;
+
+def : InstAlias<"ror $Rd, $Rs, $LSB",
+               (EXTRwwwi GPR32:$Rd, GPR32:$Rs, GPR32:$Rs, bitfield32_imm:$LSB)>;
+def : InstAlias<"ror $Rd, $Rs, $LSB",
+               (EXTRxxxi GPR64:$Rd, GPR64:$Rs, GPR64:$Rs, bitfield64_imm:$LSB)>;
+
+def : Pat<(rotr GPR32:$Rn, bitfield32_imm:$LSB),
+          (EXTRwwwi GPR32:$Rn, GPR32:$Rn, bitfield32_imm:$LSB)>;
+def : Pat<(rotr GPR64:$Rn, bitfield64_imm:$LSB),
+          (EXTRxxxi GPR64:$Rn, GPR64:$Rn, bitfield64_imm:$LSB)>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point compare instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCMP, FCMPE
+
+def fpzero_asmoperand : AsmOperandClass {
+  let Name = "FPZero";
+  let ParserMethod = "ParseFPImmOperand";
+  let DiagnosticType = "FPZero";
+}
+
+def fpz32 : Operand<f32>,
+            ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
+  let ParserMatchClass = fpzero_asmoperand;
+  let PrintMethod = "printFPZeroOperand";
+}
+
+def fpz64 : Operand<f64>,
+            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
+  let ParserMatchClass = fpzero_asmoperand;
+  let PrintMethod = "printFPZeroOperand";
+}
+
+multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, string asmop2,
+                            dag pattern> {
+  def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
+                          (outs), ins, !strconcat("fcmp\t$Rn, ", asmop2),
+                          [pattern], NoItinerary> {
+    let Defs = [NZCV];
+  }
+
+  def _sig : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b1, imm, 0b0, 0b0, 0b0},
+                        (outs), ins, !strconcat("fcmpe\t$Rn, ", asmop2),
+                        [], NoItinerary> {
+    let Defs = [NZCV];
+  }
+}
+
+defm FCMPss : A64I_fpcmpSignal<0b00, 0b0, (ins FPR32:$Rn, FPR32:$Rm), "$Rm",
+                               (set NZCV, (A64cmp (f32 FPR32:$Rn), FPR32:$Rm))>;
+defm FCMPdd : A64I_fpcmpSignal<0b01, 0b0, (ins FPR64:$Rn, FPR64:$Rm), "$Rm",
+                               (set NZCV, (A64cmp (f64 FPR64:$Rn), FPR64:$Rm))>;
+
+// What would be Rm should be written as 0, but anything is valid for
+// disassembly so we can't set the bits
+let PostEncoderMethod = "fixFCMPImm" in {
+  defm FCMPsi : A64I_fpcmpSignal<0b00, 0b1, (ins FPR32:$Rn, fpz32:$Imm), "$Imm",
+                              (set NZCV, (A64cmp (f32 FPR32:$Rn), fpz32:$Imm))>;
+
+  defm FCMPdi : A64I_fpcmpSignal<0b01, 0b1, (ins FPR64:$Rn, fpz64:$Imm), "$Imm",
+                              (set NZCV, (A64cmp (f64 FPR64:$Rn), fpz64:$Imm))>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Floating-point conditional compare instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCCMP, FCCMPE
+
+class A64I_fpccmpImpl<bits<2> type, bit op, RegisterClass FPR, string asmop>
+  : A64I_fpccmp<0b0, 0b0, type, op,
+                (outs),
+                (ins FPR:$Rn, FPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
+                !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
+                [], NoItinerary> {
+  let Defs = [NZCV];
+}
+
+def FCCMPss : A64I_fpccmpImpl<0b00, 0b0, FPR32, "fccmp">;
+def FCCMPEss : A64I_fpccmpImpl<0b00, 0b1, FPR32, "fccmpe">;
+def FCCMPdd : A64I_fpccmpImpl<0b01, 0b0, FPR64, "fccmp">;
+def FCCMPEdd : A64I_fpccmpImpl<0b01, 0b1, FPR64, "fccmpe">;
+
+//===----------------------------------------------------------------------===//
+// Floating-point conditional select instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCSEL
+
+let Uses = [NZCV] in {
+  def FCSELsssc : A64I_fpcondsel<0b0, 0b0, 0b00, (outs FPR32:$Rd),
+                                 (ins FPR32:$Rn, FPR32:$Rm, cond_code_op:$Cond),
+                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
+                                 [(set FPR32:$Rd,
+                                       (simple_select (f32 FPR32:$Rn),
+                                                      FPR32:$Rm))],
+                                 NoItinerary>;
+
+
+  def FCSELdddc : A64I_fpcondsel<0b0, 0b0, 0b01, (outs FPR64:$Rd),
+                                 (ins FPR64:$Rn, FPR64:$Rm, cond_code_op:$Cond),
+                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
+                                 [(set FPR64:$Rd,
+                                       (simple_select (f64 FPR64:$Rn),
+                                                      FPR64:$Rm))],
+                                 NoItinerary>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point data-processing (1 source)
+//===----------------------------------------------------------------------===//
+// Contains: FMOV, FABS, FNEG, FSQRT, FCVT, FRINT[NPMZAXI].
+
+def FPNoUnop : PatFrag<(ops node:$val), (fneg node:$val),
+                       [{ (void)N; return false; }]>;
+
+// First we do the fairly trivial bunch with uniform "OP s, s" and "OP d, d"
+// syntax. Default to no pattern because most are odd enough not to have one.
+multiclass A64I_fpdp1sizes<bits<6> opcode, string asmstr,
+                           SDPatternOperator opnode = FPNoUnop> {
+  def ss : A64I_fpdp1<0b0, 0b0, 0b00, opcode, (outs FPR32:$Rd), (ins FPR32:$Rn),
+                     !strconcat(asmstr, "\t$Rd, $Rn"),
+                     [(set (f32 FPR32:$Rd), (opnode FPR32:$Rn))],
+                     NoItinerary>;
+
+  def dd : A64I_fpdp1<0b0, 0b0, 0b01, opcode, (outs FPR64:$Rd), (ins FPR64:$Rn),
+                     !strconcat(asmstr, "\t$Rd, $Rn"),
+                     [(set (f64 FPR64:$Rd), (opnode FPR64:$Rn))],
+                     NoItinerary>;
+}
+
+defm FMOV   : A64I_fpdp1sizes<0b000000, "fmov">;
+defm FABS   : A64I_fpdp1sizes<0b000001, "fabs", fabs>;
+defm FNEG   : A64I_fpdp1sizes<0b000010, "fneg", fneg>;
+defm FSQRT  : A64I_fpdp1sizes<0b000011, "fsqrt", fsqrt>;
+
+defm FRINTN : A64I_fpdp1sizes<0b001000, "frintn">;
+defm FRINTP : A64I_fpdp1sizes<0b001001, "frintp", fceil>;
+defm FRINTM : A64I_fpdp1sizes<0b001010, "frintm", ffloor>;
+defm FRINTZ : A64I_fpdp1sizes<0b001011, "frintz", ftrunc>;
+defm FRINTA : A64I_fpdp1sizes<0b001100, "frinta">;
+defm FRINTX : A64I_fpdp1sizes<0b001110, "frintx", frint>;
+defm FRINTI : A64I_fpdp1sizes<0b001111, "frinti", fnearbyint>;
+
+// The FCVT instrucitons have different source and destination register-types,
+// but the fields are uniform everywhere a D-register (say) crops up. Package
+// this information in a Record.
+class FCVTRegType<RegisterClass rc, bits<2> fld, ValueType vt> {
+    RegisterClass Class = rc;
+    ValueType VT = vt;
+    bit t1 = fld{1};
+    bit t0 = fld{0};
+}
+
+def FCVT16 : FCVTRegType<FPR16, 0b11, f16>;
+def FCVT32 : FCVTRegType<FPR32, 0b00, f32>;
+def FCVT64 : FCVTRegType<FPR64, 0b01, f64>;
+
+class A64I_fpdp1_fcvt<FCVTRegType DestReg, FCVTRegType SrcReg, SDNode opnode>
+  : A64I_fpdp1<0b0, 0b0, {SrcReg.t1, SrcReg.t0},
+               {0,0,0,1, DestReg.t1, DestReg.t0},
+               (outs DestReg.Class:$Rd), (ins SrcReg.Class:$Rn),
+               "fcvt\t$Rd, $Rn",
+               [(set (DestReg.VT DestReg.Class:$Rd),
+                     (opnode (SrcReg.VT SrcReg.Class:$Rn)))], NoItinerary>;
+
+def FCVTds : A64I_fpdp1_fcvt<FCVT64, FCVT32, fextend>;
+def FCVThs : A64I_fpdp1_fcvt<FCVT16, FCVT32, fround>;
+def FCVTsd : A64I_fpdp1_fcvt<FCVT32, FCVT64, fround>;
+def FCVThd : A64I_fpdp1_fcvt<FCVT16, FCVT64, fround>;
+def FCVTsh : A64I_fpdp1_fcvt<FCVT32, FCVT16, fextend>;
+def FCVTdh : A64I_fpdp1_fcvt<FCVT64, FCVT16, fextend>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating-point data-processing (2 sources) instructions
+//===----------------------------------------------------------------------===//
+// Contains: FMUL, FDIV, FADD, FSUB, FMAX, FMIN, FMAXNM, FMINNM, FNMUL
+
+def FPNoBinop : PatFrag<(ops node:$lhs, node:$rhs), (fadd node:$lhs, node:$rhs),
+                      [{ (void)N; return false; }]>;
+
+multiclass A64I_fpdp2sizes<bits<4> opcode, string asmstr,
+                           SDPatternOperator opnode> {
+  def sss : A64I_fpdp2<0b0, 0b0, 0b00, opcode,
+                      (outs FPR32:$Rd),
+                      (ins FPR32:$Rn, FPR32:$Rm),
+                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
+                      [(set (f32 FPR32:$Rd), (opnode FPR32:$Rn, FPR32:$Rm))],
+                      NoItinerary>;
+
+  def ddd : A64I_fpdp2<0b0, 0b0, 0b01, opcode,
+                      (outs FPR64:$Rd),
+                      (ins FPR64:$Rn, FPR64:$Rm),
+                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
+                      [(set (f64 FPR64:$Rd), (opnode FPR64:$Rn, FPR64:$Rm))],
+                      NoItinerary>;
+}
+
+let isCommutable = 1 in {
+  defm FMUL   : A64I_fpdp2sizes<0b0000, "fmul", fmul>;
+  defm FADD   : A64I_fpdp2sizes<0b0010, "fadd", fadd>;
+
+  // No patterns for these.
+  defm FMAX   : A64I_fpdp2sizes<0b0100, "fmax", FPNoBinop>;
+  defm FMIN   : A64I_fpdp2sizes<0b0101, "fmin", FPNoBinop>;
+  defm FMAXNM : A64I_fpdp2sizes<0b0110, "fmaxnm", FPNoBinop>;
+  defm FMINNM : A64I_fpdp2sizes<0b0111, "fminnm", FPNoBinop>;
+
+  defm FNMUL  : A64I_fpdp2sizes<0b1000, "fnmul",
+                                PatFrag<(ops node:$lhs, node:$rhs),
+                                        (fneg (fmul node:$lhs, node:$rhs))> >;
+}
+
+defm FDIV : A64I_fpdp2sizes<0b0001, "fdiv", fdiv>;
+defm FSUB : A64I_fpdp2sizes<0b0011, "fsub", fsub>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point data-processing (3 sources) instructions
+//===----------------------------------------------------------------------===//
+// Contains: FMADD, FMSUB, FNMADD, FNMSUB
+
+def fmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
+                    (fma (fneg node:$Rn),  node:$Rm, node:$Ra)>;
+def fnmadd : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
+                     (fma node:$Rn,  node:$Rm, (fneg node:$Ra))>;
+def fnmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
+                     (fma (fneg node:$Rn),  node:$Rm, (fneg node:$Ra))>;
+
+class A64I_fpdp3Impl<string asmop, RegisterClass FPR, ValueType VT,
+                     bits<2> type, bit o1, bit o0, SDPatternOperator fmakind>
+  : A64I_fpdp3<0b0, 0b0, type, o1, o0, (outs FPR:$Rd),
+               (ins FPR:$Rn, FPR:$Rm, FPR:$Ra),
+               !strconcat(asmop,"\t$Rd, $Rn, $Rm, $Ra"),
+               [(set FPR:$Rd, (fmakind (VT FPR:$Rn), FPR:$Rm, FPR:$Ra))],
+               NoItinerary>;
+
+def FMADDssss  : A64I_fpdp3Impl<"fmadd",  FPR32, f32, 0b00, 0b0, 0b0, fma>;
+def FMSUBssss  : A64I_fpdp3Impl<"fmsub",  FPR32, f32, 0b00, 0b0, 0b1, fmsub>;
+def FNMADDssss : A64I_fpdp3Impl<"fnmadd", FPR32, f32, 0b00, 0b1, 0b0, fnmadd>;
+def FNMSUBssss : A64I_fpdp3Impl<"fnmsub", FPR32, f32, 0b00, 0b1, 0b1, fnmsub>;
+
+def FMADDdddd  : A64I_fpdp3Impl<"fmadd",  FPR64, f64, 0b01, 0b0, 0b0, fma>;
+def FMSUBdddd  : A64I_fpdp3Impl<"fmsub",  FPR64, f64, 0b01, 0b0, 0b1, fmsub>;
+def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>;
+def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point <-> fixed-point conversion instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
+
+// #1-#32 allowed, encoded as "64 - <specified imm>
+def fixedpos_asmoperand_i32 : AsmOperandClass {
+  let Name = "CVTFixedPos32";
+  let RenderMethod = "addCVTFixedPosOperands";
+  let PredicateMethod = "isCVTFixedPos<32>";
+  let DiagnosticType = "CVTFixedPos32";
+}
+
+// Also encoded as "64 - <specified imm>" but #1-#64 allowed.
+def fixedpos_asmoperand_i64 : AsmOperandClass {
+  let Name = "CVTFixedPos64";
+  let RenderMethod = "addCVTFixedPosOperands";
+  let PredicateMethod = "isCVTFixedPos<64>";
+  let DiagnosticType = "CVTFixedPos64";
+}
+
+// We need the cartesian product of f32/f64 i32/i64 operands for
+// conversions:
+//   + Selection needs to use operands of correct floating type
+//   + Assembly parsing and decoding depend on integer width
+class cvtfix_i32_op<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm]> {
+  let ParserMatchClass = fixedpos_asmoperand_i32;
+  let DecoderMethod = "DecodeCVT32FixedPosOperand";
+  let PrintMethod = "printCVTFixedPosOperand";
+}
+
+class cvtfix_i64_op<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm]> {
+  let ParserMatchClass = fixedpos_asmoperand_i64;
+  let PrintMethod = "printCVTFixedPosOperand";
+}
+
+// Because of the proliferation of weird operands, it's not really
+// worth going for a multiclass here. Oh well.
+
+class A64I_fptofix<bit sf, bits<2> type, bits<3> opcode,
+                   RegisterClass GPR, RegisterClass FPR, Operand scale_op,
+                   string asmop, SDNode cvtop>
+  : A64I_fpfixed<sf, 0b0, type, 0b11, opcode,
+                 (outs GPR:$Rd), (ins FPR:$Rn, scale_op:$Scale),
+                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
+                 [(set GPR:$Rd, (cvtop (fmul FPR:$Rn, scale_op:$Scale)))],
+                 NoItinerary>;
+
+def FCVTZSwsi : A64I_fptofix<0b0, 0b00, 0b000, GPR32, FPR32,
+                             cvtfix_i32_op<f32>, "fcvtzs", fp_to_sint>;
+def FCVTZSxsi : A64I_fptofix<0b1, 0b00, 0b000, GPR64, FPR32,
+                             cvtfix_i64_op<f32>, "fcvtzs", fp_to_sint>;
+def FCVTZUwsi : A64I_fptofix<0b0, 0b00, 0b001, GPR32, FPR32,
+                             cvtfix_i32_op<f32>, "fcvtzu", fp_to_uint>;
+def FCVTZUxsi : A64I_fptofix<0b1, 0b00, 0b001, GPR64, FPR32,
+                             cvtfix_i64_op<f32>, "fcvtzu", fp_to_uint>;
+
+def FCVTZSwdi : A64I_fptofix<0b0, 0b01, 0b000, GPR32, FPR64,
+                             cvtfix_i32_op<f64>, "fcvtzs", fp_to_sint>;
+def FCVTZSxdi : A64I_fptofix<0b1, 0b01, 0b000, GPR64, FPR64,
+                             cvtfix_i64_op<f64>, "fcvtzs", fp_to_sint>;
+def FCVTZUwdi : A64I_fptofix<0b0, 0b01, 0b001, GPR32, FPR64,
+                             cvtfix_i32_op<f64>, "fcvtzu", fp_to_uint>;
+def FCVTZUxdi : A64I_fptofix<0b1, 0b01, 0b001, GPR64, FPR64,
+                             cvtfix_i64_op<f64>, "fcvtzu", fp_to_uint>;
+
+
+class A64I_fixtofp<bit sf, bits<2> type, bits<3> opcode,
+                   RegisterClass FPR, RegisterClass GPR, Operand scale_op,
+                   string asmop, SDNode cvtop>
+  : A64I_fpfixed<sf, 0b0, type, 0b00, opcode,
+                 (outs FPR:$Rd), (ins GPR:$Rn, scale_op:$Scale),
+                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
+                 [(set FPR:$Rd, (fdiv (cvtop GPR:$Rn), scale_op:$Scale))],
+                 NoItinerary>;
+
+def SCVTFswi : A64I_fixtofp<0b0, 0b00, 0b010, FPR32, GPR32,
+                            cvtfix_i32_op<f32>, "scvtf", sint_to_fp>;
+def SCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b010, FPR32, GPR64,
+                            cvtfix_i64_op<f32>, "scvtf", sint_to_fp>;
+def UCVTFswi : A64I_fixtofp<0b0, 0b00, 0b011, FPR32, GPR32,
+                            cvtfix_i32_op<f32>, "ucvtf", uint_to_fp>;
+def UCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b011, FPR32, GPR64,
+                            cvtfix_i64_op<f32>, "ucvtf", uint_to_fp>;
+def SCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b010, FPR64, GPR32,
+                            cvtfix_i32_op<f64>, "scvtf", sint_to_fp>;
+def SCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b010, FPR64, GPR64,
+                            cvtfix_i64_op<f64>, "scvtf", sint_to_fp>;
+def UCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b011, FPR64, GPR32,
+                            cvtfix_i32_op<f64>, "ucvtf", uint_to_fp>;
+def UCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b011, FPR64, GPR64,
+                            cvtfix_i64_op<f64>, "ucvtf", uint_to_fp>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point <-> integer conversion instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
+
+class A64I_fpintI<bit sf, bits<2> type, bits<2> rmode, bits<3> opcode,
+                   RegisterClass DestPR, RegisterClass SrcPR, string asmop>
+  : A64I_fpint<sf, 0b0, type, rmode, opcode, (outs DestPR:$Rd), (ins SrcPR:$Rn),
+               !strconcat(asmop, "\t$Rd, $Rn"), [], NoItinerary>;
+
+multiclass A64I_fptointRM<bits<2> rmode, bit o2, string asmop> {
+  def Sws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 0},
+                        GPR32, FPR32, asmop # "s">;
+  def Sxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 0},
+                        GPR64, FPR32, asmop # "s">;
+  def Uws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 1},
+                        GPR32, FPR32, asmop # "u">;
+  def Uxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 1},
+                        GPR64, FPR32, asmop # "u">;
+
+  def Swd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 0},
+                        GPR32, FPR64, asmop # "s">;
+  def Sxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 0},
+                        GPR64, FPR64, asmop # "s">;
+  def Uwd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 1},
+                        GPR32, FPR64, asmop # "u">;
+  def Uxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 1},
+                        GPR64, FPR64, asmop # "u">;
+}
+
+defm FCVTN : A64I_fptointRM<0b00, 0b0, "fcvtn">;
+defm FCVTP : A64I_fptointRM<0b01, 0b0, "fcvtp">;
+defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">;
+defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">;
+defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">;
+
+def : Pat<(i32 (fp_to_sint FPR32:$Rn)), (FCVTZSws FPR32:$Rn)>;
+def : Pat<(i64 (fp_to_sint FPR32:$Rn)), (FCVTZSxs FPR32:$Rn)>;
+def : Pat<(i32 (fp_to_uint FPR32:$Rn)), (FCVTZUws FPR32:$Rn)>;
+def : Pat<(i64 (fp_to_uint FPR32:$Rn)), (FCVTZUxs FPR32:$Rn)>;
+def : Pat<(i32 (fp_to_sint (f64 FPR64:$Rn))), (FCVTZSwd FPR64:$Rn)>;
+def : Pat<(i64 (fp_to_sint (f64 FPR64:$Rn))), (FCVTZSxd FPR64:$Rn)>;
+def : Pat<(i32 (fp_to_uint (f64 FPR64:$Rn))), (FCVTZUwd FPR64:$Rn)>;
+def : Pat<(i64 (fp_to_uint (f64 FPR64:$Rn))), (FCVTZUxd FPR64:$Rn)>;
+
+multiclass A64I_inttofp<bit o0, string asmop> {
+  def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>;
+  def CVTFsx : A64I_fpintI<0b1, 0b00, 0b00, {0, 1, o0}, FPR32, GPR64, asmop>;
+  def CVTFdw : A64I_fpintI<0b0, 0b01, 0b00, {0, 1, o0}, FPR64, GPR32, asmop>;
+  def CVTFdx : A64I_fpintI<0b1, 0b01, 0b00, {0, 1, o0}, FPR64, GPR64, asmop>;
+}
+
+defm S : A64I_inttofp<0b0, "scvtf">;
+defm U : A64I_inttofp<0b1, "ucvtf">;
+
+def : Pat<(f32 (sint_to_fp GPR32:$Rn)), (SCVTFsw GPR32:$Rn)>;
+def : Pat<(f32 (sint_to_fp GPR64:$Rn)), (SCVTFsx GPR64:$Rn)>;
+def : Pat<(f64 (sint_to_fp GPR32:$Rn)), (SCVTFdw GPR32:$Rn)>;
+def : Pat<(f64 (sint_to_fp GPR64:$Rn)), (SCVTFdx GPR64:$Rn)>;
+def : Pat<(f32 (uint_to_fp GPR32:$Rn)), (UCVTFsw GPR32:$Rn)>;
+def : Pat<(f32 (uint_to_fp GPR64:$Rn)), (UCVTFsx GPR64:$Rn)>;
+def : Pat<(f64 (uint_to_fp GPR32:$Rn)), (UCVTFdw GPR32:$Rn)>;
+def : Pat<(f64 (uint_to_fp GPR64:$Rn)), (UCVTFdx GPR64:$Rn)>;
+
+def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">;
+def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">;
+def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">;
+def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">;
+
+def : Pat<(i32 (bitconvert (f32 FPR32:$Rn))), (FMOVws FPR32:$Rn)>;
+def : Pat<(f32 (bitconvert (i32 GPR32:$Rn))), (FMOVsw GPR32:$Rn)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Rn))), (FMOVxd FPR64:$Rn)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Rn))), (FMOVdx GPR64:$Rn)>;
+
+def lane1_asmoperand : AsmOperandClass {
+  let Name = "Lane1";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "Lane1";
+}
+
+def lane1 : Operand<i32> {
+  let ParserMatchClass = lane1_asmoperand;
+  let PrintMethod = "printBareImmOperand";
+}
+
+let DecoderMethod =  "DecodeFMOVLaneInstruction" in {
+  def FMOVxv : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b110,
+                          (outs GPR64:$Rd), (ins VPR128:$Rn, lane1:$Lane),
+                          "fmov\t$Rd, $Rn.d[$Lane]", [], NoItinerary>;
+
+  def FMOVvx : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b111,
+                          (outs VPR128:$Rd), (ins GPR64:$Rn, lane1:$Lane),
+                          "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>;
+}
+
+def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]",
+                (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>;
+
+def : InstAlias<"fmov $Rd.2d[$Lane], $Rn",
+                (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point immediate instructions
+//===----------------------------------------------------------------------===//
+// Contains: FMOV
+
+def fpimm_asmoperand : AsmOperandClass {
+  let Name = "FMOVImm";
+  let ParserMethod = "ParseFPImmOperand";
+  let DiagnosticType = "FPImm";
+}
+
+// The MCOperand for these instructions are the encoded 8-bit values.
+def SDXF_fpimm : SDNodeXForm<fpimm, [{
+  uint32_t Imm8;
+  A64Imms::isFPImm(N->getValueAPF(), Imm8);
+  return CurDAG->getTargetConstant(Imm8, MVT::i32);
+}]>;
+
+class fmov_operand<ValueType FT>
+  : Operand<i32>,
+    PatLeaf<(FT fpimm), [{ return A64Imms::isFPImm(N->getValueAPF()); }],
+            SDXF_fpimm> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = fpimm_asmoperand;
+}
+
+def fmov32_operand : fmov_operand<f32>;
+def fmov64_operand : fmov_operand<f64>;
+
+class A64I_fpimm_impl<bits<2> type, RegisterClass Reg, ValueType VT,
+                      Operand fmov_operand>
+  : A64I_fpimm<0b0, 0b0, type, 0b00000,
+               (outs Reg:$Rd),
+               (ins fmov_operand:$Imm8),
+               "fmov\t$Rd, $Imm8",
+               [(set (VT Reg:$Rd), fmov_operand:$Imm8)],
+               NoItinerary>;
+
+def FMOVsi : A64I_fpimm_impl<0b00, FPR32, f32, fmov32_operand>;
+def FMOVdi : A64I_fpimm_impl<0b01, FPR64, f64, fmov64_operand>;
+
+//===----------------------------------------------------------------------===//
+// Load-register (literal) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDR, LDRSW, PRFM
+
+def ldrlit_label_asmoperand : AsmOperandClass {
+  let Name = "LoadLitLabel";
+  let RenderMethod = "addLabelOperands<19, 4>";
+  let DiagnosticType = "Label";
+}
+
+def ldrlit_label : Operand<i64> {
+  let EncoderMethod = "getLoadLitLabelOpValue";
+
+  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
+  let PrintMethod = "printLabelOperand<19, 4>";
+  let ParserMatchClass = ldrlit_label_asmoperand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Various instructions take an immediate value (which can always be used),
+// where some numbers have a symbolic name to make things easier. These operands
+// and the associated functions abstract away the differences.
+multiclass namedimm<string prefix, string mapper> {
+  def _asmoperand : AsmOperandClass {
+    let Name = "NamedImm" # prefix;
+    let PredicateMethod = "isUImm";
+    let RenderMethod = "addImmOperands";
+    let ParserMethod = "ParseNamedImmOperand<" # mapper # ">";
+    let DiagnosticType = "NamedImm_" # prefix;
+  }
+
+  def _op : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
+    let PrintMethod = "printNamedImmOperand<" # mapper # ">";
+    let DecoderMethod = "DecodeNamedImmOperand<" # mapper # ">";
+  }
+}
+
+defm prefetch : namedimm<"prefetch", "A64PRFM::PRFMMapper">;
+
+class A64I_LDRlitSimple<bits<2> opc, bit v, RegisterClass OutReg,
+                      list<dag> patterns = []>
+   : A64I_LDRlit<opc, v, (outs OutReg:$Rt), (ins ldrlit_label:$Imm19),
+                 "ldr\t$Rt, $Imm19", patterns, NoItinerary>;
+
+let mayLoad = 1 in {
+  def LDRw_lit : A64I_LDRlitSimple<0b00, 0b0, GPR32>;
+  def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>;
+}
+
+def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
+def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
+
+let mayLoad = 1 in {
+  def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
+
+
+  def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
+                               (outs GPR64:$Rt),
+                               (ins ldrlit_label:$Imm19),
+                               "ldrsw\t$Rt, $Imm19",
+                               [], NoItinerary>;
+
+  def PRFM_lit : A64I_LDRlit<0b11, 0b0,
+                             (outs), (ins prefetch_op:$Rt, ldrlit_label:$Imm19),
+                             "prfm\t$Rt, $Imm19",
+                             [], NoItinerary>;
+}
+
+//===----------------------------------------------------------------------===//
+// Load-store exclusive instructions
+//===----------------------------------------------------------------------===//
+// Contains: STXRB, STXRH, STXR, LDXRB, LDXRH, LDXR. STXP, LDXP, STLXRB,
+//           STLXRH, STLXR, LDAXRB, LDAXRH, LDAXR, STLXP, LDAXP, STLRB,
+//           STLRH, STLR, LDARB, LDARH, LDAR
+
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fiels since Rt and Rn are always used.
+
+// This operand parses a GPR64xsp register, followed by an optional immediate
+// #0.
+def GPR64xsp0_asmoperand : AsmOperandClass {
+  let Name = "GPR64xsp0";
+  let PredicateMethod = "isWrappedReg";
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "ParseLSXAddressOperand";
+  // Diagnostics are provided by ParserMethod
+}
+
+def GPR64xsp0 : RegisterOperand<GPR64xsp> {
+  let ParserMatchClass = GPR64xsp0_asmoperand;
+}
+
+//===----------------------------------
+// Store-exclusive (releasing & normal)
+//===----------------------------------
+
+class A64I_SRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+       A64I_LDSTex_stn <size,
+                        opcode{2}, 0, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rs, $Rt, [$Rn]"),
+                        pat, itin> {
+  let mayStore = 1;
+  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+multiclass A64I_SRex<string asmstr, bits<3> opcode, string prefix> {
+  def _byte:  A64I_SRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
+                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                              [], NoItinerary>;
+
+  def _hword:  A64I_SRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
+                               (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                               [],NoItinerary>;
+
+  def _word:  A64I_SRexs_impl<0b10, opcode, asmstr,
+                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                              [], NoItinerary>;
+
+  def _dword: A64I_SRexs_impl<0b11, opcode, asmstr,
+                              (outs GPR32:$Rs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
+                              [], NoItinerary>;
+}
+
+defm STXR  : A64I_SRex<"stxr",  0b000, "STXR">;
+defm STLXR : A64I_SRex<"stlxr", 0b001, "STLXR">;
+
+//===----------------------------------
+// Loads
+//===----------------------------------
+
+class A64I_LRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+        A64I_LDSTex_tn <size,
+                        opcode{2}, 1, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rt, [$Rn]"),
+                        pat, itin> {
+  let mayLoad = 1;
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+multiclass A64I_LRex<string asmstr, bits<3> opcode> {
+  def _byte:  A64I_LRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
+                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _hword:  A64I_LRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
+                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _word:  A64I_LRexs_impl<0b10, opcode, asmstr,
+                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _dword: A64I_LRexs_impl<0b11, opcode, asmstr,
+                            (outs GPR64:$Rt), (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+}
+
+defm LDXR  : A64I_LRex<"ldxr",  0b000>;
+defm LDAXR : A64I_LRex<"ldaxr", 0b001>;
+defm LDAR  : A64I_LRex<"ldar",  0b101>;
+
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  return cast<AtomicSDNode>(N)->getOrdering() == Acquire;
+}]>;
+
+def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
+def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
+def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
+def atomic_load_acquire_64 : acquiring_load<atomic_load_64>;
+
+def : Pat<(atomic_load_acquire_8  GPR64xsp:$Rn), (LDAR_byte  GPR64xsp0:$Rn)>;
+def : Pat<(atomic_load_acquire_16 GPR64xsp:$Rn), (LDAR_hword GPR64xsp0:$Rn)>;
+def : Pat<(atomic_load_acquire_32 GPR64xsp:$Rn), (LDAR_word  GPR64xsp0:$Rn)>;
+def : Pat<(atomic_load_acquire_64 GPR64xsp:$Rn), (LDAR_dword GPR64xsp0:$Rn)>;
+
+//===----------------------------------
+// Store-release (no exclusivity)
+//===----------------------------------
+
+class A64I_SLexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+        A64I_LDSTex_tn <size,
+                        opcode{2}, 0, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rt, [$Rn]"),
+                        pat, itin> {
+  let mayStore = 1;
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  return cast<AtomicSDNode>(N)->getOrdering() == Release;
+}]>;
+
+def atomic_store_release_8  : releasing_store<atomic_store_8>;
+def atomic_store_release_16 : releasing_store<atomic_store_16>;
+def atomic_store_release_32 : releasing_store<atomic_store_32>;
+def atomic_store_release_64 : releasing_store<atomic_store_64>;
+
+multiclass A64I_SLex<string asmstr, bits<3> opcode, string prefix> {
+  def _byte:  A64I_SLexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
+                            (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                            [(atomic_store_release_8 GPR64xsp0:$Rn, GPR32:$Rt)],
+                            NoItinerary>;
+
+  def _hword:  A64I_SLexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
+                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                           [(atomic_store_release_16 GPR64xsp0:$Rn, GPR32:$Rt)],
+                           NoItinerary>;
+
+  def _word:  A64I_SLexs_impl<0b10, opcode, asmstr,
+                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                           [(atomic_store_release_32 GPR64xsp0:$Rn, GPR32:$Rt)],
+                           NoItinerary>;
+
+  def _dword: A64I_SLexs_impl<0b11, opcode, asmstr,
+                           (outs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
+                           [(atomic_store_release_64 GPR64xsp0:$Rn, GPR64:$Rt)],
+                           NoItinerary>;
+}
+
+defm STLR  : A64I_SLex<"stlr", 0b101, "STLR">;
+
+//===----------------------------------
+// Store-exclusive pair (releasing & normal)
+//===----------------------------------
+
+class A64I_SPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+     A64I_LDSTex_stt2n <size,
+                        opcode{2}, 0, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rs, $Rt, $Rt2, [$Rn]"),
+                        pat, itin> {
+  let mayStore = 1;
+}
+
+
+multiclass A64I_SPex<string asmstr, bits<3> opcode> {
+  def _word:  A64I_SPexs_impl<0b10, opcode, asmstr, (outs),
+                            (ins GPR32:$Rs, GPR32:$Rt, GPR32:$Rt2,
+                                 GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _dword: A64I_SPexs_impl<0b11, opcode, asmstr, (outs),
+                            (ins GPR32:$Rs, GPR64:$Rt, GPR64:$Rt2,
+                                            GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+}
+
+defm STXP  : A64I_SPex<"stxp", 0b010>;
+defm STLXP : A64I_SPex<"stlxp", 0b011>;
+
+//===----------------------------------
+// Load-exclusive pair (acquiring & normal)
+//===----------------------------------
+
+class A64I_LPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+      A64I_LDSTex_tt2n <size,
+                        opcode{2}, 1, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rt, $Rt2, [$Rn]"),
+                        pat, itin>{
+  let mayLoad = 1;
+  let DecoderMethod = "DecodeLoadPairExclusiveInstruction";
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+}
+
+multiclass A64I_LPex<string asmstr, bits<3> opcode> {
+  def _word:  A64I_LPexs_impl<0b10, opcode, asmstr,
+                            (outs GPR32:$Rt, GPR32:$Rt2),
+                            (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _dword: A64I_LPexs_impl<0b11, opcode, asmstr,
+                            (outs GPR64:$Rt, GPR64:$Rt2),
+                            (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+}
+
+defm LDXP  : A64I_LPex<"ldxp", 0b010>;
+defm LDAXP : A64I_LPex<"ldaxp", 0b011>;
+
+//===----------------------------------------------------------------------===//
+// Load-store register (unscaled immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDURB, LDURH, LDRUSB, LDRUSH, LDRUSW, STUR, STURB, STURH and PRFUM
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register (register offset) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register (unsigned immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register (immediate post-indexed) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register (immediate pre-indexed) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
+
+// Note that patterns are much later on in a completely separate section (they
+// need ADRPxi to be defined).
+
+//===-------------------------------
+// 1. Various operands needed
+//===-------------------------------
+
+//===-------------------------------
+// 1.1 Unsigned 12-bit immediate operands
+//===-------------------------------
+// The addressing mode for these instructions consists of an unsigned 12-bit
+// immediate which is scaled by the size of the memory access.
+//
+// We represent this in the MC layer by two operands:
+//     1. A base register.
+//     2. A 12-bit immediate: not multiplied by access size, so "LDR x0,[x0,#8]"
+//        would have '1' in this field.
+// This means that separate functions are needed for converting representations
+// which *are* aware of the intended access size.
+
+// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
+// know the access size via some means. An isolated operand does not have this
+// information unless told from here, which means we need separate tablegen
+// Operands for each access size. This multiclass takes care of instantiating
+// the correct template functions in the rest of the backend.
+
+//===-------------------------------
+// 1.1 Unsigned 12-bit immediate operands
+//===-------------------------------
+
+multiclass offsets_uimm12<int MemSize, string prefix> {
+  def uimm12_asmoperand : AsmOperandClass {
+    let Name = "OffsetUImm12_" # MemSize;
+    let PredicateMethod = "isOffsetUImm12<" # MemSize # ">";
+    let RenderMethod = "addOffsetUImm12Operands<" # MemSize # ">";
+    let DiagnosticType = "LoadStoreUImm12_" # MemSize;
+  }
+
+  // Pattern is really no more than an ImmLeaf, but predicated on MemSize which
+  // complicates things beyond TableGen's ken.
+  def uimm12 : Operand<i64>,
+               ComplexPattern<i64, 1, "SelectOffsetUImm12<" # MemSize # ">"> {
+    let ParserMatchClass
+      = !cast<AsmOperandClass>(prefix # uimm12_asmoperand);
+
+    let PrintMethod = "printOffsetUImm12Operand<" # MemSize # ">";
+    let EncoderMethod = "getOffsetUImm12OpValue<" # MemSize # ">";
+  }
+}
+
+defm byte_  : offsets_uimm12<1, "byte_">;
+defm hword_ : offsets_uimm12<2, "hword_">;
+defm word_  : offsets_uimm12<4, "word_">;
+defm dword_ : offsets_uimm12<8, "dword_">;
+defm qword_ : offsets_uimm12<16, "qword_">;
+
+//===-------------------------------
+// 1.1 Signed 9-bit immediate operands
+//===-------------------------------
+
+// The MCInst is expected to store the bit-wise encoding of the value,
+// which amounts to lopping off the extended sign bits.
+def SDXF_simm9 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 0x1ff, MVT::i32);
+}]>;
+
+def simm9_asmoperand : AsmOperandClass {
+  let Name = "SImm9";
+  let PredicateMethod = "isSImm<9>";
+  let RenderMethod = "addSImmOperands<9>";
+  let DiagnosticType = "LoadStoreSImm9";
+}
+
+def simm9 : Operand<i64>,
+            ImmLeaf<i64, [{ return Imm >= -0x100 && Imm <= 0xff; }],
+            SDXF_simm9> {
+  let PrintMethod = "printOffsetSImm9Operand";
+  let ParserMatchClass = simm9_asmoperand;
+}
+
+
+//===-------------------------------
+// 1.3 Register offset extensions
+//===-------------------------------
+
+// The assembly-syntax for these addressing-modes is:
+//    [<Xn|SP>, <R><m> {, <extend> {<amount>}}]
+//
+// The essential semantics are:
+//     + <amount> is a shift: #<log(transfer size)> or #0
+//     + <R> can be W or X.
+//     + If <R> is W, <extend> can be UXTW or SXTW
+//     + If <R> is X, <extend> can be LSL or SXTX
+//
+// The trickiest of those constraints is that Rm can be either GPR32 or GPR64,
+// which will need separate instructions for LLVM type-consistency. We'll also
+// need separate operands, of course.
+multiclass regexts<int MemSize, int RmSize, RegisterClass GPR,
+                   string Rm, string prefix> {
+  def regext_asmoperand : AsmOperandClass {
+    let Name = "AddrRegExtend_" # MemSize # "_" #  Rm;
+    let PredicateMethod = "isAddrRegExtend<" # MemSize # "," # RmSize # ">";
+    let RenderMethod = "addAddrRegExtendOperands<" # MemSize # ">";
+    let DiagnosticType = "LoadStoreExtend" # RmSize # "_" # MemSize;
+  }
+
+  def regext : Operand<i64> {
+    let PrintMethod
+      = "printAddrRegExtendOperand<" # MemSize # ", " # RmSize # ">";
+
+    let DecoderMethod = "DecodeAddrRegExtendOperand";
+    let ParserMatchClass
+      = !cast<AsmOperandClass>(prefix # regext_asmoperand);
+  }
+}
+
+multiclass regexts_wx<int MemSize, string prefix> {
+  // Rm is an X-register if LSL or SXTX are specified as the shift.
+  defm Xm_ : regexts<MemSize, 64, GPR64, "Xm", prefix # "Xm_">;
+
+  // Rm is a W-register if UXTW or SXTW are specified as the shift.
+  defm Wm_ : regexts<MemSize, 32, GPR32, "Wm", prefix # "Wm_">;
+}
+
+defm byte_  : regexts_wx<1, "byte_">;
+defm hword_ : regexts_wx<2, "hword_">;
+defm word_  : regexts_wx<4, "word_">;
+defm dword_ : regexts_wx<8, "dword_">;
+defm qword_ : regexts_wx<16, "qword_">;
+
+
+//===------------------------------
+// 2. The instructions themselves.
+//===------------------------------
+
+// We have the following instructions to implement:
+// |                 | B     | H     | W     | X      |
+// |-----------------+-------+-------+-------+--------|
+// | unsigned str    | STRB  | STRH  | STR   | STR    |
+// | unsigned ldr    | LDRB  | LDRH  | LDR   | LDR    |
+// | signed ldr to W | LDRSB | LDRSH | -     | -      |
+// | signed ldr to X | LDRSB | LDRSH | LDRSW | (PRFM) |
+
+// This will instantiate the LDR/STR instructions you'd expect to use for an
+// unsigned datatype (first two rows above) or floating-point register, which is
+// reasonably uniform across all access sizes.
+
+
+//===------------------------------
+// 2.1 Regular instructions
+//===------------------------------
+
+// This class covers the basic unsigned or irrelevantly-signed loads and stores,
+// to general-purpose and floating-point registers.
+
+class AddrParams<string prefix> {
+  Operand uimm12 = !cast<Operand>(prefix # "_uimm12");
+
+  Operand regextWm = !cast<Operand>(prefix # "_Wm_regext");
+  Operand regextXm = !cast<Operand>(prefix # "_Xm_regext");
+}
+
+def byte_addrparams : AddrParams<"byte">;
+def hword_addrparams : AddrParams<"hword">;
+def word_addrparams : AddrParams<"word">;
+def dword_addrparams : AddrParams<"dword">;
+def qword_addrparams : AddrParams<"qword">;
+
+multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
+                                bit high_opc, string asmsuffix,
+                                RegisterClass GPR, AddrParams params> {
+  // Unsigned immediate
+  def _STR : A64I_LSunsigimm<size, v, {high_opc, 0b0},
+                     (outs), (ins GPR:$Rt, GPR64xsp:$Rn, params.uimm12:$UImm12),
+                     "str" # asmsuffix # "\t$Rt, [$Rn, $UImm12]",
+                     [], NoItinerary> {
+    let mayStore = 1;
+  }
+  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn]",
+                (!cast<Instruction>(prefix # "_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def _LDR : A64I_LSunsigimm<size, v, {high_opc, 0b1},
+                      (outs GPR:$Rt), (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
+                      "ldr" #  asmsuffix # "\t$Rt, [$Rn, $UImm12]",
+                      [], NoItinerary> {
+    let mayLoad = 1;
+  }
+  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn]",
+                (!cast<Instruction>(prefix # "_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  // Register offset (four of these: load/store and Wm/Xm).
+  let mayLoad = 1 in {
+    def _Wm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b0,
+                            (outs GPR:$Rt),
+                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
+                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+
+    def _Xm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b1,
+                            (outs GPR:$Rt),
+                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
+                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+  }
+  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn, $Rm]",
+        (!cast<Instruction>(prefix # "_Xm_RegOffset_LDR") GPR:$Rt, GPR64xsp:$Rn,
+                                                          GPR64:$Rm, 2)>;
+
+  let mayStore = 1 in {
+    def _Wm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b0,
+                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR32:$Rm,
+                                               params.regextWm:$Ext),
+                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
+                                  [], NoItinerary>;
+
+    def _Xm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b1,
+                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR64:$Rm,
+                                               params.regextXm:$Ext),
+                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
+                                  [], NoItinerary>;
+  }
+  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn, $Rm]",
+      (!cast<Instruction>(prefix # "_Xm_RegOffset_STR") GPR:$Rt, GPR64xsp:$Rn,
+                                                        GPR64:$Rm, 2)>;
+
+  // Unaligned immediate
+  def _STUR : A64I_LSunalimm<size, v, {high_opc, 0b0},
+                             (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                             "stur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
+                             [], NoItinerary> {
+    let mayStore = 1;
+  }
+  def : InstAlias<"stur" # asmsuffix # " $Rt, [$Rn]",
+               (!cast<Instruction>(prefix # "_STUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def _LDUR : A64I_LSunalimm<size, v, {high_opc, 0b1},
+                             (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
+                             "ldur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
+                             [], NoItinerary> {
+    let mayLoad = 1;
+  }
+  def : InstAlias<"ldur" # asmsuffix # " $Rt, [$Rn]",
+               (!cast<Instruction>(prefix # "_LDUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  // Post-indexed
+  def _PostInd_STR : A64I_LSpostind<size, v, {high_opc, 0b0},
+                               (outs GPR64xsp:$Rn_wb),
+                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                               "str" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
+                               [], NoItinerary> {
+    let Constraints = "$Rn = $Rn_wb";
+    let mayStore = 1;
+
+    // Decoder only needed for unpredictability checking (FIXME).
+    let DecoderMethod = "DecodeSingleIndexedInstruction";
+  }
+
+  def _PostInd_LDR : A64I_LSpostind<size, v, {high_opc, 0b1},
+                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
+                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                    "ldr" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
+                                    [], NoItinerary> {
+    let mayLoad = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeSingleIndexedInstruction";
+  }
+
+  // Pre-indexed
+  def _PreInd_STR : A64I_LSpreind<size, v, {high_opc, 0b0},
+                               (outs GPR64xsp:$Rn_wb),
+                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                               "str" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
+                               [], NoItinerary> {
+    let Constraints = "$Rn = $Rn_wb";
+    let mayStore = 1;
+
+    // Decoder only needed for unpredictability checking (FIXME).
+    let DecoderMethod = "DecodeSingleIndexedInstruction";
+  }
+
+  def _PreInd_LDR : A64I_LSpreind<size, v, {high_opc, 0b1},
+                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
+                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                    "ldr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
+                                    [], NoItinerary> {
+    let mayLoad = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeSingleIndexedInstruction";
+  }
+
+}
+
+// STRB/LDRB: First define the instructions
+defm LS8
+  : A64I_LDRSTR_unsigned<"LS8", 0b00, 0b0, 0b0, "b", GPR32, byte_addrparams>;
+
+// STRH/LDRH
+defm LS16
+  : A64I_LDRSTR_unsigned<"LS16", 0b01, 0b0, 0b0, "h", GPR32, hword_addrparams>;
+
+
+// STR/LDR to/from a W register
+defm LS32
+  : A64I_LDRSTR_unsigned<"LS32", 0b10, 0b0, 0b0, "", GPR32, word_addrparams>;
+
+// STR/LDR to/from an X register
+defm LS64
+  : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>;
+
+// STR/LDR to/from a B register
+defm LSFP8
+  : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>;
+
+// STR/LDR to/from an H register
+defm LSFP16
+  : A64I_LDRSTR_unsigned<"LSFP16", 0b01, 0b1, 0b0, "", FPR16, hword_addrparams>;
+
+// STR/LDR to/from an S register
+defm LSFP32
+  : A64I_LDRSTR_unsigned<"LSFP32", 0b10, 0b1, 0b0, "", FPR32, word_addrparams>;
+// STR/LDR to/from a D register
+defm LSFP64
+  : A64I_LDRSTR_unsigned<"LSFP64", 0b11, 0b1, 0b0, "", FPR64, dword_addrparams>;
+// STR/LDR to/from a Q register
+defm LSFP128
+  : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128,
+                         qword_addrparams>;
+
+//===------------------------------
+// 2.3 Signed loads
+//===------------------------------
+
+// Byte and half-word signed loads can both go into either an X or a W register,
+// so it's worth factoring out. Signed word loads don't fit because there is no
+// W version.
+multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
+                           string prefix> {
+  // Unsigned offset
+  def w : A64I_LSunsigimm<size, 0b0, 0b11,
+                          (outs GPR32:$Rt),
+                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
+                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
+                          [], NoItinerary> {
+    let mayLoad = 1;
+  }
+  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
+                  (!cast<Instruction>(prefix # w) GPR32:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def x : A64I_LSunsigimm<size, 0b0, 0b10,
+                          (outs GPR64:$Rt),
+                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
+                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
+                          [], NoItinerary> {
+    let mayLoad = 1;
+  }
+  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
+                  (!cast<Instruction>(prefix # x) GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+  // Register offset
+  let mayLoad = 1 in {
+    def w_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b0,
+                            (outs GPR32:$Rt),
+                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
+                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+
+    def w_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b1,
+                            (outs GPR32:$Rt),
+                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
+                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+
+    def x_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b0,
+                            (outs GPR64:$Rt),
+                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
+                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+
+    def x_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b1,
+                            (outs GPR64:$Rt),
+                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
+                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+  }
+  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
+        (!cast<Instruction>(prefix # "w_Xm_RegOffset") GPR32:$Rt, GPR64xsp:$Rn,
+                                                       GPR64:$Rm, 2)>;
+
+  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
+        (!cast<Instruction>(prefix # "x_Xm_RegOffset") GPR64:$Rt, GPR64xsp:$Rn,
+                                                       GPR64:$Rm, 2)>;
+
+
+  let mayLoad = 1 in {
+    // Unaligned offset
+    def w_U : A64I_LSunalimm<size, 0b0, 0b11,
+                             (outs GPR32:$Rt),
+                             (ins GPR64xsp:$Rn, simm9:$SImm9),
+                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
+                             [], NoItinerary>;
+
+    def x_U : A64I_LSunalimm<size, 0b0, 0b10,
+                             (outs GPR64:$Rt),
+                             (ins GPR64xsp:$Rn, simm9:$SImm9),
+                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
+                             [], NoItinerary>;
+
+
+    // Post-indexed
+    def w_PostInd : A64I_LSpostind<size, 0b0, 0b11,
+                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
+                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                 "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
+                                 [], NoItinerary> {
+      let Constraints = "$Rn = $Rn_wb";
+      let DecoderMethod = "DecodeSingleIndexedInstruction";
+    }
+
+    def x_PostInd : A64I_LSpostind<size, 0b0, 0b10,
+                                   (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
+                                   (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                   "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
+                                   [], NoItinerary> {
+      let Constraints = "$Rn = $Rn_wb";
+      let DecoderMethod = "DecodeSingleIndexedInstruction";
+    }
+
+    // Pre-indexed
+    def w_PreInd : A64I_LSpreind<size, 0b0, 0b11,
+                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
+                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
+                                 [], NoItinerary> {
+      let Constraints = "$Rn = $Rn_wb";
+      let DecoderMethod = "DecodeSingleIndexedInstruction";
+    }
+
+    def x_PreInd : A64I_LSpreind<size, 0b0, 0b10,
+                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
+                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
+                                 [], NoItinerary> {
+      let Constraints = "$Rn = $Rn_wb";
+      let DecoderMethod = "DecodeSingleIndexedInstruction";
+    }
+  } // let mayLoad = 1
+}
+
+// LDRSB
+defm LDRSB : A64I_LDR_signed<0b00, "b", byte_addrparams, "LDRSB">;
+// LDRSH
+defm LDRSH : A64I_LDR_signed<0b01, "h", hword_addrparams, "LDRSH">;
+
+// LDRSW: load a 32-bit register, sign-extending to 64-bits.
+def LDRSWx
+    : A64I_LSunsigimm<0b10, 0b0, 0b10,
+                    (outs GPR64:$Rt),
+                    (ins GPR64xsp:$Rn, word_uimm12:$UImm12),
+                    "ldrsw\t$Rt, [$Rn, $UImm12]",
+                    [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"ldrsw $Rt, [$Rn]", (LDRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+let mayLoad = 1 in {
+  def LDRSWx_Wm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b0,
+                             (outs GPR64:$Rt),
+                             (ins GPR64xsp:$Rn, GPR32:$Rm, word_Wm_regext:$Ext),
+                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
+                             [], NoItinerary>;
+
+  def LDRSWx_Xm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b1,
+                             (outs GPR64:$Rt),
+                             (ins GPR64xsp:$Rn, GPR64:$Rm, word_Xm_regext:$Ext),
+                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
+                             [], NoItinerary>;
+}
+def : InstAlias<"ldrsw $Rt, [$Rn, $Rm]",
+                (LDRSWx_Xm_RegOffset GPR64:$Rt, GPR64xsp:$Rn, GPR64:$Rm, 2)>;
+
+
+def LDURSWx
+    : A64I_LSunalimm<0b10, 0b0, 0b10,
+                    (outs GPR64:$Rt),
+                    (ins GPR64xsp:$Rn, simm9:$SImm9),
+                    "ldursw\t$Rt, [$Rn, $SImm9]",
+                    [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"ldursw $Rt, [$Rn]", (LDURSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+def LDRSWx_PostInd
+    : A64I_LSpostind<0b10, 0b0, 0b10,
+                    (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
+                    (ins GPR64xsp:$Rn, simm9:$SImm9),
+                    "ldrsw\t$Rt, [$Rn], $SImm9",
+                    [], NoItinerary> {
+  let mayLoad = 1;
+  let Constraints = "$Rn = $Rn_wb";
+  let DecoderMethod = "DecodeSingleIndexedInstruction";
+}
+
+def LDRSWx_PreInd : A64I_LSpreind<0b10, 0b0, 0b10,
+                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
+                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                 "ldrsw\t$Rt, [$Rn, $SImm9]!",
+                                 [], NoItinerary> {
+  let mayLoad = 1;
+  let Constraints = "$Rn = $Rn_wb";
+  let DecoderMethod = "DecodeSingleIndexedInstruction";
+}
+
+//===------------------------------
+// 2.4 Prefetch operations
+//===------------------------------
+
+def PRFM : A64I_LSunsigimm<0b11, 0b0, 0b10, (outs),
+                 (ins prefetch_op:$Rt, GPR64xsp:$Rn, dword_uimm12:$UImm12),
+                 "prfm\t$Rt, [$Rn, $UImm12]",
+                 [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"prfm $Rt, [$Rn]",
+                (PRFM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
+
+let mayLoad = 1 in {
+  def PRFM_Wm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b0, (outs),
+                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
+                                             GPR32:$Rm, dword_Wm_regext:$Ext),
+                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
+                                        [], NoItinerary>;
+  def PRFM_Xm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b1, (outs),
+                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
+                                             GPR64:$Rm, dword_Xm_regext:$Ext),
+                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
+                                        [], NoItinerary>;
+}
+
+def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+                (PRFM_Xm_RegOffset prefetch_op:$Rt, GPR64xsp:$Rn,
+                                   GPR64:$Rm, 2)>;
+
+
+def PRFUM : A64I_LSunalimm<0b11, 0b0, 0b10, (outs),
+                         (ins prefetch_op:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                         "prfum\t$Rt, [$Rn, $SImm9]",
+                         [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"prfum $Rt, [$Rn]",
+                (PRFUM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load-store register (unprivileged) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDTRB, LDTRH, LDTRSB, LDTRSH, LDTRSW, STTR, STTRB and STTRH
+
+// These instructions very much mirror the "unscaled immediate" loads, but since
+// there are no floating-point variants we need to split them out into their own
+// section to avoid instantiation of "ldtr d0, [sp]" etc.
+
+multiclass A64I_LDTRSTTR<bits<2> size, string asmsuffix, RegisterClass GPR,
+                         string prefix> {
+  def _UnPriv_STR : A64I_LSunpriv<size, 0b0, 0b00,
+                              (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                              "sttr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
+                              [], NoItinerary> {
+    let mayStore = 1;
+  }
+
+  def : InstAlias<"sttr" # asmsuffix # " $Rt, [$Rn]",
+         (!cast<Instruction>(prefix # "_UnPriv_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def _UnPriv_LDR : A64I_LSunpriv<size, 0b0, 0b01,
+                               (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
+                               "ldtr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
+                               [], NoItinerary> {
+    let mayLoad = 1;
+  }
+
+  def : InstAlias<"ldtr" # asmsuffix # " $Rt, [$Rn]",
+         (!cast<Instruction>(prefix # "_UnPriv_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+}
+
+// STTRB/LDTRB: First define the instructions
+defm LS8 : A64I_LDTRSTTR<0b00, "b", GPR32, "LS8">;
+
+// STTRH/LDTRH
+defm LS16 : A64I_LDTRSTTR<0b01, "h", GPR32, "LS16">;
+
+// STTR/LDTR to/from a W register
+defm LS32 : A64I_LDTRSTTR<0b10, "", GPR32, "LS32">;
+
+// STTR/LDTR to/from an X register
+defm LS64 : A64I_LDTRSTTR<0b11, "", GPR64, "LS64">;
+
+// Now a class for the signed instructions that can go to either 32 or 64
+// bits...
+multiclass A64I_LDTR_signed<bits<2> size, string asmopcode, string prefix> {
+  let mayLoad = 1 in {
+    def w : A64I_LSunpriv<size, 0b0, 0b11,
+                          (outs GPR32:$Rt),
+                          (ins GPR64xsp:$Rn, simm9:$SImm9),
+                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
+                          [], NoItinerary>;
+
+    def x : A64I_LSunpriv<size, 0b0, 0b10,
+                          (outs GPR64:$Rt),
+                          (ins GPR64xsp:$Rn, simm9:$SImm9),
+                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
+                          [], NoItinerary>;
+  }
+
+  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
+                 (!cast<Instruction>(prefix # "w") GPR32:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
+                 (!cast<Instruction>(prefix # "x") GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+}
+
+// LDTRSB
+defm LDTRSB : A64I_LDTR_signed<0b00, "b", "LDTRSB">;
+// LDTRSH
+defm LDTRSH : A64I_LDTR_signed<0b01, "h", "LDTRSH">;
+
+// And finally LDTRSW which only goes to 64 bits.
+def LDTRSWx : A64I_LSunpriv<0b10, 0b0, 0b10,
+                            (outs GPR64:$Rt),
+                            (ins GPR64xsp:$Rn, simm9:$SImm9),
+                            "ldtrsw\t$Rt, [$Rn, $SImm9]",
+                            [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"ldtrsw $Rt, [$Rn]", (LDTRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load-store register pair (offset) instructions
+//===----------------------------------------------------------------------===//
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register pair (post-indexed) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STP, LDP, LDPSW
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register pair (pre-indexed) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STP, LDP, LDPSW
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store non-temporal register pair (offset) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STNP, LDNP
+
+
+// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
+// know the access size via some means. An isolated operand does not have this
+// information unless told from here, which means we need separate tablegen
+// Operands for each access size. This multiclass takes care of instantiating
+// the correct template functions in the rest of the backend.
+
+multiclass offsets_simm7<string MemSize, string prefix> {
+  // The bare signed 7-bit immediate is used in post-indexed instructions, but
+  // because of the scaling performed a generic "simm7" operand isn't
+  // appropriate here either.
+  def simm7_asmoperand : AsmOperandClass {
+    let Name = "SImm7_Scaled" # MemSize;
+    let PredicateMethod = "isSImm7Scaled<" # MemSize # ">";
+    let RenderMethod = "addSImm7ScaledOperands<" # MemSize # ">";
+    let DiagnosticType = "LoadStoreSImm7_" # MemSize;
+  }
+
+  def simm7 : Operand<i64> {
+    let PrintMethod = "printSImm7ScaledOperand<" # MemSize # ">";
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "simm7_asmoperand");
+  }
+}
+
+defm word_  : offsets_simm7<"4", "word_">;
+defm dword_ : offsets_simm7<"8", "dword_">;
+defm qword_ : offsets_simm7<"16", "qword_">;
+
+multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
+                          Operand simm7, string prefix> {
+  def _STR : A64I_LSPoffset<opc, v, 0b0, (outs),
+                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
+                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+    let mayStore = 1;
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+  def : InstAlias<"stp $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(prefix # "_STR") SomeReg:$Rt,
+                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
+
+  def _LDR : A64I_LSPoffset<opc, v, 0b1,
+                            (outs SomeReg:$Rt, SomeReg:$Rt2),
+                            (ins GPR64xsp:$Rn, simm7:$SImm7),
+                            "ldp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+    let mayLoad = 1;
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+  def : InstAlias<"ldp $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(prefix # "_LDR") SomeReg:$Rt,
+                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
+
+  def _PostInd_STR : A64I_LSPpostind<opc, v, 0b0,
+                               (outs GPR64xsp:$Rn_wb),
+                               (ins SomeReg:$Rt, SomeReg:$Rt2,
+                                    GPR64xsp:$Rn,
+                                    simm7:$SImm7),
+                               "stp\t$Rt, $Rt2, [$Rn], $SImm7",
+                               [], NoItinerary> {
+    let mayStore = 1;
+    let Constraints = "$Rn = $Rn_wb";
+
+    // Decoder only needed for unpredictability checking (FIXME).
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+
+  def _PostInd_LDR : A64I_LSPpostind<opc, v, 0b1,
+                        (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
+                        (ins GPR64xsp:$Rn, simm7:$SImm7),
+                        "ldp\t$Rt, $Rt2, [$Rn], $SImm7",
+                        [], NoItinerary> {
+    let mayLoad = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+
+  def _PreInd_STR : A64I_LSPpreind<opc, v, 0b0, (outs GPR64xsp:$Rn_wb),
+                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
+                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]!",
+                    [], NoItinerary> {
+    let mayStore = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+
+  def _PreInd_LDR : A64I_LSPpreind<opc, v, 0b1,
+                              (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
+                              (ins GPR64xsp:$Rn, simm7:$SImm7),
+                              "ldp\t$Rt, $Rt2, [$Rn, $SImm7]!",
+                              [], NoItinerary> {
+    let mayLoad = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+
+  def _NonTemp_STR : A64I_LSPnontemp<opc, v, 0b0, (outs),
+                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
+                    "stnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+    let mayStore = 1;
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+  def : InstAlias<"stnp $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(prefix # "_NonTemp_STR") SomeReg:$Rt,
+                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
+
+  def _NonTemp_LDR : A64I_LSPnontemp<opc, v, 0b1,
+                            (outs SomeReg:$Rt, SomeReg:$Rt2),
+                            (ins GPR64xsp:$Rn, simm7:$SImm7),
+                            "ldnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+    let mayLoad = 1;
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+  def : InstAlias<"ldnp $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(prefix # "_NonTemp_LDR") SomeReg:$Rt,
+                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
+
+}
+
+
+defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">;
+defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">;
+defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">;
+defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64,  dword_simm7, "LSFPPair64">;
+defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
+                                  "LSFPPair128">;
+
+
+def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
+                           (outs GPR64:$Rt, GPR64:$Rt2),
+                           (ins GPR64xsp:$Rn, word_simm7:$SImm7),
+                           "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+  let mayLoad = 1;
+  let DecoderMethod = "DecodeLDSTPairInstruction";
+}
+def : InstAlias<"ldpsw $Rt, $Rt2, [$Rn]",
+                (LDPSWx GPR64:$Rt, GPR64:$Rt2, GPR64xsp:$Rn, 0)>;
+
+def LDPSWx_PostInd : A64I_LSPpostind<0b01, 0b0, 0b1,
+                                  (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
+                                  (ins GPR64xsp:$Rn, word_simm7:$SImm7),
+                                  "ldpsw\t$Rt, $Rt2, [$Rn], $SImm7",
+                                  [], NoItinerary> {
+  let mayLoad = 1;
+  let Constraints = "$Rn = $Rn_wb";
+  let DecoderMethod = "DecodeLDSTPairInstruction";
+}
+
+def LDPSWx_PreInd : A64I_LSPpreind<0b01, 0b0, 0b1,
+                                   (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
+                                   (ins GPR64xsp:$Rn, word_simm7:$SImm7),
+                                   "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]!",
+                                   [], NoItinerary> {
+  let mayLoad = 1;
+  let Constraints = "$Rn = $Rn_wb";
+  let DecoderMethod = "DecodeLDSTPairInstruction";
+}
+
+//===----------------------------------------------------------------------===//
+// Logical (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: AND, ORR, EOR, ANDS, + aliases TST, MOV
+
+multiclass logical_imm_operands<string prefix, string note,
+                                int size, ValueType VT> {
+  def _asmoperand : AsmOperandClass {
+    let Name = "LogicalImm" # note # size;
+    let PredicateMethod = "isLogicalImm" # note # "<" # size # ">";
+    let RenderMethod = "addLogicalImmOperands<" # size # ">";
+    let DiagnosticType = "LogicalSecondSource";
+  }
+
+  def _operand
+        : Operand<VT>, ComplexPattern<VT, 1, "SelectLogicalImm", [imm]> {
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
+    let PrintMethod = "printLogicalImmOperand<" # size # ">";
+    let DecoderMethod = "DecodeLogicalImmOperand<" # size # ">";
+  }
+}
+
+defm logical_imm32 : logical_imm_operands<"logical_imm32", "", 32, i32>;
+defm logical_imm64 : logical_imm_operands<"logical_imm64", "", 64, i64>;
+
+// The mov versions only differ in assembly parsing, where they
+// exclude values representable with either MOVZ or MOVN.
+defm logical_imm32_mov
+  : logical_imm_operands<"logical_imm32_mov", "MOV", 32, i32>;
+defm logical_imm64_mov
+  : logical_imm_operands<"logical_imm64_mov", "MOV", 64, i64>;
+
+
+multiclass A64I_logimmSizes<bits<2> opc, string asmop, SDNode opnode> {
+  def wwi : A64I_logicalimm<0b0, opc, (outs GPR32wsp:$Rd),
+                         (ins GPR32:$Rn, logical_imm32_operand:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [(set GPR32wsp:$Rd,
+                               (opnode GPR32:$Rn, logical_imm32_operand:$Imm))],
+                         NoItinerary>;
+
+  def xxi : A64I_logicalimm<0b1, opc, (outs GPR64xsp:$Rd),
+                         (ins GPR64:$Rn, logical_imm64_operand:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [(set GPR64xsp:$Rd,
+                               (opnode GPR64:$Rn, logical_imm64_operand:$Imm))],
+                         NoItinerary>;
+}
+
+defm AND : A64I_logimmSizes<0b00, "and", and>;
+defm ORR : A64I_logimmSizes<0b01, "orr", or>;
+defm EOR : A64I_logimmSizes<0b10, "eor", xor>;
+
+let Defs = [NZCV] in {
+  def ANDSwwi : A64I_logicalimm<0b0, 0b11, (outs GPR32:$Rd),
+                                (ins GPR32:$Rn, logical_imm32_operand:$Imm),
+                                "ands\t$Rd, $Rn, $Imm",
+                                [], NoItinerary>;
+
+  def ANDSxxi : A64I_logicalimm<0b1, 0b11, (outs GPR64:$Rd),
+                                (ins GPR64:$Rn, logical_imm64_operand:$Imm),
+                                "ands\t$Rd, $Rn, $Imm",
+                                [], NoItinerary>;
+}
+
+
+def : InstAlias<"tst $Rn, $Imm",
+                (ANDSwwi WZR, GPR32:$Rn, logical_imm32_operand:$Imm)>;
+def : InstAlias<"tst $Rn, $Imm",
+                (ANDSxxi XZR, GPR64:$Rn, logical_imm64_operand:$Imm)>;
+def : InstAlias<"mov $Rd, $Imm",
+                (ORRwwi GPR32wsp:$Rd, WZR, logical_imm32_mov_operand:$Imm)>;
+def : InstAlias<"mov $Rd, $Imm",
+                (ORRxxi GPR64xsp:$Rd, XZR, logical_imm64_mov_operand:$Imm)>;
+
+//===----------------------------------------------------------------------===//
+// Logical (shifted register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: AND, BIC, ORR, ORN, EOR, EON, ANDS, BICS + aliases TST, MVN, MOV
+
+// Operand for optimizing (icmp (and LHS, RHS), 0, SomeCode). In theory "ANDS"
+// behaves differently for unsigned comparisons, so we defensively only allow
+// signed or n/a as the operand. In practice "unsigned greater than 0" is "not
+// equal to 0" and LLVM gives us this.
+def signed_cond : PatLeaf<(cond), [{
+  return !isUnsignedIntSetCC(N->get());
+}]>;
+
+
+// These instructions share their "shift" operands with add/sub (shifted
+// register instructions). They are defined there.
+
+// N.b. the commutable parameter is just !N. It will be first against the wall
+// when the revolution comes.
+multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
+                          bit N, bit commutable,
+                          string asmop, SDPatternOperator opfrag, string sty,
+                          RegisterClass GPR, list<Register> defs> {
+  let isCommutable = commutable, Defs = defs in {
+  def _lsl : A64I_logicalshift<sf, opc, 0b00, N,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set GPR:$Rd, (opfrag GPR:$Rn, (shl GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _lsr : A64I_logicalshift<sf, opc, 0b01, N,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set GPR:$Rd, (opfrag GPR:$Rn, (srl GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _asr : A64I_logicalshift<sf, opc, 0b10, N,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set GPR:$Rd, (opfrag GPR:$Rn, (sra GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _ror : A64I_logicalshift<sf, opc, 0b11, N,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("ror_operand_" # sty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set GPR:$Rd, (opfrag GPR:$Rn, (rotr GPR:$Rm,
+                            !cast<Operand>("ror_operand_" # sty):$Imm6))
+                       )],
+                       NoItinerary>;
+  }
+
+  def _noshift
+      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
+                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
+                                                      GPR:$Rm, 0)>;
+
+  def : Pat<(opfrag GPR:$Rn, GPR:$Rm),
+            (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+multiclass logical_sizes<string prefix, bits<2> opc, bit N, bit commutable,
+                         string asmop, SDPatternOperator opfrag,
+                         list<Register> defs> {
+  defm xxx : logical_shifts<prefix # "xxx", 0b1, opc, N,
+                            commutable, asmop, opfrag, "i64", GPR64, defs>;
+  defm www : logical_shifts<prefix # "www", 0b0, opc, N,
+                            commutable, asmop, opfrag, "i32", GPR32, defs>;
+}
+
+
+defm AND : logical_sizes<"AND", 0b00, 0b0, 0b1, "and", and, []>;
+defm ORR : logical_sizes<"ORR", 0b01, 0b0, 0b1, "orr", or, []>;
+defm EOR : logical_sizes<"EOR", 0b10, 0b0, 0b1, "eor", xor, []>;
+defm ANDS : logical_sizes<"ANDS", 0b11, 0b0, 0b1, "ands",
+             PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs),
+                     [{ (void)N; return false; }]>,
+             [NZCV]>;
+
+defm BIC : logical_sizes<"BIC", 0b00, 0b1, 0b0, "bic",
+                         PatFrag<(ops node:$lhs, node:$rhs),
+                                 (and node:$lhs, (not node:$rhs))>, []>;
+defm ORN : logical_sizes<"ORN", 0b01, 0b1, 0b0, "orn",
+                         PatFrag<(ops node:$lhs, node:$rhs),
+                                 (or node:$lhs, (not node:$rhs))>, []>;
+defm EON : logical_sizes<"EON", 0b10, 0b1, 0b0, "eon",
+                         PatFrag<(ops node:$lhs, node:$rhs),
+                                 (xor node:$lhs, (not node:$rhs))>, []>;
+defm BICS : logical_sizes<"BICS", 0b11, 0b1, 0b0, "bics",
+                          PatFrag<(ops node:$lhs, node:$rhs),
+                                  (and node:$lhs, (not node:$rhs)),
+                                  [{ (void)N; return false; }]>,
+                          [NZCV]>;
+
+multiclass tst_shifts<string prefix, bit sf, string sty, RegisterClass GPR> {
+  let isCommutable = 1, Rd = 0b11111, Defs = [NZCV] in {
+  def _lsl : A64I_logicalshift<sf, 0b11, 0b00, 0b0,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # sty):$Imm6),
+                       "tst\t$Rn, $Rm, $Imm6",
+                       [(set NZCV, (A64setcc (and GPR:$Rn, (shl GPR:$Rm,
+                           !cast<Operand>("lsl_operand_" # sty):$Imm6)),
+                                          0, signed_cond))],
+                       NoItinerary>;
+
+
+  def _lsr : A64I_logicalshift<sf, 0b11, 0b01, 0b0,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # sty):$Imm6),
+                       "tst\t$Rn, $Rm, $Imm6",
+                       [(set NZCV, (A64setcc (and GPR:$Rn, (srl GPR:$Rm,
+                           !cast<Operand>("lsr_operand_" # sty):$Imm6)),
+                                          0, signed_cond))],
+                       NoItinerary>;
+
+  def _asr : A64I_logicalshift<sf, 0b11, 0b10, 0b0,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # sty):$Imm6),
+                       "tst\t$Rn, $Rm, $Imm6",
+                       [(set NZCV, (A64setcc (and GPR:$Rn, (sra GPR:$Rm,
+                           !cast<Operand>("asr_operand_" # sty):$Imm6)),
+                                          0, signed_cond))],
+                       NoItinerary>;
+
+  def _ror : A64I_logicalshift<sf, 0b11, 0b11, 0b0,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("ror_operand_" # sty):$Imm6),
+                       "tst\t$Rn, $Rm, $Imm6",
+                       [(set NZCV, (A64setcc (and GPR:$Rn, (rotr GPR:$Rm,
+                           !cast<Operand>("ror_operand_" # sty):$Imm6)),
+                                          0, signed_cond))],
+                       NoItinerary>;
+  }
+
+  def _noshift : InstAlias<"tst $Rn, $Rm",
+                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+
+  def : Pat<(A64setcc (and GPR:$Rn, GPR:$Rm), 0, signed_cond),
+            (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+defm TSTxx : tst_shifts<"TSTxx", 0b1, "i64", GPR64>;
+defm TSTww : tst_shifts<"TSTww", 0b0, "i32", GPR32>;
+
+
+multiclass mvn_shifts<string prefix, bit sf, string sty, RegisterClass GPR> {
+  let isCommutable = 0, Rn = 0b11111 in {
+  def _lsl : A64I_logicalshift<sf, 0b01, 0b00, 0b1,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # sty):$Imm6),
+                       "mvn\t$Rd, $Rm, $Imm6",
+                       [(set GPR:$Rd, (not (shl GPR:$Rm,
+                         !cast<Operand>("lsl_operand_" # sty):$Imm6)))],
+                       NoItinerary>;
+
+
+  def _lsr : A64I_logicalshift<sf, 0b01, 0b01, 0b1,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # sty):$Imm6),
+                       "mvn\t$Rd, $Rm, $Imm6",
+                       [(set GPR:$Rd, (not (srl GPR:$Rm,
+                         !cast<Operand>("lsr_operand_" # sty):$Imm6)))],
+                       NoItinerary>;
+
+  def _asr : A64I_logicalshift<sf, 0b01, 0b10, 0b1,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # sty):$Imm6),
+                       "mvn\t$Rd, $Rm, $Imm6",
+                       [(set GPR:$Rd, (not (sra GPR:$Rm,
+                         !cast<Operand>("asr_operand_" # sty):$Imm6)))],
+                       NoItinerary>;
+
+  def _ror : A64I_logicalshift<sf, 0b01, 0b11, 0b1,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rm,
+                            !cast<Operand>("ror_operand_" # sty):$Imm6),
+                       "mvn\t$Rd, $Rm, $Imm6",
+                       [(set GPR:$Rd, (not (rotr GPR:$Rm,
+                         !cast<Operand>("lsl_operand_" # sty):$Imm6)))],
+                       NoItinerary>;
+  }
+
+  def _noshift : InstAlias<"mvn $Rn, $Rm",
+                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+
+  def : Pat<(not GPR:$Rm),
+            (!cast<Instruction>(prefix # "_lsl") GPR:$Rm, 0)>;
+}
+
+defm MVNxx : mvn_shifts<"MVNxx", 0b1, "i64", GPR64>;
+defm MVNww : mvn_shifts<"MVNww", 0b0, "i32", GPR32>;
+
+def MOVxx :InstAlias<"mov $Rd, $Rm", (ORRxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
+def MOVww :InstAlias<"mov $Rd, $Rm", (ORRwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Move wide (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: MOVN, MOVZ, MOVK + MOV aliases
+
+// A wide variety of different relocations are needed for variants of these
+// instructions, so it turns out that we need a different operand for all of
+// them.
+multiclass movw_operands<string prefix, string instname, int width> {
+  def _imm_asmoperand : AsmOperandClass {
+    let Name = instname # width # "Shifted" # shift;
+    let PredicateMethod = "is" # instname # width # "Imm";
+    let RenderMethod = "addMoveWideImmOperands";
+    let ParserMethod = "ParseImmWithLSLOperand";
+    let DiagnosticType = "MOVWUImm16";
+  }
+
+  def _imm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_imm_asmoperand");
+    let PrintMethod = "printMoveWideImmOperand";
+    let EncoderMethod = "getMoveWideImmOpValue";
+    let DecoderMethod = "DecodeMoveWideImmOperand<" # width # ">";
+
+    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
+  }
+}
+
+defm movn32 : movw_operands<"movn32", "MOVN", 32>;
+defm movn64 : movw_operands<"movn64", "MOVN", 64>;
+defm movz32 : movw_operands<"movz32", "MOVZ", 32>;
+defm movz64 : movw_operands<"movz64", "MOVZ", 64>;
+defm movk32 : movw_operands<"movk32", "MOVK", 32>;
+defm movk64 : movw_operands<"movk64", "MOVK", 64>;
+
+multiclass A64I_movwSizes<bits<2> opc, string asmop, dag ins32bit,
+                          dag ins64bit> {
+
+  def wii : A64I_movw<0b0, opc, (outs GPR32:$Rd), ins32bit,
+                      !strconcat(asmop, "\t$Rd, $FullImm"),
+                      [], NoItinerary> {
+    bits<18> FullImm;
+    let UImm16 = FullImm{15-0};
+    let Shift = FullImm{17-16};
+  }
+
+  def xii : A64I_movw<0b1, opc, (outs GPR64:$Rd), ins64bit,
+                      !strconcat(asmop, "\t$Rd, $FullImm"),
+                      [], NoItinerary> {
+    bits<18> FullImm;
+    let UImm16 = FullImm{15-0};
+    let Shift = FullImm{17-16};
+  }
+}
+
+let isMoveImm = 1, isReMaterializable = 1,
+    isAsCheapAsAMove = 1, hasSideEffects = 0 in {
+  defm MOVN : A64I_movwSizes<0b00, "movn",
+                             (ins movn32_imm:$FullImm),
+                             (ins movn64_imm:$FullImm)>;
+
+  // Some relocations are able to convert between a MOVZ and a MOVN. If these
+  // are applied the instruction must be emitted with the corresponding bits as
+  // 0, which means a MOVZ needs to override that bit from the default.
+  let PostEncoderMethod = "fixMOVZ" in
+  defm MOVZ : A64I_movwSizes<0b10, "movz",
+                             (ins movz32_imm:$FullImm),
+                             (ins movz64_imm:$FullImm)>;
+}
+
+let Constraints = "$src = $Rd" in
+defm MOVK : A64I_movwSizes<0b11, "movk",
+                           (ins GPR32:$src, movk32_imm:$FullImm),
+                           (ins GPR64:$src, movk64_imm:$FullImm)>;
+
+
+// And now the "MOV" aliases. These also need their own operands because what
+// they accept is completely different to what the base instructions accept.
+multiclass movalias_operand<string prefix, string basename,
+                            string immpredicate, int width> {
+  def _asmoperand : AsmOperandClass {
+    let Name = basename # width # "MovAlias";
+    let PredicateMethod
+          = "isMoveWideMovAlias<" # width # ", A64Imms::" # immpredicate # ">";
+    let RenderMethod
+      = "addMoveWideMovAliasOperands<" # width # ", "
+                                       # "A64Imms::" # immpredicate # ">";
+  }
+
+  def _movimm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
+
+    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
+  }
+}
+
+defm movz32 : movalias_operand<"movz32", "MOVZ", "isMOVZImm", 32>;
+defm movz64 : movalias_operand<"movz64", "MOVZ", "isMOVZImm", 64>;
+defm movn32 : movalias_operand<"movn32", "MOVN", "isOnlyMOVNImm", 32>;
+defm movn64 : movalias_operand<"movn64", "MOVN", "isOnlyMOVNImm", 64>;
+
+// FIXME: these are officially canonical aliases, but TableGen is too limited to
+// print them at the moment. I believe in this case an "AliasPredicate" method
+// will need to be implemented. to allow it, as well as the more generally
+// useful handling of non-register, non-constant operands.
+class movalias<Instruction INST, RegisterClass GPR, Operand operand>
+  : InstAlias<"mov $Rd, $FullImm", (INST GPR:$Rd, operand:$FullImm)>;
+
+def : movalias<MOVZwii, GPR32, movz32_movimm>;
+def : movalias<MOVZxii, GPR64, movz64_movimm>;
+def : movalias<MOVNwii, GPR32, movn32_movimm>;
+def : movalias<MOVNxii, GPR64, movn64_movimm>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative addressing instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADR, ADRP
+
+def adr_label : Operand<i64> {
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_adr_prel>";
+
+  // This label is a 21-bit offset from PC, unscaled
+  let PrintMethod = "printLabelOperand<21, 1>";
+  let ParserMatchClass = label_asmoperand<21, 1>;
+  let OperandType = "OPERAND_PCREL";
+}
+
+def adrp_label_asmoperand : AsmOperandClass {
+  let Name = "AdrpLabel";
+  let RenderMethod = "addLabelOperands<21, 4096>";
+  let DiagnosticType = "Label";
+}
+
+def adrp_label : Operand<i64> {
+  let EncoderMethod = "getAdrpLabelOpValue";
+
+  // This label is a 21-bit offset from PC, scaled by the page-size: 4096.
+  let PrintMethod = "printLabelOperand<21, 4096>";
+  let ParserMatchClass = adrp_label_asmoperand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+let hasSideEffects = 0 in {
+  def ADRxi : A64I_PCADR<0b0, (outs GPR64:$Rd), (ins adr_label:$Label),
+                         "adr\t$Rd, $Label", [], NoItinerary>;
+
+  def ADRPxi : A64I_PCADR<0b1, (outs GPR64:$Rd), (ins adrp_label:$Label),
+                          "adrp\t$Rd, $Label", [], NoItinerary>;
+}
+
+//===----------------------------------------------------------------------===//
+// System instructions
+//===----------------------------------------------------------------------===//
+// Contains: HINT, CLREX, DSB, DMB, ISB, MSR, SYS, SYSL, MRS
+//    + aliases IC, DC, AT, TLBI, NOP, YIELD, WFE, WFI, SEV, SEVL
+
+// Op1 and Op2 fields are sometimes simple 3-bit unsigned immediate values.
+def uimm3_asmoperand : AsmOperandClass {
+  let Name = "UImm3";
+  let PredicateMethod = "isUImm<3>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm3";
+}
+
+def uimm3 : Operand<i32> {
+  let ParserMatchClass = uimm3_asmoperand;
+}
+
+// The HINT alias can accept a simple unsigned 7-bit immediate.
+def uimm7_asmoperand : AsmOperandClass {
+  let Name = "UImm7";
+  let PredicateMethod = "isUImm<7>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm7";
+}
+
+def uimm7 : Operand<i32> {
+  let ParserMatchClass = uimm7_asmoperand;
+}
+
+// Multiclass namedimm is defined with the prefetch operands. Most of these fit
+// into the NamedImmMapper scheme well: they either accept a named operand or
+// any immediate under a particular value (which may be 0, implying no immediate
+// is allowed).
+defm dbarrier : namedimm<"dbarrier", "A64DB::DBarrierMapper">;
+defm isb : namedimm<"isb", "A64ISB::ISBMapper">;
+defm ic : namedimm<"ic", "A64IC::ICMapper">;
+defm dc : namedimm<"dc", "A64DC::DCMapper">;
+defm at : namedimm<"at", "A64AT::ATMapper">;
+defm tlbi : namedimm<"tlbi", "A64TLBI::TLBIMapper">;
+
+// However, MRS and MSR are more complicated for a few reasons:
+//   * There are ~1000 generic names S3_<op1>_<CRn>_<CRm>_<Op2> which have an
+//     implementation-defined effect
+//   * Most registers are shared, but some are read-only or write-only.
+//   * There is a variant of MSR which accepts the same register name (SPSel),
+//     but which would have a different encoding.
+
+// In principle these could be resolved in with more complicated subclasses of
+// NamedImmMapper, however that imposes an overhead on other "named
+// immediates". Both in concrete terms with virtual tables and in unnecessary
+// abstraction.
+
+// The solution adopted here is to take the MRS/MSR Mappers out of the usual
+// hierarchy (they're not derived from NamedImmMapper) and to add logic for
+// their special situation.
+def mrs_asmoperand : AsmOperandClass {
+  let Name = "MRS";
+  let ParserMethod = "ParseSysRegOperand";
+  let DiagnosticType = "MRS";
+}
+
+def mrs_op : Operand<i32> {
+  let ParserMatchClass = mrs_asmoperand;
+  let PrintMethod = "printMRSOperand";
+  let DecoderMethod = "DecodeMRSOperand";
+}
+
+def msr_asmoperand : AsmOperandClass {
+  let Name = "MSRWithReg";
+
+  // Note that SPSel is valid for both this and the pstate operands, but with
+  // different immediate encodings. This is why these operands provide a string
+  // AArch64Operand rather than an immediate. The overlap is small enough that
+  // it could be resolved with hackery now, but who can say in future?
+  let ParserMethod = "ParseSysRegOperand";
+  let DiagnosticType = "MSR";
+}
+
+def msr_op : Operand<i32> {
+  let ParserMatchClass = msr_asmoperand;
+  let PrintMethod = "printMSROperand";
+  let DecoderMethod = "DecodeMSROperand";
+}
+
+def pstate_asmoperand : AsmOperandClass {
+  let Name = "MSRPState";
+  // See comment above about parser.
+  let ParserMethod = "ParseSysRegOperand";
+  let DiagnosticType = "MSR";
+}
+
+def pstate_op : Operand<i32> {
+  let ParserMatchClass = pstate_asmoperand;
+  let PrintMethod = "printNamedImmOperand<A64PState::PStateMapper>";
+  let DecoderMethod = "DecodeNamedImmOperand<A64PState::PStateMapper>";
+}
+
+// When <CRn> is specified, an assembler should accept something like "C4", not
+// the usual "#4" immediate.
+def CRx_asmoperand : AsmOperandClass {
+  let Name = "CRx";
+  let PredicateMethod = "isUImm<4>";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "ParseCRxOperand";
+  // Diagnostics are handled in all cases by ParseCRxOperand.
+}
+
+def CRx : Operand<i32> {
+  let ParserMatchClass = CRx_asmoperand;
+  let PrintMethod = "printCRxOperand";
+}
+
+
+// Finally, we can start defining the instructions.
+
+// HINT is straightforward, with a few aliases.
+def HINTi : A64I_system<0b0, (outs), (ins uimm7:$UImm7), "hint\t$UImm7",
+                        [], NoItinerary> {
+  bits<7> UImm7;
+  let CRm = UImm7{6-3};
+  let Op2 = UImm7{2-0};
+
+  let Op0 = 0b00;
+  let Op1 = 0b011;
+  let CRn = 0b0010;
+  let Rt = 0b11111;
+}
+
+def : InstAlias<"nop", (HINTi 0)>;
+def : InstAlias<"yield", (HINTi 1)>;
+def : InstAlias<"wfe", (HINTi 2)>;
+def : InstAlias<"wfi", (HINTi 3)>;
+def : InstAlias<"sev", (HINTi 4)>;
+def : InstAlias<"sevl", (HINTi 5)>;
+
+// Quite a few instructions then follow a similar pattern of fixing common
+// fields in the bitpattern, we'll define a helper-class for them.
+class simple_sys<bits<2> op0, bits<3> op1, bits<4> crn, bits<3> op2,
+                 Operand operand, string asmop>
+  : A64I_system<0b0, (outs), (ins operand:$CRm), !strconcat(asmop, "\t$CRm"),
+                [], NoItinerary> {
+  let Op0 = op0;
+  let Op1 = op1;
+  let CRn = crn;
+  let Op2 = op2;
+  let Rt = 0b11111;
+}
+
+
+def CLREXi : simple_sys<0b00, 0b011, 0b0011, 0b010, uimm4, "clrex">;
+def DSBi : simple_sys<0b00, 0b011, 0b0011, 0b100, dbarrier_op, "dsb">;
+def DMBi : simple_sys<0b00, 0b011, 0b0011, 0b101, dbarrier_op, "dmb">;
+def ISBi : simple_sys<0b00, 0b011, 0b0011, 0b110, isb_op, "isb">;
+
+def : InstAlias<"clrex", (CLREXi 0b1111)>;
+def : InstAlias<"isb", (ISBi 0b1111)>;
+
+// (DMBi 0xb) is a "DMB ISH" instruciton, appropriate for Linux SMP
+// configurations at least.
+def : Pat<(atomic_fence imm, imm), (DMBi 0xb)>;
+
+// Any SYS bitpattern can be represented with a complex and opaque "SYS"
+// instruction.
+def SYSiccix : A64I_system<0b0, (outs),
+                           (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm,
+                                uimm3:$Op2, GPR64:$Rt),
+                           "sys\t$Op1, $CRn, $CRm, $Op2, $Rt",
+                           [], NoItinerary> {
+  let Op0 = 0b01;
+}
+
+// You can skip the Xt argument whether it makes sense or not for the generic
+// SYS instruction.
+def : InstAlias<"sys $Op1, $CRn, $CRm, $Op2",
+                (SYSiccix uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2, XZR)>;
+
+
+// But many have aliases, which obviously don't fit into
+class SYSalias<dag ins, string asmstring>
+  : A64I_system<0b0, (outs), ins, asmstring, [], NoItinerary> {
+  let isAsmParserOnly = 1;
+
+  bits<14> SysOp;
+  let Op0 = 0b01;
+  let Op1 = SysOp{13-11};
+  let CRn = SysOp{10-7};
+  let CRm = SysOp{6-3};
+  let Op2 = SysOp{2-0};
+}
+
+def ICix : SYSalias<(ins ic_op:$SysOp, GPR64:$Rt), "ic\t$SysOp, $Rt">;
+
+def ICi : SYSalias<(ins ic_op:$SysOp), "ic\t$SysOp"> {
+  let Rt = 0b11111;
+}
+
+def DCix : SYSalias<(ins dc_op:$SysOp, GPR64:$Rt), "dc\t$SysOp, $Rt">;
+def ATix : SYSalias<(ins at_op:$SysOp, GPR64:$Rt), "at\t$SysOp, $Rt">;
+
+def TLBIix : SYSalias<(ins tlbi_op:$SysOp, GPR64:$Rt), "tlbi\t$SysOp, $Rt">;
+
+def TLBIi : SYSalias<(ins tlbi_op:$SysOp), "tlbi\t$SysOp"> {
+  let Rt = 0b11111;
+}
+
+
+def SYSLxicci : A64I_system<0b1, (outs GPR64:$Rt),
+                            (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2),
+                            "sysl\t$Rt, $Op1, $CRn, $CRm, $Op2",
+                            [], NoItinerary> {
+  let Op0 = 0b01;
+}
+
+// The instructions themselves are rather simple for MSR and MRS.
+def MSRix : A64I_system<0b0, (outs), (ins msr_op:$SysReg, GPR64:$Rt),
+                        "msr\t$SysReg, $Rt", [], NoItinerary> {
+  bits<16> SysReg;
+  let Op0 = SysReg{15-14};
+  let Op1 = SysReg{13-11};
+  let CRn = SysReg{10-7};
+  let CRm = SysReg{6-3};
+  let Op2 = SysReg{2-0};
+}
+
+def MRSxi : A64I_system<0b1, (outs GPR64:$Rt), (ins mrs_op:$SysReg),
+                        "mrs\t$Rt, $SysReg", [], NoItinerary> {
+  bits<16> SysReg;
+  let Op0 = SysReg{15-14};
+  let Op1 = SysReg{13-11};
+  let CRn = SysReg{10-7};
+  let CRm = SysReg{6-3};
+  let Op2 = SysReg{2-0};
+}
+
+def MSRii : A64I_system<0b0, (outs), (ins pstate_op:$PState, uimm4:$CRm),
+                        "msr\t$PState, $CRm", [], NoItinerary> {
+  bits<6> PState;
+
+  let Op0 = 0b00;
+  let Op1 = PState{5-3};
+  let CRn = 0b0100;
+  let Op2 = PState{2-0};
+  let Rt = 0b11111;
+}
+
+//===----------------------------------------------------------------------===//
+// Test & branch (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: TBZ, TBNZ
+
+// The bit to test is a simple unsigned 6-bit immediate in the X-register
+// versions.
+def uimm6 : Operand<i64> {
+  let ParserMatchClass = uimm6_asmoperand;
+}
+
+def label_wid14_scal4_asmoperand : label_asmoperand<14, 4>;
+
+def tbimm_target : Operand<OtherVT> {
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_tstbr>";
+
+  // This label is a 14-bit offset from PC, scaled by the instruction-width: 4.
+  let PrintMethod = "printLabelOperand<14, 4>";
+  let ParserMatchClass = label_wid14_scal4_asmoperand;
+
+  let OperandType = "OPERAND_PCREL";
+}
+
+def A64eq : ImmLeaf<i32, [{ return Imm == A64CC::EQ; }]>;
+def A64ne : ImmLeaf<i32, [{ return Imm == A64CC::NE; }]>;
+
+// These instructions correspond to patterns involving "and" with a power of
+// two, which we need to be able to select.
+def tstb64_pat : ComplexPattern<i64, 1, "SelectTSTBOperand<64>">;
+def tstb32_pat : ComplexPattern<i32, 1, "SelectTSTBOperand<32>">;
+
+let isBranch = 1, isTerminator = 1 in {
+  def TBZxii : A64I_TBimm<0b0, (outs),
+                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
+                        "tbz\t$Rt, $Imm, $Label",
+                        [(A64br_cc (A64cmp (and GPR64:$Rt, tstb64_pat:$Imm), 0),
+                                   A64eq, bb:$Label)],
+                        NoItinerary>;
+
+  def TBNZxii : A64I_TBimm<0b1, (outs),
+                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
+                        "tbnz\t$Rt, $Imm, $Label",
+                        [(A64br_cc (A64cmp (and GPR64:$Rt, tstb64_pat:$Imm), 0),
+                                   A64ne, bb:$Label)],
+                        NoItinerary>;
+
+
+  // Note, these instructions overlap with the above 64-bit patterns. This is
+  // intentional, "tbz x3, #1, somewhere" and "tbz w3, #1, somewhere" would both
+  // do the same thing and are both permitted assembly. They also both have
+  // sensible DAG patterns.
+  def TBZwii : A64I_TBimm<0b0, (outs),
+                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
+                        "tbz\t$Rt, $Imm, $Label",
+                        [(A64br_cc (A64cmp (and GPR32:$Rt, tstb32_pat:$Imm), 0),
+                                   A64eq, bb:$Label)],
+                        NoItinerary> {
+    let Imm{5} = 0b0;
+  }
+
+  def TBNZwii : A64I_TBimm<0b1, (outs),
+                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
+                        "tbnz\t$Rt, $Imm, $Label",
+                        [(A64br_cc (A64cmp (and GPR32:$Rt, tstb32_pat:$Imm), 0),
+                                   A64ne, bb:$Label)],
+                        NoItinerary> {
+    let Imm{5} = 0b0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: B, BL
+
+def label_wid26_scal4_asmoperand : label_asmoperand<26, 4>;
+
+def bimm_target : Operand<OtherVT> {
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_uncondbr>";
+
+  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
+  let PrintMethod = "printLabelOperand<26, 4>";
+  let ParserMatchClass = label_wid26_scal4_asmoperand;
+
+  let OperandType = "OPERAND_PCREL";
+}
+
+def blimm_target : Operand<i64> {
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_call>";
+
+  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
+  let PrintMethod = "printLabelOperand<26, 4>";
+  let ParserMatchClass = label_wid26_scal4_asmoperand;
+
+  let OperandType = "OPERAND_PCREL";
+}
+
+class A64I_BimmImpl<bit op, string asmop, list<dag> patterns, Operand lbl_type>
+  : A64I_Bimm<op, (outs), (ins lbl_type:$Label),
+              !strconcat(asmop, "\t$Label"), patterns,
+              NoItinerary>;
+
+let isBranch = 1 in {
+  def Bimm : A64I_BimmImpl<0b0, "b", [(br bb:$Label)], bimm_target> {
+    let isTerminator = 1;
+    let isBarrier = 1;
+  }
+
+  def BLimm : A64I_BimmImpl<0b1, "bl",
+                            [(AArch64Call tglobaladdr:$Label)], blimm_target> {
+    let isCall = 1;
+    let Defs = [X30];
+  }
+}
+
+def : Pat<(AArch64Call texternalsym:$Label), (BLimm texternalsym:$Label)>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: BR, BLR, RET, ERET, DRP.
+
+// Most of the notional opcode fields in the A64I_Breg format are fixed in A64
+// at the moment.
+class A64I_BregImpl<bits<4> opc,
+                    dag outs, dag ins, string asmstr, list<dag> patterns,
+                    InstrItinClass itin = NoItinerary>
+  : A64I_Breg<opc, 0b11111, 0b000000, 0b00000,
+              outs, ins, asmstr, patterns, itin> {
+  let isBranch         = 1;
+  let isIndirectBranch = 1;
+}
+
+// Note that these are not marked isCall or isReturn because as far as LLVM is
+// concerned they're not. "ret" is just another jump unless it has been selected
+// by LLVM as the function's return.
+
+let isBranch = 1 in {
+  def BRx : A64I_BregImpl<0b0000,(outs), (ins GPR64:$Rn),
+                          "br\t$Rn", [(brind GPR64:$Rn)]> {
+    let isBarrier = 1;
+    let isTerminator = 1;
+  }
+
+  def BLRx : A64I_BregImpl<0b0001, (outs), (ins GPR64:$Rn),
+                           "blr\t$Rn", [(AArch64Call GPR64:$Rn)]> {
+    let isBarrier = 0;
+    let isCall = 1;
+    let Defs = [X30];
+  }
+
+  def RETx : A64I_BregImpl<0b0010, (outs), (ins GPR64:$Rn),
+                           "ret\t$Rn", []> {
+    let isBarrier = 1;
+    let isTerminator = 1;
+    let isReturn = 1;
+  }
+
+  // Create a separate pseudo-instruction for codegen to use so that we don't
+  // flag x30 as used in every function. It'll be restored before the RET by the
+  // epilogue if it's legitimately used.
+  def RET : A64PseudoExpand<(outs), (ins), [(A64ret)], (RETx (ops X30))> {
+    let isTerminator = 1;
+    let isBarrier = 1;
+    let isReturn = 1;
+  }
+
+  def ERET : A64I_BregImpl<0b0100, (outs), (ins), "eret", []> {
+    let Rn = 0b11111;
+    let isBarrier = 1;
+    let isTerminator = 1;
+    let isReturn = 1;
+  }
+
+  def DRPS : A64I_BregImpl<0b0101, (outs), (ins), "drps", []> {
+    let Rn = 0b11111;
+    let isBarrier = 1;
+  }
+}
+
+def RETAlias : InstAlias<"ret", (RETx X30)>;
+
+
+//===----------------------------------------------------------------------===//
+// Address generation patterns
+//===----------------------------------------------------------------------===//
+
+// Primary method of address generation for the small/absolute memory model is
+// an ADRP/ADR pair:
+//     ADRP x0, some_variable
+//     ADD x0, x0, #:lo12:some_variable
+//
+// The load/store elision of the ADD is accomplished when selecting
+// addressing-modes. This just mops up the cases where that doesn't work and we
+// really need an address in some register.
+
+// This wrapper applies a LO12 modifier to the address. Otherwise we could just
+// use the same address.
+
+class ADRP_ADD<SDNode Wrapper, SDNode addrop>
+ : Pat<(Wrapper addrop:$Hi, addrop:$Lo12, (i32 imm)),
+       (ADDxxi_lsl0_s (ADRPxi addrop:$Hi), addrop:$Lo12)>;
+
+def : ADRP_ADD<A64WrapperSmall, tblockaddress>;
+def : ADRP_ADD<A64WrapperSmall, texternalsym>;
+def : ADRP_ADD<A64WrapperSmall, tglobaladdr>;
+def : ADRP_ADD<A64WrapperSmall, tglobaltlsaddr>;
+def : ADRP_ADD<A64WrapperSmall, tjumptable>;
+
+//===----------------------------------------------------------------------===//
+// GOT access patterns
+//===----------------------------------------------------------------------===//
+
+// FIXME: Wibble
+
+class GOTLoadSmall<SDNode addrfrag>
+  : Pat<(A64GOTLoad (A64WrapperSmall addrfrag:$Hi, addrfrag:$Lo12, 8)),
+        (LS64_LDR (ADRPxi addrfrag:$Hi), addrfrag:$Lo12)>;
+
+def : GOTLoadSmall<texternalsym>;
+def : GOTLoadSmall<tglobaladdr>;
+def : GOTLoadSmall<tglobaltlsaddr>;
+
+//===----------------------------------------------------------------------===//
+// Tail call handling
+//===----------------------------------------------------------------------===//
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [XSP] in {
+  def TC_RETURNdi
+    : PseudoInst<(outs), (ins i64imm:$dst, i32imm:$FPDiff),
+                 [(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff))]>;
+
+  def TC_RETURNxi
+    : PseudoInst<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff),
+                 [(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff))]>;
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    Uses = [XSP] in {
+  def TAIL_Bimm : A64PseudoExpand<(outs), (ins bimm_target:$Label), [],
+                                  (Bimm bimm_target:$Label)>;
+
+  def TAIL_BRx : A64PseudoExpand<(outs), (ins tcGPR64:$Rd), [],
+                                 (BRx GPR64:$Rd)>;
+}
+
+
+def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+          (TC_RETURNdi texternalsym:$dst, imm:$FPDiff)>;
+
+//===----------------------------------------------------------------------===//
+// Thread local storage
+//===----------------------------------------------------------------------===//
+
+// This is a pseudo-instruction representing the ".tlsdesccall" directive in
+// assembly. Its effect is to insert an R_AARCH64_TLSDESC_CALL relocation at the
+// current location. It should always be immediately followed by a BLR
+// instruction, and is intended solely for relaxation by the linker.
+
+def : Pat<(A64threadpointer), (MRSxi 0xde82)>;
+
+def TLSDESCCALL : PseudoInst<(outs), (ins i64imm:$Lbl), []> {
+  let hasSideEffects = 1;
+}
+
+def TLSDESC_BLRx : PseudoInst<(outs), (ins GPR64:$Rn, i64imm:$Var),
+                            [(A64tlsdesc_blr GPR64:$Rn, tglobaltlsaddr:$Var)]> {
+  let isCall = 1;
+  let Defs = [X30];
+}
+
+def : Pat<(A64tlsdesc_blr GPR64:$Rn, texternalsym:$Var),
+          (TLSDESC_BLRx GPR64:$Rn, texternalsym:$Var)>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield patterns
+//===----------------------------------------------------------------------===//
+
+def bfi32_lsb_to_immr : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((32 - N->getZExtValue()) % 32, MVT::i64);
+}]>;
+
+def bfi64_lsb_to_immr : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((64 - N->getZExtValue()) % 64, MVT::i64);
+}]>;
+
+def bfi_width_to_imms : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() - 1, MVT::i64);
+}]>;
+
+
+// The simpler patterns deal with cases where no AND mask is actually needed
+// (either all bits are used or the low 32 bits are used).
+let AddedComplexity = 10 in {
+
+def : Pat<(A64Bfi GPR64:$src, GPR64:$Rn, imm:$ImmR, imm:$ImmS),
+           (BFIxxii GPR64:$src, GPR64:$Rn,
+                    (bfi64_lsb_to_immr (i64 imm:$ImmR)),
+                    (bfi_width_to_imms (i64 imm:$ImmS)))>;
+
+def : Pat<(A64Bfi GPR32:$src, GPR32:$Rn, imm:$ImmR, imm:$ImmS),
+          (BFIwwii GPR32:$src, GPR32:$Rn,
+                   (bfi32_lsb_to_immr (i64 imm:$ImmR)),
+                   (bfi_width_to_imms (i64 imm:$ImmS)))>;
+
+
+def : Pat<(and (A64Bfi GPR64:$src, GPR64:$Rn, imm:$ImmR, imm:$ImmS),
+               (i64 4294967295)),
+          (SUBREG_TO_REG (i64 0),
+                         (BFIwwii (EXTRACT_SUBREG GPR64:$src, sub_32),
+                                  (EXTRACT_SUBREG GPR64:$Rn, sub_32),
+                                  (bfi32_lsb_to_immr (i64 imm:$ImmR)),
+                                  (bfi_width_to_imms (i64 imm:$ImmS))),
+                         sub_32)>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous patterns
+//===----------------------------------------------------------------------===//
+
+// Truncation from 64 to 32-bits just involves renaming your register.
+def : Pat<(i32 (trunc (i64 GPR64:$val))), (EXTRACT_SUBREG GPR64:$val, sub_32)>;
+
+// Similarly, extension where we don't care about the high bits is
+// just a rename.
+def : Pat<(i64 (anyext (i32 GPR32:$val))),
+          (INSERT_SUBREG (IMPLICIT_DEF), GPR32:$val, sub_32)>;
+
+// SELECT instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : PseudoInst<(outs FPR128:$Rd),
+                          (ins FPR128:$Rn, FPR128:$Rm, cond_code_op:$Cond),
+                          [(set FPR128:$Rd, (simple_select (f128 FPR128:$Rn),
+                                                           FPR128:$Rm))]> {
+  let Uses = [NZCV];
+  let usesCustomInserter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Load/store patterns
+//===----------------------------------------------------------------------===//
+
+// There are lots of patterns here, because we need to allow at least three
+// parameters to vary independently.
+//   1. Instruction: "ldrb w9, [sp]", "ldrh w9, [sp]", ...
+//   2. LLVM source: zextloadi8, anyextloadi8, ...
+//   3. Address-generation: A64Wrapper, (add BASE, OFFSET), ...
+//
+// The biggest problem turns out to be the address-generation variable. At the
+// point of instantiation we need to produce two DAGs, one for the pattern and
+// one for the instruction. Doing this at the lowest level of classes doesn't
+// work.
+//
+// Consider the simple uimm12 addressing mode, and the desire to match both (add
+// GPR64xsp:$Rn, uimm12:$Offset) and GPR64xsp:$Rn, particularly on the
+// instruction side. We'd need to insert either "GPR64xsp" and "uimm12" or
+// "GPR64xsp" and "0" into an unknown dag. !subst is not capable of this
+// operation, and PatFrags are for selection not output.
+//
+// As a result, the address-generation patterns are the final
+// instantiations. However, we do still need to vary the operand for the address
+// further down (At the point we're deciding A64WrapperSmall, we don't know
+// the memory width of the operation).
+
+//===------------------------------
+// 1. Basic infrastructural defs
+//===------------------------------
+
+// First, some simple classes for !foreach and !subst to use:
+class Decls {
+  dag pattern;
+}
+
+def decls : Decls;
+def ALIGN;
+def INST;
+def OFFSET;
+def SHIFT;
+
+// You can't use !subst on an actual immediate, but you *can* use it on an
+// operand record that happens to match a single immediate. So we do.
+def imm_eq0 : ImmLeaf<i64, [{ return Imm == 0; }]>;
+def imm_eq1 : ImmLeaf<i64, [{ return Imm == 1; }]>;
+def imm_eq2 : ImmLeaf<i64, [{ return Imm == 2; }]>;
+def imm_eq3 : ImmLeaf<i64, [{ return Imm == 3; }]>;
+def imm_eq4 : ImmLeaf<i64, [{ return Imm == 4; }]>;
+
+// If the low bits of a pointer are known to be 0 then an "or" is just as good
+// as addition for computing an offset. This fragment forwards that check for
+// TableGen's use.
+def add_like_or : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),
+[{
+  return CurDAG->isBaseWithConstantOffset(SDValue(N, 0));
+}]>;
+
+// Load/store (unsigned immediate) operations with relocations against global
+// symbols (for lo12) are only valid if those symbols have correct alignment
+// (since the immediate offset is divided by the access scale, it can't have a
+// remainder).
+//
+// The guaranteed alignment is provided as part of the WrapperSmall
+// operation, and checked against one of these.
+def any_align   : ImmLeaf<i32, [{ (void)Imm; return true; }]>;
+def min_align2  : ImmLeaf<i32, [{ return Imm >= 2; }]>;
+def min_align4  : ImmLeaf<i32, [{ return Imm >= 4; }]>;
+def min_align8  : ImmLeaf<i32, [{ return Imm >= 8; }]>;
+def min_align16 : ImmLeaf<i32, [{ return Imm >= 16; }]>;
+
+// "Normal" load/store instructions can be used on atomic operations, provided
+// the ordering parameter is at most "monotonic". Anything above that needs
+// special handling with acquire/release instructions.
+class simple_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
+}]>;
+
+def atomic_load_simple_i8  : simple_load<atomic_load_8>;
+def atomic_load_simple_i16 : simple_load<atomic_load_16>;
+def atomic_load_simple_i32 : simple_load<atomic_load_32>;
+def atomic_load_simple_i64 : simple_load<atomic_load_64>;
+
+class simple_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
+}]>;
+
+def atomic_store_simple_i8  : simple_store<atomic_store_8>;
+def atomic_store_simple_i16 : simple_store<atomic_store_16>;
+def atomic_store_simple_i32 : simple_store<atomic_store_32>;
+def atomic_store_simple_i64 : simple_store<atomic_store_64>;
+
+//===------------------------------
+// 2. UImm12 and SImm9
+//===------------------------------
+
+// These instructions have two operands providing the address so they can be
+// treated similarly for most purposes.
+
+//===------------------------------
+// 2.1 Base patterns covering extend/truncate semantics
+//===------------------------------
+
+// Atomic patterns can be shared between integer operations of all sizes, a
+// quick multiclass here allows reuse.
+multiclass ls_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
+                          dag Offset, dag address, RegisterClass TPR,
+                          ValueType sty> {
+  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
+            (LOAD Base, Offset)>;
+
+  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, TPR:$Rt),
+            (STORE TPR:$Rt, Base, Offset)>;
+}
+
+// Instructions accessing a memory chunk smaller than a register (or, in a
+// pinch, the same size) have a characteristic set of patterns they want to
+// match: extending loads and truncating stores. This class deals with the
+// sign-neutral version of those patterns.
+//
+// It will be instantiated across multiple addressing-modes.
+multiclass ls_small_pats<Instruction LOAD, Instruction STORE,
+                         dag Base, dag Offset,
+                         dag address, ValueType sty>
+  : ls_atomic_pats<LOAD, STORE, Base, Offset, address, GPR32, sty> {
+  def : Pat<(!cast<SDNode>(zextload # sty) address), (LOAD Base, Offset)>;
+
+  def : Pat<(!cast<SDNode>(extload # sty) address), (LOAD Base, Offset)>;
+
+  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
+  // register was actually set.
+  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
+            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
+
+  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
+            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
+
+  def : Pat<(!cast<SDNode>(truncstore # sty) GPR32:$Rt, address),
+            (STORE GPR32:$Rt, Base, Offset)>;
+
+  // For truncating store from 64-bits, we have to manually tell LLVM to
+  // ignore the high bits of the x register.
+  def : Pat<(!cast<SDNode>(truncstore # sty) GPR64:$Rt, address),
+            (STORE (EXTRACT_SUBREG GPR64:$Rt, sub_32), Base, Offset)>;
+}
+
+// Next come patterns for sign-extending loads.
+multiclass load_signed_pats<string T, string U, dag Base, dag Offset,
+                            dag address, ValueType sty> {
+  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
+            (!cast<Instruction>("LDRS" # T # "w" # U) Base, Offset)>;
+
+  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
+            (!cast<Instruction>("LDRS" # T # "x" # U) Base, Offset)>;
+
+}
+
+// and finally "natural-width" loads and stores come next.
+multiclass ls_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
+                           dag Offset, dag address, RegisterClass TPR,
+                           ValueType sty> {
+  def : Pat<(sty (load address)), (LOAD Base, Offset)>;
+  def : Pat<(store (sty TPR:$Rt), address), (STORE TPR:$Rt, Base, Offset)>;
+}
+
+// Integer operations also get atomic instructions to select for.
+multiclass ls_int_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
+                           dag Offset, dag address, RegisterClass TPR,
+                           ValueType sty>
+  : ls_neutral_pats<LOAD, STORE, Base, Offset, address, TPR, sty>,
+    ls_atomic_pats<LOAD, STORE, Base, Offset, address, TPR, sty>;
+
+//===------------------------------
+// 2.2. Addressing-mode instantiations
+//===------------------------------
+
+multiclass uimm12_pats<dag address, dag Base, dag Offset> {
+  defm : ls_small_pats<LS8_LDR, LS8_STR, Base,
+                       !foreach(decls.pattern, Offset,
+                                !subst(OFFSET, byte_uimm12, decls.pattern)),
+                       !foreach(decls.pattern, address,
+                                !subst(OFFSET, byte_uimm12,
+                                !subst(ALIGN, any_align, decls.pattern))),
+                       i8>;
+  defm : ls_small_pats<LS16_LDR, LS16_STR, Base,
+                       !foreach(decls.pattern, Offset,
+                                !subst(OFFSET, hword_uimm12, decls.pattern)),
+                       !foreach(decls.pattern, address,
+                                !subst(OFFSET, hword_uimm12,
+                                !subst(ALIGN, min_align2, decls.pattern))),
+                       i16>;
+  defm : ls_small_pats<LS32_LDR, LS32_STR, Base,
+                       !foreach(decls.pattern, Offset,
+                                !subst(OFFSET, word_uimm12, decls.pattern)),
+                       !foreach(decls.pattern, address,
+                                !subst(OFFSET, word_uimm12,
+                                !subst(ALIGN, min_align4, decls.pattern))),
+                       i32>;
+
+  defm : ls_int_neutral_pats<LS32_LDR, LS32_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, word_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, word_uimm12,
+                                   !subst(ALIGN, min_align4, decls.pattern))),
+                          GPR32, i32>;
+
+  defm : ls_int_neutral_pats<LS64_LDR, LS64_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, dword_uimm12,
+                                   !subst(ALIGN, min_align8, decls.pattern))),
+                          GPR64, i64>;
+
+  defm : ls_neutral_pats<LSFP16_LDR, LSFP16_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, hword_uimm12,
+                                   !subst(ALIGN, min_align2, decls.pattern))),
+                          FPR16, f16>;
+
+  defm : ls_neutral_pats<LSFP32_LDR, LSFP32_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, word_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, word_uimm12,
+                                   !subst(ALIGN, min_align4, decls.pattern))),
+                          FPR32, f32>;
+
+  defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, dword_uimm12,
+                                   !subst(ALIGN, min_align8, decls.pattern))),
+                          FPR64, f64>;
+
+  defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, qword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, qword_uimm12,
+                                   !subst(ALIGN, min_align16, decls.pattern))),
+                          FPR128, f128>;
+
+  defm : load_signed_pats<"B", "", Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, byte_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, byte_uimm12,
+                                   !subst(ALIGN, any_align, decls.pattern))),
+                          i8>;
+
+  defm : load_signed_pats<"H", "", Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, hword_uimm12,
+                                   !subst(ALIGN, min_align2, decls.pattern))),
+                          i16>;
+
+  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
+                                  !subst(OFFSET, word_uimm12,
+                                  !subst(ALIGN, min_align4, decls.pattern)))),
+            (LDRSWx Base, !foreach(decls.pattern, Offset,
+                                  !subst(OFFSET, word_uimm12, decls.pattern)))>;
+}
+
+// Straightforward patterns of last resort: a pointer with or without an
+// appropriate offset.
+defm : uimm12_pats<(i64 GPR64xsp:$Rn), (i64 GPR64xsp:$Rn), (i64 0)>;
+defm : uimm12_pats<(add GPR64xsp:$Rn, OFFSET:$UImm12),
+                   (i64 GPR64xsp:$Rn), (i64 OFFSET:$UImm12)>;
+
+// The offset could be hidden behind an "or", of course:
+defm : uimm12_pats<(add_like_or GPR64xsp:$Rn, OFFSET:$UImm12),
+                   (i64 GPR64xsp:$Rn), (i64 OFFSET:$UImm12)>;
+
+// Global addresses under the small-absolute model should use these
+// instructions. There are ELF relocations specifically for it.
+defm : uimm12_pats<(A64WrapperSmall tglobaladdr:$Hi, tglobaladdr:$Lo12, ALIGN),
+                   (ADRPxi tglobaladdr:$Hi), (i64 tglobaladdr:$Lo12)>;
+
+defm : uimm12_pats<(A64WrapperSmall tglobaltlsaddr:$Hi, tglobaltlsaddr:$Lo12,
+                                    ALIGN),
+                   (ADRPxi tglobaltlsaddr:$Hi), (i64 tglobaltlsaddr:$Lo12)>;
+
+// External symbols that make it this far should also get standard relocations.
+defm : uimm12_pats<(A64WrapperSmall texternalsym:$Hi, texternalsym:$Lo12,
+                                    ALIGN),
+                   (ADRPxi texternalsym:$Hi), (i64 texternalsym:$Lo12)>;
+
+defm : uimm12_pats<(A64WrapperSmall tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
+                   (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
+
+// We also want to use uimm12 instructions for local variables at the moment.
+def tframeindex_XFORM : SDNodeXForm<frameindex, [{
+  int FI = cast<FrameIndexSDNode>(N)->getIndex();
+  return CurDAG->getTargetFrameIndex(FI, MVT::i64);
+}]>;
+
+defm : uimm12_pats<(i64 frameindex:$Rn),
+                   (tframeindex_XFORM tframeindex:$Rn), (i64 0)>;
+
+// These can be much simpler than uimm12 because we don't to change the operand
+// type (e.g. LDURB and LDURH take the same operands).
+multiclass simm9_pats<dag address, dag Base, dag Offset> {
+  defm : ls_small_pats<LS8_LDUR, LS8_STUR, Base, Offset, address, i8>;
+  defm : ls_small_pats<LS16_LDUR, LS16_STUR, Base, Offset, address, i16>;
+
+  defm : ls_int_neutral_pats<LS32_LDUR, LS32_STUR, Base, Offset, address,
+                             GPR32, i32>;
+  defm : ls_int_neutral_pats<LS64_LDUR, LS64_STUR, Base, Offset, address,
+                             GPR64, i64>;
+
+  defm : ls_neutral_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address,
+                         FPR16, f16>;
+  defm : ls_neutral_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address,
+                         FPR32, f32>;
+  defm : ls_neutral_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address,
+                         FPR64, f64>;
+  defm : ls_neutral_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address,
+                         FPR128, f128>;
+
+  def : Pat<(i64 (zextloadi32 address)),
+            (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>;
+
+  def : Pat<(truncstorei32 GPR64:$Rt, address),
+            (LS32_STUR (EXTRACT_SUBREG GPR64:$Rt, sub_32), Base, Offset)>;
+
+  defm : load_signed_pats<"B", "_U", Base, Offset, address, i8>;
+  defm : load_signed_pats<"H", "_U", Base, Offset, address, i16>;
+  def : Pat<(sextloadi32 address), (LDURSWx Base, Offset)>;
+}
+
+defm : simm9_pats<(add GPR64xsp:$Rn, simm9:$SImm9),
+                  (i64 GPR64xsp:$Rn), (SDXF_simm9 simm9:$SImm9)>;
+
+defm : simm9_pats<(add_like_or GPR64xsp:$Rn, simm9:$SImm9),
+                  (i64 GPR64xsp:$Rn), (SDXF_simm9 simm9:$SImm9)>;
+
+
+//===------------------------------
+// 3. Register offset patterns
+//===------------------------------
+
+// Atomic patterns can be shared between integer operations of all sizes, a
+// quick multiclass here allows reuse.
+multiclass ro_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
+                          dag Offset, dag Extend, dag address,
+                          RegisterClass TPR, ValueType sty> {
+  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
+            (LOAD Base, Offset, Extend)>;
+
+  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, TPR:$Rt),
+            (STORE TPR:$Rt, Base, Offset, Extend)>;
+}
+
+// The register offset instructions take three operands giving the instruction,
+// and have an annoying split between instructions where Rm is 32-bit and
+// 64-bit. So we need a special hierarchy to describe them. Other than that the
+// same operations should be supported as for simm9 and uimm12 addressing.
+
+multiclass ro_small_pats<Instruction LOAD, Instruction STORE,
+                         dag Base, dag Offset, dag Extend,
+                         dag address, ValueType sty>
+  : ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, GPR32, sty> {
+  def : Pat<(!cast<SDNode>(zextload # sty) address),
+            (LOAD Base, Offset, Extend)>;
+
+  def : Pat<(!cast<SDNode>(extload # sty) address),
+            (LOAD Base, Offset, Extend)>;
+
+  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
+  // register was actually set.
+  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
+            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
+
+  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
+            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
+
+  def : Pat<(!cast<SDNode>(truncstore # sty) GPR32:$Rt, address),
+            (STORE GPR32:$Rt, Base, Offset, Extend)>;
+
+  // For truncating store from 64-bits, we have to manually tell LLVM to
+  // ignore the high bits of the x register.
+  def : Pat<(!cast<SDNode>(truncstore # sty) GPR64:$Rt, address),
+            (STORE (EXTRACT_SUBREG GPR64:$Rt, sub_32), Base, Offset, Extend)>;
+
+}
+
+// Next come patterns for sign-extending loads.
+multiclass ro_signed_pats<string T, string Rm, dag Base, dag Offset, dag Extend,
+                          dag address, ValueType sty> {
+  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
+            (!cast<Instruction>("LDRS" # T # "w_" # Rm # "_RegOffset")
+              Base, Offset, Extend)>;
+
+  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
+            (!cast<Instruction>("LDRS" # T # "x_" # Rm # "_RegOffset")
+              Base, Offset, Extend)>;
+}
+
+// and finally "natural-width" loads and stores come next.
+multiclass ro_neutral_pats<Instruction LOAD, Instruction STORE,
+                           dag Base, dag Offset, dag Extend, dag address,
+                           RegisterClass TPR, ValueType sty> {
+  def : Pat<(sty (load address)), (LOAD Base, Offset, Extend)>;
+  def : Pat<(store (sty TPR:$Rt), address),
+            (STORE TPR:$Rt, Base, Offset, Extend)>;
+}
+
+multiclass ro_int_neutral_pats<Instruction LOAD, Instruction STORE,
+                               dag Base, dag Offset, dag Extend, dag address,
+                               RegisterClass TPR, ValueType sty>
+  : ro_neutral_pats<LOAD, STORE, Base, Offset, Extend, address, TPR, sty>,
+    ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, TPR, sty>;
+
+multiclass regoff_pats<string Rm, dag address, dag Base, dag Offset,
+                       dag Extend> {
+  defm : ro_small_pats<!cast<Instruction>("LS8_" # Rm # "_RegOffset_LDR"),
+                       !cast<Instruction>("LS8_" # Rm # "_RegOffset_STR"),
+                       Base, Offset, Extend,
+                       !foreach(decls.pattern, address,
+                                !subst(SHIFT, imm_eq0, decls.pattern)),
+                       i8>;
+  defm : ro_small_pats<!cast<Instruction>("LS16_" # Rm # "_RegOffset_LDR"),
+                       !cast<Instruction>("LS16_" # Rm # "_RegOffset_STR"),
+                       Base, Offset, Extend,
+                       !foreach(decls.pattern, address,
+                                !subst(SHIFT, imm_eq1, decls.pattern)),
+                       i16>;
+  defm : ro_small_pats<!cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
+                       !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
+                       Base, Offset, Extend,
+                       !foreach(decls.pattern, address,
+                                !subst(SHIFT, imm_eq2, decls.pattern)),
+                       i32>;
+
+  defm : ro_int_neutral_pats<
+                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
+                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
+                            Base, Offset, Extend,
+                            !foreach(decls.pattern, address,
+                                     !subst(SHIFT, imm_eq2, decls.pattern)),
+                            GPR32, i32>;
+
+  defm : ro_int_neutral_pats<
+                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_LDR"),
+                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_STR"),
+                            Base, Offset, Extend,
+                            !foreach(decls.pattern, address,
+                                     !subst(SHIFT, imm_eq3, decls.pattern)),
+                            GPR64, i64>;
+
+  defm : ro_neutral_pats<!cast<Instruction>("LSFP16_" # Rm # "_RegOffset_LDR"),
+                         !cast<Instruction>("LSFP16_" # Rm # "_RegOffset_STR"),
+                         Base, Offset, Extend,
+                         !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq1, decls.pattern)),
+                         FPR16, f16>;
+
+  defm : ro_neutral_pats<!cast<Instruction>("LSFP32_" # Rm # "_RegOffset_LDR"),
+                         !cast<Instruction>("LSFP32_" # Rm # "_RegOffset_STR"),
+                         Base, Offset, Extend,
+                         !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq2, decls.pattern)),
+                         FPR32, f32>;
+
+  defm : ro_neutral_pats<!cast<Instruction>("LSFP64_" # Rm # "_RegOffset_LDR"),
+                         !cast<Instruction>("LSFP64_" # Rm # "_RegOffset_STR"),
+                         Base, Offset, Extend,
+                         !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq3, decls.pattern)),
+                         FPR64, f64>;
+
+  defm : ro_neutral_pats<!cast<Instruction>("LSFP128_" # Rm # "_RegOffset_LDR"),
+                         !cast<Instruction>("LSFP128_" # Rm # "_RegOffset_STR"),
+                         Base, Offset, Extend,
+                         !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq4, decls.pattern)),
+                         FPR128, f128>;
+
+  defm : ro_signed_pats<"B", Rm, Base, Offset, Extend,
+                          !foreach(decls.pattern, address,
+                                   !subst(SHIFT, imm_eq0, decls.pattern)),
+                          i8>;
+
+  defm : ro_signed_pats<"H", Rm, Base, Offset, Extend,
+                          !foreach(decls.pattern, address,
+                                   !subst(SHIFT, imm_eq1, decls.pattern)),
+                          i16>;
+
+  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq2, decls.pattern))),
+            (!cast<Instruction>("LDRSWx_" # Rm # "_RegOffset")
+              Base, Offset, Extend)>;
+}
+
+
+// Finally we're in a position to tell LLVM exactly what addresses are reachable
+// using register-offset instructions. Essentially a base plus a possibly
+// extended, possibly shifted (by access size) offset.
+
+defm : regoff_pats<"Wm", (add GPR64xsp:$Rn, (sext GPR32:$Rm)),
+                   (i64 GPR64xsp:$Rn), (i32 GPR32:$Rm), (i64 6)>;
+
+defm : regoff_pats<"Wm", (add GPR64xsp:$Rn, (shl (sext GPR32:$Rm), SHIFT)),
+                   (i64 GPR64xsp:$Rn), (i32 GPR32:$Rm), (i64 7)>;
+
+defm : regoff_pats<"Wm", (add GPR64xsp:$Rn, (zext GPR32:$Rm)),
+                   (i64 GPR64xsp:$Rn), (i32 GPR32:$Rm), (i64 2)>;
+
+defm : regoff_pats<"Wm", (add GPR64xsp:$Rn, (shl (zext GPR32:$Rm), SHIFT)),
+                   (i64 GPR64xsp:$Rn), (i32 GPR32:$Rm), (i64 3)>;
+
+defm : regoff_pats<"Xm", (add GPR64xsp:$Rn, GPR64:$Rm),
+                   (i64 GPR64xsp:$Rn), (i64 GPR64:$Rm), (i64 2)>;
+
+defm : regoff_pats<"Xm", (add GPR64xsp:$Rn, (shl GPR64:$Rm, SHIFT)),
+                   (i64 GPR64xsp:$Rn), (i64 GPR64:$Rm), (i64 3)>;
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
new file mode 100644
index 0000000..c96bf85
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -0,0 +1,140 @@
+//===-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower AArch64 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64AsmPrinter.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+MCOperand
+AArch64AsmPrinter::lowerSymbolOperand(const MachineOperand &MO,
+                                      const MCSymbol *Sym) const {
+  const MCExpr *Expr = 0;
+
+  Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, OutContext);
+
+  switch (MO.getTargetFlags()) {
+  case AArch64II::MO_GOT:
+    Expr = AArch64MCExpr::CreateGOT(Expr, OutContext);
+    break;
+  case AArch64II::MO_GOT_LO12:
+    Expr = AArch64MCExpr::CreateGOTLo12(Expr, OutContext);
+    break;
+  case AArch64II::MO_LO12:
+    Expr = AArch64MCExpr::CreateLo12(Expr, OutContext);
+    break;
+  case AArch64II::MO_DTPREL_G1:
+    Expr = AArch64MCExpr::CreateDTPREL_G1(Expr, OutContext);
+    break;
+  case AArch64II::MO_DTPREL_G0_NC:
+    Expr = AArch64MCExpr::CreateDTPREL_G0_NC(Expr, OutContext);
+    break;
+  case AArch64II::MO_GOTTPREL:
+    Expr = AArch64MCExpr::CreateGOTTPREL(Expr, OutContext);
+    break;
+  case AArch64II::MO_GOTTPREL_LO12:
+    Expr = AArch64MCExpr::CreateGOTTPRELLo12(Expr, OutContext);
+    break;
+  case AArch64II::MO_TLSDESC:
+    Expr = AArch64MCExpr::CreateTLSDesc(Expr, OutContext);
+    break;
+  case AArch64II::MO_TLSDESC_LO12:
+    Expr = AArch64MCExpr::CreateTLSDescLo12(Expr, OutContext);
+    break;
+  case AArch64II::MO_TPREL_G1:
+    Expr = AArch64MCExpr::CreateTPREL_G1(Expr, OutContext);
+    break;
+  case AArch64II::MO_TPREL_G0_NC:
+    Expr = AArch64MCExpr::CreateTPREL_G0_NC(Expr, OutContext);
+    break;
+  case AArch64II::MO_NO_FLAG:
+    // Expr is already correct
+    break;
+  default:
+    llvm_unreachable("Unexpected MachineOperand flag");
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(),
+                                                          OutContext),
+                                   OutContext);
+
+  return MCOperand::CreateExpr(Expr);
+}
+
+bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
+                                     MCOperand &MCOp) const {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    if (MO.isImplicit())
+      return false;
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    MCOp = MCOperand::CreateReg(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                   MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = lowerSymbolOperand(MO, GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = lowerSymbolOperand(MO, GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_RegisterMask:
+    // Ignore call clobbers
+    return false;
+
+  }
+
+  return true;
+}
+
+void llvm::LowerAArch64MachineInstrToMCInst(const MachineInstr *MI,
+                                            MCInst &OutMI,
+                                            AArch64AsmPrinter &AP) {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    if (AP.lowerOperand(MO, MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
new file mode 100644
index 0000000..f45d8f7
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -0,0 +1,18 @@
+//===-- AArch64MachineFuctionInfo.cpp - AArch64 machine function info -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file just contains the anchor for the AArch64MachineFunctionInfo to
+// force vtable emission.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void AArch64MachineFunctionInfo::anchor() { }
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
new file mode 100644
index 0000000..33da54f
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -0,0 +1,149 @@
+//=- AArch64MachineFuctionInfo.h - AArch64 machine function info -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AArch64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64MACHINEFUNCTIONINFO_H
+#define AARCH64MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// This class is derived from MachineFunctionInfo and contains private AArch64
+/// target-specific information for each MachineFunction.
+class AArch64MachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// Number of bytes of arguments this function has on the stack. If the callee
+  /// is expected to restore the argument stack this should be a multiple of 16,
+  /// all usable during a tail call.
+  ///
+  /// The alternative would forbid tail call optimisation in some cases: if we
+  /// want to transfer control from a function with 8-bytes of stack-argument
+  /// space to a function with 16-bytes then misalignment of this value would
+  /// make a stack adjustment necessary, which could not be undone by the
+  /// callee.
+  unsigned BytesInStackArgArea;
+
+  /// The number of bytes to restore to deallocate space for incoming
+  /// arguments. Canonically 0 in the C calling convention, but non-zero when
+  /// callee is expected to pop the args.
+  unsigned ArgumentStackToRestore;
+
+  /// If the stack needs to be adjusted on frame entry in two stages, this
+  /// records the size of the first adjustment just prior to storing
+  /// callee-saved registers. The callee-saved slots are addressed assuming
+  /// SP == <incoming-SP> - InitialStackAdjust.
+  unsigned InitialStackAdjust;
+
+  /// Number of local-dynamic TLS accesses.
+  unsigned NumLocalDynamics;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The Frame index of the area where LowerFormalArguments puts the
+  /// general-purpose registers that might contain variadic parameters.
+  int VariadicGPRIdx;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The size of the frame object used to store the general-purpose registers
+  /// which might contain variadic arguments. This is the offset from
+  /// VariadicGPRIdx to what's stored in __gr_top.
+  unsigned VariadicGPRSize;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The Frame index of the area where LowerFormalArguments puts the
+  /// floating-point registers that might contain variadic parameters.
+  int VariadicFPRIdx;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The size of the frame object used to store the floating-point registers
+  /// which might contain variadic arguments. This is the offset from
+  /// VariadicFPRIdx to what's stored in __vr_top.
+  unsigned VariadicFPRSize;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The Frame index of an object pointing just past the last known stacked
+  /// argument on entry to a variadic function. This goes into the __stack field
+  /// of the va_list type.
+  int VariadicStackIdx;
+
+  /// The offset of the frame pointer from the stack pointer on function
+  /// entry. This is expected to be negative.
+  int FramePointerOffset;
+
+public:
+  AArch64MachineFunctionInfo()
+    : BytesInStackArgArea(0),
+      ArgumentStackToRestore(0),
+      InitialStackAdjust(0),
+      NumLocalDynamics(0),
+      VariadicGPRIdx(0),
+      VariadicGPRSize(0),
+      VariadicFPRIdx(0),
+      VariadicFPRSize(0),
+      VariadicStackIdx(0),
+      FramePointerOffset(0) {}
+
+  explicit AArch64MachineFunctionInfo(MachineFunction &MF)
+    : BytesInStackArgArea(0),
+      ArgumentStackToRestore(0),
+      InitialStackAdjust(0),
+      NumLocalDynamics(0),
+      VariadicGPRIdx(0),
+      VariadicGPRSize(0),
+      VariadicFPRIdx(0),
+      VariadicFPRSize(0),
+      VariadicStackIdx(0),
+      FramePointerOffset(0) {}
+
+  unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
+  void setBytesInStackArgArea (unsigned bytes) { BytesInStackArgArea = bytes;}
+
+  unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+  void setArgumentStackToRestore(unsigned bytes) {
+    ArgumentStackToRestore = bytes;
+  }
+
+  unsigned getInitialStackAdjust() const { return InitialStackAdjust; }
+  void setInitialStackAdjust(unsigned bytes) { InitialStackAdjust = bytes; }
+
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+  int getVariadicGPRIdx() const { return VariadicGPRIdx; }
+  void setVariadicGPRIdx(int Idx) { VariadicGPRIdx = Idx; }
+
+  unsigned getVariadicGPRSize() const { return VariadicGPRSize; }
+  void setVariadicGPRSize(unsigned Size) { VariadicGPRSize = Size; }
+
+  int getVariadicFPRIdx() const { return VariadicFPRIdx; }
+  void setVariadicFPRIdx(int Idx) { VariadicFPRIdx = Idx; }
+
+  unsigned getVariadicFPRSize() const { return VariadicFPRSize; }
+  void setVariadicFPRSize(unsigned Size) { VariadicFPRSize = Size; }
+
+  int getVariadicStackIdx() const { return VariadicStackIdx; }
+  void setVariadicStackIdx(int Idx) { VariadicStackIdx = Idx; }
+
+  int getFramePointerOffset() const { return FramePointerOffset; }
+  void setFramePointerOffset(int Idx) { FramePointerOffset = Idx; }
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
new file mode 100644
index 0000000..20b0dcf
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -0,0 +1,171 @@
+//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AArch64RegisterInfo.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/ADT/BitVector.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+using namespace llvm;
+
+AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo &tii,
+                                         const AArch64Subtarget &sti)
+  : AArch64GenRegisterInfo(AArch64::X30), TII(tii) {
+}
+
+const uint16_t *
+AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  return CSR_PCS_SaveList;
+}
+
+const uint32_t*
+AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID) const {
+  return CSR_PCS_RegMask;
+}
+
+const uint32_t *AArch64RegisterInfo::getTLSDescCallPreservedMask() const {
+  return TLSDesc_RegMask;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &AArch64::FlagClassRegClass)
+    return &AArch64::GPR64RegClass;
+
+  return RC;
+}
+
+
+
+BitVector
+AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  Reserved.set(AArch64::XSP);
+  Reserved.set(AArch64::WSP);
+
+  Reserved.set(AArch64::XZR);
+  Reserved.set(AArch64::WZR);
+
+  if (TFI->hasFP(MF)) {
+    Reserved.set(AArch64::X29);
+    Reserved.set(AArch64::W29);
+  }
+
+  return Reserved;
+}
+
+void
+AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
+                                         int SPAdj,
+                                         unsigned FIOperandNum,
+                                         RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Cannot deal with nonzero SPAdj yet");
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64FrameLowering *TFI =
+   static_cast<const AArch64FrameLowering *>(MF.getTarget().getFrameLowering());
+
+  // In order to work out the base and offset for addressing, the FrameLowering
+  // code needs to know (sometimes) whether the instruction is storing/loading a
+  // callee-saved register, or whether it's a more generic
+  // operation. Fortunately the frame indices are used *only* for that purpose
+  // and are contiguous, so we can check here.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  bool IsCalleeSaveOp = FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI;
+
+  unsigned FrameReg;
+  int64_t Offset;
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj,
+                                           IsCalleeSaveOp);
+
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+  // DBG_VALUE instructions have no real restrictions so they can be handled
+  // easily.
+  if (MI.isDebugValue()) {
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/ false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  int MinOffset, MaxOffset, OffsetScale;
+  if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s) {
+    MinOffset = 0;
+    MaxOffset = 0xfff;
+    OffsetScale = 1;
+  } else {
+    // Load/store of a stack object
+    TII.getAddressConstraints(MI, OffsetScale, MinOffset, MaxOffset);
+  }
+
+  // The frame lowering has told us a base and offset it thinks we should use to
+  // access this variable, but it's still up to us to make sure the values are
+  // legal for the instruction in question.
+  if (Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) {
+    unsigned BaseReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    emitRegUpdate(MBB, MBBI, MBBI->getDebugLoc(), TII,
+                  BaseReg, FrameReg, BaseReg, Offset);
+    FrameReg = BaseReg;
+    Offset = 0;
+  }
+
+  // Negative offsets are expected if we address from FP, but for
+  // now this checks nothing has gone horribly wrong.
+  assert(Offset >= 0 && "Unexpected negative offset from SP");
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, true);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale);
+}
+
+unsigned
+AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (TFI->hasFP(MF))
+    return AArch64::X29;
+  else
+    return AArch64::XSP;
+}
+
+bool
+AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const AArch64FrameLowering *AFI
+    = static_cast<const AArch64FrameLowering*>(TFI);
+  return AFI->useFPForAddressing(MF);
+}
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
new file mode 100644
index 0000000..bb64fd5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -0,0 +1,76 @@
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the MCRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64REGISTERINFO_H
+#define LLVM_TARGET_AARCH64REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "AArch64GenRegisterInfo.inc"
+
+namespace llvm {
+
+class AArch64InstrInfo;
+class AArch64Subtarget;
+
+struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
+private:
+  const AArch64InstrInfo &TII;
+
+public:
+  AArch64RegisterInfo(const AArch64InstrInfo &tii,
+                      const AArch64Subtarget &sti);
+
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+
+  const uint32_t *getTLSDescCallPreservedMask() const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *Rs = NULL) const;
+
+  /// getCrossCopyRegClass - Returns a legal register class to copy a register
+  /// in the specified class to or from. Returns original class if it is
+  /// possible to copy between a two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+  /// getLargestLegalSuperClass - Returns the largest super class of RC that is
+  /// legal to use in the current sub-target and has the same spill size.
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const {
+    if (RC == &AArch64::tcGPR64RegClass)
+      return &AArch64::GPR64RegClass;
+
+    return RC;
+  }
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_AARCH64REGISTERINFO_H
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
new file mode 100644
index 0000000..bd79546
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -0,0 +1,203 @@
+//===- AArch64RegisterInfo.td - ARM Register defs ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains declarations that describe the AArch64 register file
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AArch64" in {
+def sub_128 : SubRegIndex;
+def sub_64 : SubRegIndex;
+def sub_32 : SubRegIndex;
+def sub_16 : SubRegIndex;
+def sub_8  : SubRegIndex;
+
+// The VPR registers are handled as sub-registers of FPR equivalents, but
+// they're really the same thing. We give this concept a special index.
+def sub_alias : SubRegIndex;
+}
+
+// Registers are identified with 5-bit ID numbers.
+class AArch64Reg<bits<16> enc, string n> : Register<n> {
+  let HWEncoding = enc;
+  let Namespace = "AArch64";
+}
+
+class AArch64RegWithSubs<bits<16> enc, string n, list<Register> subregs = [],
+                         list<SubRegIndex> inds = []>
+      : AArch64Reg<enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = inds;
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer registers: w0-w30, wzr, wsp, x0-x30, xzr, sp
+//===----------------------------------------------------------------------===//
+
+foreach Index = 0-30 in {
+  def W#Index : AArch64Reg< Index, "w"#Index>, DwarfRegNum<[Index]>;
+}
+
+def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR : AArch64Reg<31, "wzr">;
+
+// Could be combined with previous loop, but this way leaves w and x registers
+// consecutive as LLVM register numbers, which makes for easier debugging.
+foreach Index = 0-30 in {
+  def X#Index : AArch64RegWithSubs<Index, "x"#Index,
+                                   [!cast<Register>("W"#Index)], [sub_32]>,
+                DwarfRegNum<[Index]>;
+}
+
+def XSP : AArch64RegWithSubs<31, "sp", [WSP], [sub_32]>, DwarfRegNum<[31]>;
+def XZR : AArch64RegWithSubs<31, "xzr", [WZR], [sub_32]>;
+
+// Most instructions treat register 31 as zero for reads and a black-hole for
+// writes.
+
+// Note that the order of registers is important for the Disassembler here:
+// tablegen uses it to form MCRegisterClass::getRegister, which we assume can
+// take an encoding value.
+def GPR32 : RegisterClass<"AArch64", [i32], 32,
+                          (add (sequence "W%u", 0, 30), WZR)> {
+}
+
+def GPR64 : RegisterClass<"AArch64", [i64], 64,
+                          (add (sequence "X%u", 0, 30), XZR)> {
+}
+
+def GPR32nowzr : RegisterClass<"AArch64", [i32], 32,
+                               (sequence "W%u", 0, 30)> {
+}
+
+def GPR64noxzr : RegisterClass<"AArch64", [i64], 64,
+                               (sequence "X%u", 0, 30)> {
+}
+
+// For tail calls, we can't use callee-saved registers or the structure-return
+// register, as they are supposed to be live across function calls and may be
+// clobbered by the epilogue.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64,
+                            (add (sequence "X%u", 0, 7),
+                                 (sequence "X%u", 9, 18))> {
+}
+
+
+// Certain addressing-useful instructions accept sp directly. Again the order of
+// registers is important to the Disassembler.
+def GPR32wsp : RegisterClass<"AArch64", [i32], 32,
+                             (add (sequence "W%u", 0, 30), WSP)> {
+}
+
+def GPR64xsp : RegisterClass<"AArch64", [i64], 64,
+                             (add (sequence "X%u", 0, 30), XSP)> {
+}
+
+// Some aliases *only* apply to SP (e.g. MOV uses different encoding for SP and
+// non-SP variants). We can't use a bare register in those patterns because
+// TableGen doesn't like it, so we need a class containing just stack registers
+def Rxsp : RegisterClass<"AArch64", [i64], 64,
+                         (add XSP)> {
+}
+
+def Rwsp : RegisterClass<"AArch64", [i32], 32,
+                         (add WSP)> {
+}
+
+//===----------------------------------------------------------------------===//
+//  Scalar registers in the vector unit:
+//  b0-b31, h0-h31, s0-s31, d0-d31, q0-q31
+//===----------------------------------------------------------------------===//
+
+foreach Index = 0-31 in {
+  def B # Index : AArch64Reg< Index, "b" # Index>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+
+  def H # Index : AArch64RegWithSubs<Index, "h" # Index,
+                                     [!cast<Register>("B" # Index)], [sub_8]>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+
+  def S # Index : AArch64RegWithSubs<Index, "s" # Index,
+                                     [!cast<Register>("H" # Index)], [sub_16]>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+
+  def D # Index : AArch64RegWithSubs<Index, "d" # Index,
+                                     [!cast<Register>("S" # Index)], [sub_32]>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+
+  def Q # Index : AArch64RegWithSubs<Index, "q" # Index,
+                                     [!cast<Register>("D" # Index)], [sub_64]>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+}
+
+
+def FPR8 : RegisterClass<"AArch64", [i8], 8,
+                          (sequence "B%u", 0, 31)> {
+}
+
+def FPR16 : RegisterClass<"AArch64", [f16], 16,
+                          (sequence "H%u", 0, 31)> {
+}
+
+def FPR32 : RegisterClass<"AArch64", [f32], 32,
+                          (sequence "S%u", 0, 31)> {
+}
+
+def FPR64 : RegisterClass<"AArch64", [f64], 64,
+                          (sequence "D%u", 0, 31)> {
+}
+
+def FPR128 : RegisterClass<"AArch64", [f128], 128,
+                          (sequence "Q%u", 0, 31)> {
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Vector registers:
+//===----------------------------------------------------------------------===//
+
+// NEON registers simply specify the overall vector, and it's expected that
+// Instructions will individually specify the acceptable data layout. In
+// principle this leaves two approaches open:
+//   + An operand, giving a single ADDvvv instruction (for example). This turns
+//     out to be unworkable in the assembly parser (without every Instruction
+//     having a "cvt" function, at least) because the constraints can't be
+//     properly enforced. It also complicates specifying patterns since each
+//     instruction will accept many types.
+//  + A bare token (e.g. ".2d"). This means the AsmParser has to know specific
+//    details about NEON registers, but simplifies most other details.
+//
+// The second approach was taken.
+
+foreach Index = 0-31 in {
+  def V # Index  : AArch64RegWithSubs<Index, "v" # Index,
+                                      [!cast<Register>("Q" # Index)],
+                                      [sub_alias]>,
+            DwarfRegNum<[!add(Index, 64)]>;
+}
+
+// These two classes contain the same registers, which should be reasonably
+// sensible for MC and allocation purposes, but allows them to be treated
+// separately for things like stack spilling.
+def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8], 64,
+                          (sequence "V%u", 0, 31)>;
+
+def VPR128 : RegisterClass<"AArch64",
+                           [v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], 128,
+                           (sequence "V%u", 0, 31)>;
+
+// Flags register
+def NZCV : Register<"nzcv"> {
+  let Namespace = "AArch64";
+}
+
+def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+  let CopyCost = -1;
+  let isAllocatable = 0;
+}
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
new file mode 100644
index 0000000..e17cdaa
--- /dev/null
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -0,0 +1,10 @@
+//===- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def GenericItineraries : ProcessorItineraries<[], [], []>;
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
new file mode 100644
index 0000000..6bbe075
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -0,0 +1,25 @@
+//===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-selectiondag-info"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+using namespace llvm;
+
+AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const AArch64TargetMachine &TM)
+  : TargetSelectionDAGInfo(TM),
+    Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
+}
+
+AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {
+}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
new file mode 100644
index 0000000..d412ed2
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -0,0 +1,32 @@
+//===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64SELECTIONDAGINFO_H
+#define LLVM_AARCH64SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class AArch64TargetMachine;
+
+class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+  const AArch64Subtarget *Subtarget;
+public:
+  explicit AArch64SelectionDAGInfo(const AArch64TargetMachine &TM);
+  ~AArch64SelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
new file mode 100644
index 0000000..d17b738
--- /dev/null
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -0,0 +1,43 @@
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64Subtarget.h"
+#include "AArch64RegisterInfo.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallVector.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AArch64GenSubtargetInfo.inc"
+
+using namespace llvm;
+
+AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS)
+  : AArch64GenSubtargetInfo(TT, CPU, FS)
+  , HasNEON(true)
+  , HasCrypto(true)
+  , TargetTriple(TT) {
+
+  ParseSubtargetFeatures(CPU, FS);
+}
+
+bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
+                                          Reloc::Model RelocM) const {
+  if (RelocM == Reloc::Static)
+    return false;
+
+  return !GV->hasLocalLinkage() && !GV->hasHiddenVisibility();
+}
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
new file mode 100644
index 0000000..2e9205f
--- /dev/null
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -0,0 +1,54 @@
+//==-- AArch64Subtarget.h - Define Subtarget for the AArch64 ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64_SUBTARGET_H
+#define LLVM_TARGET_AARCH64_SUBTARGET_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AArch64GenSubtargetInfo.inc"
+
+#include <string>
+
+namespace llvm {
+class StringRef;
+class GlobalValue;
+
+class AArch64Subtarget : public AArch64GenSubtargetInfo {
+protected:
+  bool HasNEON;
+  bool HasCrypto;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+
+};
+} // End llvm namespace
+
+#endif  // LLVM_TARGET_AARCH64_SUBTARGET_H
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
new file mode 100644
index 0000000..df599d5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -0,0 +1,81 @@
+//===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the AArch64TargetMachine
+// methods. Principally just setting up the passes needed to generate correct
+// code on this architecture.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeAArch64Target() {
+  RegisterTargetMachine<AArch64TargetMachine> X(TheAArch64Target);
+}
+
+AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS),
+    InstrInfo(Subtarget),
+    DL("e-p:64:64-i64:64:64-i128:128:128-s0:32:32-f128:128:128-n32:64-S128"),
+    TLInfo(*this),
+    TSInfo(*this),
+    FrameLowering(Subtarget) {
+}
+
+namespace {
+/// AArch64 Code Generator Pass Configuration Options.
+class AArch64PassConfig : public TargetPassConfig {
+public:
+  AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  AArch64TargetMachine &getAArch64TargetMachine() const {
+    return getTM<AArch64TargetMachine>();
+  }
+
+  const AArch64Subtarget &getAArch64Subtarget() const {
+    return *getAArch64TargetMachine().getSubtargetImpl();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AArch64PassConfig(this, PM);
+}
+
+bool AArch64PassConfig::addPreEmitPass() {
+  addPass(&UnpackMachineBundlesID);
+  addPass(createAArch64BranchFixupPass());
+  return true;
+}
+
+bool AArch64PassConfig::addInstSelector() {
+  addPass(createAArch64ISelDAG(getAArch64TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses.
+  if (getAArch64Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64CleanupLocalDynamicTLSPass());
+
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
new file mode 100644
index 0000000..c1f47c2
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -0,0 +1,69 @@
+//=== AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64TARGETMACHINE_H
+#define LLVM_AARCH64TARGETMACHINE_H
+
+#include "AArch64FrameLowering.h"
+#include "AArch64ISelLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64SelectionDAGInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AArch64TargetMachine : public LLVMTargetMachine {
+  AArch64Subtarget          Subtarget;
+  AArch64InstrInfo          InstrInfo;
+  const DataLayout          DL;
+  AArch64TargetLowering     TLInfo;
+  AArch64SelectionDAGInfo   TSInfo;
+  AArch64FrameLowering      FrameLowering;
+
+public:
+  AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+
+  const AArch64InstrInfo *getInstrInfo() const {
+    return &InstrInfo;
+  }
+
+  const AArch64FrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+
+  const AArch64TargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  const AArch64Subtarget *getSubtargetImpl() const { return &Subtarget; }
+
+  const DataLayout *getDataLayout() const { return &DL; }
+
+  const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  TargetPassConfig *createPassConfig(PassManagerBase &PM);
+};
+
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
new file mode 100644
index 0000000..b4452f5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -0,0 +1,24 @@
+//===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file deals with any AArch64 specific requirements on object files.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AArch64TargetObjectFile.h"
+
+using namespace llvm;
+
+void
+AArch64LinuxTargetObjectFile::Initialize(MCContext &Ctx,
+                                         const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
new file mode 100644
index 0000000..bf0565a
--- /dev/null
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -0,0 +1,31 @@
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file deals with any AArch64 specific requirements on object files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+  /// AArch64LinuxTargetObjectFile - This implementation is used for linux
+  /// AArch64.
+  class AArch64LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
new file mode 100644
index 0000000..c1695da
--- /dev/null
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -0,0 +1,2188 @@
+//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the (GNU-style) assembly parser for the AArch64
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+
+class AArch64Operand;
+
+class AArch64AsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
+#define GET_ASSEMBLER_HEADER
+#include "AArch64GenAsmMatcher.inc"
+
+public:
+  enum AArch64MatchResultTy {
+    Match_FirstAArch64 = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "AArch64GenAsmMatcher.inc"
+  };
+
+  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
+    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+    MCAsmParserExtension::Initialize(_Parser);
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  // These are the public interface of the MCTargetAsmParser
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  bool ParseDirective(AsmToken DirectiveID);
+  bool ParseDirectiveTLSDescCall(SMLoc L);
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer&Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
+
+  // The rest of the sub-parsers have more freedom over interface: they return
+  // an OperandMatchResultTy because it's less ambiguous than true/false or
+  // -1/0/1 even if it is more verbose
+  OperandMatchResultTy
+  ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+               StringRef Mnemonic);
+
+  OperandMatchResultTy ParseImmediate(const MCExpr *&ExprVal);
+
+  OperandMatchResultTy ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind);
+
+  OperandMatchResultTy
+  ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                uint32_t NumLanes);
+
+  OperandMatchResultTy
+  ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                uint32_t &NumLanes);
+
+  OperandMatchResultTy
+  ParseImmWithLSLOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseCondCodeOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseCRxOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseFPImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  template<typename SomeNamedImmMapper> OperandMatchResultTy
+  ParseNamedImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return ParseNamedImmOperand(SomeNamedImmMapper(), Operands);
+  }
+
+  OperandMatchResultTy
+  ParseNamedImmOperand(const NamedImmMapper &Mapper,
+                       SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseLSXAddressOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseShiftExtend(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  bool validateInstruction(MCInst &Inst,
+                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  /// Scan the next token (which had better be an identifier) and determine
+  /// whether it represents a general-purpose or vector register. It returns
+  /// true if an identifier was found and populates its reference arguments. It
+  /// does not consume the token.
+  bool
+  IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, StringRef &LayoutSpec,
+                   SMLoc &LayoutLoc) const;
+
+};
+
+}
+
+namespace {
+
+/// Instances of this class represent a parsed AArch64 machine instruction.
+class AArch64Operand : public MCParsedAsmOperand {
+private:
+  enum KindTy {
+    k_ImmWithLSL,     // #uimm {, LSL #amt }
+    k_CondCode,       // eq/ne/...
+    k_FPImmediate,    // Limited-precision floating-point imm
+    k_Immediate,      // Including expressions referencing symbols
+    k_Register,
+    k_ShiftExtend,
+    k_SysReg,         // The register operand of MRS and MSR instructions
+    k_Token,          // The mnemonic; other raw tokens the auto-generated
+    k_WrappedRegister // Load/store exclusive permit a wrapped register.
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  union {
+    struct {
+      const MCExpr *Val;
+      unsigned ShiftAmount;
+      bool ImplicitAmount;
+    } ImmWithLSL;
+
+    struct {
+      A64CC::CondCodes Code;
+    } CondCode;
+
+    struct {
+      double Val;
+    } FPImm;
+
+    struct {
+      const MCExpr *Val;
+    } Imm;
+
+    struct {
+      unsigned RegNum;
+    } Reg;
+
+    struct {
+      A64SE::ShiftExtSpecifiers ShiftType;
+      unsigned Amount;
+      bool ImplicitAmount;
+    } ShiftExtend;
+
+    struct {
+      const char *Data;
+      unsigned Length;
+    } SysReg;
+
+    struct {
+      const char *Data;
+      unsigned Length;
+    } Tok;
+  };
+
+  AArch64Operand(KindTy K, SMLoc S, SMLoc E)
+    : MCParsedAsmOperand(), Kind(K), StartLoc(S), EndLoc(E) {}
+
+public:
+  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand() {
+  }
+
+  SMLoc getStartLoc() const { return StartLoc; }
+  SMLoc getEndLoc() const { return EndLoc; }
+  void print(raw_ostream&) const;
+  void dump() const;
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert((Kind == k_Register || Kind == k_WrappedRegister)
+           && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == k_Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  A64CC::CondCodes getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CondCode.Code;
+  }
+
+  static bool isNonConstantExpr(const MCExpr *E,
+                                AArch64MCExpr::VariantKind &Variant) {
+    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) {
+      Variant = A64E->getKind();
+      return true;
+    } else if (!isa<MCConstantExpr>(E)) {
+      Variant = AArch64MCExpr::VK_AARCH64_None;
+      return true;
+    }
+
+    return false;
+  }
+
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isToken() const { return Kind == k_Token; }
+  bool isReg() const { return Kind == k_Register; }
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isMem() const { return false; }
+  bool isFPImm() const { return Kind == k_FPImmediate; }
+  bool isShiftOrExtend() const { return Kind == k_ShiftExtend; }
+  bool isSysReg() const { return Kind == k_SysReg; }
+  bool isImmWithLSL() const { return Kind == k_ImmWithLSL; }
+  bool isWrappedReg() const { return Kind == k_WrappedRegister; }
+
+  bool isAddSubImmLSL0() const {
+    if (!isImmWithLSL()) return false;
+    if (ImmWithLSL.ShiftAmount != 0) return false;
+
+    AArch64MCExpr::VariantKind Variant;
+    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
+      return Variant == AArch64MCExpr::VK_AARCH64_LO12
+          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12
+          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC
+          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12
+          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC
+          || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC_LO12;
+    }
+
+    // Otherwise it should be a real immediate in range:
+    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
+    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  }
+
+  bool isAddSubImmLSL12() const {
+    if (!isImmWithLSL()) return false;
+    if (ImmWithLSL.ShiftAmount != 12) return false;
+
+    AArch64MCExpr::VariantKind Variant;
+    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
+      return Variant == AArch64MCExpr::VK_AARCH64_DTPREL_HI12
+          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_HI12;
+    }
+
+    // Otherwise it should be a real immediate in range:
+    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
+    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  }
+
+  template<unsigned MemSize, unsigned RmSize> bool isAddrRegExtend() const {
+    if (!isShiftOrExtend()) return false;
+
+    A64SE::ShiftExtSpecifiers Ext = ShiftExtend.ShiftType;
+    if (RmSize == 32 && !(Ext == A64SE::UXTW || Ext == A64SE::SXTW))
+      return false;
+
+    if (RmSize == 64 && !(Ext == A64SE::LSL || Ext == A64SE::SXTX))
+      return false;
+
+    return ShiftExtend.Amount == Log2_32(MemSize) || ShiftExtend.Amount == 0;
+  }
+
+  bool isAdrpLabel() const {
+    if (!isImm()) return false;
+
+    AArch64MCExpr::VariantKind Variant;
+    if (isNonConstantExpr(getImm(), Variant)) {
+      return Variant == AArch64MCExpr::VK_AARCH64_None
+        || Variant == AArch64MCExpr::VK_AARCH64_GOT
+        || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL
+        || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC;
+    }
+
+    return isLabel<21, 4096>();
+  }
+
+  template<unsigned RegWidth>  bool isBitfieldWidth() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  }
+
+  template<int RegWidth>
+  bool isCVTFixedPos() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  }
+
+  bool isFMOVImm() const {
+    if (!isFPImm()) return false;
+
+    APFloat RealVal(FPImm.Val);
+    uint32_t ImmVal;
+    return A64Imms::isFPImm(RealVal, ImmVal);
+  }
+
+  bool isFPZero() const {
+    if (!isFPImm()) return false;
+
+    APFloat RealVal(FPImm.Val);
+    return RealVal.isPosZero();
+  }
+
+  template<unsigned field_width, unsigned scale>
+  bool isLabel() const {
+    if (!isImm()) return false;
+
+    if (dyn_cast<MCSymbolRefExpr>(Imm.Val)) {
+      return true;
+    } else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (scale * (1LL << (field_width - 1)));
+      int64_t Max = scale * ((1LL << (field_width - 1)) - 1);
+      return (Val % scale) == 0 && Val >= Min && Val <= Max;
+    }
+
+    // N.b. this disallows explicit relocation specifications via an
+    // AArch64MCExpr. Users needing that behaviour
+    return false;
+  }
+
+  bool isLane1() const {
+    if (!isImm()) return false;
+
+    // Because it's come through custom assembly parsing, it must always be a
+    // constant expression.
+    return cast<MCConstantExpr>(getImm())->getValue() == 1;
+  }
+
+  bool isLoadLitLabel() const {
+    if (!isImm()) return false;
+
+    AArch64MCExpr::VariantKind Variant;
+    if (isNonConstantExpr(getImm(), Variant)) {
+      return Variant == AArch64MCExpr::VK_AARCH64_None
+          || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL;
+    }
+
+    return isLabel<19, 4>();
+  }
+
+  template<unsigned RegWidth> bool isLogicalImm() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+    if (!CE) return false;
+
+    uint32_t Bits;
+    return A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+  }
+
+  template<unsigned RegWidth> bool isLogicalImmMOV() const {
+    if (!isLogicalImm<RegWidth>()) return false;
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
+
+    // The move alias for ORR is only valid if the immediate cannot be
+    // represented with a move (immediate) instruction; they take priority.
+    int UImm16, Shift;
+    return !A64Imms::isMOVZImm(RegWidth, CE->getValue(), UImm16, Shift)
+      && !A64Imms::isMOVNImm(RegWidth, CE->getValue(), UImm16, Shift);
+  }
+
+  template<int MemSize>
+  bool isOffsetUImm12() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+
+    // Assume they know what they're doing for now if they've given us a
+    // non-constant expression. In principle we could check for ridiculous
+    // things that can't possibly work or relocations that would almost
+    // certainly break resulting code.
+    if (!CE)
+      return true;
+
+    int64_t Val = CE->getValue();
+
+    // Must be a multiple of the access size in bytes.
+    if ((Val & (MemSize - 1)) != 0) return false;
+
+    // Must be 12-bit unsigned
+    return Val >= 0 && Val <= 0xfff * MemSize;
+  }
+
+  template<A64SE::ShiftExtSpecifiers SHKind, bool is64Bit>
+  bool isShift() const {
+    if (!isShiftOrExtend()) return false;
+
+    if (ShiftExtend.ShiftType != SHKind)
+      return false;
+
+    return is64Bit ? ShiftExtend.Amount <= 63 : ShiftExtend.Amount <= 31;
+  }
+
+  bool isMOVN32Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_SABS_G0,
+      AArch64MCExpr::VK_AARCH64_SABS_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMOVN64Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_SABS_G0,
+      AArch64MCExpr::VK_AARCH64_SABS_G1,
+      AArch64MCExpr::VK_AARCH64_SABS_G2,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G2,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  }
+
+
+  bool isMOVZ32Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_ABS_G0,
+      AArch64MCExpr::VK_AARCH64_ABS_G1,
+      AArch64MCExpr::VK_AARCH64_SABS_G0,
+      AArch64MCExpr::VK_AARCH64_SABS_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMOVZ64Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_ABS_G0,
+      AArch64MCExpr::VK_AARCH64_ABS_G1,
+      AArch64MCExpr::VK_AARCH64_ABS_G2,
+      AArch64MCExpr::VK_AARCH64_ABS_G3,
+      AArch64MCExpr::VK_AARCH64_SABS_G0,
+      AArch64MCExpr::VK_AARCH64_SABS_G1,
+      AArch64MCExpr::VK_AARCH64_SABS_G2,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G2,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMOVK32Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
+      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMOVK64Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
+      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
+      AArch64MCExpr::VK_AARCH64_ABS_G2_NC,
+      AArch64MCExpr::VK_AARCH64_ABS_G3,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMoveWideImm(unsigned RegWidth,
+                     AArch64MCExpr::VariantKind *PermittedModifiers,
+                     unsigned NumModifiers) const {
+    if (!isImmWithLSL()) return false;
+
+    if (ImmWithLSL.ShiftAmount % 16 != 0) return false;
+    if (ImmWithLSL.ShiftAmount >= RegWidth) return false;
+
+    AArch64MCExpr::VariantKind Modifier;
+    if (isNonConstantExpr(ImmWithLSL.Val, Modifier)) {
+      // E.g. "#:abs_g0:sym, lsl #16" makes no sense.
+      if (!ImmWithLSL.ImplicitAmount) return false;
+
+      for (unsigned i = 0; i < NumModifiers; ++i)
+        if (PermittedModifiers[i] == Modifier) return true;
+
+      return false;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmWithLSL.Val);
+    return CE && CE->getValue() >= 0  && CE->getValue() <= 0xffff;
+  }
+
+  template<int RegWidth, bool (*isValidImm)(int, uint64_t, int&, int&)>
+  bool isMoveWideMovAlias() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    int UImm16, Shift;
+    uint64_t Value = CE->getValue();
+
+    // If this is a 32-bit instruction then all bits above 32 should be the
+    // same: either of these is fine because signed/unsigned values should be
+    // permitted.
+    if (RegWidth == 32) {
+      if ((Value >> 32) != 0 && (Value >> 32) != 0xffffffff)
+        return false;
+
+      Value &= 0xffffffffULL;
+    }
+
+    return isValidImm(RegWidth, Value, UImm16, Shift);
+  }
+
+  bool isMSRWithReg() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    A64SysReg::MSRMapper().fromString(Name, IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+
+  bool isMSRPState() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    A64PState::PStateMapper().fromString(Name, IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+
+  bool isMRS() const {
+    if (!isSysReg()) return false;
+
+    // First check against specific MSR-only (write-only) registers
+    bool IsKnownRegister;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    A64SysReg::MRSMapper().fromString(Name, IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+
+  bool isPRFM() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+
+    if (!CE)
+      return false;
+
+    return CE->getValue() >= 0 && CE->getValue() <= 31;
+  }
+
+  template<A64SE::ShiftExtSpecifiers SHKind> bool isRegExtend() const {
+    if (!isShiftOrExtend()) return false;
+
+    if (ShiftExtend.ShiftType != SHKind)
+      return false;
+
+    return ShiftExtend.Amount <= 4;
+  }
+
+  bool isRegExtendLSL() const {
+    if (!isShiftOrExtend()) return false;
+
+    if (ShiftExtend.ShiftType != A64SE::LSL)
+      return false;
+
+    return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
+  }
+
+  template<int MemSize>  bool isSImm7Scaled() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    int64_t Val = CE->getValue();
+    if (Val % MemSize != 0) return false;
+
+    Val /= MemSize;
+
+    return Val >= -64 && Val < 64;
+  }
+
+  template<int BitWidth>
+  bool isSImm() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() >= -(1LL << (BitWidth - 1))
+      && CE->getValue() < (1LL << (BitWidth - 1));
+  }
+
+  template<int bitWidth>
+  bool isUImm() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() >= 0 && CE->getValue() < (1LL << bitWidth);
+  }
+
+  bool isUImm() const {
+    if (!isImm()) return false;
+
+    return isa<MCConstantExpr>(getImm());
+  }
+
+  static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
+                                          unsigned ShiftAmount,
+                                          bool ImplicitAmount,
+                                          SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E);
+    Op->ImmWithLSL.Val = Val;
+    Op->ImmWithLSL.ShiftAmount = ShiftAmount;
+    Op->ImmWithLSL.ImplicitAmount = ImplicitAmount;
+    return Op;
+  }
+
+  static AArch64Operand *CreateCondCode(A64CC::CondCodes Code,
+                                        SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_CondCode, S, E);
+    Op->CondCode.Code = Code;
+    return Op;
+  }
+
+  static AArch64Operand *CreateFPImm(double Val,
+                                     SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_FPImmediate, S, E);
+    Op->FPImm.Val = Val;
+    return Op;
+  }
+
+  static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_Immediate, S, E);
+    Op->Imm.Val = Val;
+    return Op;
+  }
+
+  static AArch64Operand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_Register, S, E);
+    Op->Reg.RegNum = RegNum;
+    return Op;
+  }
+
+  static AArch64Operand *CreateWrappedReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_WrappedRegister, S, E);
+    Op->Reg.RegNum = RegNum;
+    return Op;
+  }
+
+  static AArch64Operand *CreateShiftExtend(A64SE::ShiftExtSpecifiers ShiftTyp,
+                                           unsigned Amount,
+                                           bool ImplicitAmount,
+                                           SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, S, E);
+    Op->ShiftExtend.ShiftType = ShiftTyp;
+    Op->ShiftExtend.Amount = Amount;
+    Op->ShiftExtend.ImplicitAmount = ImplicitAmount;
+    return Op;
+  }
+
+  static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S) {
+    AArch64Operand *Op = new AArch64Operand(k_SysReg, S, S);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    return Op;
+  }
+
+  static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
+    AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    return Op;
+  }
+
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  template<unsigned RegWidth>
+  void addBFILSBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    unsigned EncodedVal = (RegWidth - CE->getValue()) % RegWidth;
+    Inst.addOperand(MCOperand::CreateImm(EncodedVal));
+  }
+
+  void addBFIWidthOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+  }
+
+  void addBFXWidthOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    uint64_t LSB = Inst.getOperand(Inst.getNumOperands()-1).getImm();
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+
+    Inst.addOperand(MCOperand::CreateImm(LSB + CE->getValue() - 1));
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
+  }
+
+  void addCVTFixedPosOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(64 - CE->getValue()));
+  }
+
+  void addFMOVImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    APFloat RealVal(FPImm.Val);
+    uint32_t ImmVal;
+    A64Imms::isFPImm(RealVal, ImmVal);
+
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  }
+
+  void addFPZeroOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    Inst.addOperand(MCOperand::CreateImm(0));
+  }
+
+  void addInvCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned Encoded = A64InvertCondCode(getCondCode());
+    Inst.addOperand(MCOperand::CreateImm(Encoded));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  template<int MemSize>
+  void addSImm7ScaledOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Val = CE->getValue() / MemSize;
+    Inst.addOperand(MCOperand::CreateImm(Val  & 0x7f));
+  }
+
+  template<int BitWidth>
+  void addSImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Val = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Val  & ((1ULL << BitWidth) - 1)));
+  }
+
+  void addImmWithLSLOperands(MCInst &Inst, unsigned N) const {
+    assert (N == 1 && "Invalid number of operands!");
+
+    addExpr(Inst, ImmWithLSL.Val);
+  }
+
+  template<unsigned field_width, unsigned scale>
+  void addLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+
+    if (!CE) {
+      addExpr(Inst, Imm.Val);
+      return;
+    }
+
+    int64_t Val = CE->getValue();
+    assert(Val % scale == 0 && "Unaligned immediate in instruction");
+    Val /= scale;
+
+    Inst.addOperand(MCOperand::CreateImm(Val & ((1LL << field_width) - 1)));
+  }
+
+  template<int MemSize>
+  void addOffsetUImm12Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue() / MemSize));
+    } else {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+    }
+  }
+
+  template<unsigned RegWidth>
+  void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
+
+    uint32_t Bits;
+    A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMRSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    uint32_t Bits = A64SysReg::MRSMapper().fromString(Name, Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMSRWithRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    uint32_t Bits = A64SysReg::MSRMapper().fromString(Name, Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMSRPStateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    uint32_t Bits = A64PState::PStateMapper().fromString(Name, Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMoveWideImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    addExpr(Inst, ImmWithLSL.Val);
+
+    AArch64MCExpr::VariantKind Variant;
+    if (!isNonConstantExpr(ImmWithLSL.Val, Variant)) {
+      Inst.addOperand(MCOperand::CreateImm(ImmWithLSL.ShiftAmount / 16));
+      return;
+    }
+
+    // We know it's relocated
+    switch (Variant) {
+    case AArch64MCExpr::VK_AARCH64_ABS_G0:
+    case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
+    case AArch64MCExpr::VK_AARCH64_SABS_G0:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
+    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
+      Inst.addOperand(MCOperand::CreateImm(0));
+      break;
+    case AArch64MCExpr::VK_AARCH64_ABS_G1:
+    case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
+    case AArch64MCExpr::VK_AARCH64_SABS_G1:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
+    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G1:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
+      Inst.addOperand(MCOperand::CreateImm(1));
+      break;
+    case AArch64MCExpr::VK_AARCH64_ABS_G2:
+    case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
+    case AArch64MCExpr::VK_AARCH64_SABS_G2:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G2:
+      Inst.addOperand(MCOperand::CreateImm(2));
+      break;
+    case AArch64MCExpr::VK_AARCH64_ABS_G3:
+      Inst.addOperand(MCOperand::CreateImm(3));
+      break;
+    default: llvm_unreachable("Inappropriate move wide relocation");
+    }
+  }
+
+  template<int RegWidth, bool isValidImm(int, uint64_t, int&, int&)>
+  void addMoveWideMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int UImm16, Shift;
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+
+    if (RegWidth == 32) {
+      Value &= 0xffffffffULL;
+    }
+
+    bool Valid = isValidImm(RegWidth, Value, UImm16, Shift);
+    (void)Valid;
+    assert(Valid && "Invalid immediates should have been weeded out by now");
+
+    Inst.addOperand(MCOperand::CreateImm(UImm16));
+    Inst.addOperand(MCOperand::CreateImm(Shift));
+  }
+
+  void addPRFMOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    assert(CE->getValue() >= 0 && CE->getValue() <= 31
+           && "PRFM operand should be 5-bits");
+
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  }
+
+  // For Add-sub (extended register) operands.
+  void addRegExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+  }
+
+  // For the extend in load-store (register offset) instructions.
+  template<unsigned MemSize>
+  void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const {
+    addAddrRegExtendOperands(Inst, N, MemSize);
+  }
+
+  void addAddrRegExtendOperands(MCInst &Inst, unsigned N,
+                                unsigned MemSize) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    // First bit of Option is set in instruction classes, the high two bits are
+    // as follows:
+    unsigned OptionHi = 0;
+    switch (ShiftExtend.ShiftType) {
+    case A64SE::UXTW:
+    case A64SE::LSL:
+      OptionHi = 1;
+      break;
+    case A64SE::SXTW:
+    case A64SE::SXTX:
+      OptionHi = 3;
+      break;
+    default:
+      llvm_unreachable("Invalid extend type for register offset");
+    }
+
+    unsigned S = 0;
+    if (MemSize == 1 && !ShiftExtend.ImplicitAmount)
+      S = 1;
+    else if (MemSize != 1 && ShiftExtend.Amount != 0)
+      S = 1;
+
+    Inst.addOperand(MCOperand::CreateImm((OptionHi << 1) | S));
+  }
+  void addShiftOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+  }
+};
+
+} // end anonymous namespace.
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               StringRef Mnemonic) {
+
+  // See if the operand has a custom parser
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // It could either succeed, fail or just not care.
+  if (ResTy != MatchOperand_NoMatch)
+    return ResTy;
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return MatchOperand_ParseFail;
+  case AsmToken::Identifier: {
+    // It might be in the LSL/UXTB family ...
+    OperandMatchResultTy GotShift = ParseShiftExtend(Operands);
+
+    // We can only continue if no tokens were eaten.
+    if (GotShift != MatchOperand_NoMatch)
+      return GotShift;
+
+    // ... or it might be a register ...
+    uint32_t NumLanes = 0;
+    OperandMatchResultTy GotReg = ParseRegister(Operands, NumLanes);
+    assert(GotReg != MatchOperand_ParseFail
+           && "register parsing shouldn't partially succeed");
+
+    if (GotReg == MatchOperand_Success) {
+      if (Parser.getTok().is(AsmToken::LBrac))
+        return ParseNEONLane(Operands, NumLanes);
+      else
+        return MatchOperand_Success;
+    }
+
+    // ... or it might be a symbolish thing
+  }
+    // Fall through
+  case AsmToken::LParen:  // E.g. (strcmp-4)
+  case AsmToken::Integer: // 1f, 2b labels
+  case AsmToken::String:  // quoted labels
+  case AsmToken::Dot:     // . is Current location
+  case AsmToken::Dollar:  // $ is PC
+  case AsmToken::Colon: {
+    SMLoc StartLoc  = Parser.getTok().getLoc();
+    SMLoc EndLoc;
+    const MCExpr *ImmVal = 0;
+
+    if (ParseImmediate(ImmVal) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+
+    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
+    return MatchOperand_Success;
+  }
+  case AsmToken::Hash: {   // Immediates
+    SMLoc StartLoc = Parser.getTok().getLoc();
+    SMLoc EndLoc;
+    const MCExpr *ImmVal = 0;
+    Parser.Lex();
+
+    if (ParseImmediate(ImmVal) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+
+    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
+    return MatchOperand_Success;
+  }
+  case AsmToken::LBrac: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateToken("[", Loc));
+    Parser.Lex(); // Eat '['
+
+    // There's no comma after a '[', so we can parse the next operand
+    // immediately.
+    return ParseOperand(Operands, Mnemonic);
+  }
+  // The following will likely be useful later, but not in very early cases
+  case AsmToken::LCurly:  // Weird SIMD lists
+    llvm_unreachable("Don't know how to deal with '{' in operand");
+    return MatchOperand_ParseFail;
+  }
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseImmediate(const MCExpr *&ExprVal) {
+  if (getLexer().is(AsmToken::Colon)) {
+    AArch64MCExpr::VariantKind RefKind;
+
+    OperandMatchResultTy ResTy = ParseRelocPrefix(RefKind);
+    if (ResTy != MatchOperand_Success)
+      return ResTy;
+
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return MatchOperand_ParseFail;
+
+    ExprVal = AArch64MCExpr::Create(RefKind, SubExprVal, getContext());
+    return MatchOperand_Success;
+  }
+
+  // No weird AArch64MCExpr prefix
+  return getParser().parseExpression(ExprVal)
+    ? MatchOperand_ParseFail : MatchOperand_Success;
+}
+
+// A lane attached to a NEON register. "[N]", which should yield three tokens:
+// '[', N, ']'. A hash is not allowed to precede the immediate here.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                uint32_t NumLanes) {
+  SMLoc Loc = Parser.getTok().getLoc();
+
+  assert(Parser.getTok().is(AsmToken::LBrac) && "inappropriate operand");
+  Operands.push_back(AArch64Operand::CreateToken("[", Loc));
+  Parser.Lex(); // Eat '['
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(), "expected lane number");
+    return MatchOperand_ParseFail;
+  }
+
+  if (Parser.getTok().getIntVal() >= NumLanes) {
+    Error(Parser.getTok().getLoc(), "lane number incompatible with layout");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *Lane = MCConstantExpr::Create(Parser.getTok().getIntVal(),
+                                              getContext());
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat actual lane
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateImm(Lane, S, E));
+
+
+  if (Parser.getTok().isNot(AsmToken::RBrac)) {
+    Error(Parser.getTok().getLoc(), "expected ']' after lane");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AArch64Operand::CreateToken("]", Loc));
+  Parser.Lex(); // Eat ']'
+
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind) {
+  assert(getLexer().is(AsmToken::Colon) && "expected a ':'");
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(),
+          "expected relocation specifier in operand after ':'");
+    return MatchOperand_ParseFail;
+  }
+
+  std::string LowerCase = Parser.getTok().getIdentifier().lower();
+  RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+    .Case("got",              AArch64MCExpr::VK_AARCH64_GOT)
+    .Case("got_lo12",         AArch64MCExpr::VK_AARCH64_GOT_LO12)
+    .Case("lo12",             AArch64MCExpr::VK_AARCH64_LO12)
+    .Case("abs_g0",           AArch64MCExpr::VK_AARCH64_ABS_G0)
+    .Case("abs_g0_nc",        AArch64MCExpr::VK_AARCH64_ABS_G0_NC)
+    .Case("abs_g1",           AArch64MCExpr::VK_AARCH64_ABS_G1)
+    .Case("abs_g1_nc",        AArch64MCExpr::VK_AARCH64_ABS_G1_NC)
+    .Case("abs_g2",           AArch64MCExpr::VK_AARCH64_ABS_G2)
+    .Case("abs_g2_nc",        AArch64MCExpr::VK_AARCH64_ABS_G2_NC)
+    .Case("abs_g3",           AArch64MCExpr::VK_AARCH64_ABS_G3)
+    .Case("abs_g0_s",         AArch64MCExpr::VK_AARCH64_SABS_G0)
+    .Case("abs_g1_s",         AArch64MCExpr::VK_AARCH64_SABS_G1)
+    .Case("abs_g2_s",         AArch64MCExpr::VK_AARCH64_SABS_G2)
+    .Case("dtprel_g2",        AArch64MCExpr::VK_AARCH64_DTPREL_G2)
+    .Case("dtprel_g1",        AArch64MCExpr::VK_AARCH64_DTPREL_G1)
+    .Case("dtprel_g1_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC)
+    .Case("dtprel_g0",        AArch64MCExpr::VK_AARCH64_DTPREL_G0)
+    .Case("dtprel_g0_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC)
+    .Case("dtprel_hi12",      AArch64MCExpr::VK_AARCH64_DTPREL_HI12)
+    .Case("dtprel_lo12",      AArch64MCExpr::VK_AARCH64_DTPREL_LO12)
+    .Case("dtprel_lo12_nc",   AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC)
+    .Case("gottprel_g1",      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1)
+    .Case("gottprel_g0_nc",   AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC)
+    .Case("gottprel",         AArch64MCExpr::VK_AARCH64_GOTTPREL)
+    .Case("gottprel_lo12",    AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12)
+    .Case("tprel_g2",         AArch64MCExpr::VK_AARCH64_TPREL_G2)
+    .Case("tprel_g1",         AArch64MCExpr::VK_AARCH64_TPREL_G1)
+    .Case("tprel_g1_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC)
+    .Case("tprel_g0",         AArch64MCExpr::VK_AARCH64_TPREL_G0)
+    .Case("tprel_g0_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC)
+    .Case("tprel_hi12",       AArch64MCExpr::VK_AARCH64_TPREL_HI12)
+    .Case("tprel_lo12",       AArch64MCExpr::VK_AARCH64_TPREL_LO12)
+    .Case("tprel_lo12_nc",    AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC)
+    .Case("tlsdesc",          AArch64MCExpr::VK_AARCH64_TLSDESC)
+    .Case("tlsdesc_lo12",     AArch64MCExpr::VK_AARCH64_TLSDESC_LO12)
+    .Default(AArch64MCExpr::VK_AARCH64_None);
+
+  if (RefKind == AArch64MCExpr::VK_AARCH64_None) {
+    Error(Parser.getTok().getLoc(),
+          "expected relocation specifier in operand after ':'");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat identifier
+
+  if (getLexer().isNot(AsmToken::Colon)) {
+    Error(Parser.getTok().getLoc(),
+          "expected ':' after relocation specifier");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex();
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseImmWithLSLOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // FIXME?: I want to live in a world where immediates must start with
+  // #. Please don't dash my hopes (well, do if you have a good reason).
+  if (Parser.getTok().isNot(AsmToken::Hash)) return MatchOperand_NoMatch;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '#'
+
+  const MCExpr *Imm;
+  if (ParseImmediate(Imm) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+  else if (Parser.getTok().isNot(AsmToken::Comma)) {
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, 0, true, S, E));
+    return MatchOperand_Success;
+  }
+
+  // Eat ','
+  Parser.Lex();
+
+  // The optional operand must be "lsl #N" where N is non-negative.
+  if (Parser.getTok().is(AsmToken::Identifier)
+      && Parser.getTok().getIdentifier().lower() == "lsl") {
+    Parser.Lex();
+
+    if (Parser.getTok().is(AsmToken::Hash)) {
+      Parser.Lex();
+
+      if (Parser.getTok().isNot(AsmToken::Integer)) {
+        Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+        return MatchOperand_ParseFail;
+      }
+    }
+  }
+
+  int64_t ShiftAmount = Parser.getTok().getIntVal();
+
+  if (ShiftAmount < 0) {
+    Error(Parser.getTok().getLoc(), "positive shift amount required");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the number
+
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, ShiftAmount,
+                                                      false, S, E));
+  return MatchOperand_Success;
+}
+
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseCondCodeOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  StringRef Tok = Parser.getTok().getIdentifier();
+  A64CC::CondCodes CondCode = A64StringToCondCode(Tok);
+
+  if (CondCode == A64CC::Invalid)
+    return MatchOperand_NoMatch;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat condition code
+  SMLoc E = Parser.getTok().getLoc();
+
+  Operands.push_back(AArch64Operand::CreateCondCode(CondCode, S, E));
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseCRxOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  std::string LowerTok = Parser.getTok().getIdentifier().lower();
+  StringRef Tok(LowerTok);
+  if (Tok[0] != 'c') {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  uint32_t CRNum;
+  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
+  if (BadNum || CRNum > 15) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *CRImm = MCConstantExpr::Create(CRNum, getContext());
+
+  Parser.Lex();
+  SMLoc E = Parser.getTok().getLoc();
+
+  Operands.push_back(AArch64Operand::CreateImm(CRImm, S, E));
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseFPImmOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  // FIXME?: I want to live in a world where immediates must start with
+  // #. Please don't dash my hopes (well, do if you have a good reason).
+  if (Parser.getTok().isNot(AsmToken::Hash)) return MatchOperand_NoMatch;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '#'
+
+  bool Negative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    Negative = true;
+    Parser.Lex(); // Eat '-'
+  } else if (Parser.getTok().is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat '+'
+  }
+
+  if (Parser.getTok().isNot(AsmToken::Real)) {
+    Error(S, "Expected floating-point immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  APFloat RealVal(APFloat::IEEEdouble, Parser.getTok().getString());
+  if (Negative) RealVal.changeSign();
+  double DblVal = RealVal.convertToDouble();
+
+  Parser.Lex(); // Eat real number
+  SMLoc E = Parser.getTok().getLoc();
+
+  Operands.push_back(AArch64Operand::CreateFPImm(DblVal, S, E));
+  return MatchOperand_Success;
+}
+
+
+// Automatically generated
+static unsigned MatchRegisterName(StringRef Name);
+
+bool
+AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
+                                   StringRef &Layout,
+                                   SMLoc &LayoutLoc) const {
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.isNot(AsmToken::Identifier))
+    return false;
+
+  std::string LowerReg = Tok.getString().lower();
+  size_t DotPos = LowerReg.find('.');
+
+  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
+  if (RegNum == AArch64::NoRegister) {
+    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
+      .Case("ip0", AArch64::X16)
+      .Case("ip1", AArch64::X17)
+      .Case("fp", AArch64::X29)
+      .Case("lr", AArch64::X30)
+      .Default(AArch64::NoRegister);
+  }
+  if (RegNum == AArch64::NoRegister)
+    return false;
+
+  SMLoc S = Tok.getLoc();
+  RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos);
+
+  if (DotPos == StringRef::npos) {
+    Layout = StringRef();
+  } else {
+    // Everything afterwards needs to be a literal token, expected to be
+    // '.2d','.b' etc for vector registers.
+
+    // This StringSwitch validates the input and (perhaps more importantly)
+    // gives us a permanent string to use in the token (a pointer into LowerReg
+    // would go out of scope when we return).
+    LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1);
+    std::string LayoutText = LowerReg.substr(DotPos, StringRef::npos);
+    Layout = StringSwitch<const char *>(LayoutText)
+      .Case(".d", ".d").Case(".1d", ".1d").Case(".2d", ".2d")
+      .Case(".s", ".s").Case(".2s", ".2s").Case(".4s", ".4s")
+      .Case(".h", ".h").Case(".4h", ".4h").Case(".8h", ".8h")
+      .Case(".b", ".b").Case(".8b", ".8b").Case(".16b", ".16b")
+      .Default("");
+
+    if (Layout.size() == 0) {
+      // Malformed register
+      return false;
+    }
+  }
+
+  return true;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                uint32_t &NumLanes) {
+  unsigned RegNum;
+  StringRef Layout;
+  SMLoc RegEndLoc, LayoutLoc;
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(AArch64Operand::CreateReg(RegNum, S, RegEndLoc));
+
+  if (Layout.size() != 0) {
+    unsigned long long TmpLanes = 0;
+    llvm::getAsUnsignedInteger(Layout.substr(1), 10, TmpLanes);
+    if (TmpLanes != 0) {
+      NumLanes = TmpLanes;
+    } else {
+      // If the number of lanes isn't specified explicitly, a valid instruction
+      // will have an element specifier and be capable of acting on the entire
+      // vector register.
+      switch (Layout.back()) {
+      default: llvm_unreachable("Invalid layout specifier");
+      case 'b': NumLanes = 16; break;
+      case 'h': NumLanes = 8; break;
+      case 's': NumLanes = 4; break;
+      case 'd': NumLanes = 2; break;
+      }
+    }
+
+    Operands.push_back(AArch64Operand::CreateToken(Layout, LayoutLoc));
+  }
+
+  Parser.Lex();
+  return MatchOperand_Success;
+}
+
+bool
+AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                SMLoc &EndLoc) {
+  // This callback is used for things like DWARF frame directives in
+  // assembly. They don't care about things like NEON layouts or lanes, they
+  // just want to be able to produce the DWARF register number.
+  StringRef LayoutSpec;
+  SMLoc RegEndLoc, LayoutLoc;
+  StartLoc = Parser.getTok().getLoc();
+
+  if (!IdentifyRegister(RegNo, RegEndLoc, LayoutSpec, LayoutLoc))
+    return true;
+
+  Parser.Lex();
+  EndLoc = Parser.getTok().getLoc();
+
+  return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseNamedImmOperand(const NamedImmMapper &Mapper,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Since these operands occur in very limited circumstances, without
+  // alternatives, we actually signal an error if there is no match. If relaxing
+  // this, beware of unintended consequences: an immediate will be accepted
+  // during matching, no matter how it gets into the AArch64Operand.
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+
+  if (Tok.is(AsmToken::Identifier)) {
+    bool ValidName;
+    uint32_t Code = Mapper.fromString(Tok.getString().lower(), ValidName);
+
+    if (!ValidName) {
+      Error(S, "operand specifier not recognised");
+      return MatchOperand_ParseFail;
+    }
+
+    Parser.Lex(); // We're done with the identifier. Eat it
+
+    SMLoc E = Parser.getTok().getLoc();
+    const MCExpr *Imm = MCConstantExpr::Create(Code, getContext());
+    Operands.push_back(AArch64Operand::CreateImm(Imm, S, E));
+    return MatchOperand_Success;
+  } else if (Tok.is(AsmToken::Hash)) {
+    Parser.Lex();
+
+    const MCExpr *ImmVal;
+    if (ParseImmediate(ImmVal) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!CE || CE->getValue() < 0 || !Mapper.validImm(CE->getValue())) {
+      Error(S, "Invalid immediate for instruction");
+      return MatchOperand_ParseFail;
+    }
+
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E));
+    return MatchOperand_Success;
+  }
+
+  Error(S, "unexpected operand for instruction");
+  return MatchOperand_ParseFail;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseSysRegOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  // Any MSR/MRS operand will be an identifier, and we want to store it as some
+  // kind of string: SPSel is valid for two different forms of MSR with two
+  // different encodings. There's no collision at the moment, but the potential
+  // is there.
+  if (!Tok.is(AsmToken::Identifier)) {
+    return MatchOperand_NoMatch;
+  }
+
+  SMLoc S = Tok.getLoc();
+  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), S));
+  Parser.Lex(); // Eat identifier
+
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseLSXAddressOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+
+  unsigned RegNum;
+  SMLoc RegEndLoc, LayoutLoc;
+  StringRef Layout;
+  if(!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc)
+     || !AArch64MCRegisterClasses[AArch64::GPR64xspRegClassID].contains(RegNum)
+     || Layout.size() != 0) {
+    // Check Layout.size because we don't want to let "x3.4s" or similar
+    // through.
+    return MatchOperand_NoMatch;
+  }
+  Parser.Lex(); // Eat register
+
+  if (Parser.getTok().is(AsmToken::RBrac)) {
+    // We're done
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
+    return MatchOperand_Success;
+  }
+
+  // Otherwise, only ", #0" is valid
+
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "expected ',' or ']' after register");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat ','
+
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "expected '#0'");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat '#'
+
+  if (Parser.getTok().isNot(AsmToken::Integer)
+      || Parser.getTok().getIntVal() != 0 ) {
+    Error(Parser.getTok().getLoc(), "expected '#0'");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat '0'
+
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseShiftExtend(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  StringRef IDVal = Parser.getTok().getIdentifier();
+  std::string LowerID = IDVal.lower();
+
+  A64SE::ShiftExtSpecifiers Spec =
+    StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
+      .Case("lsl", A64SE::LSL)
+      .Case("lsr", A64SE::LSR)
+      .Case("asr", A64SE::ASR)
+      .Case("ror", A64SE::ROR)
+      .Case("uxtb", A64SE::UXTB)
+      .Case("uxth", A64SE::UXTH)
+      .Case("uxtw", A64SE::UXTW)
+      .Case("uxtx", A64SE::UXTX)
+      .Case("sxtb", A64SE::SXTB)
+      .Case("sxth", A64SE::SXTH)
+      .Case("sxtw", A64SE::SXTW)
+      .Case("sxtx", A64SE::SXTX)
+      .Default(A64SE::Invalid);
+
+  if (Spec == A64SE::Invalid)
+    return MatchOperand_NoMatch;
+
+  // Eat the shift
+  SMLoc S, E;
+  S = Parser.getTok().getLoc();
+  Parser.Lex();
+
+  if (Spec != A64SE::LSL && Spec != A64SE::LSR &&
+      Spec != A64SE::ASR && Spec != A64SE::ROR) {
+    // The shift amount can be omitted for the extending versions, but not real
+    // shifts:
+    //     add x0, x0, x0, uxtb
+    // is valid, and equivalent to
+    //     add x0, x0, x0, uxtb #0
+
+    if (Parser.getTok().is(AsmToken::Comma) ||
+        Parser.getTok().is(AsmToken::EndOfStatement) ||
+        Parser.getTok().is(AsmToken::RBrac)) {
+      Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, 0, true,
+                                                           S, E));
+      return MatchOperand_Success;
+    }
+  }
+
+  // Eat # at beginning of immediate
+  if (!Parser.getTok().is(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(),
+          "expected #imm after shift specifier");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex();
+
+  // Make sure we do actually have a number
+  if (!Parser.getTok().is(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(),
+          "expected integer shift amount");
+    return MatchOperand_ParseFail;
+  }
+  unsigned Amount = Parser.getTok().getIntVal();
+  Parser.Lex();
+  E = Parser.getTok().getLoc();
+
+  Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, Amount, false,
+                                                       S, E));
+
+  return MatchOperand_Success;
+}
+
+// FIXME: We would really like to be able to tablegen'erate this.
+bool AArch64AsmParser::
+validateInstruction(MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  switch (Inst.getOpcode()) {
+  case AArch64::BFIwwii:
+  case AArch64::BFIxxii:
+  case AArch64::SBFIZwwii:
+  case AArch64::SBFIZxxii:
+  case AArch64::UBFIZwwii:
+  case AArch64::UBFIZxxii:  {
+    unsigned ImmOps = Inst.getNumOperands() - 2;
+    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
+    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
+
+    if (ImmR != 0 && ImmS >= ImmR) {
+      return Error(Operands[4]->getStartLoc(),
+                   "requested insert overflows register");
+    }
+    return false;
+  }
+  case AArch64::BFXILwwii:
+  case AArch64::BFXILxxii:
+  case AArch64::SBFXwwii:
+  case AArch64::SBFXxxii:
+  case AArch64::UBFXwwii:
+  case AArch64::UBFXxxii: {
+    unsigned ImmOps = Inst.getNumOperands() - 2;
+    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
+    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
+    int64_t RegWidth = 0;
+    switch (Inst.getOpcode()) {
+    case AArch64::SBFXxxii: case AArch64::UBFXxxii: case AArch64::BFXILxxii:
+      RegWidth = 64;
+      break;
+    case AArch64::SBFXwwii: case AArch64::UBFXwwii: case AArch64::BFXILwwii:
+      RegWidth = 32;
+      break;
+    }
+
+    if (ImmS >= RegWidth || ImmS < ImmR) {
+      return Error(Operands[4]->getStartLoc(),
+                   "requested extract overflows register");
+    }
+    return false;
+  }
+  case AArch64::ICix: {
+    int64_t ImmVal = Inst.getOperand(0).getImm();
+    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
+    if (!A64IC::NeedsRegister(ICOp)) {
+      return Error(Operands[1]->getStartLoc(),
+                   "specified IC op does not use a register");
+    }
+    return false;
+  }
+  case AArch64::ICi: {
+    int64_t ImmVal = Inst.getOperand(0).getImm();
+    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
+    if (A64IC::NeedsRegister(ICOp)) {
+      return Error(Operands[1]->getStartLoc(),
+                   "specified IC op requires a register");
+    }
+    return false;
+  }
+  case AArch64::TLBIix: {
+    int64_t ImmVal = Inst.getOperand(0).getImm();
+    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
+    if (!A64TLBI::NeedsRegister(TLBIOp)) {
+      return Error(Operands[1]->getStartLoc(),
+                   "specified TLBI op does not use a register");
+    }
+    return false;
+  }
+  case AArch64::TLBIi: {
+    int64_t ImmVal = Inst.getOperand(0).getImm();
+    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
+    if (A64TLBI::NeedsRegister(TLBIOp)) {
+      return Error(Operands[1]->getStartLoc(),
+                   "specified TLBI op requires a register");
+    }
+    return false;
+  }
+  }
+
+  return false;
+}
+
+
+// Parses the instruction *together with* all operands, appending each parsed
+// operand to the "Operands" list
+bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  size_t CondCodePos = Name.find('.');
+
+  StringRef Mnemonic = Name.substr(0, CondCodePos);
+  Operands.push_back(AArch64Operand::CreateToken(Mnemonic, NameLoc));
+
+  if (CondCodePos != StringRef::npos) {
+    // We have a condition code
+    SMLoc S = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 1);
+    StringRef CondStr = Name.substr(CondCodePos + 1, StringRef::npos);
+    A64CC::CondCodes Code;
+
+    Code = A64StringToCondCode(CondStr);
+
+    if (Code == A64CC::Invalid) {
+      Error(S, "invalid condition code");
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    SMLoc DotL = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos);
+
+    Operands.push_back(AArch64Operand::CreateToken(".",  DotL));
+    SMLoc E = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 3);
+    Operands.push_back(AArch64Operand::CreateCondCode(Code, S, E));
+  }
+
+  // Now we parse the operands of this instruction
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Mnemonic)) {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (ParseOperand(Operands, Mnemonic)) {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+
+
+      // After successfully parsing some operands there are two special cases to
+      // consider (i.e. notional operands not separated by commas). Both are due
+      // to memory specifiers:
+      //  + An RBrac will end an address for load/store/prefetch
+      //  + An '!' will indicate a pre-indexed operation.
+      //
+      // It's someone else's responsibility to make sure these tokens are sane
+      // in the given context!
+      if (Parser.getTok().is(AsmToken::RBrac)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(AArch64Operand::CreateToken("]", Loc));
+        Parser.Lex();
+      }
+
+      if (Parser.getTok().is(AsmToken::Exclaim)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(AArch64Operand::CreateToken("!", Loc));
+        Parser.Lex();
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "expected comma before next operand");
+  }
+
+  // Eat the EndOfStatement
+  Parser.Lex();
+
+  return false;
+}
+
+bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".hword")
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  else if (IDVal == ".word")
+    return ParseDirectiveWord(4, DirectiveID.getLoc());
+  else if (IDVal == ".xword")
+    return ParseDirectiveWord(8, DirectiveID.getLoc());
+  else if (IDVal == ".tlsdesccall")
+    return ParseDirectiveTLSDescCall(DirectiveID.getLoc());
+
+  return true;
+}
+
+/// parseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size, 0/*addrspace*/);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+// parseDirectiveTLSDescCall:
+//   ::= .tlsdesccall symbol
+bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return Error(L, "expected symbol after directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+
+  MCInst Inst;
+  Inst.setOpcode(AArch64::TLSDESCCALL);
+  Inst.addOperand(MCOperand::CreateExpr(Expr));
+
+  getParser().getStreamer().EmitInstruction(Inst);
+  return false;
+}
+
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                 MCStreamer &Out, unsigned &ErrorInfo,
+                                 bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult;
+  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                     MatchingInlineAsm);
+
+  if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
+    return Error(IDLoc, "too few operands for instruction");
+
+  switch (MatchResult) {
+  default: break;
+  case Match_Success:
+    if (validateInstruction(Inst, Operands))
+      return true;
+
+    Out.EmitInstruction(Inst);
+    return false;
+  case Match_MissingFeature:
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    return true;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      ErrorLoc = ((AArch64Operand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+
+  case Match_AddSubRegExtendSmall:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegExtendLarge:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegShift32:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+  case Match_AddSubRegShift64:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+  case Match_AddSubSecondSource:
+      return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+          "expected compatible register, symbol or integer in range [0, 4095]");
+  case Match_CVTFixedPos32:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 32]");
+  case Match_CVTFixedPos64:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 64]");
+  case Match_CondCode:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected AArch64 condition code");
+  case Match_FPImm:
+    // Any situation which allows a nontrivial floating-point constant also
+    // allows a register.
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected compatible register or floating-point constant");
+  case Match_FPZero:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected floating-point constant #0.0");
+  case Match_Label:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected label or encodable integer pc offset");
+  case Match_Lane1:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected lane specifier '[1]'");
+  case Match_LoadStoreExtend32_1:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
+  case Match_LoadStoreExtend32_2:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+  case Match_LoadStoreExtend32_4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+  case Match_LoadStoreExtend32_8:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+  case Match_LoadStoreExtend32_16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtw' with optional shift of #0 or #4");
+  case Match_LoadStoreExtend64_1:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0");
+  case Match_LoadStoreExtend64_2:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+  case Match_LoadStoreExtend64_4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+  case Match_LoadStoreExtend64_8:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+  case Match_LoadStoreExtend64_16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+  case Match_LoadStoreSImm7_4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer multiple of 4 in range [-256, 252]");
+  case Match_LoadStoreSImm7_8:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer multiple of 8 in range [-512, 508]");
+  case Match_LoadStoreSImm7_16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer multiple of 16 in range [-1024, 1016]");
+  case Match_LoadStoreSImm9:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [-256, 255]");
+  case Match_LoadStoreUImm12_1:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 4095]");
+  case Match_LoadStoreUImm12_2:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 8190]");
+  case Match_LoadStoreUImm12_4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 16380]");
+  case Match_LoadStoreUImm12_8:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 32760]");
+  case Match_LoadStoreUImm12_16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 65520]");
+  case Match_LogicalSecondSource:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected compatible register or logical immediate");
+  case Match_MOVWUImm16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected relocated symbol or integer in range [0, 65535]");
+  case Match_MRS:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected readable system register");
+  case Match_MSR:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected writable system register or pstate");
+  case Match_NamedImm_at:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                "expected symbolic 'at' operand: s1e[0-3][rw] or s12e[01][rw]");
+  case Match_NamedImm_dbarrier:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+             "expected integer in range [0, 15] or symbolic barrier operand");
+  case Match_NamedImm_dc:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic 'dc' operand");
+  case Match_NamedImm_ic:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'ic' operand: 'ialluis', 'iallu' or 'ivau'");
+  case Match_NamedImm_isb:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 15] or 'sy'");
+  case Match_NamedImm_prefetch:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected prefetch hint: p(ld|st|i)l[123](strm|keep)");
+  case Match_NamedImm_tlbi:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected translation buffer invalidation operand");
+  case Match_UImm16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 65535]");
+  case Match_UImm3:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 7]");
+  case Match_UImm4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 15]");
+  case Match_UImm5:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 31]");
+  case Match_UImm6:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 63]");
+  case Match_UImm7:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 127]");
+  case Match_Width32:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [<lsb>, 31]");
+  case Match_Width64:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [<lsb>, 63]");
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+  return true;
+}
+
+void AArch64Operand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_CondCode:
+    OS << "<CondCode: " << CondCode.Code << ">";
+    break;
+  case k_FPImmediate:
+    OS << "<fpimm: " << FPImm.Val << ">";
+    break;
+  case k_ImmWithLSL:
+    OS << "<immwithlsl: imm=" << ImmWithLSL.Val
+       << ", shift=" << ImmWithLSL.ShiftAmount << ">";
+    break;
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_Register:
+    OS << "<register " << getReg() << '>';
+    break;
+  case k_Token:
+    OS << '\'' << getToken() << '\'';
+    break;
+  case k_ShiftExtend:
+    OS << "<shift: type=" << ShiftExtend.ShiftType
+       << ", amount=" << ShiftExtend.Amount << ">";
+    break;
+  case k_SysReg: {
+    StringRef Name(SysReg.Data, SysReg.Length);
+    OS << "<sysreg: " << Name << '>';
+    break;
+  }
+  default:
+    llvm_unreachable("No idea how to print this kind of operand");
+    break;
+  }
+}
+
+void AArch64Operand::dump() const {
+  print(errs());
+}
+
+
+/// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmParser() {
+  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "AArch64GenAsmMatcher.inc"
diff --git a/lib/Target/AArch64/AsmParser/CMakeLists.txt b/lib/Target/AArch64/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..a018a0a
--- /dev/null
+++ b/lib/Target/AArch64/AsmParser/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAArch64AsmParser
+  AArch64AsmParser.cpp
+  )
+
+add_dependencies(LLVMAArch64AsmParser AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/AsmParser/LLVMBuild.txt b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
new file mode 100644
index 0000000..bd1fcaf
--- /dev/null
+++ b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64AsmParser
+parent = AArch64
+required_libraries = AArch64Desc AArch64Info MC MCParser Support
+add_to_library_groups = AArch64
+
diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile
new file mode 100644
index 0000000..56c9ef5
--- /dev/null
+++ b/lib/Target/AArch64/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/AArch64/AsmParser/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64AsmParser
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
new file mode 100644
index 0000000..8164d6f
--- /dev/null
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -0,0 +1,36 @@
+set(LLVM_TARGET_DEFINITIONS AArch64.td)
+
+tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
+tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
+tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(AArch64CommonTableGen)
+
+add_llvm_target(AArch64CodeGen
+  AArch64AsmPrinter.cpp
+  AArch64BranchFixupPass.cpp
+  AArch64FrameLowering.cpp
+  AArch64ISelDAGToDAG.cpp
+  AArch64ISelLowering.cpp
+  AArch64InstrInfo.cpp
+  AArch64MachineFunctionInfo.cpp
+  AArch64MCInstLower.cpp
+  AArch64RegisterInfo.cpp
+  AArch64SelectionDAGInfo.cpp
+  AArch64Subtarget.cpp
+  AArch64TargetMachine.cpp
+  AArch64TargetObjectFile.cpp
+  )
+
+add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
+add_subdirectory(InstPrinter)
+add_subdirectory(MCTargetDesc)
+add_subdirectory(TargetInfo)
+add_subdirectory(Utils)
+\ No newline at end of file
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
new file mode 100644
index 0000000..eba7666
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -0,0 +1,787 @@
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 ISA -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions necessary to decode AArch64 instruction
+// bitpatterns into MCInsts (with the help of TableGenerated information from
+// the instruction definitions).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-disassembler"
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+/// AArch64 disassembler for all AArch64 platforms.
+class AArch64Disassembler : public MCDisassembler {
+  const MCRegisterInfo *RegInfo;
+public:
+  /// Initializes the disassembler.
+  ///
+  AArch64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info)
+    : MCDisassembler(STI), RegInfo(Info) {
+  }
+
+  ~AArch64Disassembler() {
+  }
+
+  /// See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+};
+
+}
+
+// Forward-declarations used in the auto-generated files.
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus
+DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus
+DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
+                                              unsigned RegNo, uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst,
+                                              unsigned RegNo, uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
+                                               unsigned OptionHiS,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+
+static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
+                                               unsigned Imm6Bits,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
+                                               unsigned Imm6Bits,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+template<int RegWidth>
+static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
+                                             unsigned FullImm,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+template<int RegWidth>
+static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
+                                            unsigned Bits,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
+                                           unsigned ShiftAmount,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
+                                            unsigned ShiftAmount,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
+                                                       unsigned Val,
+                                                       uint64_t Address,
+                                                       const void *Decoder);
+
+template<typename SomeNamedImmMapper>
+static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
+                                          unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus
+DecodeSysRegOperand(const A64SysReg::SysRegMapper &InstMapper,
+                    llvm::MCInst &Inst, unsigned Val,
+                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
+                                     unsigned Val,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
+                                     unsigned Val,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+
+static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
+                                                   unsigned Val,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+
+static bool Check(DecodeStatus &Out, DecodeStatus In);
+
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  llvm_unreachable("Invalid DecodeStatus!");
+}
+
+DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 const MemoryObject &Region,
+                                                 uint64_t Address,
+                                                 raw_ostream &os,
+                                                 raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t insn = (bytes[3] << 24) |
+    (bytes[2] << 16) |
+    (bytes[1] <<  8) |
+    (bytes[0] <<  0);
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus result = decodeInstruction(DecoderTableA6432, MI, insn, Address,
+                                          this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const AArch64Disassembler *Dis = static_cast<const AArch64Disassembler*>(D);
+  return Dis->getRegInfo()->getRegClass(RC).getRegister(RegNo);
+}
+
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                        uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR64xspRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR32wspRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR8RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR16RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus
+DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus
+DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR128RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                         uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::VPR128RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
+                                               unsigned OptionHiS,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  // Option{1} must be 1. OptionHiS is made up of {Option{2}, Option{1},
+  // S}. Hence we want to check bit 1.
+  if (!(OptionHiS & 2))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(OptionHiS));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
+                                               unsigned Imm6Bits,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  // In the 32-bit variant, bit 6 must be zero. I.e. the immediate must be
+  // between 0 and 31.
+  if (Imm6Bits > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
+                                               unsigned Imm6Bits,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  // 1 <= Imm <= 32. Encoded as 64 - Imm so: 63 >= Encoded >= 32.
+  if (Imm6Bits < 32)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
+  return MCDisassembler::Success;
+}
+
+
+template<int RegWidth>
+static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
+                                             unsigned FullImm,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  unsigned Imm16 = FullImm & 0xffff;
+  unsigned Shift = FullImm >> 16;
+
+  if (RegWidth == 32 && Shift > 1) return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Imm16));
+  Inst.addOperand(MCOperand::CreateImm(Shift));
+  return MCDisassembler::Success;
+}
+
+template<int RegWidth>
+static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
+                                            unsigned Bits,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  uint64_t Imm;
+  if (!A64Imms::isLogicalImmBits(RegWidth, Bits, Imm))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Bits));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
+                                           unsigned ShiftAmount,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // Only values 0-4 are valid for this 3-bit field
+  if (ShiftAmount > 4)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
+                                            unsigned ShiftAmount,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  // Only values below 32 are valid for a 32-bit register
+  if (ShiftAmount > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned ImmS = fieldFromInstruction(Insn, 10, 6);
+  unsigned ImmR = fieldFromInstruction(Insn, 16, 6);
+  unsigned SF = fieldFromInstruction(Insn, 31, 1);
+
+  // Undef for 0b11 just in case it occurs. Don't want the compiler to optimise
+  // out assertions that it thinks should never be hit.
+  enum OpcTypes { SBFM = 0, BFM, UBFM, Undef } Opc;
+  Opc = (OpcTypes)fieldFromInstruction(Insn, 29, 2);
+
+  if (!SF) {
+    // ImmR and ImmS must be between 0 and 31 for 32-bit instructions.
+    if (ImmR > 31 || ImmS > 31)
+      return MCDisassembler::Fail;
+  }
+
+  if (SF) {
+    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+    // BFM MCInsts use Rd as a source too.
+    if (Opc == BFM) DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
+  } else {
+    DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
+    // BFM MCInsts use Rd as a source too.
+    if (Opc == BFM) DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  // ASR and LSR have more specific patterns so they won't get here:
+  assert(!(ImmS == 31 && !SF && Opc != BFM)
+         && "shift should have used auto decode");
+  assert(!(ImmS == 63 && SF && Opc != BFM)
+         && "shift should have used auto decode");
+
+  // Extension instructions similarly:
+  if (Opc == SBFM && ImmR == 0) {
+    assert((ImmS != 7 && ImmS != 15) && "extension got here");
+    assert((ImmS != 31 || SF == 0) && "extension got here");
+  } else if (Opc == UBFM && ImmR == 0) {
+    assert((SF != 0 || (ImmS != 7 && ImmS != 15)) && "extension got here");
+  }
+
+  if (Opc == UBFM) {
+    // It might be a LSL instruction, which actually takes the shift amount
+    // itself as an MCInst operand.
+    if (SF && (ImmS + 1) % 64 == ImmR) {
+      Inst.setOpcode(AArch64::LSLxxi);
+      Inst.addOperand(MCOperand::CreateImm(63 - ImmS));
+      return MCDisassembler::Success;
+    } else if (!SF && (ImmS + 1) % 32 == ImmR) {
+      Inst.setOpcode(AArch64::LSLwwi);
+      Inst.addOperand(MCOperand::CreateImm(31 - ImmS));
+      return MCDisassembler::Success;
+    }
+  }
+
+  // Otherwise it's definitely either an extract or an insert depending on which
+  // of ImmR or ImmS is larger.
+  unsigned ExtractOp, InsertOp;
+  switch (Opc) {
+  default: llvm_unreachable("unexpected instruction trying to decode bitfield");
+  case SBFM:
+    ExtractOp = SF ? AArch64::SBFXxxii : AArch64::SBFXwwii;
+    InsertOp = SF ? AArch64::SBFIZxxii : AArch64::SBFIZwwii;
+    break;
+  case BFM:
+    ExtractOp = SF ? AArch64::BFXILxxii : AArch64::BFXILwwii;
+    InsertOp = SF ? AArch64::BFIxxii : AArch64::BFIwwii;
+    break;
+  case UBFM:
+    ExtractOp = SF ? AArch64::UBFXxxii : AArch64::UBFXwwii;
+    InsertOp = SF ? AArch64::UBFIZxxii : AArch64::UBFIZwwii;
+    break;
+  }
+
+  // Otherwise it's a boring insert or extract
+  Inst.addOperand(MCOperand::CreateImm(ImmR));
+  Inst.addOperand(MCOperand::CreateImm(ImmS));
+
+
+  if (ImmS < ImmR)
+    Inst.setOpcode(InsertOp);
+  else
+    Inst.setOpcode(ExtractOp);
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  // This decoder exists to add the dummy Lane operand to the MCInst, which must
+  // be 1 in assembly but has no other real manifestation.
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
+
+  if (IsToVec) {
+    DecodeVPR128RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
+  } else {
+    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeVPR128RegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  // Add the lane
+  Inst.addOperand(MCOperand::CreateImm(1));
+
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  DecodeStatus Result = MCDisassembler::Success;
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(Insn, 10, 5);
+  unsigned SImm7 = fieldFromInstruction(Insn, 15, 7);
+  unsigned L = fieldFromInstruction(Insn, 22, 1);
+  unsigned V = fieldFromInstruction(Insn, 26, 1);
+  unsigned Opc = fieldFromInstruction(Insn, 30, 2);
+
+  // Not an official name, but it turns out that bit 23 distinguishes indexed
+  // from non-indexed operations.
+  unsigned Indexed = fieldFromInstruction(Insn, 23, 1);
+
+  if (Indexed && L == 0) {
+    // The MCInst for an indexed store has an out operand and 4 ins:
+    //    Rn_wb, Rt, Rt2, Rn, Imm
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  // You shouldn't load to the same register twice in an instruction...
+  if (L && Rt == Rt2)
+    Result = MCDisassembler::SoftFail;
+
+  // ... or do any operation that writes-back to a transfer register. But note
+  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
+  if (Indexed && V == 0 && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+    Result = MCDisassembler::SoftFail;
+
+  // Exactly how we decode the MCInst's registers depends on the Opc and V
+  // fields of the instruction. These also obviously determine the size of the
+  // operation so we can fill in that information while we're at it.
+  if (V) {
+    // The instruction operates on the FP/SIMD registers
+    switch (Opc) {
+    default: return MCDisassembler::Fail;
+    case 0:
+      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeFPR32RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    case 1:
+      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeFPR64RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    case 2:
+      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeFPR128RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    }
+  } else {
+    switch (Opc) {
+    default: return MCDisassembler::Fail;
+    case 0:
+      DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    case 1:
+      assert(L && "unexpected \"store signed\" attempt");
+      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    case 2:
+      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    }
+  }
+
+  if (Indexed && L == 1) {
+    // The MCInst for an indexed load has 3 out operands and an 3 ins:
+    //    Rt, Rt2, Rn_wb, Rt2, Rn, Imm
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(SImm7));
+
+  return Result;
+}
+
+static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
+                                                       uint32_t Val,
+                                                       uint64_t Address,
+                                                       const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(Val, 0, 5);
+  unsigned Rn = fieldFromInstruction(Val, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(Val, 10, 5);
+  unsigned MemSize = fieldFromInstruction(Val, 30, 2);
+
+  DecodeStatus S = MCDisassembler::Success;
+  if (Rt == Rt2) S = MCDisassembler::SoftFail;
+
+  switch (MemSize) {
+    case 2:
+      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder)))
+        return MCDisassembler::Fail;
+      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case 3:
+      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder)))
+        return MCDisassembler::Fail;
+      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      llvm_unreachable("Invalid MemSize in DecodeLoadPairExclusiveInstruction");
+  }
+
+  if (!Check(S, DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+template<typename SomeNamedImmMapper>
+static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
+                                          unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  SomeNamedImmMapper Mapper;
+  bool ValidNamed;
+  Mapper.toString(Val, ValidNamed);
+  if (ValidNamed || Mapper.validImm(Val)) {
+    Inst.addOperand(MCOperand::CreateImm(Val));
+    return MCDisassembler::Success;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
+                                        llvm::MCInst &Inst,
+                                        unsigned Val,
+                                        uint64_t Address,
+                                        const void *Decoder) {
+  bool ValidNamed;
+  Mapper.toString(Val, ValidNamed);
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+
+  return ValidNamed ? MCDisassembler::Success : MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
+                                     unsigned Val,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  return DecodeSysRegOperand(A64SysReg::MRSMapper(), Inst, Val, Address,
+                             Decoder);
+}
+
+static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
+                                     unsigned Val,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  return DecodeSysRegOperand(A64SysReg::MSRMapper(), Inst, Val, Address,
+                             Decoder);
+}
+
+static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
+                                                   unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Imm9 = fieldFromInstruction(Insn, 12, 9);
+
+  unsigned Opc = fieldFromInstruction(Insn, 22, 2);
+  unsigned V = fieldFromInstruction(Insn, 26, 1);
+  unsigned Size = fieldFromInstruction(Insn, 30, 2);
+
+  if (Opc == 0 || (V == 1 && Opc == 2)) {
+    // It's a store, the MCInst gets: Rn_wb, Rt, Rn, Imm
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  if (V == 0 && (Opc == 2 || Size == 3)) {
+    DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
+  } else if (V == 0) {
+    DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
+  } else if (V == 1 && (Opc & 2)) {
+    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+  } else {
+    switch (Size) {
+    case 0:
+      DecodeFPR8RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 1:
+      DecodeFPR16RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    }
+  }
+
+  if (Opc != 0 && (V != 1 || Opc != 2)) {
+    // It's a load, the MCInst gets: Rt, Rn_wb, Rn, Imm
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(Imm9));
+
+  // N.b. The official documentation says undpredictable if Rt == Rn, but this
+  // takes place at the architectural rather than encoding level:
+  //
+  // "STR xzr, [sp], #4" is perfectly valid.
+  if (V == 0 && Rt == Rn && Rn != 31)
+    return MCDisassembler::SoftFail;
+  else
+    return MCDisassembler::Success;
+}
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+                                                 const MCSubtargetInfo &STI) {
+  return new AArch64Disassembler(STI, T.createMCRegInfo(""));
+}
+
+extern "C" void LLVMInitializeAArch64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheAArch64Target,
+                                         createAArch64Disassembler);
+}
+
+
diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..d4bd163
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAArch64Disassembler
+  AArch64Disassembler.cpp
+  )
+
+add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000..a93e343
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64Disassembler
+parent = AArch64
+required_libraries = AArch64CodeGen AArch64Desc AArch64Info AArch64Utils MC Support
+add_to_library_groups = AArch64
+
diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile
new file mode 100644
index 0000000..5c86120
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/AArch64/Disassembler/Makefile ------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64Disassembler
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
new file mode 100644
index 0000000..82ce80c
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -0,0 +1,408 @@
+//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter.inc"
+
+static int64_t unpackSignedImm(int BitWidth, uint64_t Value) {
+  assert(!(Value & ~((1ULL << BitWidth)-1)) && "immediate not n-bit");
+  if (Value & (1ULL <<  (BitWidth - 1)))
+    return static_cast<int64_t>(Value) - (1LL << BitWidth);
+  else
+    return Value;
+}
+
+AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
+                                       const MCInstrInfo &MII,
+                                       const MCRegisterInfo &MRI,
+                                       const MCSubtargetInfo &STI) :
+  MCInstPrinter(MAI, MII, MRI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void
+AArch64InstPrinter::printOffsetSImm9Operand(const MCInst *MI,
+                                              unsigned OpNum, raw_ostream &O) {
+  const MCOperand &MOImm = MI->getOperand(OpNum);
+  int32_t Imm = unpackSignedImm(9, MOImm.getImm());
+
+  O << '#' << Imm;
+}
+
+void
+AArch64InstPrinter::printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O, unsigned MemSize,
+                                          unsigned RmSize) {
+  unsigned ExtImm = MI->getOperand(OpNum).getImm();
+  unsigned OptionHi = ExtImm >> 1;
+  unsigned S = ExtImm & 1;
+  bool IsLSL = OptionHi == 1 && RmSize == 64;
+
+  const char *Ext;
+  switch (OptionHi) {
+  case 1:
+    Ext = (RmSize == 32) ? "uxtw" : "lsl";
+    break;
+  case 3:
+    Ext = (RmSize == 32) ? "sxtw" : "sxtx";
+    break;
+  default:
+    llvm_unreachable("Incorrect Option on load/store (reg offset)");
+  }
+  O << Ext;
+
+  if (S) {
+    unsigned ShiftAmt = Log2_32(MemSize);
+    O << " #" << ShiftAmt;
+  } else if (IsLSL) {
+    O << " #0";
+  }
+}
+
+void
+AArch64InstPrinter::printAddSubImmLSL0Operand(const MCInst *MI,
+                                              unsigned OpNum, raw_ostream &O) {
+  const MCOperand &Imm12Op = MI->getOperand(OpNum);
+
+  if (Imm12Op.isImm()) {
+    int64_t Imm12 = Imm12Op.getImm();
+    assert(Imm12 >= 0 && "Invalid immediate for add/sub imm");
+    O << "#" << Imm12;
+  } else {
+    assert(Imm12Op.isExpr() && "Unexpected shift operand type");
+    O << "#" << *Imm12Op.getExpr();
+  }
+}
+
+void
+AArch64InstPrinter::printAddSubImmLSL12Operand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+
+  printAddSubImmLSL0Operand(MI, OpNum, O);
+
+  O << ", lsl #12";
+}
+
+void
+AArch64InstPrinter::printBareImmOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  O << MO.getImm();
+}
+
+template<unsigned RegWidth> void
+AArch64InstPrinter::printBFILSBOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &ImmROp = MI->getOperand(OpNum);
+  unsigned LSB = ImmROp.getImm() == 0 ? 0 : RegWidth - ImmROp.getImm();
+
+  O << '#' << LSB;
+}
+
+void AArch64InstPrinter::printBFIWidthOperand(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  const MCOperand &ImmSOp = MI->getOperand(OpNum);
+  unsigned Width = ImmSOp.getImm() + 1;
+
+  O << '#' << Width;
+}
+
+void
+AArch64InstPrinter::printBFXWidthOperand(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  const MCOperand &ImmSOp = MI->getOperand(OpNum);
+  const MCOperand &ImmROp = MI->getOperand(OpNum - 1);
+
+  unsigned ImmR = ImmROp.getImm();
+  unsigned ImmS = ImmSOp.getImm();
+
+  assert(ImmS >= ImmR && "Invalid ImmR, ImmS combination for bitfield extract");
+
+  O << '#' << (ImmS - ImmR + 1);
+}
+
+void
+AArch64InstPrinter::printCRxOperand(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O) {
+    const MCOperand &CRx = MI->getOperand(OpNum);
+
+    O << 'c' << CRx.getImm();
+}
+
+
+void
+AArch64InstPrinter::printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+    const MCOperand &ScaleOp = MI->getOperand(OpNum);
+
+    O << '#' << (64 - ScaleOp.getImm());
+}
+
+
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &o) {
+  const MCOperand &MOImm8 = MI->getOperand(OpNum);
+
+  assert(MOImm8.isImm()
+         && "Immediate operand required for floating-point immediate inst");
+
+  uint32_t Imm8 = MOImm8.getImm();
+  uint32_t Fraction = Imm8 & 0xf;
+  uint32_t Exponent = (Imm8 >> 4) & 0x7;
+  uint32_t Negative = (Imm8 >> 7) & 0x1;
+
+  float Val = 1.0f + Fraction / 16.0f;
+
+  // That is:
+  // 000 -> 2^1,  001 -> 2^2,  010 -> 2^3,  011 -> 2^4,
+  // 100 -> 2^-3, 101 -> 2^-2, 110 -> 2^-1, 111 -> 2^0
+  if (Exponent & 0x4) {
+    Val /= 1 << (7 - Exponent);
+  } else {
+    Val *= 1 << (Exponent + 1);
+  }
+
+  Val = Negative ? -Val : Val;
+
+  o << '#' << format("%.8f", Val);
+}
+
+void AArch64InstPrinter::printFPZeroOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &o) {
+  o << "#0.0";
+}
+
+void
+AArch64InstPrinter::printCondCodeOperand(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  O << A64CondCodeToString(static_cast<A64CC::CondCodes>(MO.getImm()));
+}
+
+template <unsigned field_width, unsigned scale> void
+AArch64InstPrinter::printLabelOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  if (!MO.isImm()) {
+    printOperand(MI, OpNum, O);
+    return;
+  }
+
+  // The immediate of LDR (lit) instructions is a signed 19-bit immediate, which
+  // is multiplied by 4 (because all A64 instructions are 32-bits wide).
+  uint64_t UImm = MO.getImm();
+  uint64_t Sign = UImm & (1LL << (field_width - 1));
+  int64_t SImm = scale * ((UImm & ~Sign) - Sign);
+
+  O << "#" << SImm;
+}
+
+template<unsigned RegWidth> void
+AArch64InstPrinter::printLogicalImmOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  uint64_t Val;
+  A64Imms::isLogicalImmBits(RegWidth, MO.getImm(), Val);
+  O << "#0x";
+  O.write_hex(Val);
+}
+
+void
+AArch64InstPrinter::printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O, int MemSize) {
+  const MCOperand &MOImm = MI->getOperand(OpNum);
+
+  if (MOImm.isImm()) {
+    uint32_t Imm = MOImm.getImm() * MemSize;
+
+    O << "#" << Imm;
+  } else {
+    O << "#" << *MOImm.getExpr();
+  }
+}
+
+void
+AArch64InstPrinter::printShiftOperand(const MCInst *MI,  unsigned OpNum,
+                                      raw_ostream &O,
+                                      A64SE::ShiftExtSpecifiers Shift) {
+    const MCOperand &MO = MI->getOperand(OpNum);
+
+    // LSL #0 is not printed
+    if (Shift == A64SE::LSL && MO.isImm() && MO.getImm() == 0)
+        return;
+
+    switch (Shift) {
+    case A64SE::LSL: O << "lsl"; break;
+    case A64SE::LSR: O << "lsr"; break;
+    case A64SE::ASR: O << "asr"; break;
+    case A64SE::ROR: O << "ror"; break;
+    default: llvm_unreachable("Invalid shift specifier in logical instruction");
+    }
+
+  O << " #" << MO.getImm();
+}
+
+void
+AArch64InstPrinter::printMoveWideImmOperand(const MCInst *MI,  unsigned OpNum,
+                                            raw_ostream &O) {
+  const MCOperand &UImm16MO = MI->getOperand(OpNum);
+  const MCOperand &ShiftMO = MI->getOperand(OpNum + 1);
+
+  if (UImm16MO.isImm()) {
+    O << '#' << UImm16MO.getImm();
+
+    if (ShiftMO.getImm() != 0)
+      O << ", lsl #" << (ShiftMO.getImm() * 16);
+
+    return;
+  }
+
+  O << "#" << *UImm16MO.getExpr();
+}
+
+void AArch64InstPrinter::printNamedImmOperand(const NamedImmMapper &Mapper,
+                                              const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  bool ValidName;
+  const MCOperand &MO = MI->getOperand(OpNum);
+  StringRef Name = Mapper.toString(MO.getImm(), ValidName);
+
+  if (ValidName)
+    O << Name;
+  else
+    O << '#' << MO.getImm();
+}
+
+void
+AArch64InstPrinter::printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
+                                       const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  bool ValidName;
+  std::string Name = Mapper.toString(MO.getImm(), ValidName);
+  if (ValidName) {
+    O << Name;
+    return;
+  }
+}
+
+
+void AArch64InstPrinter::printRegExtendOperand(const MCInst *MI,
+                                               unsigned OpNum,
+                                               raw_ostream &O,
+                                               A64SE::ShiftExtSpecifiers Ext) {
+  // FIXME: In principle TableGen should be able to detect this itself far more
+  // easily. We will only accumulate more of these hacks.
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+
+  if (isStackReg(Reg0) || isStackReg(Reg1)) {
+    A64SE::ShiftExtSpecifiers LSLEquiv;
+
+    if (Reg0 == AArch64::XSP || Reg1 == AArch64::XSP)
+      LSLEquiv = A64SE::UXTX;
+    else
+      LSLEquiv = A64SE::UXTW;
+
+    if (Ext == LSLEquiv) {
+      O << "lsl #" << MI->getOperand(OpNum).getImm();
+      return;
+    }
+  }
+
+  switch (Ext) {
+  case A64SE::UXTB: O << "uxtb"; break;
+  case A64SE::UXTH: O << "uxth"; break;
+  case A64SE::UXTW: O << "uxtw"; break;
+  case A64SE::UXTX: O << "uxtx"; break;
+  case A64SE::SXTB: O << "sxtb"; break;
+  case A64SE::SXTH: O << "sxth"; break;
+  case A64SE::SXTW: O << "sxtw"; break;
+  case A64SE::SXTX: O << "sxtx"; break;
+  default: llvm_unreachable("Unexpected shift type for printing");
+  }
+
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.getImm() != 0)
+    O << " #" << MO.getImm();
+}
+
+template<int MemScale> void
+AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  const MCOperand &MOImm = MI->getOperand(OpNum);
+  int32_t Imm = unpackSignedImm(7, MOImm.getImm());
+
+  O << "#" << (Imm * MemScale);
+}
+
+void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    O << getRegisterName(Reg);
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+      O << "0x";
+      O.write_hex(Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      O << *Op.getExpr();
+    }
+  }
+}
+
+
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  if (MI->getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a special assembler directive which applies an
+    // R_AARCH64_TLSDESC_CALL to the following (BLR) instruction. It has a fixed
+    // form outside the normal TableGenerated scheme.
+    O << "\t.tlsdesccall " << *MI->getOperand(0).getExpr();
+  } else if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
+
+  printAnnotation(O, Annot);
+}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
new file mode 100644
index 0000000..639fa86
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -0,0 +1,172 @@
+//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64INSTPRINTER_H
+#define LLVM_AARCH64INSTPRINTER_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class AArch64InstPrinter : public MCInstPrinter {
+public:
+  AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  // Autogenerated by tblgen
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  static const char *getInstructionName(unsigned Opcode);
+
+  void printRegName(raw_ostream &O, unsigned RegNum) const;
+
+  template<unsigned MemSize, unsigned RmSize>
+  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O) {
+    printAddrRegExtendOperand(MI, OpNum, O, MemSize, RmSize);
+  }
+
+
+  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O, unsigned MemSize,
+                                 unsigned RmSize);
+
+  void printAddSubImmLSL0Operand(const MCInst *MI,
+                                 unsigned OpNum, raw_ostream &O);
+  void printAddSubImmLSL12Operand(const MCInst *MI,
+                                  unsigned OpNum, raw_ostream &O);
+
+  void printBareImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  template<unsigned RegWidth>
+  void printBFILSBOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBFIWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBFXWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+
+  void printCondCodeOperand(const MCInst *MI, unsigned OpNum,
+                            raw_ostream &O);
+
+  void printCRxOperand(const MCInst *MI, unsigned OpNum,
+                       raw_ostream &O);
+
+  void printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
+
+  void printFPZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
+
+  template<int MemScale>
+  void printOffsetUImm12Operand(const MCInst *MI,
+                                  unsigned OpNum, raw_ostream &o) {
+    printOffsetUImm12Operand(MI, OpNum, o, MemScale);
+  }
+
+  void printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &o, int MemScale);
+
+  template<unsigned field_width, unsigned scale>
+  void printLabelOperand(const MCInst *MI, unsigned OpNum,
+                         raw_ostream &O);
+
+  template<unsigned RegWidth>
+  void printLogicalImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  template<typename SomeNamedImmMapper>
+  void printNamedImmOperand(const MCInst *MI, unsigned OpNum,
+                            raw_ostream &O) {
+    printNamedImmOperand(SomeNamedImmMapper(), MI, OpNum, O);
+  }
+
+  void printNamedImmOperand(const NamedImmMapper &Mapper,
+                            const MCInst *MI, unsigned OpNum,
+                            raw_ostream &O);
+
+  void printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
+                          const MCInst *MI, unsigned OpNum,
+                          raw_ostream &O);
+
+  void printMRSOperand(const MCInst *MI, unsigned OpNum,
+                       raw_ostream &O) {
+    printSysRegOperand(A64SysReg::MRSMapper(), MI, OpNum, O);
+  }
+
+  void printMSROperand(const MCInst *MI, unsigned OpNum,
+                       raw_ostream &O) {
+    printSysRegOperand(A64SysReg::MSRMapper(), MI, OpNum, O);
+  }
+
+  void printShiftOperand(const char *name, const MCInst *MI,
+                         unsigned OpIdx, raw_ostream &O);
+
+  void printLSLOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printLSROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printShiftOperand("lsr", MI, OpNum, O);
+  }
+  void printASROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printShiftOperand("asr", MI, OpNum, O);
+  }
+  void printROROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printShiftOperand("ror", MI, OpNum, O);
+  }
+
+  template<A64SE::ShiftExtSpecifiers Shift>
+  void printShiftOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printShiftOperand(MI, OpNum, O, Shift);
+  }
+
+  void printShiftOperand(const MCInst *MI, unsigned OpNum,
+                         raw_ostream &O, A64SE::ShiftExtSpecifiers Sh);
+
+
+  void printMoveWideImmOperand(const  MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  template<int MemSize> void
+  printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printOffsetSImm9Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  void printPRFMOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  template<A64SE::ShiftExtSpecifiers EXT>
+  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                             raw_ostream &O) {
+    printRegExtendOperand(MI, OpNum, O, EXT);
+  }
+
+  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                             raw_ostream &O, A64SE::ShiftExtSpecifiers Ext);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+
+  bool isStackReg(unsigned RegNo) {
+    return RegNo == AArch64::XSP || RegNo == AArch64::WSP;
+  }
+
+
+};
+
+}
+
+#endif
diff --git a/lib/Target/AArch64/InstPrinter/CMakeLists.txt b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..d4b980a
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
@@ -0,0 +1,8 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAArch64AsmPrinter
+  AArch64InstPrinter.cpp
+  )
+
+add_dependencies(LLVMAArch64AsmPrinter AArch64CommonTableGen)
+
diff --git a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
new file mode 100644
index 0000000..4836c7c
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64AsmPrinter
+parent = AArch64
+required_libraries = AArch64Utils MC Support
+add_to_library_groups = AArch64
+
diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile
new file mode 100644
index 0000000..1c36a8d
--- /dev/null
+++ b/lib/Target/AArch64/InstPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/AArch64/AsmPrinter/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64AsmPrinter
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
new file mode 100644
index 0000000..3b296fd
--- /dev/null
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -0,0 +1,36 @@
+;===- ./lib/Target/AArch64/LLVMBuild.txt -----------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils
+
+[component_0]
+type = TargetGroup
+name = AArch64
+parent = Target
+has_asmparser = 1
+has_asmprinter = 1
+has_disassembler = 1
+;has_jit = 1
+
+[component_1]
+type = Library
+name = AArch64CodeGen
+parent = AArch64
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AsmPrinter CodeGen Core MC SelectionDAG Support Target
+add_to_library_groups = AArch64
+
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
new file mode 100644
index 0000000..a3373b1
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -0,0 +1,585 @@
+//===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the MCAsmBackend class,
+// which is principally concerned with relaxation of the various fixup kinds.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class AArch64AsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo* STI;
+public:
+  AArch64AsmBackend(const Target &T, const StringRef TT)
+    : MCAsmBackend(),
+      STI(AArch64_MC::createAArch64MCSubtargetInfo(TT, "", ""))
+    {}
+
+
+  ~AArch64AsmBackend() {
+    delete STI;
+  }
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  virtual void processFixupValue(const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFixup &Fixup, const MCFragment *DF,
+                                 MCValue &Target, uint64_t &Value,
+                                 bool &IsResolved);
+};
+} // end anonymous namespace
+
+void AArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
+                                          const MCAsmLayout &Layout,
+                                          const MCFixup &Fixup,
+                                          const MCFragment *DF,
+                                          MCValue &Target, uint64_t &Value,
+                                          bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_page ||
+      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_got_page ||
+      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_gottprel_page ||
+      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_tlsdesc_adr_page)
+    IsResolved = false;
+}
+
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value);
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  uint8_t OSABI;
+  ELFAArch64AsmBackend(const Target &T, const StringRef TT,
+                       uint8_t _OSABI)
+    : AArch64AsmBackend(T, TT), OSABI(_OSABI) { }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const;
+
+  unsigned int getNumFixupKinds() const {
+    return AArch64::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
+// This table *must* be in the order that the fixup_* kinds are defined in
+// AArch64FixupKinds.h.
+//
+// Name                   Offset (bits)    Size (bits)    Flags
+{ "fixup_a64_ld_prel",               0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_adr_prel",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_adr_prel_page",         0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_add_lo12",              0,    32,             0 },
+{ "fixup_a64_ldst8_lo12",            0,    32,             0 },
+{ "fixup_a64_ldst16_lo12",           0,    32,             0 },
+{ "fixup_a64_ldst32_lo12",           0,    32,             0 },
+{ "fixup_a64_ldst64_lo12",           0,    32,             0 },
+{ "fixup_a64_ldst128_lo12",          0,    32,             0 },
+{ "fixup_a64_tstbr",                 0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_condbr",                0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_uncondbr",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_call",                  0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_movw_uabs_g0",          0,    32,             0 },
+{ "fixup_a64_movw_uabs_g0_nc",       0,    32,             0 },
+{ "fixup_a64_movw_uabs_g1",          0,    32,             0 },
+{ "fixup_a64_movw_uabs_g1_nc",       0,    32,             0 },
+{ "fixup_a64_movw_uabs_g2",          0,    32,             0 },
+{ "fixup_a64_movw_uabs_g2_nc",       0,    32,             0 },
+{ "fixup_a64_movw_uabs_g3",          0,    32,             0 },
+{ "fixup_a64_movw_sabs_g0",          0,    32,             0 },
+{ "fixup_a64_movw_sabs_g1",          0,    32,             0 },
+{ "fixup_a64_movw_sabs_g2",          0,    32,             0 },
+{ "fixup_a64_adr_prel_got_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_ld64_got_lo12_nc",      0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g2",        0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g1",        0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g1_nc",     0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g0",        0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g0_nc",     0,    32,             0 },
+{ "fixup_a64_add_dtprel_hi12",       0,    32,             0 },
+{ "fixup_a64_add_dtprel_lo12",       0,    32,             0 },
+{ "fixup_a64_add_dtprel_lo12_nc",    0,    32,             0 },
+{ "fixup_a64_ldst8_dtprel_lo12",     0,    32,             0 },
+{ "fixup_a64_ldst8_dtprel_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_ldst16_dtprel_lo12",    0,    32,             0 },
+{ "fixup_a64_ldst16_dtprel_lo12_nc", 0,    32,             0 },
+{ "fixup_a64_ldst32_dtprel_lo12",    0,    32,             0 },
+{ "fixup_a64_ldst32_dtprel_lo12_nc", 0,    32,             0 },
+{ "fixup_a64_ldst64_dtprel_lo12",    0,    32,             0 },
+{ "fixup_a64_ldst64_dtprel_lo12_nc", 0,    32,             0 },
+{ "fixup_a64_movw_gottprel_g1",      0,    32,             0 },
+{ "fixup_a64_movw_gottprel_g0_nc",   0,    32,             0 },
+{ "fixup_a64_adr_gottprel_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_ld64_gottprel_lo12_nc", 0,    32,             0 },
+{ "fixup_a64_ld_gottprel_prel19",    0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_movw_tprel_g2",         0,    32,             0 },
+{ "fixup_a64_movw_tprel_g1",         0,    32,             0 },
+{ "fixup_a64_movw_tprel_g1_nc",      0,    32,             0 },
+{ "fixup_a64_movw_tprel_g0",         0,    32,             0 },
+{ "fixup_a64_movw_tprel_g0_nc",      0,    32,             0 },
+{ "fixup_a64_add_tprel_hi12",        0,    32,             0 },
+{ "fixup_a64_add_tprel_lo12",        0,    32,             0 },
+{ "fixup_a64_add_tprel_lo12_nc",     0,    32,             0 },
+{ "fixup_a64_ldst8_tprel_lo12",      0,    32,             0 },
+{ "fixup_a64_ldst8_tprel_lo12_nc",   0,    32,             0 },
+{ "fixup_a64_ldst16_tprel_lo12",     0,    32,             0 },
+{ "fixup_a64_ldst16_tprel_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_ldst32_tprel_lo12",     0,    32,             0 },
+{ "fixup_a64_ldst32_tprel_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_ldst64_tprel_lo12",     0,    32,             0 },
+{ "fixup_a64_ldst64_tprel_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_tlsdesc_adr_page",      0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_tlsdesc_ld64_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_tlsdesc_add_lo12_nc",   0,    32,             0 },
+{ "fixup_a64_tlsdesc_call",          0,     0,             0 }
+    };
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const {
+    unsigned NumBytes = getFixupKindInfo(Fixup.getKind()).TargetSize / 8;
+    Value = adjustFixupValue(Fixup.getKind(), Value);
+    if (!Value) return;           // Doesn't change encoding.
+
+    unsigned Offset = Fixup.getOffset();
+    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+    // For each byte of the fragment that the fixup touches, mask in the bits
+    // from the fixup value.
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+    }
+  }
+
+  bool mayNeedRelaxation(const MCInst&) const {
+    return false;
+  }
+
+  void relaxInstruction(const MCInst&, llvm::MCInst&) const {
+    llvm_unreachable("Cannot relax instructions");
+  }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createAArch64ELFObjectWriter(OS, OSABI);
+  }
+};
+
+} // end anonymous namespace
+
+bool
+ELFAArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                           uint64_t Value,
+                                           const MCRelaxableFragment *DF,
+                                           const MCAsmLayout &Layout) const {
+  // Correct for now. With all instructions 32-bit only very low-level
+  // considerations could make you select something which may fail.
+  return false;
+}
+
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // Can't emit NOP with size not multiple of 32-bits
+  if (Count % 4 != 0)
+    return false;
+
+  uint64_t NumNops = Count / 4;
+  for (uint64_t i = 0; i != NumNops; ++i)
+    OW->Write32(0xd503201f);
+
+  return true;
+}
+
+static unsigned ADRImmBits(unsigned Value) {
+  unsigned lo2 = Value & 0x3;
+  unsigned hi19 = (Value & 0x1fffff) >> 2;
+
+  return (hi19 << 5) | (lo2 << 29);
+}
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_2:
+    assert((int64_t)Value >= -32768 &&
+           (int64_t)Value <= 65536 &&
+           "Out of range ABS16 fixup");
+    return Value;
+  case FK_Data_4:
+    assert((int64_t)Value >= -(1LL << 31) &&
+           (int64_t)Value <= (1LL << 32) - 1 &&
+           "Out of range ABS32 fixup");
+    return Value;
+  case FK_Data_8:
+    return Value;
+
+  case AArch64::fixup_a64_ld_gottprel_prel19:
+    // R_AARCH64_LD_GOTTPREL_PREL19: Set a load-literal immediate to bits 1F
+    // FFFC of G(TPREL(S+A)) - P; check -2^20 <= X < 2^20.
+  case AArch64::fixup_a64_ld_prel:
+    // R_AARCH64_LD_PREL_LO19: Sets a load-literal (immediate) value to bits
+    // 1F FFFC of S+A-P, checking that -2^20 <= S+A-P < 2^20.
+    assert((int64_t)Value >= -(1LL << 20) &&
+           (int64_t)Value < (1LL << 20) && "Out of range LDR (lit) fixup");
+    return (Value & 0x1ffffc) << 3;
+
+  case AArch64::fixup_a64_adr_prel:
+    // R_AARCH64_ADR_PREL_LO21: Sets an ADR immediate value to bits 1F FFFF of
+    // the result of S+A-P, checking that -2^20 <= S+A-P < 2^20.
+    assert((int64_t)Value >= -(1LL << 20) &&
+           (int64_t)Value < (1LL << 20) && "Out of range ADR fixup");
+    return ADRImmBits(Value & 0x1fffff);
+
+  case AArch64::fixup_a64_adr_prel_page:
+    // R_AARCH64_ADR_PREL_PG_HI21: Sets an ADRP immediate value to bits 1 FFFF
+    // F000 of the result of the operation, checking that -2^32 <= result <
+    // 2^32.
+    assert((int64_t)Value >= -(1LL << 32) &&
+           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
+    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
+
+  case AArch64::fixup_a64_add_dtprel_hi12:
+    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
+    // FF F000 of DTPREL(S+A), check 0 <= X < 2^24.
+  case AArch64::fixup_a64_add_tprel_hi12:
+    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
+    // FF F000 of TPREL(S+A), check 0 <= X < 2^24.
+    assert((int64_t)Value >= 0 &&
+           (int64_t)Value < (1LL << 24) && "Out of range ADD fixup");
+    return (Value & 0xfff000) >> 2;
+
+  case AArch64::fixup_a64_add_dtprel_lo12:
+    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
+    // FFF of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_add_tprel_lo12:
+    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
+    // FFF of TPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t)Value >= 0 &&
+           (int64_t)Value < (1LL << 12) && "Out of range ADD fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_add_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC: Set an ADD immediate field to bits
+    // FFF of DTPREL(S+A) with no overflow check.
+  case AArch64::fixup_a64_add_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_ADD_TPREL_LO12_NC: Set an ADD immediate field to bits
+    // FFF of TPREL(S+A) with no overflow check.
+  case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
+    // R_AARCH64_TLSDESC_ADD_LO12_NC: Set an ADD immediate field to bits
+    // FFF of G(TLSDESC(S+A)), with no overflow check.
+  case AArch64::fixup_a64_add_lo12:
+    // R_AARCH64_ADD_ABS_LO12_NC: Sets an ADD immediate value to bits FFF of
+    // S+A, with no overflow check.
+    return (Value & 0xfff) << 10;
+
+  case AArch64::fixup_a64_ldst8_dtprel_lo12:
+    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_ldst8_tprel_lo12:
+    // R_AARCH64_TLSLE_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t) Value >= 0 &&
+           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
+    // of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
+    // of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst8_lo12:
+    // R_AARCH64_LDST8_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFF
+    // of S+A, with no overflow check.
+    return (Value & 0xfff) << 10;
+
+  case AArch64::fixup_a64_ldst16_dtprel_lo12:
+    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_ldst16_tprel_lo12:
+    // R_AARCH64_TLSLE_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t) Value >= 0 &&
+           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
+    // of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
+    // of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst16_lo12:
+    // R_AARCH64_LDST16_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFE
+    // of S+A, with no overflow check.
+    return (Value & 0xffe) << 9;
+
+  case AArch64::fixup_a64_ldst32_dtprel_lo12:
+    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_ldst32_tprel_lo12:
+    // R_AARCH64_TLSLE_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t) Value >= 0 &&
+           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
+    // of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
+    // of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst32_lo12:
+    // R_AARCH64_LDST32_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFC
+    // of S+A, with no overflow check.
+    return (Value & 0xffc) << 8;
+
+  case AArch64::fixup_a64_ldst64_dtprel_lo12:
+    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_ldst64_tprel_lo12:
+    // R_AARCH64_TLSLE_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t) Value >= 0 &&
+           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
+    // of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
+    // of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst64_lo12:
+    // R_AARCH64_LDST64_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF8
+    // of S+A, with no overflow check.
+    return (Value & 0xff8) << 7;
+
+  case AArch64::fixup_a64_ldst128_lo12:
+    // R_AARCH64_LDST128_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF0
+    // of S+A, with no overflow check.
+    return (Value & 0xff0) << 6;
+
+  case AArch64::fixup_a64_movw_uabs_g0:
+    // R_AARCH64_MOVW_UABS_G0: Sets a MOVZ immediate field to bits FFFF of S+A
+    // with a check that S+A < 2^16
+    assert(Value <= 0xffff && "Out of range move wide fixup");
+    return (Value & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_dtprel_g0_nc:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC: Sets a MOVK immediate field to bits
+    // FFFF of DTPREL(S+A) with no overflow check.
+  case AArch64::fixup_a64_movw_gottprel_g0_nc:
+    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC: Sets a MOVK immediate field to bits
+    // FFFF of G(TPREL(S+A)) - GOT with no overflow check.
+  case AArch64::fixup_a64_movw_tprel_g0_nc:
+    // R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: Sets a MOVK immediate field to bits
+    // FFFF of TPREL(S+A) with no overflow check.
+  case AArch64::fixup_a64_movw_uabs_g0_nc:
+    // R_AARCH64_MOVW_UABS_G0_NC: Sets a MOVK immediate field to bits FFFF of
+    // S+A with no overflow check.
+    return (Value & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_uabs_g1:
+    // R_AARCH64_MOVW_UABS_G1: Sets a MOVZ immediate field to bits FFFF0000 of
+    // S+A with a check that S+A < 2^32
+    assert(Value <= 0xffffffffull && "Out of range move wide fixup");
+    return ((Value >> 16) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_dtprel_g1_nc:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC: Set a MOVK immediate field
+    // to bits FFFF0000 of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_movw_tprel_g1_nc:
+    // R_AARCH64_TLSLD_MOVW_TPREL_G1_NC: Set a MOVK immediate field
+    // to bits FFFF0000 of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_movw_uabs_g1_nc:
+    // R_AARCH64_MOVW_UABS_G1_NC: Sets a MOVK immediate field to bits
+    // FFFF0000 of S+A with no overflow check.
+    return ((Value >> 16) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_uabs_g2:
+    // R_AARCH64_MOVW_UABS_G2: Sets a MOVZ immediate field to bits FFFF 0000
+    // 0000 of S+A with a check that S+A < 2^48
+    assert(Value <= 0xffffffffffffull && "Out of range move wide fixup");
+    return ((Value >> 32) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_uabs_g2_nc:
+    // R_AARCH64_MOVW_UABS_G2: Sets a MOVK immediate field to bits FFFF 0000
+    // 0000 of S+A with no overflow check.
+    return ((Value >> 32) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_uabs_g3:
+    // R_AARCH64_MOVW_UABS_G3: Sets a MOVZ immediate field to bits FFFF 0000
+    // 0000 0000 of S+A (no overflow check needed)
+    return ((Value >> 48) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_dtprel_g0:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G0: Set a MOV[NZ] immediate field
+    // to bits FFFF of DTPREL(S+A).
+  case AArch64::fixup_a64_movw_tprel_g0:
+    // R_AARCH64_TLSLE_MOVW_TPREL_G0: Set a MOV[NZ] immediate field to
+    // bits FFFF of TPREL(S+A).
+  case AArch64::fixup_a64_movw_sabs_g0: {
+    // R_AARCH64_MOVW_SABS_G0: Sets MOV[NZ] immediate field using bits FFFF of
+    // S+A (see notes below); check -2^16 <= S+A < 2^16. (notes say that we
+    // should convert between MOVN and MOVZ to achieve our goals).
+    int64_t Signed = Value;
+    assert(Signed >= -(1LL << 16) && Signed < (1LL << 16)
+           && "Out of range move wide fixup");
+    if (Signed >= 0) {
+      Value = (Value & 0xffff) << 5;
+      // Bit 30 converts the MOVN encoding into a MOVZ
+      Value |= 1 << 30;
+    } else {
+      // MCCodeEmitter should have encoded a MOVN, which is fine.
+      Value = (~Value & 0xffff) << 5;
+    }
+    return Value;
+  }
+
+  case AArch64::fixup_a64_movw_dtprel_g1:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G1: Set a MOV[NZ] immediate field
+    // to bits FFFF0000 of DTPREL(S+A).
+  case AArch64::fixup_a64_movw_gottprel_g1:
+    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G1: Set a MOV[NZ] immediate field
+    // to bits FFFF0000 of G(TPREL(S+A)) - GOT.
+  case AArch64::fixup_a64_movw_tprel_g1:
+    // R_AARCH64_TLSLE_MOVW_TPREL_G1: Set a MOV[NZ] immediate field to
+    // bits FFFF0000 of TPREL(S+A).
+  case AArch64::fixup_a64_movw_sabs_g1: {
+    // R_AARCH64_MOVW_SABS_G1: Sets MOV[NZ] immediate field using bits FFFF 0000
+    // of S+A (see notes below); check -2^32 <= S+A < 2^32. (notes say that we
+    // should convert between MOVN and MOVZ to achieve our goals).
+    int64_t Signed = Value;
+    assert(Signed >= -(1LL << 32) && Signed < (1LL << 32)
+           && "Out of range move wide fixup");
+    if (Signed >= 0) {
+      Value = ((Value >> 16) & 0xffff) << 5;
+      // Bit 30 converts the MOVN encoding into a MOVZ
+      Value |= 1 << 30;
+    } else {
+      Value = ((~Value >> 16) & 0xffff) << 5;
+    }
+    return Value;
+  }
+
+  case AArch64::fixup_a64_movw_dtprel_g2:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G2: Set a MOV[NZ] immediate field
+    // to bits FFFF 0000 0000 of DTPREL(S+A).
+  case AArch64::fixup_a64_movw_tprel_g2:
+    // R_AARCH64_TLSLE_MOVW_TPREL_G2: Set a MOV[NZ] immediate field to
+    // bits FFFF 0000 0000 of TPREL(S+A).
+  case AArch64::fixup_a64_movw_sabs_g2: {
+    // R_AARCH64_MOVW_SABS_G2: Sets MOV[NZ] immediate field using bits FFFF 0000
+    // 0000 of S+A (see notes below); check -2^48 <= S+A < 2^48. (notes say that
+    // we should convert between MOVN and MOVZ to achieve our goals).
+    int64_t Signed = Value;
+    assert(Signed >= -(1LL << 48) && Signed < (1LL << 48)
+           && "Out of range move wide fixup");
+    if (Signed >= 0) {
+      Value = ((Value >> 32) & 0xffff) << 5;
+      // Bit 30 converts the MOVN encoding into a MOVZ
+      Value |= 1 << 30;
+    } else {
+      Value = ((~Value >> 32) & 0xffff) << 5;
+    }
+    return Value;
+  }
+
+  case AArch64::fixup_a64_tstbr:
+    // R_AARCH64_TSTBR14: Sets the immediate field of a TBZ/TBNZ instruction to
+    // bits FFFC of S+A-P, checking -2^15 <= S+A-P < 2^15.
+    assert((int64_t)Value >= -(1LL << 15) &&
+           (int64_t)Value < (1LL << 15) && "Out of range TBZ/TBNZ fixup");
+    return (Value & 0xfffc) << (5 - 2);
+
+  case AArch64::fixup_a64_condbr:
+    // R_AARCH64_CONDBR19: Sets the immediate field of a conditional branch
+    // instruction to bits 1FFFFC of S+A-P, checking -2^20 <= S+A-P < 2^20.
+    assert((int64_t)Value >= -(1LL << 20) &&
+           (int64_t)Value < (1LL << 20) && "Out of range B.cond fixup");
+    return (Value & 0x1ffffc) << (5 - 2);
+
+  case AArch64::fixup_a64_uncondbr:
+    // R_AARCH64_JUMP26 same as below (except to a linker, possibly).
+  case AArch64::fixup_a64_call:
+    // R_AARCH64_CALL26: Sets a CALL immediate field to bits FFFFFFC of S+A-P,
+    // checking that -2^27 <= S+A-P < 2^27.
+    assert((int64_t)Value >= -(1LL << 27) &&
+           (int64_t)Value < (1LL << 27) && "Out of range branch fixup");
+    return (Value & 0xffffffc) >> 2;
+
+  case AArch64::fixup_a64_adr_gottprel_page:
+    // R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: Set an ADRP immediate field to bits
+    // 1FFFFF000 of Page(G(TPREL(S+A))) - Page(P); check -2^32 <= X < 2^32.
+  case AArch64::fixup_a64_tlsdesc_adr_page:
+    // R_AARCH64_TLSDESC_ADR_PAGE: Set an ADRP immediate field to bits 1FFFFF000
+    // of Page(G(TLSDESC(S+A))) - Page(P); check -2^32 <= X < 2^32.
+  case AArch64::fixup_a64_adr_prel_got_page:
+    // R_AARCH64_ADR_GOT_PAGE: Sets the immediate value of an ADRP to bits
+    // 1FFFFF000 of the operation, checking that -2^32 < Page(G(S))-Page(GOT) <
+    // 2^32.
+    assert((int64_t)Value >= -(1LL << 32) &&
+           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
+    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
+
+  case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
+    // R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: Set an LD offset field to bits FF8
+    // of X, with no overflow check. Check that X & 7 == 0.
+  case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
+    // R_AARCH64_TLSDESC_LD64_LO12_NC: Set an LD offset field to bits FF8 of
+    // G(TLSDESC(S+A)), with no overflow check. Check that X & 7 == 0.
+  case AArch64::fixup_a64_ld64_got_lo12_nc:
+    // R_AARCH64_LD64_GOT_LO12_NC: Sets the LD/ST immediate field to bits FF8 of
+    // G(S) with no overflow check. Check X & 7 == 0
+    assert(((int64_t)Value & 7) == 0 && "Misaligned fixup");
+    return (Value & 0xff8) << 7;
+
+  case AArch64::fixup_a64_tlsdesc_call:
+    // R_AARCH64_TLSDESC_CALL: For relaxation only.
+    return 0;
+  }
+}
+
+MCAsmBackend *
+llvm::createAArch64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS());
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
new file mode 100644
index 0000000..4bcc65d
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -0,0 +1,292 @@
+//===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file handles ELF-specific object emission, converting LLVM's internal
+// fixups into the appropriate relocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  AArch64ELFObjectWriter(uint8_t OSABI);
+
+  virtual ~AArch64ELFObjectWriter();
+
+protected:
+  virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                bool IsPCRel, bool IsRelocWithSymbol,
+                                int64_t Addend) const;
+private:
+};
+}
+
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI)
+  : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                            /*HasRelocationAddend*/ true)
+{}
+
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter()
+{}
+
+unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsPCRel,
+                                              bool IsRelocWithSymbol,
+                                              int64_t Addend) const {
+  unsigned Type;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented fixup -> relocation");
+    case FK_Data_8:
+      return ELF::R_AARCH64_PREL64;
+    case FK_Data_4:
+      return ELF::R_AARCH64_PREL32;
+    case FK_Data_2:
+      return ELF::R_AARCH64_PREL16;
+    case AArch64::fixup_a64_ld_prel:
+      Type = ELF::R_AARCH64_LD_PREL_LO19;
+      break;
+    case AArch64::fixup_a64_adr_prel:
+      Type = ELF::R_AARCH64_ADR_PREL_LO21;
+      break;
+    case AArch64::fixup_a64_adr_prel_page:
+      Type = ELF::R_AARCH64_ADR_PREL_PG_HI21;
+      break;
+    case AArch64::fixup_a64_adr_prel_got_page:
+      Type = ELF::R_AARCH64_ADR_GOT_PAGE;
+      break;
+    case AArch64::fixup_a64_tstbr:
+      Type = ELF::R_AARCH64_TSTBR14;
+      break;
+    case AArch64::fixup_a64_condbr:
+      Type = ELF::R_AARCH64_CONDBR19;
+      break;
+    case AArch64::fixup_a64_uncondbr:
+      Type = ELF::R_AARCH64_JUMP26;
+      break;
+    case AArch64::fixup_a64_call:
+      Type = ELF::R_AARCH64_CALL26;
+      break;
+    case AArch64::fixup_a64_adr_gottprel_page:
+      Type = ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+      break;
+    case AArch64::fixup_a64_ld_gottprel_prel19:
+      Type =  ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+      break;
+    case AArch64::fixup_a64_tlsdesc_adr_page:
+      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented fixup -> relocation");
+    case FK_Data_8:
+      return ELF::R_AARCH64_ABS64;
+    case FK_Data_4:
+      return ELF::R_AARCH64_ABS32;
+    case FK_Data_2:
+      return ELF::R_AARCH64_ABS16;
+    case AArch64::fixup_a64_add_lo12:
+      Type = ELF::R_AARCH64_ADD_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ld64_got_lo12_nc:
+      Type = ELF::R_AARCH64_LD64_GOT_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst8_lo12:
+      Type = ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst16_lo12:
+      Type = ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst32_lo12:
+      Type = ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst64_lo12:
+      Type = ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst128_lo12:
+      Type = ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g0:
+      Type = ELF::R_AARCH64_MOVW_UABS_G0;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g0_nc:
+      Type = ELF::R_AARCH64_MOVW_UABS_G0_NC;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g1:
+      Type = ELF::R_AARCH64_MOVW_UABS_G1;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g1_nc:
+      Type = ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g2:
+      Type = ELF::R_AARCH64_MOVW_UABS_G2;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g2_nc:
+      Type = ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g3:
+      Type = ELF::R_AARCH64_MOVW_UABS_G3;
+      break;
+    case AArch64::fixup_a64_movw_sabs_g0:
+      Type = ELF::R_AARCH64_MOVW_SABS_G0;
+      break;
+    case AArch64::fixup_a64_movw_sabs_g1:
+      Type = ELF::R_AARCH64_MOVW_SABS_G1;
+      break;
+    case AArch64::fixup_a64_movw_sabs_g2:
+      Type = ELF::R_AARCH64_MOVW_SABS_G2;
+      break;
+
+    // TLS Local-dynamic block
+    case AArch64::fixup_a64_movw_dtprel_g2:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      break;
+    case AArch64::fixup_a64_movw_dtprel_g1:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+      break;
+    case AArch64::fixup_a64_movw_dtprel_g1_nc:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      break;
+    case AArch64::fixup_a64_movw_dtprel_g0:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+      break;
+    case AArch64::fixup_a64_movw_dtprel_g0_nc:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+      break;
+    case AArch64::fixup_a64_add_dtprel_hi12:
+      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
+      break;
+    case AArch64::fixup_a64_add_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_add_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst8_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst16_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst32_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst64_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+      break;
+
+    // TLS initial-exec block
+    case AArch64::fixup_a64_movw_gottprel_g1:
+      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      break;
+    case AArch64::fixup_a64_movw_gottprel_g0_nc:
+      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      break;
+    case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      break;
+
+    // TLS local-exec block
+    case AArch64::fixup_a64_movw_tprel_g2:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      break;
+    case AArch64::fixup_a64_movw_tprel_g1:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+      break;
+    case AArch64::fixup_a64_movw_tprel_g1_nc:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      break;
+    case AArch64::fixup_a64_movw_tprel_g0:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+      break;
+    case AArch64::fixup_a64_movw_tprel_g0_nc:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+      break;
+    case AArch64::fixup_a64_add_tprel_hi12:
+      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
+      break;
+    case AArch64::fixup_a64_add_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_add_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst8_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst16_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst32_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst64_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+      break;
+
+    // TLS general-dynamic block
+    case AArch64::fixup_a64_tlsdesc_adr_page:
+      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      break;
+    case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
+      Type = ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+      break;
+    case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
+      Type = ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+      break;
+    case AArch64::fixup_a64_tlsdesc_call:
+      Type = ELF::R_AARCH64_TLSDESC_CALL;
+      break;
+    }
+  }
+
+  return Type;
+}
+
+MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS,
+                                                   uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new AArch64ELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
new file mode 100644
index 0000000..b83577a
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -0,0 +1,160 @@
+//===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits AArch64 ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
+// regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// AArch64 ELF ABI:
+///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
+///
+/// In brief: $x or $d should be emitted at the start of each contiguous region
+/// of A64 code or data in a section. In practice, this emission does not rely
+/// on explicit assembler directives but on inherent properties of the
+/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
+/// instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class AArch64ELFStreamer : public MCELFStreamer {
+public:
+  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                 raw_ostream &OS, MCCodeEmitter *Emitter)
+    : MCELFStreamer(Context, TAB, OS, Emitter),
+      MappingSymbolCounter(0), LastEMS(EMS_None) {
+  }
+
+  ~AArch64ELFStreamer() {}
+
+  virtual void ChangeSection(const MCSection *Section) {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection()] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  virtual void EmitInstruction(const MCInst& Inst) {
+    EmitA64MappingSymbol();
+    MCELFStreamer::EmitInstruction(Inst);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data, AddrSpace);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                             unsigned AddrSpace) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size, AddrSpace);
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_A64,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data) return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitA64MappingSymbol() {
+    if (LastEMS == EMS_A64) return;
+    EmitMappingSymbol("$x");
+    LastEMS = EMS_A64;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    MCSymbol *Start = getContext().CreateTempSymbol();
+    EmitLabel(Start);
+
+    MCSymbol *Symbol =
+      getContext().GetOrCreateSymbol(Name + "." +
+                                     Twine(MappingSymbolCounter++));
+
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+    MCELF::SetType(SD, ELF::STT_NOTYPE);
+    MCELF::SetBinding(SD, ELF::STB_LOCAL);
+    SD.setExternal(false);
+    Symbol->setSection(*getCurrentSection());
+
+    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    Symbol->setVariableValue(Value);
+  }
+
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+
+  /// @}
+};
+}
+
+namespace llvm {
+  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *Emitter,
+                                      bool RelaxAll, bool NoExecStack) {
+    AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+    if (RelaxAll)
+      S->getAssembler().setRelaxAll(true);
+    if (NoExecStack)
+      S->getAssembler().setNoExecStack(true);
+    return S;
+  }
+}
+
+
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
new file mode 100644
index 0000000..5a89ca5
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -0,0 +1,27 @@
+//===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_ELF_STREAMER_H
+#define LLVM_AARCH64_ELF_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                          raw_ostream &OS,
+                                          MCCodeEmitter *Emitter,
+                                          bool RelaxAll, bool NoExecStack);
+}
+
+#endif // AArch64_ELF_STREAMER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
new file mode 100644
index 0000000..eeb122d
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -0,0 +1,113 @@
+//=- AArch64/AArch64FixupKinds.h - AArch64 Specific Fixup Entries -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the LLVM fixups applied to MCInsts in the AArch64
+// backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_AARCH64FIXUPKINDS_H
+#define LLVM_AARCH64_AARCH64FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+  namespace AArch64 {
+    enum Fixups {
+      fixup_a64_ld_prel = FirstTargetFixupKind,
+      fixup_a64_adr_prel,
+      fixup_a64_adr_prel_page,
+
+      fixup_a64_add_lo12,
+
+      fixup_a64_ldst8_lo12,
+      fixup_a64_ldst16_lo12,
+      fixup_a64_ldst32_lo12,
+      fixup_a64_ldst64_lo12,
+      fixup_a64_ldst128_lo12,
+
+      fixup_a64_tstbr,
+      fixup_a64_condbr,
+      fixup_a64_uncondbr,
+      fixup_a64_call,
+
+      fixup_a64_movw_uabs_g0,
+      fixup_a64_movw_uabs_g0_nc,
+      fixup_a64_movw_uabs_g1,
+      fixup_a64_movw_uabs_g1_nc,
+      fixup_a64_movw_uabs_g2,
+      fixup_a64_movw_uabs_g2_nc,
+      fixup_a64_movw_uabs_g3,
+
+      fixup_a64_movw_sabs_g0,
+      fixup_a64_movw_sabs_g1,
+      fixup_a64_movw_sabs_g2,
+
+      fixup_a64_adr_prel_got_page,
+      fixup_a64_ld64_got_lo12_nc,
+
+      // Produce offsets relative to the module's dynamic TLS area.
+      fixup_a64_movw_dtprel_g2,
+      fixup_a64_movw_dtprel_g1,
+      fixup_a64_movw_dtprel_g1_nc,
+      fixup_a64_movw_dtprel_g0,
+      fixup_a64_movw_dtprel_g0_nc,
+      fixup_a64_add_dtprel_hi12,
+      fixup_a64_add_dtprel_lo12,
+      fixup_a64_add_dtprel_lo12_nc,
+      fixup_a64_ldst8_dtprel_lo12,
+      fixup_a64_ldst8_dtprel_lo12_nc,
+      fixup_a64_ldst16_dtprel_lo12,
+      fixup_a64_ldst16_dtprel_lo12_nc,
+      fixup_a64_ldst32_dtprel_lo12,
+      fixup_a64_ldst32_dtprel_lo12_nc,
+      fixup_a64_ldst64_dtprel_lo12,
+      fixup_a64_ldst64_dtprel_lo12_nc,
+
+      // Produce the GOT entry containing a variable's address in TLS's
+      // initial-exec mode.
+      fixup_a64_movw_gottprel_g1,
+      fixup_a64_movw_gottprel_g0_nc,
+      fixup_a64_adr_gottprel_page,
+      fixup_a64_ld64_gottprel_lo12_nc,
+      fixup_a64_ld_gottprel_prel19,
+
+      // Produce offsets relative to the thread pointer: TPIDR_EL0.
+      fixup_a64_movw_tprel_g2,
+      fixup_a64_movw_tprel_g1,
+      fixup_a64_movw_tprel_g1_nc,
+      fixup_a64_movw_tprel_g0,
+      fixup_a64_movw_tprel_g0_nc,
+      fixup_a64_add_tprel_hi12,
+      fixup_a64_add_tprel_lo12,
+      fixup_a64_add_tprel_lo12_nc,
+      fixup_a64_ldst8_tprel_lo12,
+      fixup_a64_ldst8_tprel_lo12_nc,
+      fixup_a64_ldst16_tprel_lo12,
+      fixup_a64_ldst16_tprel_lo12_nc,
+      fixup_a64_ldst32_tprel_lo12,
+      fixup_a64_ldst32_tprel_lo12_nc,
+      fixup_a64_ldst64_tprel_lo12,
+      fixup_a64_ldst64_tprel_lo12_nc,
+
+      // Produce the special fixups used by the general-dynamic TLS model.
+      fixup_a64_tlsdesc_adr_page,
+      fixup_a64_tlsdesc_ld64_lo12_nc,
+      fixup_a64_tlsdesc_add_lo12_nc,
+      fixup_a64_tlsdesc_call,
+
+
+      // Marker
+      LastTargetFixupKind,
+      NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+    };
+  }
+}
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
new file mode 100644
index 0000000..8ec8cbf
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -0,0 +1,41 @@
+//===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AArch64MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCAsmInfo.h"
+
+using namespace llvm;
+
+AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo() {
+  PointerSize = 8;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "//";
+  PrivateGlobalPrefix = ".L";
+  Code32Directive = ".code\t32";
+
+  Data16bitsDirective = "\t.hword\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = "\t.xword\t";
+
+  UseDataRegionDirectives = true;
+
+  WeakRefDirective = "\t.weak\t";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
new file mode 100644
index 0000000..a20bc47
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -0,0 +1,27 @@
+//==-- AArch64MCAsmInfo.h - AArch64 asm properties -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AArch64MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64TARGETASMINFO_H
+#define LLVM_AARCH64TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+
+  struct AArch64ELFMCAsmInfo : public MCAsmInfo {
+    explicit AArch64ELFMCAsmInfo();
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
new file mode 100644
index 0000000..756e037
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -0,0 +1,513 @@
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class AArch64MCCodeEmitter : public MCCodeEmitter {
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  MCContext &Ctx;
+
+public:
+  AArch64MCCodeEmitter(MCContext &ctx) : Ctx(ctx) {}
+
+  ~AArch64MCCodeEmitter() {}
+
+  unsigned getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  template<int MemSize>
+  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups) const {
+    return getOffsetUImm12OpValue(MI, OpIdx, Fixups, MemSize);
+  }
+
+  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    int MemSize) const;
+
+  unsigned getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  // Labels are handled mostly the same way: a symbol is needed, and
+  // just gets some fixup attached.
+  template<AArch64::Fixups fixupDesired>
+  unsigned getLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned  getLoadLitLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  unsigned getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  unsigned getAddressWithFixup(const MCOperand &MO,
+                               unsigned FixupKind,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const {
+    OS << (char)C;
+  }
+
+  void EmitInstruction(uint32_t Val, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != 4; ++i) {
+      EmitByte(Val & 0xff, OS);
+      Val >>= 8;
+    }
+  }
+
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned fixFCMPImm(const MCInst &MI, unsigned EncodedValue) const;
+
+  template<int hasRs, int hasRt2> unsigned
+  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue) const;
+
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue) const;
+
+  unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue) const;
+
+
+};
+
+} // end anonymous namespace
+
+unsigned AArch64MCCodeEmitter::getAddressWithFixup(const MCOperand &MO,
+                                       unsigned FixupKind,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  if (!MO.isExpr()) {
+    // This can occur for manually decoded or constructed MCInsts, but neither
+    // the assembly-parser nor instruction selection will currently produce an
+    // MCInst that's not a symbol reference.
+    assert(MO.isImm() && "Unexpected address requested");
+    return MO.getImm();
+  }
+
+  const MCExpr *Expr = MO.getExpr();
+  MCFixupKind Kind = MCFixupKind(FixupKind);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+  return 0;
+}
+
+unsigned AArch64MCCodeEmitter::
+getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       int MemSize) const {
+  const MCOperand &ImmOp = MI.getOperand(OpIdx);
+  if (ImmOp.isImm())
+    return ImmOp.getImm();
+
+  assert(ImmOp.isExpr() && "Unexpected operand type");
+  const AArch64MCExpr *Expr = cast<AArch64MCExpr>(ImmOp.getExpr());
+  unsigned FixupKind;
+
+
+  switch (Expr->getKind()) {
+  default: llvm_unreachable("Unexpected operand modifier");
+  case AArch64MCExpr::VK_AARCH64_LO12: {
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12,
+                                AArch64::fixup_a64_ldst16_lo12,
+                                AArch64::fixup_a64_ldst32_lo12,
+                                AArch64::fixup_a64_ldst64_lo12,
+                                AArch64::fixup_a64_ldst128_lo12 };
+    assert(MemSize <= 16 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_GOT_LO12:
+    assert(MemSize == 8 && "Invalid fixup for operation");
+    FixupKind = AArch64::fixup_a64_ld64_got_lo12_nc;
+    break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:  {
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12,
+                                AArch64::fixup_a64_ldst16_dtprel_lo12,
+                                AArch64::fixup_a64_ldst32_dtprel_lo12,
+                                AArch64::fixup_a64_ldst64_dtprel_lo12 };
+    assert(MemSize <= 8 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC: {
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12_nc,
+                                AArch64::fixup_a64_ldst16_dtprel_lo12_nc,
+                                AArch64::fixup_a64_ldst32_dtprel_lo12_nc,
+                                AArch64::fixup_a64_ldst64_dtprel_lo12_nc };
+    assert(MemSize <= 8 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12:
+    assert(MemSize == 8 && "Invalid fixup for operation");
+    FixupKind = AArch64::fixup_a64_ld64_gottprel_lo12_nc;
+    break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:{
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12,
+                                AArch64::fixup_a64_ldst16_tprel_lo12,
+                                AArch64::fixup_a64_ldst32_tprel_lo12,
+                                AArch64::fixup_a64_ldst64_tprel_lo12 };
+    assert(MemSize <= 8 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC: {
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12_nc,
+                                AArch64::fixup_a64_ldst16_tprel_lo12_nc,
+                                AArch64::fixup_a64_ldst32_tprel_lo12_nc,
+                                AArch64::fixup_a64_ldst64_tprel_lo12_nc };
+    assert(MemSize <= 8 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
+    assert(MemSize == 8 && "Invalid fixup for operation");
+    FixupKind = AArch64::fixup_a64_tlsdesc_ld64_lo12_nc;
+    break;
+  }
+
+  return getAddressWithFixup(ImmOp, FixupKind, Fixups);
+}
+
+unsigned
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+
+  assert(MO.isExpr());
+
+  unsigned FixupKind = 0;
+  switch(cast<AArch64MCExpr>(MO.getExpr())->getKind()) {
+  default: llvm_unreachable("Invalid expression modifier");
+  case AArch64MCExpr::VK_AARCH64_LO12:
+    FixupKind = AArch64::fixup_a64_add_lo12; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_HI12:
+    FixupKind = AArch64::fixup_a64_add_dtprel_hi12; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:
+    FixupKind = AArch64::fixup_a64_add_dtprel_lo12; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC:
+    FixupKind = AArch64::fixup_a64_add_dtprel_lo12_nc; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_HI12:
+    FixupKind = AArch64::fixup_a64_add_tprel_hi12; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:
+    FixupKind = AArch64::fixup_a64_add_tprel_lo12; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC:
+    FixupKind = AArch64::fixup_a64_add_tprel_lo12_nc; break;
+  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
+    FixupKind = AArch64::fixup_a64_tlsdesc_add_lo12_nc; break;
+  }
+
+  return getAddressWithFixup(MO, FixupKind, Fixups);
+}
+
+unsigned
+AArch64MCCodeEmitter::getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+
+  assert(MO.isExpr());
+
+  unsigned Modifier = AArch64MCExpr::VK_AARCH64_None;
+  if (const AArch64MCExpr *Expr = dyn_cast<AArch64MCExpr>(MO.getExpr()))
+    Modifier = Expr->getKind();
+
+  unsigned FixupKind = 0;
+  switch(Modifier) {
+  case AArch64MCExpr::VK_AARCH64_None:
+    FixupKind = AArch64::fixup_a64_adr_prel_page;
+    break;
+  case AArch64MCExpr::VK_AARCH64_GOT:
+    FixupKind = AArch64::fixup_a64_adr_prel_got_page;
+    break;
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL:
+    FixupKind = AArch64::fixup_a64_adr_gottprel_page;
+    break;
+  case AArch64MCExpr::VK_AARCH64_TLSDESC:
+    FixupKind = AArch64::fixup_a64_tlsdesc_adr_page;
+    break;
+  default:
+    llvm_unreachable("Unknown symbol reference kind for ADRP instruction");
+  }
+
+  return getAddressWithFixup(MO, FixupKind, Fixups);
+}
+
+unsigned
+AArch64MCCodeEmitter::getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Only immediate expected for shift");
+
+  return ((32 - MO.getImm()) & 0x1f) | (31 - MO.getImm()) << 6;
+}
+
+unsigned
+AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Only immediate expected for shift");
+
+  return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6;
+}
+
+
+template<AArch64::Fixups fixupDesired> unsigned
+AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
+                                      unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isExpr())
+    return getAddressWithFixup(MO, fixupDesired, Fixups);
+
+  assert(MO.isImm());
+  return MO.getImm();
+}
+
+unsigned
+AArch64MCCodeEmitter::getLoadLitLabelOpValue(const MCInst &MI,
+                                       unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr());
+
+  unsigned FixupKind;
+  if (isa<AArch64MCExpr>(MO.getExpr())) {
+    assert(dyn_cast<AArch64MCExpr>(MO.getExpr())->getKind()
+           == AArch64MCExpr::VK_AARCH64_GOTTPREL
+           && "Invalid symbol modifier for literal load");
+    FixupKind = AArch64::fixup_a64_ld_gottprel_prel19;
+  } else {
+    FixupKind = AArch64::fixup_a64_ld_prel;
+  }
+
+  return getAddressWithFixup(MO, FixupKind, Fixups);
+}
+
+
+unsigned
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                       const MCOperand &MO,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg()) {
+    return Ctx.getRegisterInfo().getEncodingValue(MO.getReg());
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  }
+
+  llvm_unreachable("Unable to encode MCOperand!");
+  return 0;
+}
+
+unsigned
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &UImm16MO = MI.getOperand(OpIdx);
+  const MCOperand &ShiftMO = MI.getOperand(OpIdx + 1);
+
+  unsigned Result = static_cast<unsigned>(ShiftMO.getImm()) << 16;
+
+  if (UImm16MO.isImm()) {
+    Result |= UImm16MO.getImm();
+    return Result;
+  }
+
+  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
+  AArch64::Fixups requestedFixup;
+  switch (A64E->getKind()) {
+  default: llvm_unreachable("unexpected expression modifier");
+  case AArch64MCExpr::VK_AARCH64_ABS_G0:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g0; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g0_nc; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G1:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g1; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g1_nc; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G2:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g2; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g2_nc; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G3:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g3; break;
+  case AArch64MCExpr::VK_AARCH64_SABS_G0:
+    requestedFixup = AArch64::fixup_a64_movw_sabs_g0; break;
+  case AArch64MCExpr::VK_AARCH64_SABS_G1:
+    requestedFixup = AArch64::fixup_a64_movw_sabs_g1; break;
+  case AArch64MCExpr::VK_AARCH64_SABS_G2:
+    requestedFixup = AArch64::fixup_a64_movw_sabs_g2; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g2; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1_nc; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0_nc; break;
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
+    requestedFixup = AArch64::fixup_a64_movw_gottprel_g1; break;
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
+    requestedFixup = AArch64::fixup_a64_movw_gottprel_g0_nc; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g2; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g1; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g1_nc; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g0; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g0_nc; break;
+  }
+
+  return Result | getAddressWithFixup(UImm16MO, requestedFixup, Fixups);
+}
+
+unsigned AArch64MCCodeEmitter::fixFCMPImm(const MCInst &MI,
+                                          unsigned EncodedValue) const {
+    // For FCMP[E] Rn, #0.0, the Rm field has a canonical representation
+    // with 0s, but is architecturally ignored
+    EncodedValue &= ~0x1f0000u;
+
+    return EncodedValue;
+}
+
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+                                            unsigned EncodedValue) const {
+  if (!hasRs) EncodedValue |= 0x001F0000;
+  if (!hasRt2) EncodedValue |= 0x00007C00;
+
+  return EncodedValue;
+}
+
+unsigned
+AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue) const {
+  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
+  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
+  // job to ensure that any bits possibly affected by this are 0. This means we
+  // must zero out bit 30 (essentially emitting a MOVN).
+  MCOperand UImm16MO = MI.getOperand(1);
+
+  // Nothing to do if there's no fixup.
+  if (UImm16MO.isImm())
+    return EncodedValue;
+
+  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
+  switch (A64E->getKind()) {
+  case AArch64MCExpr::VK_AARCH64_SABS_G0:
+  case AArch64MCExpr::VK_AARCH64_SABS_G1:
+  case AArch64MCExpr::VK_AARCH64_SABS_G2:
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
+  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
+  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
+  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+    return EncodedValue & ~(1u << 30);
+  default:
+    // Nothing to do for an unsigned fixup.
+    return EncodedValue;
+  }
+
+  llvm_unreachable("Should have returned by now");
+}
+
+unsigned
+AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
+                                 unsigned EncodedValue) const {
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  EncodedValue |= 0x1f << 10;
+  return EncodedValue;
+}
+
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(Ctx);
+}
+
+void AArch64MCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_a64_tlsdesc_call);
+    const MCExpr *Expr;
+    Expr = AArch64MCExpr::CreateTLSDesc(MI.getOperand(0).getExpr(), Ctx);
+    Fixups.push_back(MCFixup::Create(0, Expr, Fixup));
+    return;
+  }
+
+  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
+
+  EmitInstruction(Binary, OS);
+}
+
+
+#include "AArch64GenMCCodeEmitter.inc"
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
new file mode 100644
index 0000000..c1abfe7
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -0,0 +1,178 @@
+//===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64mcexpr"
+#include "AArch64MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/Object/ELF.h"
+
+using namespace llvm;
+
+const AArch64MCExpr*
+AArch64MCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                      MCContext &Ctx) {
+  return new (Ctx) AArch64MCExpr(Kind, Expr);
+}
+
+void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_AARCH64_GOT:              OS << ":got:"; break;
+  case VK_AARCH64_GOT_LO12:         OS << ":got_lo12:"; break;
+  case VK_AARCH64_LO12:             OS << ":lo12:"; break;
+  case VK_AARCH64_ABS_G0:           OS << ":abs_g0:"; break;
+  case VK_AARCH64_ABS_G0_NC:        OS << ":abs_g0_nc:"; break;
+  case VK_AARCH64_ABS_G1:           OS << ":abs_g1:"; break;
+  case VK_AARCH64_ABS_G1_NC:        OS << ":abs_g1_nc:"; break;
+  case VK_AARCH64_ABS_G2:           OS << ":abs_g2:"; break;
+  case VK_AARCH64_ABS_G2_NC:        OS << ":abs_g2_nc:"; break;
+  case VK_AARCH64_ABS_G3:           OS << ":abs_g3:"; break;
+  case VK_AARCH64_SABS_G0:          OS << ":abs_g0_s:"; break;
+  case VK_AARCH64_SABS_G1:          OS << ":abs_g1_s:"; break;
+  case VK_AARCH64_SABS_G2:          OS << ":abs_g2_s:"; break;
+  case VK_AARCH64_DTPREL_G2:        OS << ":dtprel_g2:"; break;
+  case VK_AARCH64_DTPREL_G1:        OS << ":dtprel_g1:"; break;
+  case VK_AARCH64_DTPREL_G1_NC:     OS << ":dtprel_g1_nc:"; break;
+  case VK_AARCH64_DTPREL_G0:        OS << ":dtprel_g0:"; break;
+  case VK_AARCH64_DTPREL_G0_NC:     OS << ":dtprel_g0_nc:"; break;
+  case VK_AARCH64_DTPREL_HI12:      OS << ":dtprel_hi12:"; break;
+  case VK_AARCH64_DTPREL_LO12:      OS << ":dtprel_lo12:"; break;
+  case VK_AARCH64_DTPREL_LO12_NC:   OS << ":dtprel_lo12_nc:"; break;
+  case VK_AARCH64_GOTTPREL_G1:      OS << ":gottprel_g1:"; break;
+  case VK_AARCH64_GOTTPREL_G0_NC:   OS << ":gottprel_g0_nc:"; break;
+  case VK_AARCH64_GOTTPREL:         OS << ":gottprel:"; break;
+  case VK_AARCH64_GOTTPREL_LO12:    OS << ":gottprel_lo12:"; break;
+  case VK_AARCH64_TPREL_G2:         OS << ":tprel_g2:"; break;
+  case VK_AARCH64_TPREL_G1:         OS << ":tprel_g1:"; break;
+  case VK_AARCH64_TPREL_G1_NC:      OS << ":tprel_g1_nc:"; break;
+  case VK_AARCH64_TPREL_G0:         OS << ":tprel_g0:"; break;
+  case VK_AARCH64_TPREL_G0_NC:      OS << ":tprel_g0_nc:"; break;
+  case VK_AARCH64_TPREL_HI12:       OS << ":tprel_hi12:"; break;
+  case VK_AARCH64_TPREL_LO12:       OS << ":tprel_lo12:"; break;
+  case VK_AARCH64_TPREL_LO12_NC:    OS << ":tprel_lo12_nc:"; break;
+  case VK_AARCH64_TLSDESC:          OS << ":tlsdesc:"; break;
+  case VK_AARCH64_TLSDESC_LO12:     OS << ":tlsdesc_lo12:"; break;
+
+  }
+
+  const MCExpr *Expr = getSubExpr();
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << '(';
+  Expr->print(OS);
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << ')';
+}
+
+bool
+AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                         const MCAsmLayout *Layout) const {
+  return getSubExpr()->EvaluateAsRelocatable(Res, *Layout);
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expression");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getKind()) {
+  default:
+    return;
+  case VK_AARCH64_DTPREL_G2:
+  case VK_AARCH64_DTPREL_G1:
+  case VK_AARCH64_DTPREL_G1_NC:
+  case VK_AARCH64_DTPREL_G0:
+  case VK_AARCH64_DTPREL_G0_NC:
+  case VK_AARCH64_DTPREL_HI12:
+  case VK_AARCH64_DTPREL_LO12:
+  case VK_AARCH64_DTPREL_LO12_NC:
+  case VK_AARCH64_GOTTPREL_G1:
+  case VK_AARCH64_GOTTPREL_G0_NC:
+  case VK_AARCH64_GOTTPREL:
+  case VK_AARCH64_GOTTPREL_LO12:
+  case VK_AARCH64_TPREL_G2:
+  case VK_AARCH64_TPREL_G1:
+  case VK_AARCH64_TPREL_G1_NC:
+  case VK_AARCH64_TPREL_G0:
+  case VK_AARCH64_TPREL_G0_NC:
+  case VK_AARCH64_TPREL_HI12:
+  case VK_AARCH64_TPREL_LO12:
+  case VK_AARCH64_TPREL_LO12_NC:
+  case VK_AARCH64_TLSDESC:
+  case VK_AARCH64_TLSDESC_LO12:
+    break;
+  }
+
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that two backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
new file mode 100644
index 0000000..c0e3b29
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -0,0 +1,167 @@
+//==- AArch64MCExpr.h - AArch64 specific MC expression classes --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes AArch64-specific MCExprs, used for modifiers like
+// ":lo12:" or ":gottprel_g1:".
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64MCEXPR_H
+#define LLVM_AARCH64MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class AArch64MCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_AARCH64_None,
+    VK_AARCH64_GOT,      // :got: modifier in assembly
+    VK_AARCH64_GOT_LO12, // :got_lo12:
+    VK_AARCH64_LO12,     // :lo12:
+
+    VK_AARCH64_ABS_G0, // :abs_g0:
+    VK_AARCH64_ABS_G0_NC, // :abs_g0_nc:
+    VK_AARCH64_ABS_G1,
+    VK_AARCH64_ABS_G1_NC,
+    VK_AARCH64_ABS_G2,
+    VK_AARCH64_ABS_G2_NC,
+    VK_AARCH64_ABS_G3,
+
+    VK_AARCH64_SABS_G0, // :abs_g0_s:
+    VK_AARCH64_SABS_G1,
+    VK_AARCH64_SABS_G2,
+
+    VK_AARCH64_DTPREL_G2, // :dtprel_g2:
+    VK_AARCH64_DTPREL_G1,
+    VK_AARCH64_DTPREL_G1_NC,
+    VK_AARCH64_DTPREL_G0,
+    VK_AARCH64_DTPREL_G0_NC,
+    VK_AARCH64_DTPREL_HI12,
+    VK_AARCH64_DTPREL_LO12,
+    VK_AARCH64_DTPREL_LO12_NC,
+
+    VK_AARCH64_GOTTPREL_G1, // :gottprel:
+    VK_AARCH64_GOTTPREL_G0_NC,
+    VK_AARCH64_GOTTPREL,
+    VK_AARCH64_GOTTPREL_LO12,
+
+    VK_AARCH64_TPREL_G2, // :tprel:
+    VK_AARCH64_TPREL_G1,
+    VK_AARCH64_TPREL_G1_NC,
+    VK_AARCH64_TPREL_G0,
+    VK_AARCH64_TPREL_G0_NC,
+    VK_AARCH64_TPREL_HI12,
+    VK_AARCH64_TPREL_LO12,
+    VK_AARCH64_TPREL_LO12_NC,
+
+    VK_AARCH64_TLSDESC, // :tlsdesc:
+    VK_AARCH64_TLSDESC_LO12
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit AArch64MCExpr(VariantKind _Kind, const MCExpr *_Expr)
+    : Kind(_Kind), Expr(_Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const AArch64MCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                     MCContext &Ctx);
+
+  static const AArch64MCExpr *CreateLo12(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_AARCH64_LO12, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateGOT(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_AARCH64_GOT, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateGOTLo12(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+    return Create(VK_AARCH64_GOT_LO12, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateDTPREL_G1(const MCExpr *Expr,
+                                             MCContext &Ctx) {
+    return Create(VK_AARCH64_DTPREL_G1, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateDTPREL_G0_NC(const MCExpr *Expr,
+                                                MCContext &Ctx) {
+    return Create(VK_AARCH64_DTPREL_G0_NC, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateGOTTPREL(const MCExpr *Expr,
+                                             MCContext &Ctx) {
+    return Create(VK_AARCH64_GOTTPREL, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateGOTTPRELLo12(const MCExpr *Expr,
+                                                 MCContext &Ctx) {
+    return Create(VK_AARCH64_GOTTPREL_LO12, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateTLSDesc(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+    return Create(VK_AARCH64_TLSDESC, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateTLSDescLo12(const MCExpr *Expr,
+                                                MCContext &Ctx) {
+    return Create(VK_AARCH64_TLSDESC_LO12, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateTPREL_G1(const MCExpr *Expr,
+                                             MCContext &Ctx) {
+    return Create(VK_AARCH64_TPREL_G1, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateTPREL_G0_NC(const MCExpr *Expr,
+                                                MCContext &Ctx) {
+    return Create(VK_AARCH64_TPREL_G0_NC, Expr, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+  void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const AArch64MCExpr *) { return true; }
+
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
new file mode 100644
index 0000000..7960db0
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -0,0 +1,194 @@
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCTargetDesc.h"
+#include "AArch64ELFStreamer.h"
+#include "AArch64MCAsmInfo.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+using namespace llvm;
+
+MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT,
+                                                          StringRef CPU,
+                                                          StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitAArch64MCSubtargetInfo(X, TT, CPU, "");
+  return X;
+}
+
+
+static MCInstrInfo *createAArch64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAArch64MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAArch64MCRegisterInfo(X, AArch64::X30);
+  return X;
+}
+
+static MCAsmInfo *createAArch64MCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+
+  MCAsmInfo *MAI = new AArch64ELFMCAsmInfo();
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(AArch64::XSP, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                 CodeModel::Model CM,
+                                                 CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) {
+    // On ELF platforms the default static relocation model has a smart enough
+    // linker to cope with referencing external symbols defined in a shared
+    // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+    RM = Reloc::Static;
+  }
+
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  return createAArch64ELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+
+static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new AArch64InstPrinter(MAI, MII, MRI, STI);
+  return 0;
+}
+
+namespace {
+
+class AArch64MCInstrAnalysis : public MCInstrAnalysis {
+public:
+  AArch64MCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  virtual bool isUnconditionalBranch(const MCInst &Inst) const {
+    if (Inst.getOpcode() == AArch64::Bcc
+        && Inst.getOperand(0).getImm() == A64CC::AL)
+      return true;
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  virtual bool isConditionalBranch(const MCInst &Inst) const {
+    if (Inst.getOpcode() == AArch64::Bcc
+        && Inst.getOperand(0).getImm() == A64CC::AL)
+      return false;
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                          uint64_t Size) const {
+    unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0;
+    // FIXME: We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType
+        != MCOI::OPERAND_PCREL)
+      return -1ULL;
+
+    int64_t Imm = Inst.getOperand(LblOperand).getImm();
+
+    return Addr + Imm;
+  }
+};
+
+}
+
+static MCInstrAnalysis *createAArch64MCInstrAnalysis(const MCInstrInfo *Info) {
+  return new AArch64MCInstrAnalysis(Info);
+}
+
+
+
+extern "C" void LLVMInitializeAArch64TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn A(TheAArch64Target, createAArch64MCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64Target,
+                                        createAArch64MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheAArch64Target,
+                                      createAArch64MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheAArch64Target,
+                                    createAArch64MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  using AArch64_MC::createAArch64MCSubtargetInfo;
+  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64Target,
+                                          createAArch64MCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64Target,
+                                          createAArch64MCInstrAnalysis);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheAArch64Target,
+                                        createAArch64MCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64Target,
+                                       createAArch64AsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheAArch64Target,
+                                           createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheAArch64Target,
+                                        createAArch64MCInstPrinter);
+}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
new file mode 100644
index 0000000..3849fe3
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -0,0 +1,65 @@
+//===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64MCTARGETDESC_H
+#define LLVM_AARCH64MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheAArch64Target;
+
+namespace AArch64_MC {
+  MCSubtargetInfo *createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                StringRef FS);
+}
+
+MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                          const MCRegisterInfo &MRI,
+                                          const MCSubtargetInfo &STI,
+                                          MCContext &Ctx);
+
+MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
+                                             uint8_t OSABI);
+
+MCAsmBackend *createAArch64AsmBackend(const Target &T, StringRef TT,
+                                      StringRef CPU);
+
+} // End llvm namespace
+
+// Defines symbolic names for AArch64 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "AArch64GenRegisterInfo.inc"
+
+// Defines symbolic names for the AArch64 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AArch64GenSubtargetInfo.inc"
+
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000..44c66a2
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_library(LLVMAArch64Desc
+  AArch64AsmBackend.cpp
+  AArch64ELFObjectWriter.cpp
+  AArch64ELFStreamer.cpp
+  AArch64MCAsmInfo.cpp
+  AArch64MCCodeEmitter.cpp
+  AArch64MCExpr.cpp
+  AArch64MCTargetDesc.cpp
+  )
+add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 0000000..37c8035
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64Desc
+parent = AArch64
+required_libraries = AArch64AsmPrinter AArch64Info MC Support
+add_to_library_groups = AArch64
+
diff --git a/lib/Target/AArch64/MCTargetDesc/Makefile b/lib/Target/AArch64/MCTargetDesc/Makefile
new file mode 100644
index 0000000..5779ac5
--- /dev/null
+++ b/lib/Target/AArch64/MCTargetDesc/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/AArch64/TargetDesc/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64Desc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile
new file mode 100644
index 0000000..641bb83
--- /dev/null
+++ b/lib/Target/AArch64/Makefile
@@ -0,0 +1,30 @@
+##===- lib/Target/AArch64/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMAArch64CodeGen
+TARGET = AArch64
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = AArch64GenAsmMatcher.inc \
+   AArch64GenAsmWriter.inc \
+   AArch64GenCallingConv.inc \
+   AArch64GenDAGISel.inc \
+   AArch64GenDisassemblerTables.inc \
+   AArch64GenInstrInfo.inc \
+   AArch64GenMCCodeEmitter.inc \
+   AArch64GenMCPseudoLowering.inc \
+   AArch64GenRegisterInfo.inc \
+   AArch64GenSubtargetInfo.inc
+
+DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc Utils
+
+include $(LEVEL)/Makefile.common
+
+
diff --git a/lib/Target/AArch64/README.txt b/lib/Target/AArch64/README.txt
new file mode 100644
index 0000000..601990f
--- /dev/null
+++ b/lib/Target/AArch64/README.txt
@@ -0,0 +1,2 @@
+This file will contain changes that need to be made before AArch64 can become an
+officially supported target. Currently a placeholder.
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
new file mode 100644
index 0000000..b8099cb
--- /dev/null
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -0,0 +1,24 @@
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the key registration step for the architecture.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheAArch64Target;
+
+extern "C" void LLVMInitializeAArch64TargetInfo() {
+  RegisterTarget<Triple::aarch64>
+    X(TheAArch64Target, "aarch64", "AArch64");
+}
diff --git a/lib/Target/AArch64/TargetInfo/CMakeLists.txt b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..e236eed
--- /dev/null
+++ b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAArch64Info
+  AArch64TargetInfo.cpp
+  )
+
+add_dependencies(LLVMAArch64Info AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
new file mode 100644
index 0000000..5b003f0
--- /dev/null
+++ b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64Info
+parent = AArch64
+required_libraries = MC Support Target
+add_to_library_groups = AArch64
+
diff --git a/lib/Target/AArch64/TargetInfo/Makefile b/lib/Target/AArch64/TargetInfo/Makefile
new file mode 100644
index 0000000..9dc9aa4
--- /dev/null
+++ b/lib/Target/AArch64/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/AArch64/TargetInfo/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
new file mode 100644
index 0000000..ab9bba1
--- /dev/null
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -0,0 +1,819 @@
+//===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides basic encoding and assembly information for AArch64.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm;
+
+StringRef NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
+  for (unsigned i = 0; i < NumPairs; ++i) {
+    if (Pairs[i].Value == Value) {
+      Valid = true;
+      return Pairs[i].Name;
+    }
+  }
+
+  Valid = false;
+  return StringRef();
+}
+
+uint32_t NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
+  std::string LowerCaseName = Name.lower();
+  for (unsigned i = 0; i < NumPairs; ++i) {
+    if (Pairs[i].Name == LowerCaseName) {
+      Valid = true;
+      return Pairs[i].Value;
+    }
+  }
+
+  Valid = false;
+  return -1;
+}
+
+bool NamedImmMapper::validImm(uint32_t Value) const {
+  return Value < TooBigImm;
+}
+
+const NamedImmMapper::Mapping A64AT::ATMapper::ATPairs[] = {
+  {"s1e1r", S1E1R},
+  {"s1e2r", S1E2R},
+  {"s1e3r", S1E3R},
+  {"s1e1w", S1E1W},
+  {"s1e2w", S1E2W},
+  {"s1e3w", S1E3W},
+  {"s1e0r", S1E0R},
+  {"s1e0w", S1E0W},
+  {"s12e1r", S12E1R},
+  {"s12e1w", S12E1W},
+  {"s12e0r", S12E0R},
+  {"s12e0w", S12E0W},
+};
+
+A64AT::ATMapper::ATMapper()
+  : NamedImmMapper(ATPairs, 0) {}
+
+const NamedImmMapper::Mapping A64DB::DBarrierMapper::DBarrierPairs[] = {
+  {"oshld", OSHLD},
+  {"oshst", OSHST},
+  {"osh", OSH},
+  {"nshld", NSHLD},
+  {"nshst", NSHST},
+  {"nsh", NSH},
+  {"ishld", ISHLD},
+  {"ishst", ISHST},
+  {"ish", ISH},
+  {"ld", LD},
+  {"st", ST},
+  {"sy", SY}
+};
+
+A64DB::DBarrierMapper::DBarrierMapper()
+  : NamedImmMapper(DBarrierPairs, 16u) {}
+
+const NamedImmMapper::Mapping A64DC::DCMapper::DCPairs[] = {
+  {"zva", ZVA},
+  {"ivac", IVAC},
+  {"isw", ISW},
+  {"cvac", CVAC},
+  {"csw", CSW},
+  {"cvau", CVAU},
+  {"civac", CIVAC},
+  {"cisw", CISW}
+};
+
+A64DC::DCMapper::DCMapper()
+  : NamedImmMapper(DCPairs, 0) {}
+
+const NamedImmMapper::Mapping A64IC::ICMapper::ICPairs[] = {
+  {"ialluis",  IALLUIS},
+  {"iallu", IALLU},
+  {"ivau", IVAU}
+};
+
+A64IC::ICMapper::ICMapper()
+  : NamedImmMapper(ICPairs, 0) {}
+
+const NamedImmMapper::Mapping A64ISB::ISBMapper::ISBPairs[] = {
+  {"sy",  SY},
+};
+
+A64ISB::ISBMapper::ISBMapper()
+  : NamedImmMapper(ISBPairs, 16) {}
+
+const NamedImmMapper::Mapping A64PRFM::PRFMMapper::PRFMPairs[] = {
+  {"pldl1keep", PLDL1KEEP},
+  {"pldl1strm", PLDL1STRM},
+  {"pldl2keep", PLDL2KEEP},
+  {"pldl2strm", PLDL2STRM},
+  {"pldl3keep", PLDL3KEEP},
+  {"pldl3strm", PLDL3STRM},
+  {"plil1keep", PLIL1KEEP},
+  {"plil1strm", PLIL1STRM},
+  {"plil2keep", PLIL2KEEP},
+  {"plil2strm", PLIL2STRM},
+  {"plil3keep", PLIL3KEEP},
+  {"plil3strm", PLIL3STRM},
+  {"pstl1keep", PSTL1KEEP},
+  {"pstl1strm", PSTL1STRM},
+  {"pstl2keep", PSTL2KEEP},
+  {"pstl2strm", PSTL2STRM},
+  {"pstl3keep", PSTL3KEEP},
+  {"pstl3strm", PSTL3STRM}
+};
+
+A64PRFM::PRFMMapper::PRFMMapper()
+  : NamedImmMapper(PRFMPairs, 32) {}
+
+const NamedImmMapper::Mapping A64PState::PStateMapper::PStatePairs[] = {
+  {"spsel", SPSel},
+  {"daifset", DAIFSet},
+  {"daifclr", DAIFClr}
+};
+
+A64PState::PStateMapper::PStateMapper()
+  : NamedImmMapper(PStatePairs, 0) {}
+
+const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
+  {"mdccsr_el0", MDCCSR_EL0},
+  {"dbgdtrrx_el0", DBGDTRRX_EL0},
+  {"mdrar_el1", MDRAR_EL1},
+  {"oslsr_el1", OSLSR_EL1},
+  {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1},
+  {"pmceid0_el0", PMCEID0_EL0},
+  {"pmceid1_el0", PMCEID1_EL0},
+  {"midr_el1", MIDR_EL1},
+  {"ccsidr_el1", CCSIDR_EL1},
+  {"clidr_el1", CLIDR_EL1},
+  {"ctr_el0", CTR_EL0},
+  {"mpidr_el1", MPIDR_EL1},
+  {"revidr_el1", REVIDR_EL1},
+  {"aidr_el1", AIDR_EL1},
+  {"dczid_el0", DCZID_EL0},
+  {"id_pfr0_el1", ID_PFR0_EL1},
+  {"id_pfr1_el1", ID_PFR1_EL1},
+  {"id_dfr0_el1", ID_DFR0_EL1},
+  {"id_afr0_el1", ID_AFR0_EL1},
+  {"id_mmfr0_el1", ID_MMFR0_EL1},
+  {"id_mmfr1_el1", ID_MMFR1_EL1},
+  {"id_mmfr2_el1", ID_MMFR2_EL1},
+  {"id_mmfr3_el1", ID_MMFR3_EL1},
+  {"id_isar0_el1", ID_ISAR0_EL1},
+  {"id_isar1_el1", ID_ISAR1_EL1},
+  {"id_isar2_el1", ID_ISAR2_EL1},
+  {"id_isar3_el1", ID_ISAR3_EL1},
+  {"id_isar4_el1", ID_ISAR4_EL1},
+  {"id_isar5_el1", ID_ISAR5_EL1},
+  {"id_aa64pfr0_el1", ID_AA64PFR0_EL1},
+  {"id_aa64pfr1_el1", ID_AA64PFR1_EL1},
+  {"id_aa64dfr0_el1", ID_AA64DFR0_EL1},
+  {"id_aa64dfr1_el1", ID_AA64DFR1_EL1},
+  {"id_aa64afr0_el1", ID_AA64AFR0_EL1},
+  {"id_aa64afr1_el1", ID_AA64AFR1_EL1},
+  {"id_aa64isar0_el1", ID_AA64ISAR0_EL1},
+  {"id_aa64isar1_el1", ID_AA64ISAR1_EL1},
+  {"id_aa64mmfr0_el1", ID_AA64MMFR0_EL1},
+  {"id_aa64mmfr1_el1", ID_AA64MMFR1_EL1},
+  {"mvfr0_el1", MVFR0_EL1},
+  {"mvfr1_el1", MVFR1_EL1},
+  {"mvfr2_el1", MVFR2_EL1},
+  {"rvbar_el1", RVBAR_EL1},
+  {"rvbar_el2", RVBAR_EL2},
+  {"rvbar_el3", RVBAR_EL3},
+  {"isr_el1", ISR_EL1},
+  {"cntpct_el0", CNTPCT_EL0},
+  {"cntvct_el0", CNTVCT_EL0}
+};
+
+A64SysReg::MRSMapper::MRSMapper() {
+    InstPairs = &MRSPairs[0];
+    NumInstPairs = llvm::array_lengthof(MRSPairs);
+}
+
+const NamedImmMapper::Mapping A64SysReg::MSRMapper::MSRPairs[] = {
+  {"dbgdtrtx_el0", DBGDTRTX_EL0},
+  {"oslar_el1", OSLAR_EL1},
+  {"pmswinc_el0", PMSWINC_EL0}
+};
+
+A64SysReg::MSRMapper::MSRMapper() {
+    InstPairs = &MSRPairs[0];
+    NumInstPairs = llvm::array_lengthof(MSRPairs);
+}
+
+
+const NamedImmMapper::Mapping A64SysReg::SysRegMapper::SysRegPairs[] = {
+  {"osdtrrx_el1", OSDTRRX_EL1},
+  {"osdtrtx_el1",  OSDTRTX_EL1},
+  {"teecr32_el1", TEECR32_EL1},
+  {"mdccint_el1", MDCCINT_EL1},
+  {"mdscr_el1", MDSCR_EL1},
+  {"dbgdtr_el0", DBGDTR_EL0},
+  {"oseccr_el1", OSECCR_EL1},
+  {"dbgvcr32_el2", DBGVCR32_EL2},
+  {"dbgbvr0_el1", DBGBVR0_EL1},
+  {"dbgbvr1_el1", DBGBVR1_EL1},
+  {"dbgbvr2_el1", DBGBVR2_EL1},
+  {"dbgbvr3_el1", DBGBVR3_EL1},
+  {"dbgbvr4_el1", DBGBVR4_EL1},
+  {"dbgbvr5_el1", DBGBVR5_EL1},
+  {"dbgbvr6_el1", DBGBVR6_EL1},
+  {"dbgbvr7_el1", DBGBVR7_EL1},
+  {"dbgbvr8_el1", DBGBVR8_EL1},
+  {"dbgbvr9_el1", DBGBVR9_EL1},
+  {"dbgbvr10_el1", DBGBVR10_EL1},
+  {"dbgbvr11_el1", DBGBVR11_EL1},
+  {"dbgbvr12_el1", DBGBVR12_EL1},
+  {"dbgbvr13_el1", DBGBVR13_EL1},
+  {"dbgbvr14_el1", DBGBVR14_EL1},
+  {"dbgbvr15_el1", DBGBVR15_EL1},
+  {"dbgbcr0_el1", DBGBCR0_EL1},
+  {"dbgbcr1_el1", DBGBCR1_EL1},
+  {"dbgbcr2_el1", DBGBCR2_EL1},
+  {"dbgbcr3_el1", DBGBCR3_EL1},
+  {"dbgbcr4_el1", DBGBCR4_EL1},
+  {"dbgbcr5_el1", DBGBCR5_EL1},
+  {"dbgbcr6_el1", DBGBCR6_EL1},
+  {"dbgbcr7_el1", DBGBCR7_EL1},
+  {"dbgbcr8_el1", DBGBCR8_EL1},
+  {"dbgbcr9_el1", DBGBCR9_EL1},
+  {"dbgbcr10_el1", DBGBCR10_EL1},
+  {"dbgbcr11_el1", DBGBCR11_EL1},
+  {"dbgbcr12_el1", DBGBCR12_EL1},
+  {"dbgbcr13_el1", DBGBCR13_EL1},
+  {"dbgbcr14_el1", DBGBCR14_EL1},
+  {"dbgbcr15_el1", DBGBCR15_EL1},
+  {"dbgwvr0_el1", DBGWVR0_EL1},
+  {"dbgwvr1_el1", DBGWVR1_EL1},
+  {"dbgwvr2_el1", DBGWVR2_EL1},
+  {"dbgwvr3_el1", DBGWVR3_EL1},
+  {"dbgwvr4_el1", DBGWVR4_EL1},
+  {"dbgwvr5_el1", DBGWVR5_EL1},
+  {"dbgwvr6_el1", DBGWVR6_EL1},
+  {"dbgwvr7_el1", DBGWVR7_EL1},
+  {"dbgwvr8_el1", DBGWVR8_EL1},
+  {"dbgwvr9_el1", DBGWVR9_EL1},
+  {"dbgwvr10_el1", DBGWVR10_EL1},
+  {"dbgwvr11_el1", DBGWVR11_EL1},
+  {"dbgwvr12_el1", DBGWVR12_EL1},
+  {"dbgwvr13_el1", DBGWVR13_EL1},
+  {"dbgwvr14_el1", DBGWVR14_EL1},
+  {"dbgwvr15_el1", DBGWVR15_EL1},
+  {"dbgwcr0_el1", DBGWCR0_EL1},
+  {"dbgwcr1_el1", DBGWCR1_EL1},
+  {"dbgwcr2_el1", DBGWCR2_EL1},
+  {"dbgwcr3_el1", DBGWCR3_EL1},
+  {"dbgwcr4_el1", DBGWCR4_EL1},
+  {"dbgwcr5_el1", DBGWCR5_EL1},
+  {"dbgwcr6_el1", DBGWCR6_EL1},
+  {"dbgwcr7_el1", DBGWCR7_EL1},
+  {"dbgwcr8_el1", DBGWCR8_EL1},
+  {"dbgwcr9_el1", DBGWCR9_EL1},
+  {"dbgwcr10_el1", DBGWCR10_EL1},
+  {"dbgwcr11_el1", DBGWCR11_EL1},
+  {"dbgwcr12_el1", DBGWCR12_EL1},
+  {"dbgwcr13_el1", DBGWCR13_EL1},
+  {"dbgwcr14_el1", DBGWCR14_EL1},
+  {"dbgwcr15_el1", DBGWCR15_EL1},
+  {"teehbr32_el1", TEEHBR32_EL1},
+  {"osdlr_el1", OSDLR_EL1},
+  {"dbgprcr_el1", DBGPRCR_EL1},
+  {"dbgclaimset_el1", DBGCLAIMSET_EL1},
+  {"dbgclaimclr_el1", DBGCLAIMCLR_EL1},
+  {"csselr_el1", CSSELR_EL1},
+  {"vpidr_el2", VPIDR_EL2},
+  {"vmpidr_el2", VMPIDR_EL2},
+  {"sctlr_el1", SCTLR_EL1},
+  {"sctlr_el2", SCTLR_EL2},
+  {"sctlr_el3", SCTLR_EL3},
+  {"actlr_el1", ACTLR_EL1},
+  {"actlr_el2", ACTLR_EL2},
+  {"actlr_el3", ACTLR_EL3},
+  {"cpacr_el1", CPACR_EL1},
+  {"hcr_el2", HCR_EL2},
+  {"scr_el3", SCR_EL3},
+  {"mdcr_el2", MDCR_EL2},
+  {"sder32_el3", SDER32_EL3},
+  {"cptr_el2", CPTR_EL2},
+  {"cptr_el3", CPTR_EL3},
+  {"hstr_el2", HSTR_EL2},
+  {"hacr_el2", HACR_EL2},
+  {"mdcr_el3", MDCR_EL3},
+  {"ttbr0_el1", TTBR0_EL1},
+  {"ttbr0_el2", TTBR0_EL2},
+  {"ttbr0_el3", TTBR0_EL3},
+  {"ttbr1_el1", TTBR1_EL1},
+  {"tcr_el1", TCR_EL1},
+  {"tcr_el2", TCR_EL2},
+  {"tcr_el3", TCR_EL3},
+  {"vttbr_el2", VTTBR_EL2},
+  {"vtcr_el2", VTCR_EL2},
+  {"dacr32_el2", DACR32_EL2},
+  {"spsr_el1", SPSR_EL1},
+  {"spsr_el2", SPSR_EL2},
+  {"spsr_el3", SPSR_EL3},
+  {"elr_el1", ELR_EL1},
+  {"elr_el2", ELR_EL2},
+  {"elr_el3", ELR_EL3},
+  {"sp_el0", SP_EL0},
+  {"sp_el1", SP_EL1},
+  {"sp_el2", SP_EL2},
+  {"spsel", SPSel},
+  {"nzcv", NZCV},
+  {"daif", DAIF},
+  {"currentel", CurrentEL},
+  {"spsr_irq", SPSR_irq},
+  {"spsr_abt", SPSR_abt},
+  {"spsr_und", SPSR_und},
+  {"spsr_fiq", SPSR_fiq},
+  {"fpcr", FPCR},
+  {"fpsr", FPSR},
+  {"dspsr_el0", DSPSR_EL0},
+  {"dlr_el0", DLR_EL0},
+  {"ifsr32_el2", IFSR32_EL2},
+  {"afsr0_el1", AFSR0_EL1},
+  {"afsr0_el2", AFSR0_EL2},
+  {"afsr0_el3", AFSR0_EL3},
+  {"afsr1_el1", AFSR1_EL1},
+  {"afsr1_el2", AFSR1_EL2},
+  {"afsr1_el3", AFSR1_EL3},
+  {"esr_el1", ESR_EL1},
+  {"esr_el2", ESR_EL2},
+  {"esr_el3", ESR_EL3},
+  {"fpexc32_el2", FPEXC32_EL2},
+  {"far_el1", FAR_EL1},
+  {"far_el2", FAR_EL2},
+  {"far_el3", FAR_EL3},
+  {"hpfar_el2", HPFAR_EL2},
+  {"par_el1", PAR_EL1},
+  {"pmcr_el0", PMCR_EL0},
+  {"pmcntenset_el0", PMCNTENSET_EL0},
+  {"pmcntenclr_el0", PMCNTENCLR_EL0},
+  {"pmovsclr_el0", PMOVSCLR_EL0},
+  {"pmselr_el0", PMSELR_EL0},
+  {"pmccntr_el0", PMCCNTR_EL0},
+  {"pmxevtyper_el0", PMXEVTYPER_EL0},
+  {"pmxevcntr_el0", PMXEVCNTR_EL0},
+  {"pmuserenr_el0", PMUSERENR_EL0},
+  {"pmintenset_el1", PMINTENSET_EL1},
+  {"pmintenclr_el1", PMINTENCLR_EL1},
+  {"pmovsset_el0", PMOVSSET_EL0},
+  {"mair_el1", MAIR_EL1},
+  {"mair_el2", MAIR_EL2},
+  {"mair_el3", MAIR_EL3},
+  {"amair_el1", AMAIR_EL1},
+  {"amair_el2", AMAIR_EL2},
+  {"amair_el3", AMAIR_EL3},
+  {"vbar_el1", VBAR_EL1},
+  {"vbar_el2", VBAR_EL2},
+  {"vbar_el3", VBAR_EL3},
+  {"rmr_el1", RMR_EL1},
+  {"rmr_el2", RMR_EL2},
+  {"rmr_el3", RMR_EL3},
+  {"contextidr_el1", CONTEXTIDR_EL1},
+  {"tpidr_el0", TPIDR_EL0},
+  {"tpidr_el2", TPIDR_EL2},
+  {"tpidr_el3", TPIDR_EL3},
+  {"tpidrro_el0", TPIDRRO_EL0},
+  {"tpidr_el1", TPIDR_EL1},
+  {"cntfrq_el0", CNTFRQ_EL0},
+  {"cntvoff_el2", CNTVOFF_EL2},
+  {"cntkctl_el1", CNTKCTL_EL1},
+  {"cnthctl_el2", CNTHCTL_EL2},
+  {"cntp_tval_el0", CNTP_TVAL_EL0},
+  {"cnthp_tval_el2", CNTHP_TVAL_EL2},
+  {"cntps_tval_el1", CNTPS_TVAL_EL1},
+  {"cntp_ctl_el0", CNTP_CTL_EL0},
+  {"cnthp_ctl_el2", CNTHP_CTL_EL2},
+  {"cntps_ctl_el1", CNTPS_CTL_EL1},
+  {"cntp_cval_el0", CNTP_CVAL_EL0},
+  {"cnthp_cval_el2", CNTHP_CVAL_EL2},
+  {"cntps_cval_el1", CNTPS_CVAL_EL1},
+  {"cntv_tval_el0", CNTV_TVAL_EL0},
+  {"cntv_ctl_el0", CNTV_CTL_EL0},
+  {"cntv_cval_el0", CNTV_CVAL_EL0},
+  {"pmevcntr0_el0", PMEVCNTR0_EL0},
+  {"pmevcntr1_el0", PMEVCNTR1_EL0},
+  {"pmevcntr2_el0", PMEVCNTR2_EL0},
+  {"pmevcntr3_el0", PMEVCNTR3_EL0},
+  {"pmevcntr4_el0", PMEVCNTR4_EL0},
+  {"pmevcntr5_el0", PMEVCNTR5_EL0},
+  {"pmevcntr6_el0", PMEVCNTR6_EL0},
+  {"pmevcntr7_el0", PMEVCNTR7_EL0},
+  {"pmevcntr8_el0", PMEVCNTR8_EL0},
+  {"pmevcntr9_el0", PMEVCNTR9_EL0},
+  {"pmevcntr10_el0", PMEVCNTR10_EL0},
+  {"pmevcntr11_el0", PMEVCNTR11_EL0},
+  {"pmevcntr12_el0", PMEVCNTR12_EL0},
+  {"pmevcntr13_el0", PMEVCNTR13_EL0},
+  {"pmevcntr14_el0", PMEVCNTR14_EL0},
+  {"pmevcntr15_el0", PMEVCNTR15_EL0},
+  {"pmevcntr16_el0", PMEVCNTR16_EL0},
+  {"pmevcntr17_el0", PMEVCNTR17_EL0},
+  {"pmevcntr18_el0", PMEVCNTR18_EL0},
+  {"pmevcntr19_el0", PMEVCNTR19_EL0},
+  {"pmevcntr20_el0", PMEVCNTR20_EL0},
+  {"pmevcntr21_el0", PMEVCNTR21_EL0},
+  {"pmevcntr22_el0", PMEVCNTR22_EL0},
+  {"pmevcntr23_el0", PMEVCNTR23_EL0},
+  {"pmevcntr24_el0", PMEVCNTR24_EL0},
+  {"pmevcntr25_el0", PMEVCNTR25_EL0},
+  {"pmevcntr26_el0", PMEVCNTR26_EL0},
+  {"pmevcntr27_el0", PMEVCNTR27_EL0},
+  {"pmevcntr28_el0", PMEVCNTR28_EL0},
+  {"pmevcntr29_el0", PMEVCNTR29_EL0},
+  {"pmevcntr30_el0", PMEVCNTR30_EL0},
+  {"pmccfiltr_el0", PMCCFILTR_EL0},
+  {"pmevtyper0_el0", PMEVTYPER0_EL0},
+  {"pmevtyper1_el0", PMEVTYPER1_EL0},
+  {"pmevtyper2_el0", PMEVTYPER2_EL0},
+  {"pmevtyper3_el0", PMEVTYPER3_EL0},
+  {"pmevtyper4_el0", PMEVTYPER4_EL0},
+  {"pmevtyper5_el0", PMEVTYPER5_EL0},
+  {"pmevtyper6_el0", PMEVTYPER6_EL0},
+  {"pmevtyper7_el0", PMEVTYPER7_EL0},
+  {"pmevtyper8_el0", PMEVTYPER8_EL0},
+  {"pmevtyper9_el0", PMEVTYPER9_EL0},
+  {"pmevtyper10_el0", PMEVTYPER10_EL0},
+  {"pmevtyper11_el0", PMEVTYPER11_EL0},
+  {"pmevtyper12_el0", PMEVTYPER12_EL0},
+  {"pmevtyper13_el0", PMEVTYPER13_EL0},
+  {"pmevtyper14_el0", PMEVTYPER14_EL0},
+  {"pmevtyper15_el0", PMEVTYPER15_EL0},
+  {"pmevtyper16_el0", PMEVTYPER16_EL0},
+  {"pmevtyper17_el0", PMEVTYPER17_EL0},
+  {"pmevtyper18_el0", PMEVTYPER18_EL0},
+  {"pmevtyper19_el0", PMEVTYPER19_EL0},
+  {"pmevtyper20_el0", PMEVTYPER20_EL0},
+  {"pmevtyper21_el0", PMEVTYPER21_EL0},
+  {"pmevtyper22_el0", PMEVTYPER22_EL0},
+  {"pmevtyper23_el0", PMEVTYPER23_EL0},
+  {"pmevtyper24_el0", PMEVTYPER24_EL0},
+  {"pmevtyper25_el0", PMEVTYPER25_EL0},
+  {"pmevtyper26_el0", PMEVTYPER26_EL0},
+  {"pmevtyper27_el0", PMEVTYPER27_EL0},
+  {"pmevtyper28_el0", PMEVTYPER28_EL0},
+  {"pmevtyper29_el0", PMEVTYPER29_EL0},
+  {"pmevtyper30_el0", PMEVTYPER30_EL0},
+};
+
+uint32_t
+A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
+  // First search the registers shared by all
+  std::string NameLower = Name.lower();
+  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
+    if (SysRegPairs[i].Name == NameLower) {
+      Valid = true;
+      return SysRegPairs[i].Value;
+    }
+  }
+
+  // Now try the instruction-specific registers (either read-only or
+  // write-only).
+  for (unsigned i = 0; i < NumInstPairs; ++i) {
+    if (InstPairs[i].Name == NameLower) {
+      Valid = true;
+      return InstPairs[i].Value;
+    }
+  }
+
+  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits
+  // are: 11 xxx 1x11 xxxx xxx
+  Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$");
+
+  SmallVector<StringRef, 4> Ops;
+  if (!GenericRegPattern.match(NameLower, &Ops)) {
+    Valid = false;
+    return -1;
+  }
+
+  uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
+  uint32_t Bits;
+  Ops[1].getAsInteger(10, Op1);
+  Ops[2].getAsInteger(10, CRn);
+  Ops[3].getAsInteger(10, CRm);
+  Ops[4].getAsInteger(10, Op2);
+  Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
+
+  Valid = true;
+  return Bits;
+}
+
+std::string
+A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
+    if (SysRegPairs[i].Value == Bits) {
+      Valid = true;
+      return SysRegPairs[i].Name;
+    }
+  }
+
+  for (unsigned i = 0; i < NumInstPairs; ++i) {
+    if (InstPairs[i].Value == Bits) {
+      Valid = true;
+      return InstPairs[i].Name;
+    }
+  }
+
+  uint32_t Op0 = (Bits >> 14) & 0x3;
+  uint32_t Op1 = (Bits >> 11) & 0x7;
+  uint32_t CRn = (Bits >> 7) & 0xf;
+  uint32_t CRm = (Bits >> 3) & 0xf;
+  uint32_t Op2 = Bits & 0x7;
+
+  // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic
+  // name.
+  if (Op0 != 3 || (CRn != 11 && CRn != 15)) {
+      Valid = false;
+      return "";
+  }
+
+  assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg");
+
+  Valid = true;
+  return "s3_" + utostr(Op1) + "_c" + utostr(CRn)
+               + "_c" + utostr(CRm) + "_" + utostr(Op2);
+}
+
+const NamedImmMapper::Mapping A64TLBI::TLBIMapper::TLBIPairs[] = {
+  {"ipas2e1is", IPAS2E1IS},
+  {"ipas2le1is", IPAS2LE1IS},
+  {"vmalle1is", VMALLE1IS},
+  {"alle2is", ALLE2IS},
+  {"alle3is", ALLE3IS},
+  {"vae1is", VAE1IS},
+  {"vae2is", VAE2IS},
+  {"vae3is", VAE3IS},
+  {"aside1is", ASIDE1IS},
+  {"vaae1is", VAAE1IS},
+  {"alle1is", ALLE1IS},
+  {"vale1is", VALE1IS},
+  {"vale2is", VALE2IS},
+  {"vale3is", VALE3IS},
+  {"vmalls12e1is", VMALLS12E1IS},
+  {"vaale1is", VAALE1IS},
+  {"ipas2e1", IPAS2E1},
+  {"ipas2le1", IPAS2LE1},
+  {"vmalle1", VMALLE1},
+  {"alle2", ALLE2},
+  {"alle3", ALLE3},
+  {"vae1", VAE1},
+  {"vae2", VAE2},
+  {"vae3", VAE3},
+  {"aside1", ASIDE1},
+  {"vaae1", VAAE1},
+  {"alle1", ALLE1},
+  {"vale1", VALE1},
+  {"vale2", VALE2},
+  {"vale3", VALE3},
+  {"vmalls12e1", VMALLS12E1},
+  {"vaale1", VAALE1}
+};
+
+A64TLBI::TLBIMapper::TLBIMapper()
+  : NamedImmMapper(TLBIPairs, 0) {}
+
+bool A64Imms::isFPImm(const APFloat &Val, uint32_t &Imm8Bits) {
+  const fltSemantics &Sem = Val.getSemantics();
+  unsigned FracBits = APFloat::semanticsPrecision(Sem) - 1;
+
+  uint32_t ExpMask;
+  switch (FracBits) {
+  case 10: // IEEE half-precision
+    ExpMask = 0x1f;
+    break;
+  case 23: // IEEE single-precision
+    ExpMask = 0xff;
+    break;
+  case 52: // IEEE double-precision
+    ExpMask = 0x7ff;
+    break;
+  case 112: // IEEE quad-precision
+    // No immediates are valid for double precision.
+    return false;
+  default:
+    llvm_unreachable("Only half, single and double precision supported");
+  }
+
+  uint32_t ExpStart = FracBits;
+  uint64_t FracMask = (1ULL << FracBits) - 1;
+
+  uint32_t Sign = Val.isNegative();
+
+  uint64_t Bits= Val.bitcastToAPInt().getLimitedValue();
+  uint64_t Fraction = Bits & FracMask;
+  int32_t Exponent = ((Bits >> ExpStart) & ExpMask);
+  Exponent -= ExpMask >> 1;
+
+  // S[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 5):imm8<5:0>:Zeros(19)
+  // D[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 8):imm8<5:0>:Zeros(48)
+  // This translates to: only 4 bits of fraction; -3 <= exp <= 4.
+  uint64_t A64FracStart = FracBits - 4;
+  uint64_t A64FracMask = 0xf;
+
+  // Are there too many fraction bits?
+  if (Fraction & ~(A64FracMask << A64FracStart))
+    return false;
+
+  if (Exponent < -3 || Exponent > 4)
+    return false;
+
+  uint32_t PackedFraction = (Fraction >> A64FracStart) & A64FracMask;
+  uint32_t PackedExp = (Exponent + 7) & 0x7;
+
+  Imm8Bits = (Sign << 7) | (PackedExp << 4) | PackedFraction;
+  return true;
+}
+
+// Encoding of the immediate for logical (immediate) instructions:
+//
+// | N | imms   | immr   | size | R            | S            |
+// |---+--------+--------+------+--------------+--------------|
+// | 1 | ssssss | rrrrrr |   64 | UInt(rrrrrr) | UInt(ssssss) |
+// | 0 | 0sssss | xrrrrr |   32 | UInt(rrrrr)  | UInt(sssss)  |
+// | 0 | 10ssss | xxrrrr |   16 | UInt(rrrr)   | UInt(ssss)   |
+// | 0 | 110sss | xxxrrr |    8 | UInt(rrr)    | UInt(sss)    |
+// | 0 | 1110ss | xxxxrr |    4 | UInt(rr)     | UInt(ss)     |
+// | 0 | 11110s | xxxxxr |    2 | UInt(r)      | UInt(s)      |
+// | 0 | 11111x | -      |      | UNALLOCATED  |              |
+//
+// Columns 'R', 'S' and 'size' specify a "bitmask immediate" of size bits in
+// which the lower S+1 bits are ones and the remaining bits are zero, then
+// rotated right by R bits, which is then replicated across the datapath.
+//
+// + Values of 'N', 'imms' and 'immr' which do not match the above table are
+//   RESERVED.
+// + If all 's' bits in the imms field are set then the instruction is
+//   RESERVED.
+// + The 'x' bits in the 'immr' field are IGNORED.
+
+bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) {
+  int RepeatWidth;
+  int Rotation = 0;
+  int Num1s = 0;
+
+  // Because there are S+1 ones in the replicated mask, an immediate of all
+  // zeros is not allowed. Filtering it here is probably more efficient.
+  if (Imm == 0) return false;
+
+  for (RepeatWidth = RegWidth; RepeatWidth > 1; RepeatWidth /= 2) {
+    uint64_t RepeatMask = RepeatWidth == 64 ? -1 : (1ULL << RepeatWidth) - 1;
+    uint64_t ReplicatedMask = Imm & RepeatMask;
+
+    if (ReplicatedMask == 0) continue;
+
+    // First we have to make sure the mask is actually repeated in each slot for
+    // this width-specifier.
+    bool IsReplicatedMask = true;
+    for (unsigned i = RepeatWidth; i < RegWidth; i += RepeatWidth) {
+      if (((Imm >> i) & RepeatMask) != ReplicatedMask) {
+        IsReplicatedMask = false;
+        break;
+      }
+    }
+    if (!IsReplicatedMask) continue;
+
+    // Now we have to work out the amount of rotation needed. The first part of
+    // this calculation is actually independent of RepeatWidth, but the complex
+    // case will depend on it.
+    Rotation = CountTrailingZeros_64(Imm);
+    if (Rotation == 0) {
+      // There were no leading zeros, which means it's either in place or there
+      // are 1s at each end (e.g. 0x8003 needs rotating).
+      Rotation = RegWidth == 64 ? CountLeadingOnes_64(Imm)
+                                : CountLeadingOnes_32(Imm);
+      Rotation = RepeatWidth - Rotation;
+    }
+
+    uint64_t ReplicatedOnes = (ReplicatedMask >> Rotation)
+      | ((ReplicatedMask << (RepeatWidth - Rotation)) & RepeatMask);
+    // Of course, they may not actually be ones, so we have to check that:
+    if (!isMask_64(ReplicatedOnes))
+      continue;
+
+    Num1s = CountTrailingOnes_64(ReplicatedOnes);
+
+    // We know we've got an almost valid encoding (certainly, if this is invalid
+    // no other parameters would work).
+    break;
+  }
+
+  // The encodings which would produce all 1s are RESERVED.
+  if (RepeatWidth == 1 || Num1s == RepeatWidth) return false;
+
+  uint32_t N = RepeatWidth == 64;
+  uint32_t ImmR = RepeatWidth - Rotation;
+  uint32_t ImmS = Num1s - 1;
+
+  switch (RepeatWidth) {
+  default: break; // No action required for other valid rotations.
+  case 16: ImmS |= 0x20; break; // 10ssss
+  case 8: ImmS |= 0x30; break;  // 110sss
+  case 4: ImmS |= 0x38; break;  // 1110ss
+  case 2: ImmS |= 0x3c; break;  // 11110s
+  }
+
+  Bits = ImmS | (ImmR << 6) | (N << 12);
+
+  return true;
+}
+
+
+bool A64Imms::isLogicalImmBits(unsigned RegWidth, uint32_t Bits,
+                               uint64_t &Imm) {
+  uint32_t N = Bits >> 12;
+  uint32_t ImmR = (Bits >> 6) & 0x3f;
+  uint32_t ImmS = Bits & 0x3f;
+
+  // N=1 encodes a 64-bit replication and is invalid for the 32-bit
+  // instructions.
+  if (RegWidth == 32 && N != 0) return false;
+
+  int Width = 0;
+  if (N == 1)
+    Width = 64;
+  else if ((ImmS & 0x20) == 0)
+    Width = 32;
+  else if ((ImmS & 0x10) == 0)
+    Width = 16;
+  else if ((ImmS & 0x08) == 0)
+    Width = 8;
+  else if ((ImmS & 0x04) == 0)
+    Width = 4;
+  else if ((ImmS & 0x02) == 0)
+    Width = 2;
+  else {
+    // ImmS  is 0b11111x: UNALLOCATED
+    return false;
+  }
+
+  int Num1s = (ImmS & (Width - 1)) + 1;
+
+  // All encodings which would map to -1 (signed) are RESERVED.
+  if (Num1s == Width) return false;
+
+  int Rotation = (ImmR & (Width - 1));
+  uint64_t Mask = (1ULL << Num1s) - 1;
+  uint64_t WidthMask = Width == 64 ? -1 : (1ULL << Width) - 1;
+  Mask = (Mask >> Rotation)
+    | ((Mask << (Width - Rotation)) & WidthMask);
+
+  Imm = 0;
+  for (unsigned i = 0; i < RegWidth / Width; ++i) {
+    Imm |= Mask;
+    Mask <<= Width;
+  }
+
+  return true;
+}
+
+bool A64Imms::isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
+  // If high bits are set then a 32-bit MOVZ can't possibly work.
+  if (RegWidth == 32 && (Value & ~0xffffffffULL))
+    return false;
+
+  for (int i = 0; i < RegWidth; i += 16) {
+    // If the value is 0 when we mask out all the bits that could be set with
+    // the current LSL value then it's representable.
+    if ((Value & ~(0xffffULL << i)) == 0) {
+      Shift = i / 16;
+      UImm16 = (Value >> i) & 0xffff;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool A64Imms::isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
+  // MOVN is defined to set its register to NOT(LSL(imm16, shift)).
+
+  // We have to be a little careful about a 32-bit register: 0xffff_1234 *is*
+  // representable, but ~0xffff_1234 == 0xffff_ffff_0000_edcb which is not
+  // a valid input for isMOVZImm.
+  if (RegWidth == 32 && (Value & ~0xffffffffULL))
+    return false;
+
+  uint64_t MOVZEquivalent = RegWidth == 32 ? ~Value & 0xffffffff : ~Value;
+
+  return isMOVZImm(RegWidth, MOVZEquivalent, UImm16, Shift);
+}
+
+bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value,
+                            int &UImm16, int &Shift) {
+  if (isMOVZImm(RegWidth, Value, UImm16, Shift))
+    return false;
+
+  return isMOVNImm(RegWidth, Value, UImm16, Shift);
+}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
new file mode 100644
index 0000000..5eebf44
--- /dev/null
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -0,0 +1,784 @@
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the AArch64 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_BASEINFO_H
+#define LLVM_AARCH64_BASEINFO_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// // Enums corresponding to AArch64 condition codes
+namespace A64CC {
+  // The CondCodes constants map directly to the 4-bit encoding of the
+  // condition field for predicated instructions.
+  enum CondCodes {   // Meaning (integer)          Meaning (floating-point)
+    EQ = 0,        // Equal                      Equal
+    NE,            // Not equal                  Not equal, or unordered
+    HS,            // Unsigned higher or same    >, ==, or unordered
+    LO,            // Unsigned lower or same     Less than
+    MI,            // Minus, negative            Less than
+    PL,            // Plus, positive or zero     >, ==, or unordered
+    VS,            // Overflow                   Unordered
+    VC,            // No overflow                Ordered
+    HI,            // Unsigned higher            Greater than, or unordered
+    LS,            // Unsigned lower or same     Less than or equal
+    GE,            // Greater than or equal      Greater than or equal
+    LT,            // Less than                  Less than, or unordered
+    GT,            // Signed greater than        Greater than
+    LE,            // Signed less than or equal  <, ==, or unordered
+    AL,            // Always (unconditional)     Always (unconditional)
+    NV,             // Always (unconditional)     Always (unconditional)
+    // Note the NV exists purely to disassemble 0b1111. Execution
+    // is "always".
+    Invalid
+  };
+
+} // namespace A64CC
+
+inline static const char *A64CondCodeToString(A64CC::CondCodes CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code");
+  case A64CC::EQ:  return "eq";
+  case A64CC::NE:  return "ne";
+  case A64CC::HS:  return "hs";
+  case A64CC::LO:  return "lo";
+  case A64CC::MI:  return "mi";
+  case A64CC::PL:  return "pl";
+  case A64CC::VS:  return "vs";
+  case A64CC::VC:  return "vc";
+  case A64CC::HI:  return "hi";
+  case A64CC::LS:  return "ls";
+  case A64CC::GE:  return "ge";
+  case A64CC::LT:  return "lt";
+  case A64CC::GT:  return "gt";
+  case A64CC::LE:  return "le";
+  case A64CC::AL:  return "al";
+  case A64CC::NV:  return "nv";
+  }
+}
+
+inline static A64CC::CondCodes A64StringToCondCode(StringRef CondStr) {
+  return StringSwitch<A64CC::CondCodes>(CondStr.lower())
+             .Case("eq", A64CC::EQ)
+             .Case("ne", A64CC::NE)
+             .Case("ne", A64CC::NE)
+             .Case("hs", A64CC::HS)
+             .Case("cs", A64CC::HS)
+             .Case("lo", A64CC::LO)
+             .Case("cc", A64CC::LO)
+             .Case("mi", A64CC::MI)
+             .Case("pl", A64CC::PL)
+             .Case("vs", A64CC::VS)
+             .Case("vc", A64CC::VC)
+             .Case("hi", A64CC::HI)
+             .Case("ls", A64CC::LS)
+             .Case("ge", A64CC::GE)
+             .Case("lt", A64CC::LT)
+             .Case("gt", A64CC::GT)
+             .Case("le", A64CC::LE)
+             .Case("al", A64CC::AL)
+             .Case("nv", A64CC::NV)
+             .Default(A64CC::Invalid);
+}
+
+inline static A64CC::CondCodes A64InvertCondCode(A64CC::CondCodes CC) {
+  // It turns out that the condition codes have been designed so that in order
+  // to reverse the intent of the condition you only have to invert the low bit:
+
+  return static_cast<A64CC::CondCodes>(static_cast<unsigned>(CC) ^ 0x1);
+}
+
+/// Instances of this class can perform bidirectional mapping from random
+/// identifier strings to operand encodings. For example "MSR" takes a named
+/// system-register which must be encoded somehow and decoded for printing. This
+/// central location means that the information for those transformations is not
+/// duplicated and remains in sync.
+///
+/// FIXME: currently the algorithm is a completely unoptimised linear
+/// search. Obviously this could be improved, but we would probably want to work
+/// out just how often these instructions are emitted before working on it. It
+/// might even be optimal to just reorder the tables for the common instructions
+/// rather than changing the algorithm.
+struct NamedImmMapper {
+  struct Mapping {
+    const char *Name;
+    uint32_t Value;
+  };
+
+  template<int N>
+  NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
+    : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
+
+  StringRef toString(uint32_t Value, bool &Valid) const;
+  uint32_t fromString(StringRef Name, bool &Valid) const;
+
+  /// Many of the instructions allow an alternative assembly form consisting of
+  /// a simple immediate. Currently the only valid forms are ranges [0, N) where
+  /// N being 0 indicates no immediate syntax-form is allowed.
+  bool validImm(uint32_t Value) const;
+protected:
+  const Mapping *Pairs;
+  size_t NumPairs;
+  uint32_t TooBigImm;
+};
+
+namespace A64AT {
+  enum ATValues {
+    Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
+    S1E1R = 0x43c0,  // 01  000  0111  1000  000
+    S1E2R = 0x63c0,  // 01  100  0111  1000  000
+    S1E3R = 0x73c0,  // 01  110  0111  1000  000
+    S1E1W = 0x43c1,  // 01  000  0111  1000  001
+    S1E2W = 0x63c1,  // 01  100  0111  1000  001
+    S1E3W = 0x73c1,  // 01  110  0111  1000  001
+    S1E0R = 0x43c2,  // 01  000  0111  1000  010
+    S1E0W = 0x43c3,  // 01  000  0111  1000  011
+    S12E1R = 0x63c4, // 01  100  0111  1000  100
+    S12E1W = 0x63c5, // 01  100  0111  1000  101
+    S12E0R = 0x63c6, // 01  100  0111  1000  110
+    S12E0W = 0x63c7  // 01  100  0111  1000  111
+  };
+
+  struct ATMapper : NamedImmMapper {
+    const static Mapping ATPairs[];
+
+    ATMapper();
+  };
+
+}
+namespace A64DB {
+  enum DBValues {
+    Invalid = -1,
+    OSHLD = 0x1,
+    OSHST = 0x2,
+    OSH =   0x3,
+    NSHLD = 0x5,
+    NSHST = 0x6,
+    NSH =   0x7,
+    ISHLD = 0x9,
+    ISHST = 0xa,
+    ISH =   0xb,
+    LD =    0xd,
+    ST =    0xe,
+    SY =    0xf
+  };
+
+  struct DBarrierMapper : NamedImmMapper {
+    const static Mapping DBarrierPairs[];
+
+    DBarrierMapper();
+  };
+}
+
+namespace  A64DC {
+  enum DCValues {
+    Invalid = -1,   // Op1  CRn   CRm   Op2
+    ZVA   = 0x5ba1, // 01  011  0111  0100  001
+    IVAC  = 0x43b1, // 01  000  0111  0110  001
+    ISW   = 0x43b2, // 01  000  0111  0110  010
+    CVAC  = 0x5bd1, // 01  011  0111  1010  001
+    CSW   = 0x43d2, // 01  000  0111  1010  010
+    CVAU  = 0x5bd9, // 01  011  0111  1011  001
+    CIVAC = 0x5bf1, // 01  011  0111  1110  001
+    CISW  = 0x43f2  // 01  000  0111  1110  010
+  };
+
+  struct DCMapper : NamedImmMapper {
+    const static Mapping DCPairs[];
+
+    DCMapper();
+  };
+
+}
+
+namespace  A64IC {
+  enum ICValues {
+    Invalid = -1,     // Op1  CRn   CRm   Op2
+    IALLUIS = 0x0388, // 000  0111  0001  000
+    IALLU = 0x03a8,   // 000  0111  0101  000
+    IVAU = 0x1ba9     // 011  0111  0101  001
+  };
+
+
+  struct ICMapper : NamedImmMapper {
+    const static Mapping ICPairs[];
+
+    ICMapper();
+  };
+
+  static inline bool NeedsRegister(ICValues Val) {
+    return Val == IVAU;
+  }
+}
+
+namespace  A64ISB {
+  enum ISBValues {
+    Invalid = -1,
+    SY = 0xf
+  };
+  struct ISBMapper : NamedImmMapper {
+    const static Mapping ISBPairs[];
+
+    ISBMapper();
+  };
+}
+
+namespace A64PRFM {
+  enum PRFMValues {
+    Invalid = -1,
+    PLDL1KEEP = 0x00,
+    PLDL1STRM = 0x01,
+    PLDL2KEEP = 0x02,
+    PLDL2STRM = 0x03,
+    PLDL3KEEP = 0x04,
+    PLDL3STRM = 0x05,
+    PLIL1KEEP = 0x08,
+    PLIL1STRM = 0x09,
+    PLIL2KEEP = 0x0a,
+    PLIL2STRM = 0x0b,
+    PLIL3KEEP = 0x0c,
+    PLIL3STRM = 0x0d,
+    PSTL1KEEP = 0x10,
+    PSTL1STRM = 0x11,
+    PSTL2KEEP = 0x12,
+    PSTL2STRM = 0x13,
+    PSTL3KEEP = 0x14,
+    PSTL3STRM = 0x15
+  };
+
+  struct PRFMMapper : NamedImmMapper {
+    const static Mapping PRFMPairs[];
+
+    PRFMMapper();
+  };
+}
+
+namespace A64PState {
+  enum PStateValues {
+    Invalid = -1,
+    SPSel = 0x05,
+    DAIFSet = 0x1e,
+    DAIFClr = 0x1f
+  };
+
+  struct PStateMapper : NamedImmMapper {
+    const static Mapping PStatePairs[];
+
+    PStateMapper();
+  };
+
+}
+
+namespace A64SE {
+    enum ShiftExtSpecifiers {
+        Invalid = -1,
+        LSL,
+        LSR,
+        ASR,
+        ROR,
+
+        UXTB,
+        UXTH,
+        UXTW,
+        UXTX,
+
+        SXTB,
+        SXTH,
+        SXTW,
+        SXTX
+    };
+}
+
+namespace A64SysReg {
+  enum SysRegROValues {
+    MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
+    DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
+    MDRAR_EL1         = 0x8080, // 10  000  0001  0000  000
+    OSLSR_EL1         = 0x808c, // 10  000  0001  0001  100
+    DBGAUTHSTATUS_EL1 = 0x83f6, // 10  000  0111  1110  110
+    PMCEID0_EL0       = 0xdce6, // 11  011  1001  1100  110
+    PMCEID1_EL0       = 0xdce7, // 11  011  1001  1100  111
+    MIDR_EL1          = 0xc000, // 11  000  0000  0000  000
+    CCSIDR_EL1        = 0xc800, // 11  001  0000  0000  000
+    CLIDR_EL1         = 0xc801, // 11  001  0000  0000  001
+    CTR_EL0           = 0xd801, // 11  011  0000  0000  001
+    MPIDR_EL1         = 0xc005, // 11  000  0000  0000  101
+    REVIDR_EL1        = 0xc006, // 11  000  0000  0000  110
+    AIDR_EL1          = 0xc807, // 11  001  0000  0000  111
+    DCZID_EL0         = 0xd807, // 11  011  0000  0000  111
+    ID_PFR0_EL1       = 0xc008, // 11  000  0000  0001  000
+    ID_PFR1_EL1       = 0xc009, // 11  000  0000  0001  001
+    ID_DFR0_EL1       = 0xc00a, // 11  000  0000  0001  010
+    ID_AFR0_EL1       = 0xc00b, // 11  000  0000  0001  011
+    ID_MMFR0_EL1      = 0xc00c, // 11  000  0000  0001  100
+    ID_MMFR1_EL1      = 0xc00d, // 11  000  0000  0001  101
+    ID_MMFR2_EL1      = 0xc00e, // 11  000  0000  0001  110
+    ID_MMFR3_EL1      = 0xc00f, // 11  000  0000  0001  111
+    ID_ISAR0_EL1      = 0xc010, // 11  000  0000  0010  000
+    ID_ISAR1_EL1      = 0xc011, // 11  000  0000  0010  001
+    ID_ISAR2_EL1      = 0xc012, // 11  000  0000  0010  010
+    ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
+    ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
+    ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
+    ID_AA64PFR0_EL1   = 0xc020, // 11  000  0000  0100  000
+    ID_AA64PFR1_EL1   = 0xc021, // 11  000  0000  0100  001
+    ID_AA64DFR0_EL1   = 0xc028, // 11  000  0000  0101  000
+    ID_AA64DFR1_EL1   = 0xc029, // 11  000  0000  0101  001
+    ID_AA64AFR0_EL1   = 0xc02c, // 11  000  0000  0101  100
+    ID_AA64AFR1_EL1   = 0xc02d, // 11  000  0000  0101  101
+    ID_AA64ISAR0_EL1  = 0xc030, // 11  000  0000  0110  000
+    ID_AA64ISAR1_EL1  = 0xc031, // 11  000  0000  0110  001
+    ID_AA64MMFR0_EL1  = 0xc038, // 11  000  0000  0111  000
+    ID_AA64MMFR1_EL1  = 0xc039, // 11  000  0000  0111  001
+    MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
+    MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
+    MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
+    RVBAR_EL1         = 0xc601, // 11  000  1100  0000  001
+    RVBAR_EL2         = 0xe601, // 11  100  1100  0000  001
+    RVBAR_EL3         = 0xf601, // 11  110  1100  0000  001
+    ISR_EL1           = 0xc608, // 11  000  1100  0001  000
+    CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
+    CNTVCT_EL0        = 0xdf02  // 11  011  1110  0000  010
+  };
+
+  enum SysRegWOValues {
+    DBGDTRTX_EL0      = 0x9828, // 10  011  0000  0101  000
+    OSLAR_EL1         = 0x8084, // 10  000  0001  0000  100
+    PMSWINC_EL0       = 0xdce4  // 11  011  1001  1100  100
+  };
+
+  enum SysRegValues {
+    Invalid = -1,               // Op0 Op1  CRn   CRm   Op2
+    OSDTRRX_EL1       = 0x8002, // 10  000  0000  0000  010
+    OSDTRTX_EL1       = 0x801a, // 10  000  0000  0011  010
+    TEECR32_EL1       = 0x9000, // 10  010  0000  0000  000
+    MDCCINT_EL1       = 0x8010, // 10  000  0000  0010  000
+    MDSCR_EL1         = 0x8012, // 10  000  0000  0010  010
+    DBGDTR_EL0        = 0x9820, // 10  011  0000  0100  000
+    OSECCR_EL1        = 0x8032, // 10  000  0000  0110  010
+    DBGVCR32_EL2      = 0xa038, // 10  100  0000  0111  000
+    DBGBVR0_EL1       = 0x8004, // 10  000  0000  0000  100
+    DBGBVR1_EL1       = 0x800c, // 10  000  0000  0001  100
+    DBGBVR2_EL1       = 0x8014, // 10  000  0000  0010  100
+    DBGBVR3_EL1       = 0x801c, // 10  000  0000  0011  100
+    DBGBVR4_EL1       = 0x8024, // 10  000  0000  0100  100
+    DBGBVR5_EL1       = 0x802c, // 10  000  0000  0101  100
+    DBGBVR6_EL1       = 0x8034, // 10  000  0000  0110  100
+    DBGBVR7_EL1       = 0x803c, // 10  000  0000  0111  100
+    DBGBVR8_EL1       = 0x8044, // 10  000  0000  1000  100
+    DBGBVR9_EL1       = 0x804c, // 10  000  0000  1001  100
+    DBGBVR10_EL1      = 0x8054, // 10  000  0000  1010  100
+    DBGBVR11_EL1      = 0x805c, // 10  000  0000  1011  100
+    DBGBVR12_EL1      = 0x8064, // 10  000  0000  1100  100
+    DBGBVR13_EL1      = 0x806c, // 10  000  0000  1101  100
+    DBGBVR14_EL1      = 0x8074, // 10  000  0000  1110  100
+    DBGBVR15_EL1      = 0x807c, // 10  000  0000  1111  100
+    DBGBCR0_EL1       = 0x8005, // 10  000  0000  0000  101
+    DBGBCR1_EL1       = 0x800d, // 10  000  0000  0001  101
+    DBGBCR2_EL1       = 0x8015, // 10  000  0000  0010  101
+    DBGBCR3_EL1       = 0x801d, // 10  000  0000  0011  101
+    DBGBCR4_EL1       = 0x8025, // 10  000  0000  0100  101
+    DBGBCR5_EL1       = 0x802d, // 10  000  0000  0101  101
+    DBGBCR6_EL1       = 0x8035, // 10  000  0000  0110  101
+    DBGBCR7_EL1       = 0x803d, // 10  000  0000  0111  101
+    DBGBCR8_EL1       = 0x8045, // 10  000  0000  1000  101
+    DBGBCR9_EL1       = 0x804d, // 10  000  0000  1001  101
+    DBGBCR10_EL1      = 0x8055, // 10  000  0000  1010  101
+    DBGBCR11_EL1      = 0x805d, // 10  000  0000  1011  101
+    DBGBCR12_EL1      = 0x8065, // 10  000  0000  1100  101
+    DBGBCR13_EL1      = 0x806d, // 10  000  0000  1101  101
+    DBGBCR14_EL1      = 0x8075, // 10  000  0000  1110  101
+    DBGBCR15_EL1      = 0x807d, // 10  000  0000  1111  101
+    DBGWVR0_EL1       = 0x8006, // 10  000  0000  0000  110
+    DBGWVR1_EL1       = 0x800e, // 10  000  0000  0001  110
+    DBGWVR2_EL1       = 0x8016, // 10  000  0000  0010  110
+    DBGWVR3_EL1       = 0x801e, // 10  000  0000  0011  110
+    DBGWVR4_EL1       = 0x8026, // 10  000  0000  0100  110
+    DBGWVR5_EL1       = 0x802e, // 10  000  0000  0101  110
+    DBGWVR6_EL1       = 0x8036, // 10  000  0000  0110  110
+    DBGWVR7_EL1       = 0x803e, // 10  000  0000  0111  110
+    DBGWVR8_EL1       = 0x8046, // 10  000  0000  1000  110
+    DBGWVR9_EL1       = 0x804e, // 10  000  0000  1001  110
+    DBGWVR10_EL1      = 0x8056, // 10  000  0000  1010  110
+    DBGWVR11_EL1      = 0x805e, // 10  000  0000  1011  110
+    DBGWVR12_EL1      = 0x8066, // 10  000  0000  1100  110
+    DBGWVR13_EL1      = 0x806e, // 10  000  0000  1101  110
+    DBGWVR14_EL1      = 0x8076, // 10  000  0000  1110  110
+    DBGWVR15_EL1      = 0x807e, // 10  000  0000  1111  110
+    DBGWCR0_EL1       = 0x8007, // 10  000  0000  0000  111
+    DBGWCR1_EL1       = 0x800f, // 10  000  0000  0001  111
+    DBGWCR2_EL1       = 0x8017, // 10  000  0000  0010  111
+    DBGWCR3_EL1       = 0x801f, // 10  000  0000  0011  111
+    DBGWCR4_EL1       = 0x8027, // 10  000  0000  0100  111
+    DBGWCR5_EL1       = 0x802f, // 10  000  0000  0101  111
+    DBGWCR6_EL1       = 0x8037, // 10  000  0000  0110  111
+    DBGWCR7_EL1       = 0x803f, // 10  000  0000  0111  111
+    DBGWCR8_EL1       = 0x8047, // 10  000  0000  1000  111
+    DBGWCR9_EL1       = 0x804f, // 10  000  0000  1001  111
+    DBGWCR10_EL1      = 0x8057, // 10  000  0000  1010  111
+    DBGWCR11_EL1      = 0x805f, // 10  000  0000  1011  111
+    DBGWCR12_EL1      = 0x8067, // 10  000  0000  1100  111
+    DBGWCR13_EL1      = 0x806f, // 10  000  0000  1101  111
+    DBGWCR14_EL1      = 0x8077, // 10  000  0000  1110  111
+    DBGWCR15_EL1      = 0x807f, // 10  000  0000  1111  111
+    TEEHBR32_EL1      = 0x9080, // 10  010  0001  0000  000
+    OSDLR_EL1         = 0x809c, // 10  000  0001  0011  100
+    DBGPRCR_EL1       = 0x80a4, // 10  000  0001  0100  100
+    DBGCLAIMSET_EL1   = 0x83c6, // 10  000  0111  1000  110
+    DBGCLAIMCLR_EL1   = 0x83ce, // 10  000  0111  1001  110
+    CSSELR_EL1        = 0xd000, // 11  010  0000  0000  000
+    VPIDR_EL2         = 0xe000, // 11  100  0000  0000  000
+    VMPIDR_EL2        = 0xe005, // 11  100  0000  0000  101
+    CPACR_EL1         = 0xc082, // 11  000  0001  0000  010
+    SCTLR_EL1         = 0xc080, // 11  000  0001  0000  000
+    SCTLR_EL2         = 0xe080, // 11  100  0001  0000  000
+    SCTLR_EL3         = 0xf080, // 11  110  0001  0000  000
+    ACTLR_EL1         = 0xc081, // 11  000  0001  0000  001
+    ACTLR_EL2         = 0xe081, // 11  100  0001  0000  001
+    ACTLR_EL3         = 0xf081, // 11  110  0001  0000  001
+    HCR_EL2           = 0xe088, // 11  100  0001  0001  000
+    SCR_EL3           = 0xf088, // 11  110  0001  0001  000
+    MDCR_EL2          = 0xe089, // 11  100  0001  0001  001
+    SDER32_EL3        = 0xf089, // 11  110  0001  0001  001
+    CPTR_EL2          = 0xe08a, // 11  100  0001  0001  010
+    CPTR_EL3          = 0xf08a, // 11  110  0001  0001  010
+    HSTR_EL2          = 0xe08b, // 11  100  0001  0001  011
+    HACR_EL2          = 0xe08f, // 11  100  0001  0001  111
+    MDCR_EL3          = 0xf099, // 11  110  0001  0011  001
+    TTBR0_EL1         = 0xc100, // 11  000  0010  0000  000
+    TTBR0_EL2         = 0xe100, // 11  100  0010  0000  000
+    TTBR0_EL3         = 0xf100, // 11  110  0010  0000  000
+    TTBR1_EL1         = 0xc101, // 11  000  0010  0000  001
+    TCR_EL1           = 0xc102, // 11  000  0010  0000  010
+    TCR_EL2           = 0xe102, // 11  100  0010  0000  010
+    TCR_EL3           = 0xf102, // 11  110  0010  0000  010
+    VTTBR_EL2         = 0xe108, // 11  100  0010  0001  000
+    VTCR_EL2          = 0xe10a, // 11  100  0010  0001  010
+    DACR32_EL2        = 0xe180, // 11  100  0011  0000  000
+    SPSR_EL1          = 0xc200, // 11  000  0100  0000  000
+    SPSR_EL2          = 0xe200, // 11  100  0100  0000  000
+    SPSR_EL3          = 0xf200, // 11  110  0100  0000  000
+    ELR_EL1           = 0xc201, // 11  000  0100  0000  001
+    ELR_EL2           = 0xe201, // 11  100  0100  0000  001
+    ELR_EL3           = 0xf201, // 11  110  0100  0000  001
+    SP_EL0            = 0xc208, // 11  000  0100  0001  000
+    SP_EL1            = 0xe208, // 11  100  0100  0001  000
+    SP_EL2            = 0xf208, // 11  110  0100  0001  000
+    SPSel             = 0xc210, // 11  000  0100  0010  000
+    NZCV              = 0xda10, // 11  011  0100  0010  000
+    DAIF              = 0xda11, // 11  011  0100  0010  001
+    CurrentEL         = 0xc212, // 11  000  0100  0010  010
+    SPSR_irq          = 0xe218, // 11  100  0100  0011  000
+    SPSR_abt          = 0xe219, // 11  100  0100  0011  001
+    SPSR_und          = 0xe21a, // 11  100  0100  0011  010
+    SPSR_fiq          = 0xe21b, // 11  100  0100  0011  011
+    FPCR              = 0xda20, // 11  011  0100  0100  000
+    FPSR              = 0xda21, // 11  011  0100  0100  001
+    DSPSR_EL0         = 0xda28, // 11  011  0100  0101  000
+    DLR_EL0           = 0xda29, // 11  011  0100  0101  001
+    IFSR32_EL2        = 0xe281, // 11  100  0101  0000  001
+    AFSR0_EL1         = 0xc288, // 11  000  0101  0001  000
+    AFSR0_EL2         = 0xe288, // 11  100  0101  0001  000
+    AFSR0_EL3         = 0xf288, // 11  110  0101  0001  000
+    AFSR1_EL1         = 0xc289, // 11  000  0101  0001  001
+    AFSR1_EL2         = 0xe289, // 11  100  0101  0001  001
+    AFSR1_EL3         = 0xf289, // 11  110  0101  0001  001
+    ESR_EL1           = 0xc290, // 11  000  0101  0010  000
+    ESR_EL2           = 0xe290, // 11  100  0101  0010  000
+    ESR_EL3           = 0xf290, // 11  110  0101  0010  000
+    FPEXC32_EL2       = 0xe298, // 11  100  0101  0011  000
+    FAR_EL1           = 0xc300, // 11  000  0110  0000  000
+    FAR_EL2           = 0xe300, // 11  100  0110  0000  000
+    FAR_EL3           = 0xf300, // 11  110  0110  0000  000
+    HPFAR_EL2         = 0xe304, // 11  100  0110  0000  100
+    PAR_EL1           = 0xc3a0, // 11  000  0111  0100  000
+    PMCR_EL0          = 0xdce0, // 11  011  1001  1100  000
+    PMCNTENSET_EL0    = 0xdce1, // 11  011  1001  1100  001
+    PMCNTENCLR_EL0    = 0xdce2, // 11  011  1001  1100  010
+    PMOVSCLR_EL0      = 0xdce3, // 11  011  1001  1100  011
+    PMSELR_EL0        = 0xdce5, // 11  011  1001  1100  101
+    PMCCNTR_EL0       = 0xdce8, // 11  011  1001  1101  000
+    PMXEVTYPER_EL0    = 0xdce9, // 11  011  1001  1101  001
+    PMXEVCNTR_EL0     = 0xdcea, // 11  011  1001  1101  010
+    PMUSERENR_EL0     = 0xdcf0, // 11  011  1001  1110  000
+    PMINTENSET_EL1    = 0xc4f1, // 11  000  1001  1110  001
+    PMINTENCLR_EL1    = 0xc4f2, // 11  000  1001  1110  010
+    PMOVSSET_EL0      = 0xdcf3, // 11  011  1001  1110  011
+    MAIR_EL1          = 0xc510, // 11  000  1010  0010  000
+    MAIR_EL2          = 0xe510, // 11  100  1010  0010  000
+    MAIR_EL3          = 0xf510, // 11  110  1010  0010  000
+    AMAIR_EL1         = 0xc518, // 11  000  1010  0011  000
+    AMAIR_EL2         = 0xe518, // 11  100  1010  0011  000
+    AMAIR_EL3         = 0xf518, // 11  110  1010  0011  000
+    VBAR_EL1          = 0xc600, // 11  000  1100  0000  000
+    VBAR_EL2          = 0xe600, // 11  100  1100  0000  000
+    VBAR_EL3          = 0xf600, // 11  110  1100  0000  000
+    RMR_EL1           = 0xc602, // 11  000  1100  0000  010
+    RMR_EL2           = 0xe602, // 11  100  1100  0000  010
+    RMR_EL3           = 0xf602, // 11  110  1100  0000  010
+    CONTEXTIDR_EL1    = 0xc681, // 11  000  1101  0000  001
+    TPIDR_EL0         = 0xde82, // 11  011  1101  0000  010
+    TPIDR_EL2         = 0xe682, // 11  100  1101  0000  010
+    TPIDR_EL3         = 0xf682, // 11  110  1101  0000  010
+    TPIDRRO_EL0       = 0xde83, // 11  011  1101  0000  011
+    TPIDR_EL1         = 0xc684, // 11  000  1101  0000  100
+    CNTFRQ_EL0        = 0xdf00, // 11  011  1110  0000  000
+    CNTVOFF_EL2       = 0xe703, // 11  100  1110  0000  011
+    CNTKCTL_EL1       = 0xc708, // 11  000  1110  0001  000
+    CNTHCTL_EL2       = 0xe708, // 11  100  1110  0001  000
+    CNTP_TVAL_EL0     = 0xdf10, // 11  011  1110  0010  000
+    CNTHP_TVAL_EL2    = 0xe710, // 11  100  1110  0010  000
+    CNTPS_TVAL_EL1    = 0xff10, // 11  111  1110  0010  000
+    CNTP_CTL_EL0      = 0xdf11, // 11  011  1110  0010  001
+    CNTHP_CTL_EL2     = 0xe711, // 11  100  1110  0010  001
+    CNTPS_CTL_EL1     = 0xff11, // 11  111  1110  0010  001
+    CNTP_CVAL_EL0     = 0xdf12, // 11  011  1110  0010  010
+    CNTHP_CVAL_EL2    = 0xe712, // 11  100  1110  0010  010
+    CNTPS_CVAL_EL1    = 0xff12, // 11  111  1110  0010  010
+    CNTV_TVAL_EL0     = 0xdf18, // 11  011  1110  0011  000
+    CNTV_CTL_EL0      = 0xdf19, // 11  011  1110  0011  001
+    CNTV_CVAL_EL0     = 0xdf1a, // 11  011  1110  0011  010
+    PMEVCNTR0_EL0     = 0xdf40, // 11  011  1110  1000  000
+    PMEVCNTR1_EL0     = 0xdf41, // 11  011  1110  1000  001
+    PMEVCNTR2_EL0     = 0xdf42, // 11  011  1110  1000  010
+    PMEVCNTR3_EL0     = 0xdf43, // 11  011  1110  1000  011
+    PMEVCNTR4_EL0     = 0xdf44, // 11  011  1110  1000  100
+    PMEVCNTR5_EL0     = 0xdf45, // 11  011  1110  1000  101
+    PMEVCNTR6_EL0     = 0xdf46, // 11  011  1110  1000  110
+    PMEVCNTR7_EL0     = 0xdf47, // 11  011  1110  1000  111
+    PMEVCNTR8_EL0     = 0xdf48, // 11  011  1110  1001  000
+    PMEVCNTR9_EL0     = 0xdf49, // 11  011  1110  1001  001
+    PMEVCNTR10_EL0    = 0xdf4a, // 11  011  1110  1001  010
+    PMEVCNTR11_EL0    = 0xdf4b, // 11  011  1110  1001  011
+    PMEVCNTR12_EL0    = 0xdf4c, // 11  011  1110  1001  100
+    PMEVCNTR13_EL0    = 0xdf4d, // 11  011  1110  1001  101
+    PMEVCNTR14_EL0    = 0xdf4e, // 11  011  1110  1001  110
+    PMEVCNTR15_EL0    = 0xdf4f, // 11  011  1110  1001  111
+    PMEVCNTR16_EL0    = 0xdf50, // 11  011  1110  1010  000
+    PMEVCNTR17_EL0    = 0xdf51, // 11  011  1110  1010  001
+    PMEVCNTR18_EL0    = 0xdf52, // 11  011  1110  1010  010
+    PMEVCNTR19_EL0    = 0xdf53, // 11  011  1110  1010  011
+    PMEVCNTR20_EL0    = 0xdf54, // 11  011  1110  1010  100
+    PMEVCNTR21_EL0    = 0xdf55, // 11  011  1110  1010  101
+    PMEVCNTR22_EL0    = 0xdf56, // 11  011  1110  1010  110
+    PMEVCNTR23_EL0    = 0xdf57, // 11  011  1110  1010  111
+    PMEVCNTR24_EL0    = 0xdf58, // 11  011  1110  1011  000
+    PMEVCNTR25_EL0    = 0xdf59, // 11  011  1110  1011  001
+    PMEVCNTR26_EL0    = 0xdf5a, // 11  011  1110  1011  010
+    PMEVCNTR27_EL0    = 0xdf5b, // 11  011  1110  1011  011
+    PMEVCNTR28_EL0    = 0xdf5c, // 11  011  1110  1011  100
+    PMEVCNTR29_EL0    = 0xdf5d, // 11  011  1110  1011  101
+    PMEVCNTR30_EL0    = 0xdf5e, // 11  011  1110  1011  110
+    PMCCFILTR_EL0     = 0xdf7f, // 11  011  1110  1111  111
+    PMEVTYPER0_EL0    = 0xdf60, // 11  011  1110  1100  000
+    PMEVTYPER1_EL0    = 0xdf61, // 11  011  1110  1100  001
+    PMEVTYPER2_EL0    = 0xdf62, // 11  011  1110  1100  010
+    PMEVTYPER3_EL0    = 0xdf63, // 11  011  1110  1100  011
+    PMEVTYPER4_EL0    = 0xdf64, // 11  011  1110  1100  100
+    PMEVTYPER5_EL0    = 0xdf65, // 11  011  1110  1100  101
+    PMEVTYPER6_EL0    = 0xdf66, // 11  011  1110  1100  110
+    PMEVTYPER7_EL0    = 0xdf67, // 11  011  1110  1100  111
+    PMEVTYPER8_EL0    = 0xdf68, // 11  011  1110  1101  000
+    PMEVTYPER9_EL0    = 0xdf69, // 11  011  1110  1101  001
+    PMEVTYPER10_EL0   = 0xdf6a, // 11  011  1110  1101  010
+    PMEVTYPER11_EL0   = 0xdf6b, // 11  011  1110  1101  011
+    PMEVTYPER12_EL0   = 0xdf6c, // 11  011  1110  1101  100
+    PMEVTYPER13_EL0   = 0xdf6d, // 11  011  1110  1101  101
+    PMEVTYPER14_EL0   = 0xdf6e, // 11  011  1110  1101  110
+    PMEVTYPER15_EL0   = 0xdf6f, // 11  011  1110  1101  111
+    PMEVTYPER16_EL0   = 0xdf70, // 11  011  1110  1110  000
+    PMEVTYPER17_EL0   = 0xdf71, // 11  011  1110  1110  001
+    PMEVTYPER18_EL0   = 0xdf72, // 11  011  1110  1110  010
+    PMEVTYPER19_EL0   = 0xdf73, // 11  011  1110  1110  011
+    PMEVTYPER20_EL0   = 0xdf74, // 11  011  1110  1110  100
+    PMEVTYPER21_EL0   = 0xdf75, // 11  011  1110  1110  101
+    PMEVTYPER22_EL0   = 0xdf76, // 11  011  1110  1110  110
+    PMEVTYPER23_EL0   = 0xdf77, // 11  011  1110  1110  111
+    PMEVTYPER24_EL0   = 0xdf78, // 11  011  1110  1111  000
+    PMEVTYPER25_EL0   = 0xdf79, // 11  011  1110  1111  001
+    PMEVTYPER26_EL0   = 0xdf7a, // 11  011  1110  1111  010
+    PMEVTYPER27_EL0   = 0xdf7b, // 11  011  1110  1111  011
+    PMEVTYPER28_EL0   = 0xdf7c, // 11  011  1110  1111  100
+    PMEVTYPER29_EL0   = 0xdf7d, // 11  011  1110  1111  101
+    PMEVTYPER30_EL0   = 0xdf7e  // 11  011  1110  1111  110
+  };
+
+  // Note that these do not inherit from NamedImmMapper. This class is
+  // sufficiently different in its behaviour that I don't believe it's worth
+  // burdening the common NamedImmMapper with abstractions only needed in
+  // this one case.
+  struct SysRegMapper {
+    static const NamedImmMapper::Mapping SysRegPairs[];
+
+    const NamedImmMapper::Mapping *InstPairs;
+    size_t NumInstPairs;
+
+    SysRegMapper() {}
+    uint32_t fromString(StringRef Name, bool &Valid) const;
+    std::string toString(uint32_t Bits, bool &Valid) const;
+  };
+
+  struct MSRMapper : SysRegMapper {
+    static const NamedImmMapper::Mapping MSRPairs[];
+    MSRMapper();
+  };
+
+  struct MRSMapper : SysRegMapper {
+    static const NamedImmMapper::Mapping MRSPairs[];
+    MRSMapper();
+  };
+
+  uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
+}
+
+namespace A64TLBI {
+  enum TLBIValues {
+    Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
+    IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
+    IPAS2LE1IS   = 0x6405, // 01  100  1000  0000  101
+    VMALLE1IS    = 0x4418, // 01  000  1000  0011  000
+    ALLE2IS      = 0x6418, // 01  100  1000  0011  000
+    ALLE3IS      = 0x7418, // 01  110  1000  0011  000
+    VAE1IS       = 0x4419, // 01  000  1000  0011  001
+    VAE2IS       = 0x6419, // 01  100  1000  0011  001
+    VAE3IS       = 0x7419, // 01  110  1000  0011  001
+    ASIDE1IS     = 0x441a, // 01  000  1000  0011  010
+    VAAE1IS      = 0x441b, // 01  000  1000  0011  011
+    ALLE1IS      = 0x641c, // 01  100  1000  0011  100
+    VALE1IS      = 0x441d, // 01  000  1000  0011  101
+    VALE2IS      = 0x641d, // 01  100  1000  0011  101
+    VALE3IS      = 0x741d, // 01  110  1000  0011  101
+    VMALLS12E1IS = 0x641e, // 01  100  1000  0011  110
+    VAALE1IS     = 0x441f, // 01  000  1000  0011  111
+    IPAS2E1      = 0x6421, // 01  100  1000  0100  001
+    IPAS2LE1     = 0x6425, // 01  100  1000  0100  101
+    VMALLE1      = 0x4438, // 01  000  1000  0111  000
+    ALLE2        = 0x6438, // 01  100  1000  0111  000
+    ALLE3        = 0x7438, // 01  110  1000  0111  000
+    VAE1         = 0x4439, // 01  000  1000  0111  001
+    VAE2         = 0x6439, // 01  100  1000  0111  001
+    VAE3         = 0x7439, // 01  110  1000  0111  001
+    ASIDE1       = 0x443a, // 01  000  1000  0111  010
+    VAAE1        = 0x443b, // 01  000  1000  0111  011
+    ALLE1        = 0x643c, // 01  100  1000  0111  100
+    VALE1        = 0x443d, // 01  000  1000  0111  101
+    VALE2        = 0x643d, // 01  100  1000  0111  101
+    VALE3        = 0x743d, // 01  110  1000  0111  101
+    VMALLS12E1   = 0x643e, // 01  100  1000  0111  110
+    VAALE1       = 0x443f  // 01  000  1000  0111  111
+  };
+
+  struct TLBIMapper : NamedImmMapper {
+    const static Mapping TLBIPairs[];
+
+    TLBIMapper();
+  };
+
+  static inline bool NeedsRegister(TLBIValues Val) {
+    switch (Val) {
+    case VMALLE1IS:
+    case ALLE2IS:
+    case ALLE3IS:
+    case ALLE1IS:
+    case VMALLS12E1IS:
+    case VMALLE1:
+    case ALLE2:
+    case ALLE3:
+    case ALLE1:
+    case VMALLS12E1:
+      return false;
+    default:
+      return true;
+    }
+  }
+}
+
+namespace AArch64II {
+
+  enum TOF {
+    //===--------------------------------------------------------------===//
+    // AArch64 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    // MO_GOT - Represents a relocation referring to the GOT entry of a given
+    // symbol. Used in adrp.
+    MO_GOT,
+
+    // MO_GOT_LO12 - Represents a relocation referring to the low 12 bits of the
+    // GOT entry of a given symbol. Used in ldr only.
+    MO_GOT_LO12,
+
+    // MO_DTPREL_* - Represents a relocation referring to the offset from a
+    // module's dynamic thread pointer. Used in the local-dynamic TLS access
+    // model.
+    MO_DTPREL_G1,
+    MO_DTPREL_G0_NC,
+
+    // MO_GOTTPREL_* - Represents a relocation referring to a GOT entry
+    // providing the offset of a variable from the thread-pointer. Used in
+    // initial-exec TLS model where this offset is assigned in the static thread
+    // block and thus known by the dynamic linker.
+    MO_GOTTPREL,
+    MO_GOTTPREL_LO12,
+
+    // MO_TLSDESC_* - Represents a relocation referring to a GOT entry providing
+    // a TLS descriptor chosen by the dynamic linker. Used for the
+    // general-dynamic and local-dynamic TLS access models where very littls is
+    // known at link-time.
+    MO_TLSDESC,
+    MO_TLSDESC_LO12,
+
+    // MO_TPREL_* - Represents a relocation referring to the offset of a
+    // variable from the thread pointer itself. Used in the local-exec TLS
+    // access model.
+    MO_TPREL_G1,
+    MO_TPREL_G0_NC,
+
+    // MO_LO12 - On a symbol operand, this represents a relocation containing
+    // lower 12 bits of the address. Used in add/sub/ldr/str.
+    MO_LO12
+  };
+}
+
+class APFloat;
+
+namespace A64Imms {
+  bool isFPImm(const APFloat &Val, uint32_t &Imm8Bits);
+
+  inline bool isFPImm(const APFloat &Val) {
+    uint32_t Imm8;
+    return isFPImm(Val, Imm8);
+  }
+
+  bool isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits);
+  bool isLogicalImmBits(unsigned RegWidth, uint32_t Bits, uint64_t &Imm);
+
+  bool isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
+  bool isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
+
+  // We sometimes want to know whether the immediate is representable with a
+  // MOVN but *not* with a MOVZ (because that would take priority).
+  bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
+
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/lib/Target/AArch64/Utils/CMakeLists.txt b/lib/Target/AArch64/Utils/CMakeLists.txt
new file mode 100644
index 0000000..2c28348
--- /dev/null
+++ b/lib/Target/AArch64/Utils/CMakeLists.txt
@@ -0,0 +1,5 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAArch64Utils
+  AArch64BaseInfo.cpp
+  )
diff --git a/lib/Target/AArch64/Utils/LLVMBuild.txt b/lib/Target/AArch64/Utils/LLVMBuild.txt
new file mode 100644
index 0000000..1be5375
--- /dev/null
+++ b/lib/Target/AArch64/Utils/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/AArch646/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = AArch64Utils
+parent = AArch64
+required_libraries = Core Support
+add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile
new file mode 100644
index 0000000..0f4a645
--- /dev/null
+++ b/lib/Target/AArch64/Utils/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/AArch64/Utils/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAArch64Utils
+
+# Hack: we need to include 'main' AArch64 target directory to grab private headers
+#CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index a76715a..46915ee 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -110,6 +110,11 @@ def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
 def FeatureMClass : SubtargetFeature<"mclass", "IsMClass", "true",
                                      "Is microcontroller profile ('M' series)">;
 
+// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
+// See ARMInstrInfo.td for details.
+def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
+                                       "NaCl trap">;
+
 // ARM ISAs.
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index fc6ac90..58c7798 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -37,6 +37,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCObjectStreamer.h"
@@ -45,6 +46,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -182,7 +184,7 @@ namespace {
       const size_t TagHeaderSize = 1 + 4;
 
       Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
-      Streamer.EmitBytes(CurrentVendor, 0);
+      Streamer.EmitBytes(CurrentVendor);
       Streamer.EmitIntValue(0, 1); // '\0'
 
       Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
@@ -192,14 +194,14 @@ namespace {
       // emit each field as its type (ULEB or String)
       for (unsigned int i=0; i<Contents.size(); ++i) {
         AttributeItemType item = Contents[i];
-        Streamer.EmitULEB128IntValue(item.Tag, 0);
+        Streamer.EmitULEB128IntValue(item.Tag);
         switch (item.Type) {
         default: llvm_unreachable("Invalid attribute type");
         case AttributeItemType::NumericAttribute:
-          Streamer.EmitULEB128IntValue(item.IntValue, 0);
+          Streamer.EmitULEB128IntValue(item.IntValue);
           break;
         case AttributeItemType::TextAttribute:
-          Streamer.EmitBytes(item.StringValue.upper(), 0);
+          Streamer.EmitBytes(item.StringValue.upper());
           Streamer.EmitIntValue(0, 1); // '\0'
           break;
         }
@@ -340,6 +342,11 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     unsigned Reg = MO.getReg();
     assert(TargetRegisterInfo::isPhysicalRegister(Reg));
     assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    if(ARM::GPRPairRegClass.contains(Reg)) {
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+      Reg = TRI->getSubReg(Reg, ARM::gsub_0);
+    }
     O << ARMInstPrinter::getRegisterName(Reg);
     break;
   }
@@ -528,14 +535,12 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       const MachineOperand &MO = MI->getOperand(OpNum);
       if (!MO.isReg())
         return true;
-      const TargetRegisterClass &RC = ARM::GPRRegClass;
       const MachineFunction &MF = *MI->getParent()->getParent();
       const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
-
-      unsigned RegIdx = TRI->getEncodingValue(MO.getReg());
-      RegIdx |= 1; //The odd register is also the higher-numbered one of a pair.
-
-      unsigned Reg = RC.getRegister(RegIdx);
+      unsigned Reg = MO.getReg();
+      if(!ARM::GPRPairRegClass.contains(Reg))
+        return false;
+      Reg = TRI->getSubReg(Reg, ARM::gsub_1);
       O << ARMInstPrinter::getRegisterName(Reg);
       return false;
     }
@@ -657,7 +662,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
 
         if (MCSym.getInt())
           // External to current translation unit.
-          OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
+          OutStreamer.EmitIntValue(0, 4/*size*/);
         else
           // Internal to current translation unit.
           //
@@ -667,7 +672,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
           // We need to fill in the value for the NLP in those cases.
           OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
                                                         OutContext),
-                                4/*size*/, 0/*addrspace*/);
+                                4/*size*/);
       }
 
       Stubs.clear();
@@ -685,7 +690,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
         OutStreamer.EmitValue(MCSymbolRefExpr::
                               Create(Stubs[i].second.getPointer(),
                                      OutContext),
-                              4/*size*/, 0/*addrspace*/);
+                              4/*size*/);
       }
 
       Stubs.clear();
@@ -699,6 +704,11 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
+  // FIXME: This should eventually end up somewhere else where more
+  // intelligent flag decisions can be made. For now we are just maintaining
+  // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+  if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&OutStreamer))
+    MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1682,6 +1692,13 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     break;
   }
+  case ARM::TRAPNaCl: {
+    //.long 0xe7fedef0 @ trap
+    uint32_t Val = 0xe7fedef0UL;
+    OutStreamer.AddComment("trap");
+    OutStreamer.EmitIntValue(Val, 4);
+    return;
+  }
   case ARM::tTRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index f7392fb..c945e4f 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- ARMAsmPrinter.h - Print machine code to an ARM .s file --*- C++ -*-===//
+//===-- ARMAsmPrinter.h - ARM implementation of AsmPrinter ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// ARM Assembly printer class.
-//
-//===----------------------------------------------------------------------===//
 
 #ifndef ARMASMPRINTER_H
 #define ARMASMPRINTER_H
@@ -54,7 +50,7 @@ public:
     }
 
   virtual const char *getPassName() const LLVM_OVERRIDE {
-    return "ARM Assembly Printer";
+    return "ARM Assembly / Object Emitter";
   }
 
   void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 0076910..ed001ea 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2719,7 +2719,6 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   case ARM::t2STMDB_UPD: {
     unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
     if (Subtarget.isSwift()) {
-      // rdar://8402126
       int UOps = 1 + NumRegs;  // One for address computation, one for each ld / st.
       switch (Opc) {
       default: break;
@@ -4047,7 +4046,6 @@ getPartialRegUpdateClearance(const MachineInstr *MI,
   case ARM::VLDRS:
   case ARM::FCONSTS:
   case ARM::VMOVSR:
-    // rdar://problem/8791586
   case ARM::VMOVv8i8:
   case ARM::VMOVv4i16:
   case ARM::VMOVv2i32:
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index d2f6a33..abdd251 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -205,7 +205,8 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
   }
 
   // First prefer the paired physreg.
-  if (PairedPhys)
+  if (PairedPhys &&
+      std::find(Order.begin(), Order.end(), PairedPhys) != Order.end())
     Hints.push_back(PairedPhys);
 
   // Then prefer even or odd registers.
@@ -400,64 +401,6 @@ requiresVirtualBaseRegisters(const MachineFunction &MF) const {
   return true;
 }
 
-static void
-emitSPUpdate(bool isARM,
-             MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-             DebugLoc dl, const ARMBaseInstrInfo &TII,
-             int NumBytes,
-             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
-  if (isARM)
-    emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
-                            Pred, PredReg, TII);
-  else
-    emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
-                           Pred, PredReg, TII);
-}
-
-
-void ARMBaseRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  if (!TFI->hasReservedCallFrame(MF)) {
-    // If we have alloca, convert as follows:
-    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
-    // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc dl = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = TFI->getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
-
-      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-      assert(!AFI->isThumb1OnlyFunction() &&
-             "This eliminateCallFramePseudoInstr does not support Thumb1!");
-      bool isARM = !AFI->isThumbFunction();
-
-      // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
-      int PIdx = Old->findFirstPredOperandIdx();
-      ARMCC::CondCodes Pred = (PIdx == -1)
-        ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm();
-      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
-        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
-        unsigned PredReg = Old->getOperand(2).getReg();
-        emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, Pred, PredReg);
-      } else {
-        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
-        unsigned PredReg = Old->getOperand(3).getReg();
-        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
-        emitSPUpdate(isARM, MBB, I, dl, TII, Amount, Pred, PredReg);
-      }
-    }
-  }
-  MBB.erase(I);
-}
-
 int64_t ARMBaseRegisterInfo::
 getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
   const MCInstrDesc &Desc = MI->getDesc();
@@ -717,8 +660,8 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
 
 void
 ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                         int SPAdj, RegScavenger *RS) const {
-  unsigned i = 0;
+                                         int SPAdj, unsigned FIOperandNum,
+                                         RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
@@ -727,13 +670,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   assert(!AFI->isThumb1OnlyFunction() &&
          "This eliminateFrameIndex does not support Thumb1!");
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
-  int FrameIndex = MI.getOperand(i).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned FrameReg;
 
   int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
@@ -755,18 +692,18 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Special handling of dbg_value instructions.
   if (MI.isDebugValue()) {
-    MI.getOperand(i).  ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(i+1).ChangeToImmediate(Offset);
+    MI.getOperand(FIOperandNum).  ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
     return;
   }
 
   // Modify MI as necessary to handle as much of 'Offset' as possible
   bool Done = false;
   if (!AFI->isThumbFunction())
-    Done = rewriteARMFrameIndex(MI, i, FrameReg, Offset, TII);
+    Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
   else {
     assert(AFI->isThumb2Function());
-    Done = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII);
+    Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
   }
   if (Done)
     return;
@@ -786,7 +723,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
   if (Offset == 0)
     // Must be addrmode4/6.
-    MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false);
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false);
   else {
     ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
     if (!AFI->isThumbFunction())
@@ -798,6 +735,6 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                              Offset, Pred, PredReg, TII);
     }
     // Update the original instruction to use the scratch register.
-    MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
+    MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false,true);
   }
 }
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index aaa56a9..725033b 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -168,12 +168,9 @@ public:
 
   virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const;
 
-  virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                           MachineBasicBlock &MBB,
-                                           MachineBasicBlock::iterator I) const;
-
   virtual void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                   int SPAdj, RegScavenger *RS = NULL) const;
+                                   int SPAdj, unsigned FIOperandNum,
+                                   RegScavenger *RS = NULL) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 70a25c2..4891609 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -1468,7 +1468,7 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
   if (CPEBB->empty()) {
     BBInfo[CPEBB->getNumber()].Size = 0;
 
-    // This block no longer needs to be aligned. <rdar://problem/10534709>.
+    // This block no longer needs to be aligned.
     CPEBB->setAlignment(0);
   } else
     // Entries are sorted by descending alignment, so realign from the front.
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 94c574a..29fcd40 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -146,6 +146,7 @@ class ARMFastISel : public FastISel {
     virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
     virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
                                const LoadInst *LI);
+    virtual bool FastLowerArguments();
   private:
   #include "ARMGenFastISel.inc"
 
@@ -2099,6 +2100,9 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
   CallingConv::ID CC = F.getCallingConv();
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
@@ -2157,13 +2161,16 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
             DstReg).addReg(SrcReg);
 
-    // Mark the register as live out of the function.
-    MRI.addLiveOut(VA.getLocReg());
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
   }
 
   unsigned RetOpc = isThumb2 ? ARM::tBX_RET : ARM::BX_RET;
-  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                          TII.get(RetOpc)));
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(RetOpc));
+  AddOptionalDefs(MIB);
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
   return true;
 }
 
@@ -2451,7 +2458,6 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
       if (Len >= 2 && Alignment == 2)
         VT = MVT::i16;
       else {
-        assert (Alignment == 1 && "Expected an alignment of 1!");
         VT = MVT::i8;
       }
     }
@@ -2562,7 +2568,8 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
     return SelectCall(&I, "memset");
   }
   case Intrinsic::trap: {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::TRAP));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(
+      Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP));
     return true;
   }
   }
@@ -2877,6 +2884,80 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   return DestReg2;
 }
 
+bool ARMFastISel::FastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  switch (CC) {
+  default:
+    return false;
+  case CallingConv::Fast:
+  case CallingConv::C:
+  case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::ARM_AAPCS:
+  case CallingConv::ARM_APCS:
+    break;
+  }
+
+  // Only handle simple cases. i.e. Up to 4 i8/i16/i32 scalar arguments
+  // which are passed in r0 - r3.
+  unsigned Idx = 1;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (Idx > 4)
+      return false;
+
+    if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::ByVal))
+      return false;
+
+    Type *ArgTy = I->getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(ArgTy);
+    if (!ArgVT.isSimple()) return false;
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      break;
+    default:
+      return false;
+    }
+  }
+
+
+  static const uint16_t GPRArgRegs[] = {
+    ARM::R0, ARM::R1, ARM::R2, ARM::R3
+  };
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::i32);
+  Idx = 0;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (I->use_empty())
+      continue;
+    unsigned SrcReg = GPRArgRegs[Idx];
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(DstReg, getKillRegState(true));
+    UpdateValueMap(I, ResultReg);
+  }
+
+  return true;
+}
+
 namespace llvm {
   FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
                                 const TargetLibraryInfo *libInfo) {
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 39d27c4..0ca6450 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -119,13 +119,14 @@ static void
 emitSPUpdate(bool isARM,
              MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
              DebugLoc dl, const ARMBaseInstrInfo &TII,
-             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags) {
+             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags,
+             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
   if (isARM)
     emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
-                            ARMCC::AL, 0, TII, MIFlags);
+                            Pred, PredReg, TII, MIFlags);
   else
     emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
-                           ARMCC::AL, 0, TII, MIFlags);
+                           Pred, PredReg, TII, MIFlags);
 }
 
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
@@ -1430,3 +1431,51 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     AFI->setLRIsSpilledForFarJump(true);
   }
 }
+
+
+void ARMFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      assert(!AFI->isThumb1OnlyFunction() &&
+             "This eliminateCallFramePseudoInstr does not support Thumb1!");
+      bool isARM = !AFI->isThumbFunction();
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      int PIdx = Old->findFirstPredOperandIdx();
+      ARMCC::CondCodes Pred = (PIdx == -1)
+        ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
+        unsigned PredReg = Old->getOperand(2).getReg();
+        emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
+                     Pred, PredReg);
+      } else {
+        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
+        unsigned PredReg = Old->getOperand(3).getReg();
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags,
+                     Pred, PredReg);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index a1c2b93..efa255a 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -70,6 +70,11 @@ public:
                    unsigned LdrOpc, bool isVarArg, bool NoGap,
                    bool(*Func)(unsigned, bool),
                    unsigned NumAlignedDPRCS2Regs) const;
+
+  virtual void eliminateCallFramePseudoInstr(
+                                    MachineFunction &MF,
+                                    MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 939bed7..a83f052 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/CallingConv.h"
@@ -257,6 +258,8 @@ private:
   // Select special operations if node forms integer ABS pattern
   SDNode *SelectABSOp(SDNode *N);
 
+  SDNode *SelectInlineAsm(SDNode *N);
+
   SDNode *SelectConcatVector(SDNode *N);
 
   SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
@@ -2552,6 +2555,12 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   switch (N->getOpcode()) {
   default: break;
+  case ISD::INLINEASM: {
+    SDNode *ResNode = SelectInlineAsm(N);
+    if (ResNode)
+      return ResNode;
+    break;
+  }
   case ISD::XOR: {
     // Select special operations if XOR node forms integer ABS pattern
     SDNode *ResNode = SelectABSOp(N);
@@ -3446,6 +3455,138 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   return SelectCode(N);
 }
 
+SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
+  std::vector<SDValue> AsmNodeOperands;
+  unsigned Flag, Kind;
+  bool Changed = false;
+  unsigned NumOps = N->getNumOperands();
+
+  ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(
+      N->getOperand(InlineAsm::Op_AsmString));
+  StringRef AsmString = StringRef(S->getSymbol());
+
+  // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint.
+  // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require
+  // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs
+  // respectively. Since there is no constraint to explicitly specify a
+  // reg pair, we search %H operand inside the asm string. If it is found, the
+  // transformation below enforces a GPRPair reg class for "%r" for 64-bit data.
+  if (AsmString.find(":H}") == StringRef::npos)
+    return NULL;
+
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Glue = N->getOperand(NumOps-1);
+
+  // Glue node will be appended late.
+  for(unsigned i = 0; i < NumOps -1; ++i) {
+    SDValue op = N->getOperand(i);
+    AsmNodeOperands.push_back(op);
+
+    if (i < InlineAsm::Op_FirstOperand)
+      continue;
+
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+      Flag = C->getZExtValue();
+      Kind = InlineAsm::getKind(Flag);
+    }
+    else
+      continue;
+
+    if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+        && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+      continue;
+
+    unsigned RegNum = InlineAsm::getNumOperandRegisters(Flag);
+    unsigned RC;
+    bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+    if (!HasRC || RC != ARM::GPRRegClassID || RegNum != 2)
+      continue;
+
+    assert((i+2 < NumOps-1) && "Invalid number of operands in inline asm");
+    SDValue V0 = N->getOperand(i+1);
+    SDValue V1 = N->getOperand(i+2);
+    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    SDValue PairedReg;
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    if (Kind == InlineAsm::Kind_RegDef ||
+        Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+      // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+      // the original GPRs.
+
+      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+      SDValue Chain = SDValue(N,0);
+
+      SDNode *GU = N->getGluedUser();
+      SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::Untyped,
+                                               Chain.getValue(1));
+
+      // Extract values from a GPRPair reg and copy to the original GPR reg.
+      SDValue Sub0 = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue Sub1 = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+                                        RegCopy.getValue(1));
+      SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+      // Update the original glue user.
+      std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+      Ops.push_back(T1.getValue(1));
+      CurDAG->UpdateNodeOperands(GU, &Ops[0], Ops.size());
+      GU = T1.getNode();
+    }
+    else {
+      // For Kind  == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+      // GPRPair and then pass the GPRPair to the inline asm.
+      SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+      // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+      SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+                                          Chain.getValue(1));
+      SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+                                          T0.getValue(1));
+      SDValue Pair = SDValue(createGPRPairNode(MVT::Untyped, T0, T1), 0);
+
+      // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+      // i32 VRs of inline asm with it.
+      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+      Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+      AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+      Glue = Chain.getValue(1);
+    }
+
+    Changed = true;
+
+    if(PairedReg.getNode()) {
+      Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+      Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
+      // Replace the current flag.
+      AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+          Flag, MVT::i32);
+      // Add the new register node and skip the original two GPRs.
+      AsmNodeOperands.push_back(PairedReg);
+      // Skip the next two GPRs.
+      i += 2;
+    }
+  }
+
+  AsmNodeOperands.push_back(Glue);
+  if (!Changed)
+    return NULL;
+
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, N->getDebugLoc(),
+      CurDAG->getVTList(MVT::Other, MVT::Glue), &AsmNodeOperands[0],
+                        AsmNodeOperands.size());
+  New->setNodeId(-1);
+  return New.getNode();
+}
+
+
 bool ARMDAGToDAGISel::
 SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
                              std::vector<SDValue> &OutOps) {
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 5b3e31f..ef96e56 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -781,6 +781,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
   setOperationAction(ISD::FREM,      MVT::f64, Expand);
   setOperationAction(ISD::FREM,      MVT::f32, Expand);
   if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
@@ -833,21 +835,21 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setSchedulingPreference(Sched::Hybrid);
 
   //// temporary - rewrite interface to use type
-  maxStoresPerMemset = 8;
-  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
-  maxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
-  maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
-  maxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
-  maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  MaxStoresPerMemset = 8;
+  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
 
   // On ARM arguments smaller than 4 bytes are extended, so all arguments
   // are at least 4 bytes aligned.
   setMinStackArgumentAlignment(4);
 
-  benefitFromCodePlacementOpt = true;
+  BenefitFromCodePlacementOpt = true;
 
   // Prefer likely predicted branches to selects on out-of-order cores.
-  predictableSelectIsExpensive = Subtarget->isLikeA9();
+  PredictableSelectIsExpensive = Subtarget->isLikeA9();
 
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
@@ -1926,15 +1928,9 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
                                                isVarArg));
 
-  // If this is the first return lowered for this function, add
-  // the regs to the liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
   SDValue Flag;
+  SmallVector<SDValue, 4> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
 
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
@@ -1963,10 +1959,12 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
 
         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
         Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
                                  HalfGPRs.getValue(1), Flag);
         Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
 
         // Extract the 2nd half and fall through to handle it as an f64 value.
@@ -1979,6 +1977,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
                                   DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
       Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
       VA = RVLocs[++i]; // skip ahead to next loc
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
                                Flag);
@@ -1988,15 +1987,16 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
     // Guarantee that all emitted copies are
     // stuck together, avoiding something bad.
     Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  SDValue result;
+  // Update chain and glue.
+  RetOps[0] = Chain;
   if (Flag.getNode())
-    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
-  else // Return Void
-    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain);
+    RetOps.push_back(Flag);
 
-  return result;
+  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
+                     RetOps.data(), RetOps.size());
 }
 
 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2576,7 +2576,7 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
 }
 
 // The remaining GPRs hold either the beginning of variable-argument
-// data, or the beginning of an aggregate passed by value (usuall
+// data, or the beginning of an aggregate passed by value (usually
 // byval).  Either way, we allocate stack slots adjacent to the data
 // provided by our caller, and store the unallocated registers there.
 // If this is a variadic function, the va_list pointer will begin with
@@ -4294,6 +4294,21 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
   return true;
 }
 
+/// \return true if this is a reverse operation on an vector.
+static bool isReverseMask(ArrayRef<int> M, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  // Make sure the mask has the right size.
+  if (NumElts != M.size())
+      return false;
+
+  // Look for <15, ..., 3, -1, 1, 0>.
+  for (unsigned i = 0; i != NumElts; ++i)
+    if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
+      return false;
+
+  return true;
+}
+
 // If N is an integer constant that can be moved into a register in one
 // instruction, return an SDValue of such a constant (will become a MOV
 // instruction).  Otherwise return null.
@@ -4689,7 +4704,8 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isVZIPMask(M, VT, WhichResult) ||
           isVTRN_v_undef_Mask(M, VT, WhichResult) ||
           isVUZP_v_undef_Mask(M, VT, WhichResult) ||
-          isVZIP_v_undef_Mask(M, VT, WhichResult));
+          isVZIP_v_undef_Mask(M, VT, WhichResult) ||
+          ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
 }
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
@@ -4793,6 +4809,23 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
                                  &VTBLMask[0], 8));
 }
 
+static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
+                                                      SelectionDAG &DAG) {
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue OpLHS = Op.getOperand(0);
+  EVT VT = OpLHS.getValueType();
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
+         "Expect an v8i16/v16i8 type");
+  OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
+  // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
+  // extract the first 8 bytes into the top double word and the last 8 bytes
+  // into the bottom double word. The v8i16 case is similar.
+  unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
+  return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
+                     DAG.getConstant(ExtractNum, MVT::i32));
+}
+
 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
@@ -4930,6 +4963,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
+  if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+    return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
+
   if (VT == MVT::v8i8) {
     SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
     if (NewOp.getNode())
@@ -5967,9 +6003,6 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
   }
 
-  unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD;
-  unsigned strOpc = isThumb2 ? ARM::t2STREXD : ARM::STREXD;
-
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *contBB = 0, *cont2BB = 0;
   if (IsCmpxchg || IsMinMax)
@@ -6007,42 +6040,26 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   //   cmp storesuccess, #0
   //   bne- loopMBB
   //   fallthrough --> exitMBB
-  //
-  // Note that the registers are explicitly specified because there is not any
-  // way to force the register allocator to allocate a register pair.
-  //
-  // FIXME: The hardcoded registers are not necessary for Thumb2, but we
-  // need to properly enforce the restriction that the two output registers
-  // for ldrexd must be different.
   BB = loopMBB;
+
   // Load
-  unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-  unsigned GPRPair1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-  unsigned GPRPair2;
-  if (IsMinMax) {
-    //We need an extra double register for doing min/max.
-    unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    GPRPair2 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
-      .addReg(undef)
-      .addReg(vallo)
-      .addImm(ARM::gsub_0);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair2)
-      .addReg(r1)
-      .addReg(valhi)
-      .addImm(ARM::gsub_1);
+  if (isThumb2) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD))
+                   .addReg(destlo, RegState::Define)
+                   .addReg(desthi, RegState::Define)
+                   .addReg(ptr));
+  } else {
+    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD))
+                   .addReg(GPRPair0, RegState::Define).addReg(ptr));
+    // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
+      .addReg(GPRPair0, 0, ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
+      .addReg(GPRPair0, 0, ARM::gsub_1);
   }
 
-  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
-                 .addReg(GPRPair0, RegState::Define).addReg(ptr));
-  // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
-  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
-    .addReg(GPRPair0, 0, ARM::gsub_0);
-  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
-    .addReg(GPRPair0, 0, ARM::gsub_1);
-
+  unsigned StoreLo, StoreHi;
   if (IsCmpxchg) {
     // Add early exit
     for (unsigned i = 0; i < 2; i++) {
@@ -6058,19 +6075,8 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
     }
 
     // Copy to physregs for strexd
-    unsigned setlo = MI->getOperand(5).getReg();
-    unsigned sethi = MI->getOperand(6).getReg();
-    unsigned undef = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), undef);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
-      .addReg(undef)
-      .addReg(setlo)
-      .addImm(ARM::gsub_0);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1)
-      .addReg(r1)
-      .addReg(sethi)
-      .addImm(ARM::gsub_1);
+    StoreLo = MI->getOperand(5).getReg();
+    StoreHi = MI->getOperand(6).getReg();
   } else if (Op1) {
     // Perform binary operation
     unsigned tmpRegLo = MRI.createVirtualRegister(TRC);
@@ -6082,32 +6088,13 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
                    .addReg(desthi).addReg(valhi))
         .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax));
 
-    unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
-    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
-      .addReg(UndefPair)
-      .addReg(tmpRegLo)
-      .addImm(ARM::gsub_0);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1)
-      .addReg(r1)
-      .addReg(tmpRegHi)
-      .addImm(ARM::gsub_1);
+    StoreLo = tmpRegLo;
+    StoreHi = tmpRegHi;
   } else {
     // Copy to physregs for strexd
-    unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
-      .addReg(UndefPair)
-      .addReg(vallo)
-      .addImm(ARM::gsub_0);
-    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), GPRPair1)
-      .addReg(r1)
-      .addReg(valhi)
-      .addImm(ARM::gsub_1);
+    StoreLo = vallo;
+    StoreHi = valhi;
   }
-  unsigned GPRPairStore = GPRPair1;
   if (IsMinMax) {
     // Compare and branch to exit block.
     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -6115,12 +6102,33 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
     BB->addSuccessor(exitMBB);
     BB->addSuccessor(contBB);
     BB = contBB;
-    GPRPairStore = GPRPair2;
+    StoreLo = vallo;
+    StoreHi = valhi;
   }
 
   // Store
-  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
-                 .addReg(GPRPairStore).addReg(ptr));
+  if (isThumb2) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess)
+                   .addReg(StoreLo).addReg(StoreHi).addReg(ptr));
+  } else {
+    // Marshal a pair...
+    unsigned StorePair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
+      .addReg(UndefPair)
+      .addReg(StoreLo)
+      .addImm(ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), StorePair)
+      .addReg(r1)
+      .addReg(StoreHi)
+      .addImm(ARM::gsub_1);
+
+    // ...and store it
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess)
+                   .addReg(StorePair).addReg(ptr));
+  }
   // Cmp+jump
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
                  .addReg(storesuccess).addImm(0));
@@ -6329,7 +6337,16 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
   DispatchBB->setIsLandingPad();
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
-  BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
+  unsigned trap_opcode;
+  if (Subtarget->isThumb()) {
+    trap_opcode = ARM::tTRAP;
+  } else {
+    if (Subtarget->useNaClTrap())
+      trap_opcode = ARM::TRAPNaCl;
+    else
+      trap_opcode = ARM::TRAP;
+  }
+  BuildMI(TrapBB, dl, TII->get(trap_opcode));
   DispatchBB->addSuccessor(TrapBB);
 
   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
@@ -7123,7 +7140,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
-                              /*IsMinMax*/ true, ARMCC::LE);
+                              /*IsMinMax*/ true, ARMCC::LT);
   case ARM::ATOMMAX6432:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
@@ -7133,7 +7150,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
-                              /*IsMinMax*/ true, ARMCC::LS);
+                              /*IsMinMax*/ true, ARMCC::LO);
   case ARM::ATOMUMAX6432:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
@@ -10343,4 +10360,3 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
   return false;
 }
-
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 12712c0..9409f35 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -117,7 +117,7 @@ def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
                                SDNPVariadic]>;
 
 def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
-                              [SDNPHasChain, SDNPOptInGlue]>;
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
                               [SDNPInGlue]>;
@@ -239,6 +239,9 @@ def IsARM            : Predicate<"!Subtarget->isThumb()">,
 def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
 def IsNotIOS         : Predicate<"!Subtarget->isTargetIOS()">;
 def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
+def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
+                                 AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
+def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
 
 // FIXME: Eventually this will be just "hasV6T2Ops".
 def UseMovt          : Predicate<"Subtarget->useMovt()">;
@@ -1762,11 +1765,32 @@ def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
   let Inst{3-0} = opt;
 }
 
-// A5.4 Permanently UNDEFINED instructions.
+/*
+ * A5.4 Permanently UNDEFINED instructions.
+ *
+ * For most targets use UDF #65006, for which the OS will generate SIGTRAP.
+ * Other UDF encodings generate SIGILL.
+ *
+ * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb.
+ * Encoding A1:
+ *  1110 0111 1111 iiii iiii iiii 1111 iiii
+ * Encoding T1:
+ *  1101 1110 iiii iiii
+ * It uses the following encoding:
+ *  1110 0111 1111 1110 1101 1110 1111 0000
+ *  - In ARM: UDF #60896;
+ *  - In Thumb: UDF #254 followed by a branch-to-self.
+ */
+let isBarrier = 1, isTerminator = 1 in
+def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary,
+               "trap", [(trap)]>,
+           Requires<[IsARM,UseNaClTrap]> {
+  let Inst = 0xe7fedef0;
+}
 let isBarrier = 1, isTerminator = 1 in
 def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
                "trap", [(trap)]>,
-           Requires<[IsARM]> {
+           Requires<[IsARM,DontUseNaClTrap]> {
   let Inst = 0xe7ffdefe;
 }
 
@@ -2079,6 +2103,18 @@ def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> {
   let Inst{24-23} = 0b11;
 }
 
+def : ARMInstAlias<"srsda $mode", (SRSDA imm0_31:$mode)>;
+def : ARMInstAlias<"srsda $mode!", (SRSDA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsdb $mode", (SRSDB imm0_31:$mode)>;
+def : ARMInstAlias<"srsdb $mode!", (SRSDB_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsia $mode", (SRSIA imm0_31:$mode)>;
+def : ARMInstAlias<"srsia $mode!", (SRSIA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsib $mode", (SRSIB imm0_31:$mode)>;
+def : ARMInstAlias<"srsib $mode!", (SRSIB_UPD imm0_31:$mode)>;
+
 // Return From Exception
 class RFEI<bit wb, string asm>
   : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm,
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 697a8d2..0411ac4 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -4264,6 +4264,7 @@ def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
 def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
                      NEONvceq, 1>;
 
+let TwoOperandAliasConstraint = "$Vm = $Vd" in
 defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
                             "$Vd, $Vm, #0", NEONvceqz>;
 
@@ -4277,10 +4278,12 @@ def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
 def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
                      NEONvcge, 0>;
 
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
                             "$Vd, $Vm, #0", NEONvcgez>;
 defm VCLEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
                             "$Vd, $Vm, #0", NEONvclez>;
+}
 
 //   VCGT     : Vector Compare Greater Than
 defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
@@ -4292,10 +4295,12 @@ def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
 def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
                      NEONvcgt, 0>;
 
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
                             "$Vd, $Vm, #0", NEONvcgtz>;
 defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
                             "$Vd, $Vm, #0", NEONvcltz>;
+}
 
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
 def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
@@ -5740,6 +5745,10 @@ def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
 def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
 
+// Fold extracting an element out of a v2i32 into a vfp register.
+def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
+          (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+
 // Vector lengthening move with load, matching extending loads.
 
 // extload, zextload and sextload for a standard lengthening load. Example:
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index cf8b302..c9d709e 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -3481,6 +3481,13 @@ def t2SRSIA_UPD : T2SRS<0b11, 1, (outs), (ins imm0_31:$mode), NoItinerary,
 def t2SRSIA  : T2SRS<0b11, 0, (outs), (ins imm0_31:$mode), NoItinerary,
                      "srsia","\tsp, $mode", []>;
 
+
+def : t2InstAlias<"srsdb${p} $mode", (t2SRSDB imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsdb${p} $mode!", (t2SRSDB_UPD imm0_31:$mode, pred:$p)>;
+
+def : t2InstAlias<"srsia${p} $mode", (t2SRSIA imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsia${p} $mode!", (t2SRSIA_UPD imm0_31:$mode, pred:$p)>;
+
 // Return From Exception is a system instruction.
 class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
           string opc, string asm, list<dag> pattern>
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index a1c21ee..98bd6c1 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -1188,7 +1188,6 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
           OddDeadKill = true;
         }
         // Never kill the base register in the first instruction.
-        // <rdar://problem/11101911>
         if (EvenReg == BaseReg)
           EvenDeadKill = false;
         InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 404634f..4191931 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1887,6 +1887,9 @@ def CortexA9Model : SchedMachineModel {
   let LoadLatency = 2; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
+  let ILPWindow = 10; // Don't reschedule small blocks to hide
+                      // latency. Minimum latency requirements are already
+                      // modeled strictly by reserving resources.
   let MispredictPenalty = 8; // Based on estimate of pipeline depth.
 
   let Itineraries = CortexA9Itineraries;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 058d4c4..f4d568c 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -14,7 +14,9 @@
 #include "ARMSubtarget.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -43,58 +45,83 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS)
   : ARMGenSubtargetInfo(TT, CPU, FS)
   , ARMProcFamily(Others)
-  , HasV4TOps(false)
-  , HasV5TOps(false)
-  , HasV5TEOps(false)
-  , HasV6Ops(false)
-  , HasV6T2Ops(false)
-  , HasV7Ops(false)
-  , HasVFPv2(false)
-  , HasVFPv3(false)
-  , HasVFPv4(false)
-  , HasNEON(false)
-  , UseNEONForSinglePrecisionFP(false)
-  , UseMulOps(UseFusedMulOps)
-  , SlowFPVMLx(false)
-  , HasVMLxForwarding(false)
-  , SlowFPBrcc(false)
-  , InThumbMode(false)
-  , HasThumb2(false)
-  , IsMClass(false)
-  , NoARM(false)
-  , PostRAScheduler(false)
-  , IsR9Reserved(ReserveR9)
-  , UseMovt(false)
-  , SupportsTailCall(false)
-  , HasFP16(false)
-  , HasD16(false)
-  , HasHardwareDivide(false)
-  , HasHardwareDivideInARM(false)
-  , HasT2ExtractPack(false)
-  , HasDataBarrier(false)
-  , Pref32BitThumb(false)
-  , AvoidCPSRPartialUpdate(false)
-  , AvoidMOVsShifterOperand(false)
-  , HasRAS(false)
-  , HasMPExtension(false)
-  , FPOnlySP(false)
-  , AllowsUnalignedMem(false)
-  , Thumb2DSP(false)
   , stackAlignment(4)
   , CPUString(CPU)
   , TargetTriple(TT)
   , TargetABI(ARM_ABI_APCS) {
-  // Determine default and user specified characteristics
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+}
+
+void ARMSubtarget::initializeEnvironment() {
+  HasV4TOps = false;
+  HasV5TOps = false;
+  HasV5TEOps = false;
+  HasV6Ops = false;
+  HasV6T2Ops = false;
+  HasV7Ops = false;
+  HasVFPv2 = false;
+  HasVFPv3 = false;
+  HasVFPv4 = false;
+  HasNEON = false;
+  UseNEONForSinglePrecisionFP = false;
+  UseMulOps = UseFusedMulOps;
+  SlowFPVMLx = false;
+  HasVMLxForwarding = false;
+  SlowFPBrcc = false;
+  InThumbMode = false;
+  HasThumb2 = false;
+  IsMClass = false;
+  NoARM = false;
+  PostRAScheduler = false;
+  IsR9Reserved = ReserveR9;
+  UseMovt = false;
+  SupportsTailCall = false;
+  HasFP16 = false;
+  HasD16 = false;
+  HasHardwareDivide = false;
+  HasHardwareDivideInARM = false;
+  HasT2ExtractPack = false;
+  HasDataBarrier = false;
+  Pref32BitThumb = false;
+  AvoidCPSRPartialUpdate = false;
+  AvoidMOVsShifterOperand = false;
+  HasRAS = false;
+  HasMPExtension = false;
+  FPOnlySP = false;
+  AllowsUnalignedMem = false;
+  Thumb2DSP = false;
+  UseNaClTrap = false;
+}
+
+void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                           "target-cpu");
+  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                          "target-features");
+  std::string CPU =
+    !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+  std::string FS =
+    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+  if (!FS.empty()) {
+    initializeEnvironment();
+    resetSubtargetFeatures(CPU, FS);
+  }
+}
+
+void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (CPUString.empty())
     CPUString = "generic";
 
   // Insert the architecture feature derived from the target triple into the
   // feature string. This is important for setting features that are implied
   // based on the architecture version.
-  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPUString);
+  std::string ArchFS = ARM_MC::ParseARMTriple(TargetTriple.getTriple(),
+                                              CPUString);
   if (!FS.empty()) {
     if (!ArchFS.empty())
-      ArchFS = ArchFS + "," + FS;
+      ArchFS = ArchFS + "," + FS.str();
     else
       ArchFS = FS;
   }
@@ -111,7 +138,8 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
 
-  if ((TT.find("eabi") != std::string::npos) || (isTargetIOS() && isMClass()))
+  if ((TargetTriple.getTriple().find("eabi") != std::string::npos) ||
+      (isTargetIOS() && isMClass()))
     // FIXME: We might want to separate AAPCS and EABI. Some systems, e.g.
     // Darwin-EABI conforms to AACPS but not the rest of EABI.
     TargetABI = ARM_ABI_AAPCS;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 64878cd..8ce22e1 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -156,6 +156,9 @@ protected:
   /// and such) instructions in Thumb2 code.
   bool Thumb2DSP;
 
+  /// NaCl TRAP instruction is generated instead of the regular TRAP.
+  bool UseNaClTrap;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -199,6 +202,12 @@ protected:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
+  /// \brief Reset the features for the ARM target.
+  virtual void resetSubtargetFeatures(const MachineFunction *MF);
+private:
+  void initializeEnvironment();
+  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
   void computeIssueWidth();
 
   bool hasV4TOps()  const { return HasV4TOps;  }
@@ -241,6 +250,7 @@ protected:
   bool hasRAS() const { return HasRAS; }
   bool hasMPExtension() const { return HasMPExtension; }
   bool hasThumb2DSP() const { return Thumb2DSP; }
+  bool useNaClTrap() const { return UseNaClTrap; }
 
   bool hasFP16() const { return HasFP16; }
   bool hasD16() const { return HasD16; }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index be6bec7..d4caf5c 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -46,6 +46,10 @@ public:
 
   virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
   virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual const ARMTargetLowering *getTargetLowering() const {
+    // Implemented by derived classes
+    llvm_unreachable("getTargetLowering not implemented");
+  }
   virtual const InstrItineraryData *getInstrItineraryData() const {
     return &InstrItins;
   }
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 03a23be..01c04b4 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
 using namespace llvm;
 
 // Declare the pass initialization routine locally as target-specific passes
@@ -34,18 +35,20 @@ namespace {
 class ARMTTI : public ImmutablePass, public TargetTransformInfo {
   const ARMBaseTargetMachine *TM;
   const ARMSubtarget *ST;
+  const ARMTargetLowering *TLI;
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
 public:
-  ARMTTI() : ImmutablePass(ID), TM(0), ST(0) {
+  ARMTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
   ARMTTI(const ARMBaseTargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()) {
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
     initializeARMTTIPass(*PassRegistry::getPassRegistry());
   }
 
@@ -77,6 +80,52 @@ public:
   virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
 
   /// @}
+
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 16;
+      return 0;
+    }
+
+    if (ST->isThumb1Only())
+      return 8;
+    return 16;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) const {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+
+    return 32;
+  }
+
+  unsigned getMaximumUnrollFactor() const {
+    // These are out of order CPUs:
+    if (ST->isCortexA15() || ST->isSwift())
+      return 2;
+    return 1;
+  }
+
+  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                          int Index, Type *SubTp) const;
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                      Type *Src) const;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
+
+  unsigned getAddressComputationCost(Type *Val) const;
+  /// @}
 };
 
 } // end anonymous namespace
@@ -122,3 +171,200 @@ unsigned ARMTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   }
   return 2;
 }
+
+unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  // Some arithmetic, load and store operations have specific instructions
+  // to cast up/down their types automatically at no extra cost.
+  // TODO: Get these tables to know at least what the related operations are.
+  static const TypeConversionCostTblEntry<MVT> NEONVectorConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
+    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
+
+    // Vector float <-> i32 conversions.
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
+
+    // Vector double <-> i32 conversions.
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 }
+  };
+
+  if (SrcTy.isVector() && ST->hasNEON()) {
+    int Idx = ConvertCostTableLookup<MVT>(NEONVectorConversionTbl,
+                                array_lengthof(NEONVectorConversionTbl),
+                                ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return NEONVectorConversionTbl[Idx].Cost;
+  }
+
+  // Scalar float to integer conversions.
+  static const TypeConversionCostTblEntry<MVT> NEONFloatConversionTbl[] = {
+    { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
+    { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
+    { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
+    { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
+  };
+  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
+    int Idx = ConvertCostTableLookup<MVT>(NEONFloatConversionTbl,
+                                        array_lengthof(NEONFloatConversionTbl),
+                                        ISD, DstTy.getSimpleVT(),
+                                        SrcTy.getSimpleVT());
+    if (Idx != -1)
+        return NEONFloatConversionTbl[Idx].Cost;
+  }
+
+
+  // Scalar integer to float conversions.
+  static const TypeConversionCostTblEntry<MVT> NEONIntegerConversionTbl[] = {
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
+  };
+
+  if (SrcTy.isInteger() && ST->hasNEON()) {
+    int Idx = ConvertCostTableLookup<MVT>(NEONIntegerConversionTbl,
+                                       array_lengthof(NEONIntegerConversionTbl),
+                                       ISD, DstTy.getSimpleVT(),
+                                       SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return NEONIntegerConversionTbl[Idx].Cost;
+  }
+
+  // Scalar integer conversion costs.
+  static const TypeConversionCostTblEntry<MVT> ARMIntegerConversionTbl[] = {
+    // i16 -> i64 requires two dependent operations.
+    { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
+
+    // Truncates on i64 are assumed to be free.
+    { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
+  };
+
+  if (SrcTy.isInteger()) {
+    int Idx =
+      ConvertCostTableLookup<MVT>(ARMIntegerConversionTbl,
+                                  array_lengthof(ARMIntegerConversionTbl),
+                                  ISD, DstTy.getSimpleVT(),
+                                  SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return ARMIntegerConversionTbl[Idx].Cost;
+  }
+
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                    unsigned Index) const {
+  // Penalize inserting into an D-subregister. We end up with a three times
+  // lower estimated throughput on swift.
+  if (ST->isSwift() &&
+      Opcode == Instruction::InsertElement &&
+      ValTy->isVectorTy() &&
+      ValTy->getScalarSizeInBits() <= 32)
+    return 3;
+
+  return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index);
+}
+
+unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                    Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // On NEON a a vector select gets lowered to vbsl.
+  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+    return LT.first;
+  }
+
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned ARMTTI::getAddressComputationCost(Type *Ty) const {
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                Type *SubTp) const {
+  // We only handle costs of reverse shuffles for now.
+  if (Kind != SK_Reverse)
+    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  static const CostTblEntry<MVT> NEONShuffleTbl[] = {
+    // Reverse shuffle cost one instruction if we are shuffling within a double
+    // word (vrev) or two if we shuffle a quad word (vrev, vext).
+    { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
+    { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 },
+    { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 },
+    { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 },
+
+    { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 },
+    { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 },
+    { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 },
+    { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 }
+  };
+
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+  int Idx = CostTableLookup<MVT>(NEONShuffleTbl, array_lengthof(NEONShuffleTbl),
+                                 ISD::VECTOR_SHUFFLE, LT.second);
+  if (Idx == -1)
+    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  return LT.first * NEONShuffleTbl[Idx].Cost;
+}
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index ad37a21..6c678fd 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -18,7 +18,9 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -28,6 +30,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -250,6 +253,13 @@ public:
 
     // Not in an ITBlock to start with.
     ITState.CurPosition = ~0U;
+
+    // Set ELF header flags.
+    // FIXME: This should eventually end up somewhere else where more
+    // intelligent flag decisions can be made. For now we are just maintaining
+    // the statu/parseDirects quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+    if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&Parser.getStreamer()))
+      MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
   }
 
   // Implementation of the MCTargetAsmParser interface:
@@ -259,6 +269,7 @@ public:
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands);
   bool ParseDirective(AsmToken DirectiveID);
 
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
   unsigned checkTargetMatchPredicate(MCInst &Inst);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -271,7 +282,7 @@ public:
 namespace {
 
 /// ARMOperand - Instances of this class represent a parsed ARM machine
-/// instruction.
+/// operand.
 class ARMOperand : public MCParsedAsmOperand {
   enum KindTy {
     k_CondCode,
@@ -2557,7 +2568,7 @@ int ARMAsmParser::tryParseShiftRegister(
       Parser.Lex(); // Eat hash.
       SMLoc ImmLoc = Parser.getTok().getLoc();
       const MCExpr *ShiftExpr = 0;
-      if (getParser().ParseExpression(ShiftExpr, EndLoc)) {
+      if (getParser().parseExpression(ShiftExpr, EndLoc)) {
         Error(ImmLoc, "invalid immediate shift value");
         return -1;
       }
@@ -2640,7 +2651,7 @@ tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     Parser.Lex(); // Eat left bracket token.
 
     const MCExpr *ImmVal;
-    if (getParser().ParseExpression(ImmVal))
+    if (getParser().parseExpression(ImmVal))
       return true;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
     if (!MCE)
@@ -2785,7 +2796,7 @@ parseCoprocOptionOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   const MCExpr *Expr;
   SMLoc Loc = Parser.getTok().getLoc();
-  if (getParser().ParseExpression(Expr)) {
+  if (getParser().parseExpression(Expr)) {
     Error(Loc, "illegal expression");
     return MatchOperand_ParseFail;
   }
@@ -2998,7 +3009,7 @@ parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
 
     const MCExpr *LaneIndex;
     SMLoc Loc = Parser.getTok().getLoc();
-    if (getParser().ParseExpression(LaneIndex)) {
+    if (getParser().parseExpression(LaneIndex)) {
       Error(Loc, "illegal expression");
       return MatchOperand_ParseFail;
     }
@@ -3316,7 +3327,7 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     SMLoc Loc = Parser.getTok().getLoc();
 
     const MCExpr *MemBarrierID;
-    if (getParser().ParseExpression(MemBarrierID)) {
+    if (getParser().parseExpression(MemBarrierID)) {
       Error(Loc, "illegal expression");
       return MatchOperand_ParseFail;
     }
@@ -3532,7 +3543,7 @@ parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op,
   const MCExpr *ShiftAmount;
   SMLoc Loc = Parser.getTok().getLoc();
   SMLoc EndLoc;
-  if (getParser().ParseExpression(ShiftAmount, EndLoc)) {
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
     Error(Loc, "illegal expression");
     return MatchOperand_ParseFail;
   }
@@ -3612,7 +3623,7 @@ parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   const MCExpr *ShiftAmount;
   SMLoc EndLoc;
-  if (getParser().ParseExpression(ShiftAmount, EndLoc)) {
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
     Error(ExLoc, "malformed shift expression");
     return MatchOperand_ParseFail;
   }
@@ -3673,7 +3684,7 @@ parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   const MCExpr *ShiftAmount;
   SMLoc EndLoc;
-  if (getParser().ParseExpression(ShiftAmount, EndLoc)) {
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
     Error(ExLoc, "malformed rotate expression");
     return MatchOperand_ParseFail;
   }
@@ -3710,7 +3721,7 @@ parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   const MCExpr *LSBExpr;
   SMLoc E = Parser.getTok().getLoc();
-  if (getParser().ParseExpression(LSBExpr)) {
+  if (getParser().parseExpression(LSBExpr)) {
     Error(E, "malformed immediate expression");
     return MatchOperand_ParseFail;
   }
@@ -3743,7 +3754,7 @@ parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   const MCExpr *WidthExpr;
   SMLoc EndLoc;
-  if (getParser().ParseExpression(WidthExpr, EndLoc)) {
+  if (getParser().parseExpression(WidthExpr, EndLoc)) {
     Error(E, "malformed immediate expression");
     return MatchOperand_ParseFail;
   }
@@ -3839,7 +3850,7 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     bool isNegative = Parser.getTok().is(AsmToken::Minus);
     const MCExpr *Offset;
     SMLoc E;
-    if (getParser().ParseExpression(Offset, E))
+    if (getParser().parseExpression(Offset, E))
       return MatchOperand_ParseFail;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
     if (!CE) {
@@ -4226,9 +4237,10 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (BaseRegNum == -1)
     return Error(BaseRegTok.getLoc(), "register expected");
 
-  // The next token must either be a comma or a closing bracket.
+  // The next token must either be a comma, a colon or a closing bracket.
   const AsmToken &Tok = Parser.getTok();
-  if (!Tok.is(AsmToken::Comma) && !Tok.is(AsmToken::RBrac))
+  if (!Tok.is(AsmToken::Colon) && !Tok.is(AsmToken::Comma) &&
+      !Tok.is(AsmToken::RBrac))
     return Error(Tok.getLoc(), "malformed memory operand");
 
   if (Tok.is(AsmToken::RBrac)) {
@@ -4248,8 +4260,11 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return false;
   }
 
-  assert(Tok.is(AsmToken::Comma) && "Lost comma in memory operand?!");
-  Parser.Lex(); // Eat the comma.
+  assert((Tok.is(AsmToken::Colon) || Tok.is(AsmToken::Comma)) &&
+         "Lost colon or comma in memory operand?!");
+  if (Tok.is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+  }
 
   // If we have a ':', it's an alignment specifier.
   if (Parser.getTok().is(AsmToken::Colon)) {
@@ -4257,7 +4272,7 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     E = Parser.getTok().getLoc();
 
     const MCExpr *Expr;
-    if (getParser().ParseExpression(Expr))
+    if (getParser().parseExpression(Expr))
      return true;
 
     // The expression has to be a constant. Memory references with relocations
@@ -4313,7 +4328,7 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
     bool isNegative = getParser().getTok().is(AsmToken::Minus);
     const MCExpr *Offset;
-    if (getParser().ParseExpression(Offset))
+    if (getParser().parseExpression(Offset))
      return true;
 
     // The expression has to be a constant. Memory references with relocations
@@ -4432,7 +4447,7 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
     Parser.Lex(); // Eat hash token.
 
     const MCExpr *Expr;
-    if (getParser().ParseExpression(Expr))
+    if (getParser().parseExpression(Expr))
       return true;
     // Range check the immediate.
     // lsl, ror: 0 <= imm <= 31
@@ -4461,7 +4476,7 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Anything that can accept a floating point constant as an operand
-  // needs to go through here, as the regular ParseExpression is
+  // needs to go through here, as the regular parseExpression is
   // integer only.
   //
   // This routine still creates a generic Immediate operand, containing
@@ -4581,7 +4596,7 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     // identifier (like labels) as expressions and create them as immediates.
     const MCExpr *IdVal;
     S = Parser.getTok().getLoc();
-    if (getParser().ParseExpression(IdVal))
+    if (getParser().parseExpression(IdVal))
       return true;
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
     Operands.push_back(ARMOperand::CreateImm(IdVal, S, E));
@@ -4600,7 +4615,7 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     if (Parser.getTok().isNot(AsmToken::Colon)) {
       bool isNegative = Parser.getTok().is(AsmToken::Minus);
       const MCExpr *ImmVal;
-      if (getParser().ParseExpression(ImmVal))
+      if (getParser().parseExpression(ImmVal))
         return true;
       const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
       if (CE) {
@@ -4610,6 +4625,15 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       }
       E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
       Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
+
+      // There can be a trailing '!' on operands that we want as a separate
+      // '!' Token operand. Handle that here. For example, the compatibilty
+      // alias for 'srsdb sp!, #imm' is 'srsdb #imm!'.
+      if (Parser.getTok().is(AsmToken::Exclaim)) {
+        Operands.push_back(ARMOperand::CreateToken(Parser.getTok().getString(),
+                                                   Parser.getTok().getLoc()));
+        Parser.Lex(); // Eat exclaim token
+      }
       return false;
     }
     // w/ a ':' after the '#', it's just like a plain ':'.
@@ -4624,7 +4648,7 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       return true;
 
     const MCExpr *SubExprVal;
-    if (getParser().ParseExpression(SubExprVal))
+    if (getParser().parseExpression(SubExprVal))
       return true;
 
     const MCExpr *ExprVal = ARMMCExpr::Create(RefKind, SubExprVal,
@@ -4997,7 +5021,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   // In Thumb1, only the branch (B) instruction can be predicated.
   if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") {
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return Error(NameLoc, "conditional execution not supported in Thumb1");
   }
 
@@ -5011,14 +5035,14 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   if (Mnemonic == "it") {
     SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2);
     if (ITMask.size() > 3) {
-      Parser.EatToEndOfStatement();
+      Parser.eatToEndOfStatement();
       return Error(Loc, "too many conditions on IT instruction");
     }
     unsigned Mask = 8;
     for (unsigned i = ITMask.size(); i != 0; --i) {
       char pos = ITMask[i - 1];
       if (pos != 't' && pos != 'e') {
-        Parser.EatToEndOfStatement();
+        Parser.eatToEndOfStatement();
         return Error(Loc, "illegal IT block condition mask '" + ITMask + "'");
       }
       Mask >>= 1;
@@ -5044,14 +5068,14 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // If we had a carry-set on an instruction that can't do that, issue an
   // error.
   if (!CanAcceptCarrySet && CarrySetting) {
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return Error(NameLoc, "instruction '" + Mnemonic +
                  "' can not set flags, but 's' suffix specified");
   }
   // If we had a predication code on an instruction that can't do that, issue an
   // error.
   if (!CanAcceptPredicationCode && PredicationCode != ARMCC::AL) {
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return Error(NameLoc, "instruction '" + Mnemonic +
                  "' is not predicable, but condition code specified");
   }
@@ -5100,7 +5124,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
     if (parseOperand(Operands, Mnemonic)) {
-      Parser.EatToEndOfStatement();
+      Parser.eatToEndOfStatement();
       return true;
     }
 
@@ -5109,7 +5133,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
       // Parse and remember the operand.
       if (parseOperand(Operands, Mnemonic)) {
-        Parser.EatToEndOfStatement();
+        Parser.eatToEndOfStatement();
         return true;
       }
     }
@@ -5117,7 +5141,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     SMLoc Loc = getLexer().getLoc();
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return Error(Loc, "unexpected token in argument list");
   }
 
@@ -5148,53 +5172,6 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     delete Op;
   }
 
-  // The vector-compare-to-zero instructions have a literal token "#0" at
-  // the end that comes to here as an immediate operand. Convert it to a
-  // token to play nicely with the matcher.
-  if ((Mnemonic == "vceq" || Mnemonic == "vcge" || Mnemonic == "vcgt" ||
-      Mnemonic == "vcle" || Mnemonic == "vclt") && Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[5])->isImm()) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[5]);
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
-    if (CE && CE->getValue() == 0) {
-      Operands.erase(Operands.begin() + 5);
-      Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
-      delete Op;
-    }
-  }
-  // VCMP{E} does the same thing, but with a different operand count.
-  if ((Mnemonic == "vcmp" || Mnemonic == "vcmpe") && Operands.size() == 5 &&
-      static_cast<ARMOperand*>(Operands[4])->isImm()) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[4]);
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
-    if (CE && CE->getValue() == 0) {
-      Operands.erase(Operands.begin() + 4);
-      Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
-      delete Op;
-    }
-  }
-  // Similarly, the Thumb1 "RSB" instruction has a literal "#0" on the
-  // end. Convert it to a token here. Take care not to convert those
-  // that should hit the Thumb2 encoding.
-  if (Mnemonic == "rsb" && isThumb() && Operands.size() == 6 &&
-      static_cast<ARMOperand*>(Operands[3])->isReg() &&
-      static_cast<ARMOperand*>(Operands[4])->isReg() &&
-      static_cast<ARMOperand*>(Operands[5])->isImm()) {
-    ARMOperand *Op = static_cast<ARMOperand*>(Operands[5]);
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
-    if (CE && CE->getValue() == 0 &&
-        (isThumbOne() ||
-         // The cc_out operand matches the IT block.
-         ((inITBlock() != CarrySetting) &&
-         // Neither register operand is a high register.
-         (isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) &&
-          isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()))))){
-      Operands.erase(Operands.begin() + 5);
-      Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
-      delete Op;
-    }
-  }
-
   // Adjust operands of ldrexd/strexd to MCK_GPRPair.
   // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
   // a single GPRPair reg operand is used in the .td file to replace the two
@@ -7646,10 +7623,10 @@ bool ARMAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
-      if (getParser().ParseExpression(Value))
+      if (getParser().parseExpression(Value))
         return true;
 
-      getParser().getStreamer().EmitValue(Value, Size, 0/*addrspace*/);
+      getParser().getStreamer().EmitValue(Value, Size);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -7793,13 +7770,13 @@ bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
   unsigned Reg;
   SMLoc SRegLoc, ERegLoc;
   if (ParseRegister(Reg, SRegLoc, ERegLoc)) {
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return Error(SRegLoc, "register name expected");
   }
 
   // Shouldn't be anything else.
   if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return Error(Parser.getTok().getLoc(),
                  "unexpected input in .req directive.");
   }
@@ -7817,7 +7794,7 @@ bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
 ///  ::= .unreq registername
 bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
   if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return Error(L, "unexpected input in .unreq directive.");
   }
   RegisterReqs.erase(Parser.getTok().getIdentifier());
@@ -7847,3 +7824,21 @@ extern "C" void LLVMInitializeARMAsmParser() {
 #define GET_SUBTARGET_FEATURE_NAME
 #define GET_MATCHER_IMPLEMENTATION
 #include "ARMGenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+                                                  unsigned Kind) {
+  ARMOperand *Op = static_cast<ARMOperand*>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  if (Kind == MCK__35_0 && Op->isImm()) {
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (!CE)
+      return Match_InvalidOperand;
+    if (CE->getValue() == 0)
+      return Match_Success;
+  }
+  return Match_InvalidOperand;
+}
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index d48b37e..2afb20d 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -627,8 +627,7 @@ void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
   O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
   if (MO2.getImm()) {
-    // FIXME: Both darwin as and GNU as violate ARM docs here.
-    O << ", :" << (MO2.getImm() << 3);
+    O << ":" << (MO2.getImm() << 3);
   }
   O << "]" << markup(">");
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 1f1b334..e66e985 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -11,6 +11,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -663,25 +664,20 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin()) {
-    if (TheTriple.getArchName() == "armv4t" ||
-        TheTriple.getArchName() == "thumbv4t")
-      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V4T);
-    else if (TheTriple.getArchName() == "armv5e" ||
-        TheTriple.getArchName() == "thumbv5e")
-      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V5TEJ);
-    else if (TheTriple.getArchName() == "armv6" ||
-        TheTriple.getArchName() == "thumbv6")
-      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V6);
-    else if (TheTriple.getArchName() == "armv7f" ||
-        TheTriple.getArchName() == "thumbv7f")
-      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7F);
-    else if (TheTriple.getArchName() == "armv7k" ||
-        TheTriple.getArchName() == "thumbv7k")
-      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7K);
-    else if (TheTriple.getArchName() == "armv7s" ||
-        TheTriple.getArchName() == "thumbv7s")
-      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7S);
-    return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7);
+    object::mach::CPUSubtypeARM CS =
+      StringSwitch<object::mach::CPUSubtypeARM>(TheTriple.getArchName())
+      .Cases("armv4t", "thumbv4t", object::mach::CSARM_V4T)
+      .Cases("armv5e", "thumbv5e",object::mach::CSARM_V5TEJ)
+      .Cases("armv6", "thumbv6", object::mach::CSARM_V6)
+      .Cases("armv6m", "thumbv6m", object::mach::CSARM_V6M)
+      .Cases("armv7em", "thumbv7em", object::mach::CSARM_V7EM)
+      .Cases("armv7f", "thumbv7f", object::mach::CSARM_V7F)
+      .Cases("armv7k", "thumbv7k", object::mach::CSARM_V7K)
+      .Cases("armv7m", "thumbv7m", object::mach::CSARM_V7M)
+      .Cases("armv7s", "thumbv7s", object::mach::CSARM_V7S)
+      .Default(object::mach::CSARM_V7);
+
+    return new DarwinARMAsmBackend(T, TT, CS);
   }
 
   if (TheTriple.isOSWindows())
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 9193e40..f98bbd2 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -37,7 +37,6 @@ namespace {
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend) const;
-    virtual unsigned getEFlags() const;
     virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
                                    const MCValue &Target,
                                    const MCFragment &F,
@@ -53,11 +52,6 @@ ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
 
 ARMELFObjectWriter::~ARMELFObjectWriter() {}
 
-// FIXME: get the real EABI Version from the Triple.
-unsigned ARMELFObjectWriter::getEFlags() const {
-  return ELF::EF_ARM_EABIMASK & DefaultEABIVersion;
-}
-
 // In ARM, _MergedGlobals and other most symbols get emitted directly.
 // I.e. not as an offset to a section symbol.
 // This code is an approximation of what ARM/gcc does.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 39ded8f..418971d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMUnwindOp.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -53,14 +54,27 @@ namespace {
 /// by MachO. Beware!
 class ARMELFStreamer : public MCELFStreamer {
 public:
-  ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                 raw_ostream &OS, MCCodeEmitter *Emitter, bool IsThumb)
-    : MCELFStreamer(Context, TAB, OS, Emitter),
-      IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None) {
-  }
+  ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                 MCCodeEmitter *Emitter, bool IsThumb)
+      : MCELFStreamer(SK_ARMELFStreamer, Context, TAB, OS, Emitter),
+        IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None), ExTab(0),
+        FnStart(0), Personality(0), CantUnwind(false) {}
 
   ~ARMELFStreamer() {}
 
+  // ARM exception handling directives
+  virtual void EmitFnStart();
+  virtual void EmitFnEnd();
+  virtual void EmitCantUnwind();
+  virtual void EmitPersonality(const MCSymbol *Per);
+  virtual void EmitHandlerData();
+  virtual void EmitSetFP(unsigned NewFpReg,
+                         unsigned NewSpReg,
+                         int64_t Offset = 0);
+  virtual void EmitPad(int64_t Offset);
+  virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                           bool isVector);
+
   virtual void ChangeSection(const MCSection *Section) {
     // We have to keep track of the mapping symbol state of any sections we
     // use. Each one should start off as EMS_None, which is provided as the
@@ -119,6 +133,10 @@ public:
     }
   }
 
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_ARMELFStreamer;
+  }
+
 private:
   enum ElfMappingSymbol {
     EMS_None,
@@ -172,6 +190,15 @@ private:
     SD.setFlags(SD.getFlags() | ELF_Other_ThumbFunc);
   }
 
+  // Helper functions for ARM exception handling directives
+  void Reset();
+
+  void EmitPersonalityFixup(StringRef Name);
+
+  void SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags,
+                         SectionKind Kind, const MCSymbol &Fn);
+  void SwitchToExTabSection(const MCSymbol &FnStart);
+  void SwitchToExIdxSection(const MCSymbol &FnStart);
 
   bool IsThumb;
   int64_t MappingSymbolCounter;
@@ -179,10 +206,200 @@ private:
   DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
   ElfMappingSymbol LastEMS;
 
-  /// @}
+  // ARM Exception Handling Frame Information
+  MCSymbol *ExTab;
+  MCSymbol *FnStart;
+  const MCSymbol *Personality;
+  bool CantUnwind;
 };
 }
 
+inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
+                                              unsigned Type,
+                                              unsigned Flags,
+                                              SectionKind Kind,
+                                              const MCSymbol &Fn) {
+  const MCSectionELF &FnSection =
+    static_cast<const MCSectionELF &>(Fn.getSection());
+
+  // Create the name for new section
+  StringRef FnSecName(FnSection.getSectionName());
+  SmallString<128> EHSecName(Prefix);
+  if (FnSecName != ".text") {
+    EHSecName += FnSecName;
+  }
+
+  // Get .ARM.extab or .ARM.exidx section
+  const MCSectionELF *EHSection = NULL;
+  if (const MCSymbol *Group = FnSection.getGroup()) {
+    EHSection = getContext().getELFSection(
+      EHSecName, Type, Flags | ELF::SHF_GROUP, Kind,
+      FnSection.getEntrySize(), Group->getName());
+  } else {
+    EHSection = getContext().getELFSection(EHSecName, Type, Flags, Kind);
+  }
+  assert(EHSection);
+
+  // Switch to .ARM.extab or .ARM.exidx section
+  SwitchSection(EHSection);
+  EmitCodeAlignment(4, 0);
+}
+
+inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
+  SwitchToEHSection(".ARM.extab",
+                    ELF::SHT_PROGBITS,
+                    ELF::SHF_ALLOC,
+                    SectionKind::getDataRel(),
+                    FnStart);
+}
+
+inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
+  SwitchToEHSection(".ARM.exidx",
+                    ELF::SHT_ARM_EXIDX,
+                    ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER,
+                    SectionKind::getDataRel(),
+                    FnStart);
+}
+
+void ARMELFStreamer::Reset() {
+  ExTab = NULL;
+  FnStart = NULL;
+  Personality = NULL;
+  CantUnwind = false;
+}
+
+// Add the R_ARM_NONE fixup at the same position
+void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
+  const MCSymbol *PersonalitySym = getContext().GetOrCreateSymbol(Name);
+
+  const MCSymbolRefExpr *PersonalityRef =
+    MCSymbolRefExpr::Create(PersonalitySym,
+                            MCSymbolRefExpr::VK_ARM_NONE,
+                            getContext());
+
+  AddValueSymbols(PersonalityRef);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  DF->getFixups().push_back(
+    MCFixup::Create(DF->getContents().size(), PersonalityRef,
+                    MCFixup::getKindForSize(4, false)));
+}
+
+void ARMELFStreamer::EmitFnStart() {
+  assert(FnStart == 0);
+  FnStart = getContext().CreateTempSymbol();
+  EmitLabel(FnStart);
+}
+
+void ARMELFStreamer::EmitFnEnd() {
+  assert(FnStart && ".fnstart must preceeds .fnend");
+
+  // Emit unwind opcodes if there is no .handlerdata directive
+  int PersonalityIndex = -1;
+  if (!ExTab && !CantUnwind) {
+    // For __aeabi_unwind_cpp_pr1, we have to emit opcodes in .ARM.extab.
+    SwitchToExTabSection(*FnStart);
+
+    // Create .ARM.extab label for offset in .ARM.exidx
+    ExTab = getContext().CreateTempSymbol();
+    EmitLabel(ExTab);
+
+    PersonalityIndex = 1;
+
+    uint32_t Entry = 0;
+    uint32_t NumExtraEntryWords = 0;
+    Entry |= NumExtraEntryWords << 24;
+    Entry |= (EHT_COMPACT | PersonalityIndex) << 16;
+
+    // TODO: This should be generated according to .save, .vsave, .setfp
+    // directives.  Currently, we are simply generating FINISH opcode.
+    Entry |= UNWIND_OPCODE_FINISH << 8;
+    Entry |= UNWIND_OPCODE_FINISH;
+
+    EmitIntValue(Entry, 4, 0);
+  }
+
+  // Emit the exception index table entry
+  SwitchToExIdxSection(*FnStart);
+
+  if (PersonalityIndex == 1)
+    EmitPersonalityFixup("__aeabi_unwind_cpp_pr1");
+
+  const MCSymbolRefExpr *FnStartRef =
+    MCSymbolRefExpr::Create(FnStart,
+                            MCSymbolRefExpr::VK_ARM_PREL31,
+                            getContext());
+
+  EmitValue(FnStartRef, 4, 0);
+
+  if (CantUnwind) {
+    EmitIntValue(EXIDX_CANTUNWIND, 4, 0);
+  } else {
+    const MCSymbolRefExpr *ExTabEntryRef =
+      MCSymbolRefExpr::Create(ExTab,
+                              MCSymbolRefExpr::VK_ARM_PREL31,
+                              getContext());
+    EmitValue(ExTabEntryRef, 4, 0);
+  }
+
+  // Clean exception handling frame information
+  Reset();
+}
+
+void ARMELFStreamer::EmitCantUnwind() {
+  CantUnwind = true;
+}
+
+void ARMELFStreamer::EmitHandlerData() {
+  SwitchToExTabSection(*FnStart);
+
+  // Create .ARM.extab label for offset in .ARM.exidx
+  assert(!ExTab);
+  ExTab = getContext().CreateTempSymbol();
+  EmitLabel(ExTab);
+
+  // Emit Personality
+  assert(Personality && ".personality directive must preceed .handlerdata");
+
+  const MCSymbolRefExpr *PersonalityRef =
+    MCSymbolRefExpr::Create(Personality,
+                            MCSymbolRefExpr::VK_ARM_PREL31,
+                            getContext());
+
+  EmitValue(PersonalityRef, 4, 0);
+
+  // Emit unwind opcodes
+  uint32_t Entry = 0;
+  uint32_t NumExtraEntryWords = 0;
+
+  // TODO: This should be generated according to .save, .vsave, .setfp
+  // directives.  Currently, we are simply generating FINISH opcode.
+  Entry |= NumExtraEntryWords << 24;
+  Entry |= UNWIND_OPCODE_FINISH << 16;
+  Entry |= UNWIND_OPCODE_FINISH << 8;
+  Entry |= UNWIND_OPCODE_FINISH;
+
+  EmitIntValue(Entry, 4, 0);
+}
+
+void ARMELFStreamer::EmitPersonality(const MCSymbol *Per) {
+  Personality = Per;
+}
+
+void ARMELFStreamer::EmitSetFP(unsigned NewFpReg,
+                               unsigned NewSpReg,
+                               int64_t Offset) {
+  // TODO: Not implemented
+}
+
+void ARMELFStreamer::EmitPad(int64_t Offset) {
+  // TODO: Not implemented
+}
+
+void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                 bool IsVector) {
+  // TODO: Not implemented
+}
+
 namespace llvm {
   MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                       raw_ostream &OS, MCCodeEmitter *Emitter,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index b404e6c..cd4067a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -64,6 +64,9 @@ public:
     return getSubExpr()->FindAssociatedSection();
   }
 
+  // There are no TLS ARMMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index f4958f3..f09fb5a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -11,11 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMMCTargetDesc.h"
 #include "ARMBaseInfo.h"
 #include "ARMELFStreamer.h"
 #include "ARMMCAsmInfo.h"
+#include "ARMMCTargetDesc.h"
 #include "InstPrinter/ARMInstPrinter.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -37,6 +38,8 @@
 using namespace llvm;
 
 std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
+  Triple triple(TT);
+
   // Set the boolean corresponding to the current target triple, or the default
   // if one cannot be determined, to true.
   unsigned Len = TT.size();
@@ -119,6 +122,13 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
       ARMArchFeature += ",+thumb-mode";
   }
 
+  if (triple.isOSNaCl()) {
+    if (ARMArchFeature.empty())
+      ARMArchFeature = "+nacl-trap";
+    else
+      ARMArchFeature += ",+nacl-trap";
+  }
+
   return ARMArchFeature;
 }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h
new file mode 100644
index 0000000..dad5576
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h
@@ -0,0 +1,112 @@
+//===-- ARMUnwindOp.h - ARM Unwind Opcodes ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the constants for the ARM unwind opcodes and exception
+// handling table entry kinds.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_UNWIND_OP_H
+#define ARM_UNWIND_OP_H
+
+namespace llvm {
+
+  /// ARM exception handling table entry kinds
+  enum ARMEHTEntryKind {
+    EHT_GENERIC = 0x00,
+    EHT_COMPACT = 0x80
+  };
+
+  enum {
+    /// Special entry for the function never unwind
+    EXIDX_CANTUNWIND = 0x1
+  };
+
+  /// ARM-defined frame unwinding opcodes
+  enum ARMUnwindOpcodes {
+    // Format: 00xxxxxx
+    // Purpose: vsp = vsp + ((x << 2) + 4)
+    UNWIND_OPCODE_INC_VSP = 0x00,
+
+    // Format: 01xxxxxx
+    // Purpose: vsp = vsp - ((x << 2) + 4)
+    UNWIND_OPCODE_DEC_VSP = 0x40,
+
+    // Format: 10000000 00000000
+    // Purpose: refuse to unwind
+    UNWIND_OPCODE_REFUSE = 0x8000,
+
+    // Format: 1000xxxx xxxxxxxx
+    // Purpose: pop r[15:12], r[11:4]
+    // Constraint: x != 0
+    UNWIND_OPCODE_POP_REG_MASK_R4 = 0x8000,
+
+    // Format: 1001xxxx
+    // Purpose: vsp = r[x]
+    // Constraint: x != 13 && x != 15
+    UNWIND_OPCODE_SET_VSP = 0x90,
+
+    // Format: 10100xxx
+    // Purpose: pop r[(4+x):4]
+    UNWIND_OPCODE_POP_REG_RANGE_R4 = 0xa0,
+
+    // Format: 10101xxx
+    // Purpose: pop r14, r[(4+x):4]
+    UNWIND_OPCODE_POP_REG_RANGE_R4_R14 = 0xa8,
+
+    // Format: 10110000
+    // Purpose: finish
+    UNWIND_OPCODE_FINISH = 0xb0,
+
+    // Format: 10110001 0000xxxx
+    // Purpose: pop r[3:0]
+    // Constraint: x != 0
+    UNWIND_OPCODE_POP_REG_MASK = 0xb100,
+
+    // Format: 10110010 x(uleb128)
+    // Purpose: vsp = vsp + ((x << 2) + 0x204)
+    UNWIND_OPCODE_INC_VSP_ULEB128 = 0xb2,
+
+    // Format: 10110011 xxxxyyyy
+    // Purpose: pop d[(x+y):x]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDX = 0xb300,
+
+    // Format: 10111xxx
+    // Purpose: pop d[(8+x):8]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDX_D8 = 0xb8,
+
+    // Format: 11000xxx
+    // Purpose: pop wR[(10+x):10]
+    UNWIND_OPCODE_POP_WIRELESS_MMX_REG_RANGE_WR10 = 0xc0,
+
+    // Format: 11000110 xxxxyyyy
+    // Purpose: pop wR[(x+y):x]
+    UNWIND_OPCODE_POP_WIRELESS_MMX_REG_RANGE = 0xc600,
+
+    // Format: 11000111 0000xxxx
+    // Purpose: pop wCGR[3:0]
+    // Constraint: x != 0
+    UNWIND_OPCODE_POP_WIRELESS_MMX_REG_MASK = 0xc700,
+
+    // Format: 11001000 xxxxyyyy
+    // Purpose: pop d[(16+x+y):(16+x)]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 = 0xc800,
+
+    // Format: 11001001 xxxxyyyy
+    // Purpose: pop d[(x+y):x]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD = 0xc900,
+
+    // Format: 11010xxx
+    // Purpose: pop d[(8+x):8]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D8 = 0xd0
+  };
+
+}
+
+#endif // ARM_UNWIND_OP_H
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 123ada6..2c3388c 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -43,6 +43,41 @@ emitSPUpdate(MachineBasicBlock &MBB,
                             MRI, MIFlags);
 }
 
+
+void Thumb1FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const Thumb1InstrInfo &TII =
+    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+  const Thumb1RegisterInfo *RegInfo =
+    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
+      } else {
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
 void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -124,14 +159,17 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
   unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
   unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
-  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
+  bool HasFP = hasFP(MF);
+  if (HasFP)
+    AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
+                                NumBytes);
   AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
   AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
   NumBytes = DPRCSOffset;
 
   // Adjust FP so it point to the stack slot that contains the previous FP.
-  if (hasFP(MF)) {
+  if (HasFP) {
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
       .addFrameIndex(FramePtrSpillFI).addImm(0)
       .setMIFlags(MachineInstr::FrameSetup));
@@ -146,7 +184,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
                  MachineInstr::FrameSetup);
 
-  if (STI.isTargetELF() && hasFP(MF))
+  if (STI.isTargetELF() && HasFP)
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
                              AFI->getFramePtrSpillOffset());
 
diff --git a/lib/Target/ARM/Thumb1FrameLowering.h b/lib/Target/ARM/Thumb1FrameLowering.h
index bcfc516..5a300af 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/lib/Target/ARM/Thumb1FrameLowering.h
@@ -45,6 +45,10 @@ public:
                                    const TargetRegisterInfo *TRI) const;
 
   bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 57cc7d8..609d502 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -296,47 +296,6 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
   }
 }
 
-static void emitSPUpdate(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator &MBBI,
-                         const TargetInstrInfo &TII, DebugLoc dl,
-                         const Thumb1RegisterInfo &MRI,
-                         int NumBytes) {
-  emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
-                            MRI);
-}
-
-void Thumb1RegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  if (!TFI->hasReservedCallFrame(MF)) {
-    // If we have alloca, convert as follows:
-    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
-    // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc dl = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = TFI->getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
-
-      // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
-      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
-        emitSPUpdate(MBB, I, TII, dl, *this, -Amount);
-      } else {
-        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
-        emitSPUpdate(MBB, I, TII, dl, *this, Amount);
-      }
-    }
-  }
-  MBB.erase(I);
-}
-
 /// emitThumbConstant - Emit a series of instructions to materialize a
 /// constant.
 static void emitThumbConstant(MachineBasicBlock &MBB,
@@ -593,9 +552,9 @@ Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
 
 void
 Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                        int SPAdj, RegScavenger *RS) const {
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
   unsigned VReg = 0;
-  unsigned i = 0;
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
@@ -603,13 +562,8 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB(*MBB.getParent(), &MI);
 
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
   unsigned FrameReg = ARM::SP;
-  int FrameIndex = MI.getOperand(i).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
                MF.getFrameInfo()->getStackSize() + SPAdj;
 
@@ -646,15 +600,15 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Special handling of dbg_value instructions.
   if (MI.isDebugValue()) {
-    MI.getOperand(i).  ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(i+1).ChangeToImmediate(Offset);
+    MI.getOperand(FIOperandNum).  ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
     return;
   }
 
   // Modify MI as necessary to handle as much of 'Offset' as possible
   assert(AFI->isThumbFunction() &&
          "This eliminateFrameIndex only supports Thumb1!");
-  if (rewriteFrameIndex(MI, i, FrameReg, Offset, TII))
+  if (rewriteFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
     return;
 
   // If we get here, the immediate doesn't fit into the instruction.  We folded
@@ -687,11 +641,12 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     }
 
     MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi));
-    MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
+    MI.getOperand(FIOperandNum).ChangeToRegister(TmpReg, false, false, true);
     if (UseRR)
       // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
       // register. The offset is already handled in the vreg value.
-      MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false);
+      MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+                                                     false);
   } else if (MI.mayStore()) {
       VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
       bool UseRR = false;
@@ -708,11 +663,12 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
         emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII,
                                   *this);
       MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi));
-      MI.getOperand(i).ChangeToRegister(VReg, false, false, true);
+      MI.getOperand(FIOperandNum).ChangeToRegister(VReg, false, false, true);
       if (UseRR)
         // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
         // register. The offset is already handled in the vreg value.
-        MI.getOperand(i+1).ChangeToRegister(FrameReg, false, false, false);
+        MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+                                                       false);
   } else {
     llvm_unreachable("Unexpected opcode!");
   }
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index f2e4b08..ebbab36 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -43,11 +43,6 @@ public:
                         unsigned PredReg = 0,
                         unsigned MIFlags = MachineInstr::NoFlags) const;
 
-  /// Code Generation virtual methods...
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
   // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
   // however much remains to be handled. Return 'true' if no further
   // work is required.
@@ -62,7 +57,8 @@ public:
                              const TargetRegisterClass *RC,
                              unsigned Reg) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
 };
 }
 
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index f468861..604abf9 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -470,18 +470,19 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
   nl(Out);
   if (!PAL.isEmpty()) {
     Out << '{'; in(); nl(Out);
-    Out << "SmallVector<AttributeWithIndex, 4> Attrs;"; nl(Out);
-    Out << "AttributeWithIndex PAWI;"; nl(Out);
+    Out << "SmallVector<AttributeSet, 4> Attrs;"; nl(Out);
+    Out << "AttributeSet PAS;"; in(); nl(Out);
     for (unsigned i = 0; i < PAL.getNumSlots(); ++i) {
-      unsigned index = PAL.getSlot(i).Index;
-      AttrBuilder attrs(PAL.getSlot(i).Attrs);
-      Out << "PAWI.Index = " << index << "U;\n";
-      Out << " {\n    AttrBuilder B;\n";
-
-#define HANDLE_ATTR(X)                                     \
-      if (attrs.contains(Attribute::X))                    \
-        Out << "    B.addAttribute(Attribute::" #X ");\n"; \
-      attrs.removeAttribute(Attribute::X);
+      unsigned index = PAL.getSlotIndex(i);
+      AttrBuilder attrs(PAL.getSlotAttributes(i), index);
+      Out << "{"; in(); nl(Out);
+      Out << "AttrBuilder B;"; nl(Out);
+
+#define HANDLE_ATTR(X)                                                  \
+      if (attrs.contains(Attribute::X)) {                               \
+        Out << "B.addAttribute(Attribute::" #X ");"; nl(Out);           \
+        attrs.removeAttribute(Attribute::X);                            \
+      }
 
       HANDLE_ATTR(SExt);
       HANDLE_ATTR(ZExt);
@@ -499,6 +500,7 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
       HANDLE_ATTR(OptimizeForSize);
       HANDLE_ATTR(StackProtect);
       HANDLE_ATTR(StackProtectReq);
+      HANDLE_ATTR(StackProtectStrong);
       HANDLE_ATTR(NoCapture);
       HANDLE_ATTR(NoRedZone);
       HANDLE_ATTR(NoImplicitFloat);
@@ -509,14 +511,23 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
       HANDLE_ATTR(NonLazyBind);
       HANDLE_ATTR(MinSize);
 #undef HANDLE_ATTR
-      if (attrs.contains(Attribute::StackAlignment))
-        Out << "    B.addStackAlignmentAttr(" << attrs.getStackAlignment() << ")\n";
-      attrs.removeAttribute(Attribute::StackAlignment);
+
+      if (attrs.contains(Attribute::StackAlignment)) {
+        Out << "B.addStackAlignmentAttr(" << attrs.getStackAlignment()<<')';
+        nl(Out);
+        attrs.removeAttribute(Attribute::StackAlignment);
+      }
+
       assert(!attrs.hasAttributes() && "Unhandled attribute!");
-      Out << "    PAWI.Attrs = Attribute::get(mod->getContext(), B);\n }";
-      nl(Out);
-      Out << "Attrs.push_back(PAWI);";
+      Out << "PAS = AttributeSet::get(mod->getContext(), ";
+      if (index == ~0U)
+        Out << "~0U,";
+      else
+        Out << index << "U,";
+      Out << " B);"; out(); nl(Out);
+      Out << "}"; out(); nl(Out);
       nl(Out);
+      Out << "Attrs.push_back(PAS);"; nl(Out);
     }
     Out << name << "_PAL = AttributeSet::get(mod->getContext(), Attrs);";
     nl(Out);
@@ -1888,23 +1899,24 @@ void CppWriter::printModuleBody() {
 
 void CppWriter::printProgram(const std::string& fname,
                              const std::string& mName) {
-  Out << "#include <llvm/LLVMContext.h>\n";
-  Out << "#include <llvm/Module.h>\n";
-  Out << "#include <llvm/DerivedTypes.h>\n";
-  Out << "#include <llvm/Constants.h>\n";
-  Out << "#include <llvm/GlobalVariable.h>\n";
-  Out << "#include <llvm/Function.h>\n";
-  Out << "#include <llvm/CallingConv.h>\n";
-  Out << "#include <llvm/BasicBlock.h>\n";
-  Out << "#include <llvm/Instructions.h>\n";
-  Out << "#include <llvm/InlineAsm.h>\n";
-  Out << "#include <llvm/Support/FormattedStream.h>\n";
-  Out << "#include <llvm/Support/MathExtras.h>\n";
   Out << "#include <llvm/Pass.h>\n";
   Out << "#include <llvm/PassManager.h>\n";
+
   Out << "#include <llvm/ADT/SmallVector.h>\n";
   Out << "#include <llvm/Analysis/Verifier.h>\n";
   Out << "#include <llvm/Assembly/PrintModulePass.h>\n";
+  Out << "#include <llvm/IR/BasicBlock.h>\n";
+  Out << "#include <llvm/IR/CallingConv.h>\n";
+  Out << "#include <llvm/IR/Constants.h>\n";
+  Out << "#include <llvm/IR/DerivedTypes.h>\n";
+  Out << "#include <llvm/IR/Function.h>\n";
+  Out << "#include <llvm/IR/GlobalVariable.h>\n";
+  Out << "#include <llvm/IR/InlineAsm.h>\n";
+  Out << "#include <llvm/IR/Instructions.h>\n";
+  Out << "#include <llvm/IR/LLVMContext.h>\n";
+  Out << "#include <llvm/IR/Module.h>\n";
+  Out << "#include <llvm/Support/FormattedStream.h>\n";
+  Out << "#include <llvm/Support/MathExtras.h>\n";
   Out << "#include <algorithm>\n";
   Out << "using namespace llvm;\n\n";
   Out << "Module* " << fname << "();\n\n";
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index aee43ba..b5b887e 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -18,6 +18,7 @@ add_llvm_target(HexagonCodeGen
   HexagonExpandPredSpillCode.cpp
   HexagonFrameLowering.cpp
   HexagonHardwareLoops.cpp
+  HexagonFixupHwLoops.cpp
   HexagonMachineScheduler.cpp
   HexagonMCInstLower.cpp
   HexagonInstrInfo.cpp
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index 45f857b..dfbefc8 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -21,14 +21,16 @@
 
 namespace llvm {
   class FunctionPass;
+  class ModulePass;
   class TargetMachine;
   class MachineInstr;
-  class MCInst;
+  class HexagonMCInst;
   class HexagonAsmPrinter;
   class HexagonTargetMachine;
   class raw_ostream;
 
-  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM);
+  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel);
   FunctionPass *createHexagonDelaySlotFillerPass(TargetMachine &TM);
   FunctionPass *createHexagonFPMoverPass(TargetMachine &TM);
   FunctionPass *createHexagonRemoveExtendOps(HexagonTargetMachine &TM);
@@ -53,7 +55,7 @@ namespace llvm {
   TargetAsmBackend *createHexagonAsmBackend(const Target &,
                                                   const std::string &);
 */
-  void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI,
+  void HexagonLowerToMC(const MachineInstr *MI, HexagonMCInst &MCI,
                         HexagonAsmPrinter &AP);
 } // end namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 58b89d1..88cd3fb 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -14,12 +14,12 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "asm-printer"
-#include "HexagonAsmPrinter.h"
 #include "Hexagon.h"
-#include "HexagonMCInst.h"
+#include "HexagonAsmPrinter.h"
 #include "HexagonMachineFunctionInfo.h"
-#include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "MCTargetDesc/HexagonMCInst.h"
 #include "InstPrinter/HexagonInstPrinter.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -220,8 +220,8 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert((Size+IgnoreCount) == MI->getBundleSize() && "Corrupt Bundle!");
     for (unsigned Index = 0; Index < Size; Index++) {
       HexagonMCInst MCI;
-      MCI.setStartPacket(Index == 0);
-      MCI.setEndPacket(Index == (Size-1));
+      MCI.setPacketStart(Index == 0);
+      MCI.setPacketEnd(Index == (Size-1));
 
       HexagonLowerToMC(BundleMIs[Index], MCI, *this);
       OutStreamer.EmitInstruction(MCI);
@@ -230,8 +230,8 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   else {
     HexagonMCInst MCI;
     if (MI->getOpcode() == Hexagon::ENDLOOP0) {
-      MCI.setStartPacket(true);
-      MCI.setEndPacket(true);
+      MCI.setPacketStart(true);
+      MCI.setPacketEnd(true);
     }
     HexagonLowerToMC(MI, MCI, *this);
     OutStreamer.EmitInstruction(MCI);
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
new file mode 100644
index 0000000..240cc95
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -0,0 +1,183 @@
+//===---- HexagonFixupHwLoops.cpp - Fixup HW loops too far from LOOPn. ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// The loop start address in the LOOPn instruction is encoded as a distance
+// from the LOOPn instruction itself.  If the start address is too far from
+// the LOOPn instruction, the loop needs to be set up manually, i.e. via
+// direct transfers to SAn and LCn.
+// This pass will identify and convert such LOOPn instructions to a proper
+// form.
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeHexagonFixupHwLoopsPass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonFixupHwLoops : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonFixupHwLoops() : MachineFunctionPass(ID) {
+      initializeHexagonFixupHwLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "Hexagon Hardware Loop Fixup"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// \brief Maximum distance between the loop instr and the basic block.
+    /// Just an estimate.
+    static const unsigned MAX_LOOP_DISTANCE = 200;
+
+    /// \brief Check the offset between each loop instruction and
+    /// the loop basic block to determine if we can use the LOOP instruction
+    /// or if we need to set the LC/SA registers explicitly.
+    bool fixupLoopInstrs(MachineFunction &MF);
+
+    /// \brief Add the instruction to set the LC and SA registers explicitly.
+    void convertLoopInstr(MachineFunction &MF,
+                          MachineBasicBlock::iterator &MII,
+                          RegScavenger &RS);
+
+  };
+
+  char HexagonFixupHwLoops::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonFixupHwLoops, "hwloopsfixup",
+                "Hexagon Hardware Loops Fixup", false, false)
+
+FunctionPass *llvm::createHexagonFixupHwLoops() {
+  return new HexagonFixupHwLoops();
+}
+
+
+/// \brief Returns true if the instruction is a hardware loop instruction.
+static bool isHardwareLoop(const MachineInstr *MI) {
+  return MI->getOpcode() == Hexagon::LOOP0_r ||
+         MI->getOpcode() == Hexagon::LOOP0_i;
+}
+
+
+bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = fixupLoopInstrs(MF);
+  return Changed;
+}
+
+
+/// \brief For Hexagon, if the loop label is to far from the
+/// loop instruction then we need to set the LC0 and SA0 registers
+/// explicitly instead of using LOOP(start,count).  This function
+/// checks the distance, and generates register assignments if needed.
+///
+/// This function makes two passes over the basic blocks.  The first
+/// pass computes the offset of the basic block from the start.
+/// The second pass checks all the loop instructions.
+bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
+
+  // Offset of the current instruction from the start.
+  unsigned InstOffset = 0;
+  // Map for each basic block to it's first instruction.
+  DenseMap<MachineBasicBlock*, unsigned> BlockToInstOffset;
+
+  // First pass - compute the offset of each basic block.
+  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
+       MBB != MBBe; ++MBB) {
+    BlockToInstOffset[MBB] = InstOffset;
+    InstOffset += (MBB->size() * 4);
+  }
+
+  // Second pass - check each loop instruction to see if it needs to
+  // be converted.
+  InstOffset = 0;
+  bool Changed = false;
+  RegScavenger RS;
+
+  // Loop over all the basic blocks.
+  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
+       MBB != MBBe; ++MBB) {
+    InstOffset = BlockToInstOffset[MBB];
+    RS.enterBasicBlock(MBB);
+
+    // Loop over all the instructions.
+    MachineBasicBlock::iterator MIE = MBB->end();
+    MachineBasicBlock::iterator MII = MBB->begin();
+    while (MII != MIE) {
+      if (isHardwareLoop(MII)) {
+        RS.forward(MII);
+        assert(MII->getOperand(0).isMBB() &&
+               "Expect a basic block as loop operand");
+        int Sub = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
+        unsigned Dist = Sub > 0 ? Sub : -Sub;
+        if (Dist > MAX_LOOP_DISTANCE) {
+          // Convert to explicity setting LC0 and SA0.
+          convertLoopInstr(MF, MII, RS);
+          MII = MBB->erase(MII);
+          Changed = true;
+        } else {
+          ++MII;
+        }
+      } else {
+        ++MII;
+      }
+      InstOffset += 4;
+    }
+  }
+
+  return Changed;
+}
+
+
+/// \brief convert a loop instruction to a sequence of instructions that
+/// set the LC0 and SA0 register explicitly.
+void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF,
+                                           MachineBasicBlock::iterator &MII,
+                                           RegScavenger &RS) {
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  MachineBasicBlock *MBB = MII->getParent();
+  DebugLoc DL = MII->getDebugLoc();
+  unsigned Scratch = RS.scavengeRegister(&Hexagon::IntRegsRegClass, MII, 0);
+
+  // First, set the LC0 with the trip count.
+  if (MII->getOperand(1).isReg()) {
+    // Trip count is a register
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+      .addReg(MII->getOperand(1).getReg());
+  } else {
+    // Trip count is an immediate.
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFRI), Scratch)
+      .addImm(MII->getOperand(1).getImm());
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+      .addReg(Scratch);
+  }
+  // Then, set the SA0 with the loop start address.
+  BuildMI(*MBB, MII, DL, TII->get(Hexagon::CONST32_Label), Scratch)
+    .addMBB(MII->getOperand(0).getMBB());
+  BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::SA0)
+    .addReg(Scratch);
+}
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 9043cf9..d6a9329 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -327,6 +327,21 @@ bool HexagonFrameLowering::restoreCalleeSavedRegisters(
   return true;
 }
 
+void HexagonFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+
+  if (MI.getOpcode() == Hexagon::ADJCALLSTACKDOWN) {
+    // Hexagon_TODO: add code
+  } else if (MI.getOpcode() == Hexagon::ADJCALLSTACKUP) {
+    // Hexagon_TODO: add code
+  } else {
+    llvm_unreachable("Cannot handle this call frame pseudo instruction");
+  }
+  MBB.erase(I);
+}
+
 int HexagonFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
                                               int FI) const {
   return MF.getFrameInfo()->getObjectOffset(FI);
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index ad87f11..a62c76a 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -35,6 +35,11 @@ public:
                             MachineBasicBlock::iterator MI,
                             const std::vector<CalleeSavedInfo> &CSI,
                             const TargetRegisterInfo *TRI) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
   virtual bool
   restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 2a00a9f..62aed13 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -27,9 +27,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "hwloops"
-#include "Hexagon.h"
-#include "HexagonTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -37,79 +35,194 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+
 #include <algorithm>
+#include <vector>
 
 using namespace llvm;
 
+#ifndef NDEBUG
+static cl::opt<int> HWLoopLimit("max-hwloop", cl::Hidden, cl::init(-1));
+#endif
+
 STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
 
+namespace llvm {
+  void initializeHexagonHardwareLoopsPass(PassRegistry&);
+}
+
 namespace {
   class CountValue;
   struct HexagonHardwareLoops : public MachineFunctionPass {
-    MachineLoopInfo       *MLI;
-    MachineRegisterInfo   *MRI;
-    const TargetInstrInfo *TII;
+    MachineLoopInfo            *MLI;
+    MachineRegisterInfo        *MRI;
+    MachineDominatorTree       *MDT;
+    const HexagonTargetMachine *TM;
+    const HexagonInstrInfo     *TII;
+    const HexagonRegisterInfo  *TRI;
+#ifndef NDEBUG
+    static int Counter;
+#endif
 
   public:
-    static char ID;   // Pass identification, replacement for typeid
+    static char ID;
 
-    HexagonHardwareLoops() : MachineFunctionPass(ID) {}
+    HexagonHardwareLoops() : MachineFunctionPass(ID) {
+      initializeHexagonHardwareLoopsPass(*PassRegistry::getPassRegistry());
+    }
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
     const char *getPassName() const { return "Hexagon Hardware Loops"; }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
       AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
       AU.addRequired<MachineLoopInfo>();
-      AU.addPreserved<MachineLoopInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
   private:
-    /// getCanonicalInductionVariable - Check to see if the loop has a canonical
-    /// induction variable.
-    /// Should be defined in MachineLoop. Based upon version in class Loop.
-    const MachineInstr *getCanonicalInductionVariable(MachineLoop *L) const;
-
-    /// getTripCount - Return a loop-invariant LLVM register indicating the
-    /// number of times the loop will be executed.  If the trip-count cannot
-    /// be determined, this return null.
-    CountValue *getTripCount(MachineLoop *L) const;
-
-    /// isInductionOperation - Return true if the instruction matches the
-    /// pattern for an opertion that defines an induction variable.
-    bool isInductionOperation(const MachineInstr *MI, unsigned IVReg) const;
+    /// Kinds of comparisons in the compare instructions.
+    struct Comparison {
+      enum Kind {
+        EQ  = 0x01,
+        NE  = 0x02,
+        L   = 0x04, // Less-than property.
+        G   = 0x08, // Greater-than property.
+        U   = 0x40, // Unsigned property.
+        LTs = L,
+        LEs = L | EQ,
+        GTs = G,
+        GEs = G | EQ,
+        LTu = L      | U,
+        LEu = L | EQ | U,
+        GTu = G      | U,
+        GEu = G | EQ | U
+      };
+
+      static Kind getSwappedComparison(Kind Cmp) {
+        assert ((!((Cmp & L) && (Cmp & G))) && "Malformed comparison operator");
+        if ((Cmp & L) || (Cmp & G))
+          return (Kind)(Cmp ^ (L|G));
+        return Cmp;
+      }
+    };
 
-    /// isInvalidOperation - Return true if the instruction is not valid within
-    /// a hardware loop.
+    /// \brief Find the register that contains the loop controlling
+    /// induction variable.
+    /// If successful, it will return true and set the \p Reg, \p IVBump
+    /// and \p IVOp arguments.  Otherwise it will return false.
+    /// The returned induction register is the register R that follows the
+    /// following induction pattern:
+    /// loop:
+    ///   R = phi ..., [ R.next, LatchBlock ]
+    ///   R.next = R + #bump
+    ///   if (R.next < #N) goto loop
+    /// IVBump is the immediate value added to R, and IVOp is the instruction
+    /// "R.next = R + #bump".
+    bool findInductionRegister(MachineLoop *L, unsigned &Reg,
+                               int64_t &IVBump, MachineInstr *&IVOp) const;
+
+    /// \brief Analyze the statements in a loop to determine if the loop
+    /// has a computable trip count and, if so, return a value that represents
+    /// the trip count expression.
+    CountValue *getLoopTripCount(MachineLoop *L,
+                                 SmallVector<MachineInstr*, 2> &OldInsts);
+
+    /// \brief Return the expression that represents the number of times
+    /// a loop iterates.  The function takes the operands that represent the
+    /// loop start value, loop end value, and induction value.  Based upon
+    /// these operands, the function attempts to compute the trip count.
+    /// If the trip count is not directly available (as an immediate value,
+    /// or a register), the function will attempt to insert computation of it
+    /// to the loop's preheader.
+    CountValue *computeCount(MachineLoop *Loop,
+                             const MachineOperand *Start,
+                             const MachineOperand *End,
+                             unsigned IVReg,
+                             int64_t IVBump,
+                             Comparison::Kind Cmp) const;
+
+    /// \brief Return true if the instruction is not valid within a hardware
+    /// loop.
     bool isInvalidLoopOperation(const MachineInstr *MI) const;
 
-    /// containsInavlidInstruction - Return true if the loop contains an
-    /// instruction that inhibits using the hardware loop.
+    /// \brief Return true if the loop contains an instruction that inhibits
+    /// using the hardware loop.
     bool containsInvalidInstruction(MachineLoop *L) const;
 
-    /// converToHardwareLoop - Given a loop, check if we can convert it to a
-    /// hardware loop.  If so, then perform the conversion and return true.
+    /// \brief Given a loop, check if we can convert it to a hardware loop.
+    /// If so, then perform the conversion and return true.
     bool convertToHardwareLoop(MachineLoop *L);
 
+    /// \brief Return true if the instruction is now dead.
+    bool isDead(const MachineInstr *MI,
+                SmallVector<MachineInstr*, 1> &DeadPhis) const;
+
+    /// \brief Remove the instruction if it is now dead.
+    void removeIfDead(MachineInstr *MI);
+
+    /// \brief Make sure that the "bump" instruction executes before the
+    /// compare.  We need that for the IV fixup, so that the compare
+    /// instruction would not use a bumped value that has not yet been
+    /// defined.  If the instructions are out of order, try to reorder them.
+    bool orderBumpCompare(MachineInstr *BumpI, MachineInstr *CmpI);
+
+    /// \brief Get the instruction that loads an immediate value into \p R,
+    /// or 0 if such an instruction does not exist.
+    MachineInstr *defWithImmediate(unsigned R);
+
+    /// \brief Get the immediate value referenced to by \p MO, either for
+    /// immediate operands, or for register operands, where the register
+    /// was defined with an immediate value.
+    int64_t getImmediate(MachineOperand &MO);
+
+    /// \brief Reset the given machine operand to now refer to a new immediate
+    /// value.  Assumes that the operand was already referencing an immediate
+    /// value, either directly, or via a register.
+    void setImmediate(MachineOperand &MO, int64_t Val);
+
+    /// \brief Fix the data flow of the induction varible.
+    /// The desired flow is: phi ---> bump -+-> comparison-in-latch.
+    ///                                     |
+    ///                                     +-> back to phi
+    /// where "bump" is the increment of the induction variable:
+    ///   iv = iv + #const.
+    /// Due to some prior code transformations, the actual flow may look
+    /// like this:
+    ///   phi -+-> bump ---> back to phi
+    ///        |
+    ///        +-> comparison-in-latch (against upper_bound-bump),
+    /// i.e. the comparison that controls the loop execution may be using
+    /// the value of the induction variable from before the increment.
+    ///
+    /// Return true if the loop's flow is the desired one (i.e. it's
+    /// either been fixed, or no fixing was necessary).
+    /// Otherwise, return false.  This can happen if the induction variable
+    /// couldn't be identified, or if the value in the latch's comparison
+    /// cannot be adjusted to reflect the post-bump value.
+    bool fixupInductionVariable(MachineLoop *L);
+
+    /// \brief Given a loop, if it does not have a preheader, create one.
+    /// Return the block that is the preheader.
+    MachineBasicBlock *createPreheaderForLoop(MachineLoop *L);
   };
 
   char HexagonHardwareLoops::ID = 0;
+#ifndef NDEBUG
+  int HexagonHardwareLoops::Counter = 0;
+#endif
 
-
-  // CountValue class - Abstraction for a trip count of a loop. A
-  // smaller vesrsion of the MachineOperand class without the concerns
-  // of changing the operand representation.
+  /// \brief Abstraction for a trip count of a loop. A smaller vesrsion
+  /// of the MachineOperand class without the concerns of changing the
+  /// operand representation.
   class CountValue {
   public:
     enum CountValueType {
@@ -119,101 +232,62 @@ namespace {
   private:
     CountValueType Kind;
     union Values {
-      unsigned RegNum;
-      int64_t ImmVal;
-      Values(unsigned r) : RegNum(r) {}
-      Values(int64_t i) : ImmVal(i) {}
+      struct {
+        unsigned Reg;
+        unsigned Sub;
+      } R;
+      unsigned ImmVal;
     } Contents;
-    bool isNegative;
 
   public:
-    CountValue(unsigned r, bool neg) : Kind(CV_Register), Contents(r),
-                                       isNegative(neg) {}
-    explicit CountValue(int64_t i) : Kind(CV_Immediate), Contents(i),
-                                     isNegative(i < 0) {}
-    CountValueType getType() const { return Kind; }
+    explicit CountValue(CountValueType t, unsigned v, unsigned u = 0) {
+      Kind = t;
+      if (Kind == CV_Register) {
+        Contents.R.Reg = v;
+        Contents.R.Sub = u;
+      } else {
+        Contents.ImmVal = v;
+      }
+    }
     bool isReg() const { return Kind == CV_Register; }
     bool isImm() const { return Kind == CV_Immediate; }
-    bool isNeg() const { return isNegative; }
 
     unsigned getReg() const {
       assert(isReg() && "Wrong CountValue accessor");
-      return Contents.RegNum;
+      return Contents.R.Reg;
     }
-    void setReg(unsigned Val) {
-      Contents.RegNum = Val;
+    unsigned getSubReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.R.Sub;
     }
-    int64_t getImm() const {
+    unsigned getImm() const {
       assert(isImm() && "Wrong CountValue accessor");
-      if (isNegative) {
-        return -Contents.ImmVal;
-      }
       return Contents.ImmVal;
     }
-    void setImm(int64_t Val) {
-      Contents.ImmVal = Val;
-    }
 
     void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
-      if (isReg()) { OS << PrintReg(getReg()); }
-      if (isImm()) { OS << getImm(); }
-    }
-  };
-
-  struct HexagonFixupHwLoops : public MachineFunctionPass {
-  public:
-    static char ID;     // Pass identification, replacement for typeid.
-
-    HexagonFixupHwLoops() : MachineFunctionPass(ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &MF);
-
-    const char *getPassName() const { return "Hexagon Hardware Loop Fixup"; }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      MachineFunctionPass::getAnalysisUsage(AU);
+      const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : 0;
+      if (isReg()) { OS << PrintReg(Contents.R.Reg, TRI, Contents.R.Sub); }
+      if (isImm()) { OS << Contents.ImmVal; }
     }
-
-  private:
-    /// Maximum distance between the loop instr and the basic block.
-    /// Just an estimate.
-    static const unsigned MAX_LOOP_DISTANCE = 200;
-
-    /// fixupLoopInstrs - Check the offset between each loop instruction and
-    /// the loop basic block to determine if we can use the LOOP instruction
-    /// or if we need to set the LC/SA registers explicitly.
-    bool fixupLoopInstrs(MachineFunction &MF);
-
-    /// convertLoopInstr - Add the instruction to set the LC and SA registers
-    /// explicitly.
-    void convertLoopInstr(MachineFunction &MF,
-                          MachineBasicBlock::iterator &MII,
-                          RegScavenger &RS);
-
   };
+} // end anonymous namespace
 
-  char HexagonFixupHwLoops::ID = 0;
 
-} // end anonymous namespace
+INITIALIZE_PASS_BEGIN(HexagonHardwareLoops, "hwloops",
+                      "Hexagon Hardware Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops",
+                    "Hexagon Hardware Loops", false, false)
 
 
-/// isHardwareLoop - Returns true if the instruction is a hardware loop
-/// instruction.
+/// \brief Returns true if the instruction is a hardware loop instruction.
 static bool isHardwareLoop(const MachineInstr *MI) {
   return MI->getOpcode() == Hexagon::LOOP0_r ||
     MI->getOpcode() == Hexagon::LOOP0_i;
 }
 
-/// isCompareEquals - Returns true if the instruction is a compare equals
-/// instruction with an immediate operand.
-static bool isCompareEqualsImm(const MachineInstr *MI) {
-  return MI->getOpcode() == Hexagon::CMPEQri;
-}
-
-
-/// createHexagonHardwareLoops - Factory for creating
-/// the hardware loop phase.
 FunctionPass *llvm::createHexagonHardwareLoops() {
   return new HexagonHardwareLoops();
 }
@@ -224,45 +298,149 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
-  // get the loop information
   MLI = &getAnalysis<MachineLoopInfo>();
-  // get the register information
   MRI = &MF.getRegInfo();
-  // the target specific instructio info.
-  TII = MF.getTarget().getInstrInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  TM  = static_cast<const HexagonTargetMachine*>(&MF.getTarget());
+  TII = static_cast<const HexagonInstrInfo*>(TM->getInstrInfo());
+  TRI = static_cast<const HexagonRegisterInfo*>(TM->getRegisterInfo());
 
   for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
        I != E; ++I) {
     MachineLoop *L = *I;
-    if (!L->getParentLoop()) {
+    if (!L->getParentLoop())
       Changed |= convertToHardwareLoop(L);
-    }
   }
 
   return Changed;
 }
 
-/// getCanonicalInductionVariable - Check to see if the loop has a canonical
-/// induction variable. We check for a simple recurrence pattern - an
-/// integer recurrence that decrements by one each time through the loop and
-/// ends at zero.  If so, return the phi node that corresponds to it.
-///
-/// Based upon the similar code in LoopInfo except this code is specific to
-/// the machine.
-/// This method assumes that the IndVarSimplify pass has been run by 'opt'.
+
+bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
+                                                 unsigned &Reg,
+                                                 int64_t &IVBump,
+                                                 MachineInstr *&IVOp
+                                                 ) const {
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  if (!Header || !Preheader || !Latch)
+    return false;
+
+  // This pair represents an induction register together with an immediate
+  // value that will be added to it in each loop iteration.
+  typedef std::pair<unsigned,int64_t> RegisterBump;
+
+  // Mapping:  R.next -> (R, bump), where R, R.next and bump are derived
+  // from an induction operation
+  //   R.next = R + bump
+  // where bump is an immediate value.
+  typedef std::map<unsigned,RegisterBump> InductionMap;
+
+  InductionMap IndMap;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *Phi = &*I;
+
+    // Have a PHI instruction.  Get the operand that corresponds to the
+    // latch block, and see if is a result of an addition of form "reg+imm",
+    // where the "reg" is defined by the PHI node we are looking at.
+    for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+      if (Phi->getOperand(i+1).getMBB() != Latch)
+        continue;
+
+      unsigned PhiOpReg = Phi->getOperand(i).getReg();
+      MachineInstr *DI = MRI->getVRegDef(PhiOpReg);
+      unsigned UpdOpc = DI->getOpcode();
+      bool isAdd = (UpdOpc == Hexagon::ADD_ri);
+
+      if (isAdd) {
+        // If the register operand to the add is the PHI we're
+        // looking at, this meets the induction pattern.
+        unsigned IndReg = DI->getOperand(1).getReg();
+        if (MRI->getVRegDef(IndReg) == Phi) {
+          unsigned UpdReg = DI->getOperand(0).getReg();
+          int64_t V = DI->getOperand(2).getImm();
+          IndMap.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
+        }
+      }
+    }  // for (i)
+  }  // for (instr)
+
+  SmallVector<MachineOperand,2> Cond;
+  MachineBasicBlock *TB = 0, *FB = 0;
+  bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return false;
+
+  unsigned CSz = Cond.size();
+  assert (CSz == 1 || CSz == 2);
+  unsigned PredR = Cond[CSz-1].getReg();
+
+  MachineInstr *PredI = MRI->getVRegDef(PredR);
+  if (!PredI->isCompare())
+    return false;
+
+  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  int CmpImm = 0, CmpMask = 0;
+  bool CmpAnalyzed = TII->analyzeCompare(PredI, CmpReg1, CmpReg2,
+                                         CmpMask, CmpImm);
+  // Fail if the compare was not analyzed, or it's not comparing a register
+  // with an immediate value.  Not checking the mask here, since we handle
+  // the individual compare opcodes (including CMPb) later on.
+  if (!CmpAnalyzed)
+    return false;
+
+  // Exactly one of the input registers to the comparison should be among
+  // the induction registers.
+  InductionMap::iterator IndMapEnd = IndMap.end();
+  InductionMap::iterator F = IndMapEnd;
+  if (CmpReg1 != 0) {
+    InductionMap::iterator F1 = IndMap.find(CmpReg1);
+    if (F1 != IndMapEnd)
+      F = F1;
+  }
+  if (CmpReg2 != 0) {
+    InductionMap::iterator F2 = IndMap.find(CmpReg2);
+    if (F2 != IndMapEnd) {
+      if (F != IndMapEnd)
+        return false;
+      F = F2;
+    }
+  }
+  if (F == IndMapEnd)
+    return false;
+
+  Reg = F->second.first;
+  IVBump = F->second.second;
+  IVOp = MRI->getVRegDef(F->first);
+  return true;
+}
+
+
+/// \brief Analyze the statements in a loop to determine if the loop has
+/// a computable trip count and, if so, return a value that represents
+/// the trip count expression.
 ///
-const MachineInstr
-*HexagonHardwareLoops::getCanonicalInductionVariable(MachineLoop *L) const {
+/// This function iterates over the phi nodes in the loop to check for
+/// induction variable patterns that are used in the calculation for
+/// the number of time the loop is executed.
+CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
+                                SmallVector<MachineInstr*, 2> &OldInsts) {
   MachineBasicBlock *TopMBB = L->getTopBlock();
   MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
   assert(PI != TopMBB->pred_end() &&
          "Loop must have more than one incoming edge!");
   MachineBasicBlock *Backedge = *PI++;
-  if (PI == TopMBB->pred_end()) return 0;  // dead loop
+  if (PI == TopMBB->pred_end())  // dead loop?
+    return 0;
   MachineBasicBlock *Incoming = *PI++;
-  if (PI != TopMBB->pred_end()) return 0;  // multiple backedges?
+  if (PI != TopMBB->pred_end())  // multiple backedges?
+    return 0;
 
-  // make sure there is one incoming and one backedge and determine which
+  // Make sure there is one incoming and one backedge and determine which
   // is which.
   if (L->contains(Incoming)) {
     if (L->contains(Backedge))
@@ -271,139 +449,433 @@ const MachineInstr
   } else if (!L->contains(Backedge))
     return 0;
 
-  // Loop over all of the PHI nodes, looking for a canonical induction variable:
-  //   - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2".
-  //   - The recurrence comes from the backedge.
-  //   - the definition is an induction operatio.n
-  for (MachineBasicBlock::iterator I = TopMBB->begin(), E = TopMBB->end();
-       I != E && I->isPHI(); ++I) {
-    const MachineInstr *MPhi = &*I;
-    unsigned DefReg = MPhi->getOperand(0).getReg();
-    for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
-      // Check each operand for the value from the backedge.
-      MachineBasicBlock *MBB = MPhi->getOperand(i+1).getMBB();
-      if (L->contains(MBB)) { // operands comes from the backedge
-        // Check if the definition is an induction operation.
-        const MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg());
-        if (isInductionOperation(DI, DefReg)) {
-          return MPhi;
-        }
-      }
+  // Look for the cmp instruction to determine if we can get a useful trip
+  // count.  The trip count can be either a register or an immediate.  The
+  // location of the value depends upon the type (reg or imm).
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  if (!Latch)
+    return 0;
+
+  unsigned IVReg = 0;
+  int64_t IVBump = 0;
+  MachineInstr *IVOp;
+  bool FoundIV = findInductionRegister(L, IVReg, IVBump, IVOp);
+  if (!FoundIV)
+    return 0;
+
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+
+  MachineOperand *InitialValue = 0;
+  MachineInstr *IV_Phi = MRI->getVRegDef(IVReg);
+  for (unsigned i = 1, n = IV_Phi->getNumOperands(); i < n; i += 2) {
+    MachineBasicBlock *MBB = IV_Phi->getOperand(i+1).getMBB();
+    if (MBB == Preheader)
+      InitialValue = &IV_Phi->getOperand(i);
+    else if (MBB == Latch)
+      IVReg = IV_Phi->getOperand(i).getReg();  // Want IV reg after bump.
+  }
+  if (!InitialValue)
+    return 0;
+
+  SmallVector<MachineOperand,2> Cond;
+  MachineBasicBlock *TB = 0, *FB = 0;
+  bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return 0;
+
+  MachineBasicBlock *Header = L->getHeader();
+  // TB must be non-null.  If FB is also non-null, one of them must be
+  // the header.  Otherwise, branch to TB could be exiting the loop, and
+  // the fall through can go to the header.
+  assert (TB && "Latch block without a branch?");
+  assert ((!FB || TB == Header || FB == Header) && "Branches not to header?");
+  if (!TB || (FB && TB != Header && FB != Header))
+    return 0;
+
+  // Branches of form "if (!P) ..." cause HexagonInstrInfo::AnalyzeBranch
+  // to put imm(0), followed by P in the vector Cond.
+  // If TB is not the header, it means that the "not-taken" path must lead
+  // to the header.
+  bool Negated = (Cond.size() > 1) ^ (TB != Header);
+  unsigned PredReg = Cond[Cond.size()-1].getReg();
+  MachineInstr *CondI = MRI->getVRegDef(PredReg);
+  unsigned CondOpc = CondI->getOpcode();
+
+  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  int Mask = 0, ImmValue = 0;
+  bool AnalyzedCmp = TII->analyzeCompare(CondI, CmpReg1, CmpReg2,
+                                         Mask, ImmValue);
+  if (!AnalyzedCmp)
+    return 0;
+
+  // The comparison operator type determines how we compute the loop
+  // trip count.
+  OldInsts.push_back(CondI);
+  OldInsts.push_back(IVOp);
+
+  // Sadly, the following code gets information based on the position
+  // of the operands in the compare instruction.  This has to be done
+  // this way, because the comparisons check for a specific relationship
+  // between the operands (e.g. is-less-than), rather than to find out
+  // what relationship the operands are in (as on PPC).
+  Comparison::Kind Cmp;
+  bool isSwapped = false;
+  const MachineOperand &Op1 = CondI->getOperand(1);
+  const MachineOperand &Op2 = CondI->getOperand(2);
+  const MachineOperand *EndValue = 0;
+
+  if (Op1.isReg()) {
+    if (Op2.isImm() || Op1.getReg() == IVReg)
+      EndValue = &Op2;
+    else {
+      EndValue = &Op1;
+      isSwapped = true;
     }
   }
-  return 0;
-}
 
-/// getTripCount - Return a loop-invariant LLVM value indicating the
-/// number of times the loop will be executed.  The trip count can
-/// be either a register or a constant value.  If the trip-count
-/// cannot be determined, this returns null.
-///
-/// We find the trip count from the phi instruction that defines the
-/// induction variable.  We follow the links to the CMP instruction
-/// to get the trip count.
-///
-/// Based upon getTripCount in LoopInfo.
-///
-CountValue *HexagonHardwareLoops::getTripCount(MachineLoop *L) const {
-  // Check that the loop has a induction variable.
-  const MachineInstr *IV_Inst = getCanonicalInductionVariable(L);
-  if (IV_Inst == 0) return 0;
-
-  // Canonical loops will end with a 'cmpeq_ri IV, Imm',
-  //  if Imm is 0, get the count from the PHI opnd
-  //  if Imm is -M, than M is the count
-  //  Otherwise, Imm is the count
-  const MachineOperand *IV_Opnd;
-  const MachineOperand *InitialValue;
-  if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
-    InitialValue = &IV_Inst->getOperand(1);
-    IV_Opnd = &IV_Inst->getOperand(3);
-  } else {
-    InitialValue = &IV_Inst->getOperand(3);
-    IV_Opnd = &IV_Inst->getOperand(1);
-  }
-
-  // Look for the cmp instruction to determine if we
-  // can get a useful trip count.  The trip count can
-  // be either a register or an immediate.  The location
-  // of the value depends upon the type (reg or imm).
-  for (MachineRegisterInfo::reg_iterator
-       RI = MRI->reg_begin(IV_Opnd->getReg()), RE = MRI->reg_end();
-       RI != RE; ++RI) {
-    IV_Opnd = &RI.getOperand();
-    const MachineInstr *MI = IV_Opnd->getParent();
-    if (L->contains(MI) && isCompareEqualsImm(MI)) {
-      const MachineOperand &MO = MI->getOperand(2);
-      assert(MO.isImm() && "IV Cmp Operand should be 0");
-      int64_t ImmVal = MO.getImm();
-
-      const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
-      assert(L->contains(IV_DefInstr->getParent()) &&
-             "IV definition should occurs in loop");
-      int64_t iv_value = IV_DefInstr->getOperand(2).getImm();
-
-      if (ImmVal == 0) {
-        // Make sure the induction variable changes by one on each iteration.
-        if (iv_value != 1 && iv_value != -1) {
+  if (!EndValue)
+    return 0;
+
+  switch (CondOpc) {
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPEQrr:
+      Cmp = !Negated ? Comparison::EQ : Comparison::NE;
+      break;
+    case Hexagon::CMPLTrr:
+      Cmp = !Negated ? Comparison::LTs : Comparison::GEs;
+      break;
+    case Hexagon::CMPLTUrr:
+      Cmp = !Negated ? Comparison::LTu : Comparison::GEu;
+      break;
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGTUrr:
+      Cmp = !Negated ? Comparison::GTu : Comparison::LEu;
+      break;
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPGTrr:
+      Cmp = !Negated ? Comparison::GTs : Comparison::LEs;
+      break;
+    // Very limited support for byte/halfword compares.
+    case Hexagon::CMPbEQri_V4:
+    case Hexagon::CMPhEQri_V4: {
+      if (IVBump != 1)
+        return 0;
+
+      int64_t InitV, EndV;
+      // Since the comparisons are "ri", the EndValue should be an
+      // immediate.  Check it just in case.
+      assert(EndValue->isImm() && "Unrecognized latch comparison");
+      EndV = EndValue->getImm();
+      // Allow InitialValue to be a register defined with an immediate.
+      if (InitialValue->isReg()) {
+        if (!defWithImmediate(InitialValue->getReg()))
           return 0;
-        }
-        return new CountValue(InitialValue->getReg(), iv_value > 0);
+        InitV = getImmediate(*InitialValue);
       } else {
-        assert(InitialValue->isReg() && "Expecting register for init value");
-        const MachineInstr *DefInstr = MRI->getVRegDef(InitialValue->getReg());
-        if (DefInstr && DefInstr->getOpcode() == Hexagon::TFRI) {
-          int64_t count = ImmVal - DefInstr->getOperand(1).getImm();
-          if ((count % iv_value) != 0) {
-            return 0;
-          }
-          return new CountValue(count/iv_value);
-        }
+        assert(InitialValue->isImm());
+        InitV = InitialValue->getImm();
+      }
+      if (InitV >= EndV)
+        return 0;
+      if (CondOpc == Hexagon::CMPbEQri_V4) {
+        if (!isInt<8>(InitV) || !isInt<8>(EndV))
+          return 0;
+      } else {  // Hexagon::CMPhEQri_V4
+        if (!isInt<16>(InitV) || !isInt<16>(EndV))
+          return 0;
       }
+      Cmp = !Negated ? Comparison::EQ : Comparison::NE;
+      break;
     }
+    default:
+      return 0;
   }
-  return 0;
+
+  if (isSwapped)
+   Cmp = Comparison::getSwappedComparison(Cmp);
+
+  if (InitialValue->isReg()) {
+    unsigned R = InitialValue->getReg();
+    MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+    if (!MDT->properlyDominates(DefBB, Header))
+      return 0;
+    OldInsts.push_back(MRI->getVRegDef(R));
+  }
+  if (EndValue->isReg()) {
+    unsigned R = EndValue->getReg();
+    MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+    if (!MDT->properlyDominates(DefBB, Header))
+      return 0;
+  }
+
+  return computeCount(L, InitialValue, EndValue, IVReg, IVBump, Cmp);
 }
 
-/// isInductionOperation - return true if the operation is matches the
-/// pattern that defines an induction variable:
-///    add iv, c
-///
-bool
-HexagonHardwareLoops::isInductionOperation(const MachineInstr *MI,
-                                           unsigned IVReg) const {
-  return (MI->getOpcode() ==
-          Hexagon::ADD_ri && MI->getOperand(1).getReg() == IVReg);
+/// \brief Helper function that returns the expression that represents the
+/// number of times a loop iterates.  The function takes the operands that
+/// represent the loop start value, loop end value, and induction value.
+/// Based upon these operands, the function attempts to compute the trip count.
+CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
+                                               const MachineOperand *Start,
+                                               const MachineOperand *End,
+                                               unsigned IVReg,
+                                               int64_t IVBump,
+                                               Comparison::Kind Cmp) const {
+  // Cannot handle comparison EQ, i.e. while (A == B).
+  if (Cmp == Comparison::EQ)
+    return 0;
+
+  // Check if either the start or end values are an assignment of an immediate.
+  // If so, use the immediate value rather than the register.
+  if (Start->isReg()) {
+    const MachineInstr *StartValInstr = MRI->getVRegDef(Start->getReg());
+    if (StartValInstr && StartValInstr->getOpcode() == Hexagon::TFRI)
+      Start = &StartValInstr->getOperand(1);
+  }
+  if (End->isReg()) {
+    const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
+    if (EndValInstr && EndValInstr->getOpcode() == Hexagon::TFRI)
+      End = &EndValInstr->getOperand(1);
+  }
+
+  assert (Start->isReg() || Start->isImm());
+  assert (End->isReg() || End->isImm());
+
+  bool CmpLess =     Cmp & Comparison::L;
+  bool CmpGreater =  Cmp & Comparison::G;
+  bool CmpHasEqual = Cmp & Comparison::EQ;
+
+  // Avoid certain wrap-arounds.  This doesn't detect all wrap-arounds.
+  // If loop executes while iv is "less" with the iv value going down, then
+  // the iv must wrap.
+  if (CmpLess && IVBump < 0)
+    return 0;
+  // If loop executes while iv is "greater" with the iv value going up, then
+  // the iv must wrap.
+  if (CmpGreater && IVBump > 0)
+    return 0;
+
+  if (Start->isImm() && End->isImm()) {
+    // Both, start and end are immediates.
+    int64_t StartV = Start->getImm();
+    int64_t EndV = End->getImm();
+    int64_t Dist = EndV - StartV;
+    if (Dist == 0)
+      return 0;
+
+    bool Exact = (Dist % IVBump) == 0;
+
+    if (Cmp == Comparison::NE) {
+      if (!Exact)
+        return 0;
+      if ((Dist < 0) ^ (IVBump < 0))
+        return 0;
+    }
+
+    // For comparisons that include the final value (i.e. include equality
+    // with the final value), we need to increase the distance by 1.
+    if (CmpHasEqual)
+      Dist = Dist > 0 ? Dist+1 : Dist-1;
+
+    // assert (CmpLess => Dist > 0);
+    assert ((!CmpLess || Dist > 0) && "Loop should never iterate!");
+    // assert (CmpGreater => Dist < 0);
+    assert ((!CmpGreater || Dist < 0) && "Loop should never iterate!");
+
+    // "Normalized" distance, i.e. with the bump set to +-1.
+    int64_t Dist1 = (IVBump > 0) ? (Dist +  (IVBump-1)) /   IVBump
+                               :  (-Dist + (-IVBump-1)) / (-IVBump);
+    assert (Dist1 > 0 && "Fishy thing.  Both operands have the same sign.");
+
+    uint64_t Count = Dist1;
+
+    if (Count > 0xFFFFFFFFULL)
+      return 0;
+
+    return new CountValue(CountValue::CV_Immediate, Count);
+  }
+
+  // A general case: Start and End are some values, but the actual
+  // iteration count may not be available.  If it is not, insert
+  // a computation of it into the preheader.
+
+  // If the induction variable bump is not a power of 2, quit.
+  // Othwerise we'd need a general integer division.
+  if (!isPowerOf2_64(abs(IVBump)))
+    return 0;
+
+  MachineBasicBlock *PH = Loop->getLoopPreheader();
+  assert (PH && "Should have a preheader by now");
+  MachineBasicBlock::iterator InsertPos = PH->getFirstTerminator();
+  DebugLoc DL = (InsertPos != PH->end()) ? InsertPos->getDebugLoc()
+                                         : DebugLoc();
+
+  // If Start is an immediate and End is a register, the trip count
+  // will be "reg - imm".  Hexagon's "subtract immediate" instruction
+  // is actually "reg + -imm".
+
+  // If the loop IV is going downwards, i.e. if the bump is negative,
+  // then the iteration count (computed as End-Start) will need to be
+  // negated.  To avoid the negation, just swap Start and End.
+  if (IVBump < 0) {
+    std::swap(Start, End);
+    IVBump = -IVBump;
+  }
+  // Cmp may now have a wrong direction, e.g.  LEs may now be GEs.
+  // Signedness, and "including equality" are preserved.
+
+  bool RegToImm = Start->isReg() && End->isImm(); // for (reg..imm)
+  bool RegToReg = Start->isReg() && End->isReg(); // for (reg..reg)
+
+  int64_t StartV = 0, EndV = 0;
+  if (Start->isImm())
+    StartV = Start->getImm();
+  if (End->isImm())
+    EndV = End->getImm();
+
+  int64_t AdjV = 0;
+  // To compute the iteration count, we would need this computation:
+  //   Count = (End - Start + (IVBump-1)) / IVBump
+  // or, when CmpHasEqual:
+  //   Count = (End - Start + (IVBump-1)+1) / IVBump
+  // The "IVBump-1" part is the adjustment (AdjV).  We can avoid
+  // generating an instruction specifically to add it if we can adjust
+  // the immediate values for Start or End.
+
+  if (CmpHasEqual) {
+    // Need to add 1 to the total iteration count.
+    if (Start->isImm())
+      StartV--;
+    else if (End->isImm())
+      EndV++;
+    else
+      AdjV += 1;
+  }
+
+  if (Cmp != Comparison::NE) {
+    if (Start->isImm())
+      StartV -= (IVBump-1);
+    else if (End->isImm())
+      EndV += (IVBump-1);
+    else
+      AdjV += (IVBump-1);
+  }
+
+  unsigned R = 0, SR = 0;
+  if (Start->isReg()) {
+    R = Start->getReg();
+    SR = Start->getSubReg();
+  } else {
+    R = End->getReg();
+    SR = End->getSubReg();
+  }
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  // Hardware loops cannot handle 64-bit registers.  If it's a double
+  // register, it has to have a subregister.
+  if (!SR && RC == &Hexagon::DoubleRegsRegClass)
+    return 0;
+  const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
+
+  // Compute DistR (register with the distance between Start and End).
+  unsigned DistR, DistSR;
+
+  // Avoid special case, where the start value is an imm(0).
+  if (Start->isImm() && StartV == 0) {
+    DistR = End->getReg();
+    DistSR = End->getSubReg();
+  } else {
+    const MCInstrDesc &SubD = RegToReg ? TII->get(Hexagon::SUB_rr) :
+                              (RegToImm ? TII->get(Hexagon::SUB_ri) :
+                                          TII->get(Hexagon::ADD_ri));
+    unsigned SubR = MRI->createVirtualRegister(IntRC);
+    MachineInstrBuilder SubIB =
+      BuildMI(*PH, InsertPos, DL, SubD, SubR);
+
+    if (RegToReg) {
+      SubIB.addReg(End->getReg(), 0, End->getSubReg())
+           .addReg(Start->getReg(), 0, Start->getSubReg());
+    } else if (RegToImm) {
+      SubIB.addImm(EndV)
+           .addReg(Start->getReg(), 0, Start->getSubReg());
+    } else { // ImmToReg
+      SubIB.addReg(End->getReg(), 0, End->getSubReg())
+           .addImm(-StartV);
+    }
+    DistR = SubR;
+    DistSR = 0;
+  }
+
+  // From DistR, compute AdjR (register with the adjusted distance).
+  unsigned AdjR, AdjSR;
+
+  if (AdjV == 0) {
+    AdjR = DistR;
+    AdjSR = DistSR;
+  } else {
+    // Generate CountR = ADD DistR, AdjVal
+    unsigned AddR = MRI->createVirtualRegister(IntRC);
+    const MCInstrDesc &AddD = TII->get(Hexagon::ADD_ri);
+    BuildMI(*PH, InsertPos, DL, AddD, AddR)
+      .addReg(DistR, 0, DistSR)
+      .addImm(AdjV);
+
+    AdjR = AddR;
+    AdjSR = 0;
+  }
+
+  // From AdjR, compute CountR (register with the final count).
+  unsigned CountR, CountSR;
+
+  if (IVBump == 1) {
+    CountR = AdjR;
+    CountSR = AdjSR;
+  } else {
+    // The IV bump is a power of two. Log_2(IV bump) is the shift amount.
+    unsigned Shift = Log2_32(IVBump);
+
+    // Generate NormR = LSR DistR, Shift.
+    unsigned LsrR = MRI->createVirtualRegister(IntRC);
+    const MCInstrDesc &LsrD = TII->get(Hexagon::LSR_ri);
+    BuildMI(*PH, InsertPos, DL, LsrD, LsrR)
+      .addReg(AdjR, 0, AdjSR)
+      .addImm(Shift);
+
+    CountR = LsrR;
+    CountSR = 0;
+  }
+
+  return new CountValue(CountValue::CV_Register, CountR, CountSR);
 }
 
-/// isInvalidOperation - Return true if the operation is invalid within
-/// hardware loop.
-bool
-HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI) const {
+
+/// \brief Return true if the operation is invalid within hardware loop.
+bool HexagonHardwareLoops::isInvalidLoopOperation(
+      const MachineInstr *MI) const {
 
   // call is not allowed because the callee may use a hardware loop
-  if (MI->getDesc().isCall()) {
+  if (MI->getDesc().isCall())
     return true;
-  }
+
   // do not allow nested hardware loops
-  if (isHardwareLoop(MI)) {
+  if (isHardwareLoop(MI))
     return true;
-  }
+
   // check if the instruction defines a hardware loop register
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.isDef() &&
-        (MO.getReg() == Hexagon::LC0 || MO.getReg() == Hexagon::LC1 ||
-         MO.getReg() == Hexagon::SA0 || MO.getReg() == Hexagon::SA0)) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (R == Hexagon::LC0 || R == Hexagon::LC1 ||
+        R == Hexagon::SA0 || R == Hexagon::SA1)
       return true;
-    }
   }
   return false;
 }
 
-/// containsInvalidInstruction - Return true if the loop contains
-/// an instruction that inhibits the use of the hardware loop function.
-///
+
+/// \brief - Return true if the loop contains an instruction that inhibits
+/// the use of the hardware loop function.
 bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const {
   const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
@@ -411,58 +883,184 @@ bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const {
     for (MachineBasicBlock::iterator
            MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
       const MachineInstr *MI = &*MII;
-      if (isInvalidLoopOperation(MI)) {
+      if (isInvalidLoopOperation(MI))
         return true;
-      }
     }
   }
   return false;
 }
 
-/// converToHardwareLoop - check if the loop is a candidate for
-/// converting to a hardware loop.  If so, then perform the
-/// transformation.
+
+/// \brief Returns true if the instruction is dead.  This was essentially
+/// copied from DeadMachineInstructionElim::isDead, but with special cases
+/// for inline asm, physical registers and instructions with side effects
+/// removed.
+bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
+                             SmallVector<MachineInstr*, 1> &DeadPhis) const {
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (MRI->use_nodbg_empty(Reg))
+      continue;
+
+    typedef MachineRegisterInfo::use_nodbg_iterator use_nodbg_iterator;
+
+    // This instruction has users, but if the only user is the phi node for the
+    // parent block, and the only use of that phi node is this instruction, then
+    // this instruction is dead: both it (and the phi node) can be removed.
+    use_nodbg_iterator I = MRI->use_nodbg_begin(Reg);
+    use_nodbg_iterator End = MRI->use_nodbg_end();
+    if (llvm::next(I) != End || !I.getOperand().getParent()->isPHI())
+      return false;
+
+    MachineInstr *OnePhi = I.getOperand().getParent();
+    for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
+      const MachineOperand &OPO = OnePhi->getOperand(j);
+      if (!OPO.isReg() || !OPO.isDef())
+        continue;
+
+      unsigned OPReg = OPO.getReg();
+      use_nodbg_iterator nextJ;
+      for (use_nodbg_iterator J = MRI->use_nodbg_begin(OPReg);
+           J != End; J = nextJ) {
+        nextJ = llvm::next(J);
+        MachineOperand &Use = J.getOperand();
+        MachineInstr *UseMI = Use.getParent();
+
+        // If the phi node has a user that is not MI, bail...
+        if (MI != UseMI)
+          return false;
+      }
+    }
+    DeadPhis.push_back(OnePhi);
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
+  // This procedure was essentially copied from DeadMachineInstructionElim.
+
+  SmallVector<MachineInstr*, 1> DeadPhis;
+  if (isDead(MI, DeadPhis)) {
+    DEBUG(dbgs() << "HW looping will remove: " << *MI);
+
+    // It is possible that some DBG_VALUE instructions refer to this
+    // instruction.  Examine each def operand for such references;
+    // if found, mark the DBG_VALUE as undef (but don't delete it).
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      MachineRegisterInfo::use_iterator nextI;
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+           E = MRI->use_end(); I != E; I = nextI) {
+        nextI = llvm::next(I);  // I is invalidated by the setReg
+        MachineOperand &Use = I.getOperand();
+        MachineInstr *UseMI = Use.getParent();
+        if (UseMI == MI)
+          continue;
+        if (Use.isDebug())
+          UseMI->getOperand(0).setReg(0U);
+        // This may also be a "instr -> phi -> instr" case which can
+        // be removed too.
+      }
+    }
+
+    MI->eraseFromParent();
+    for (unsigned i = 0; i < DeadPhis.size(); ++i)
+      DeadPhis[i]->eraseFromParent();
+  }
+}
+
+/// \brief Check if the loop is a candidate for converting to a hardware
+/// loop.  If so, then perform the transformation.
 ///
-/// This function works on innermost loops first.  A loop can
-/// be converted if it is a counting loop; either a register
-/// value or an immediate.
+/// This function works on innermost loops first.  A loop can be converted
+/// if it is a counting loop; either a register value or an immediate.
 ///
-/// The code makes several assumptions about the representation
-/// of the loop in llvm.
+/// The code makes several assumptions about the representation of the loop
+/// in llvm.
 bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
+  // This is just for sanity.
+  assert(L->getHeader() && "Loop without a header?");
+
   bool Changed = false;
   // Process nested loops first.
-  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
     Changed |= convertToHardwareLoop(*I);
-  }
+
   // If a nested loop has been converted, then we can't convert this loop.
-  if (Changed) {
+  if (Changed)
     return Changed;
+
+#ifndef NDEBUG
+  // Stop trying after reaching the limit (if any).
+  int Limit = HWLoopLimit;
+  if (Limit >= 0) {
+    if (Counter >= HWLoopLimit)
+      return false;
+    Counter++;
   }
-  // Are we able to determine the trip count for the loop?
-  CountValue *TripCount = getTripCount(L);
-  if (TripCount == 0) {
-    return false;
-  }
+#endif
+
   // Does the loop contain any invalid instructions?
-  if (containsInvalidInstruction(L)) {
+  if (containsInvalidInstruction(L))
     return false;
-  }
-  MachineBasicBlock *Preheader = L->getLoopPreheader();
-  // No preheader means there's not place for the loop instr.
-  if (Preheader == 0) {
+
+  // Is the induction variable bump feeding the latch condition?
+  if (!fixupInductionVariable(L))
     return false;
-  }
-  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
 
   MachineBasicBlock *LastMBB = L->getExitingBlock();
   // Don't generate hw loop if the loop has more than one exit.
-  if (LastMBB == 0) {
+  if (LastMBB == 0)
     return false;
-  }
+
   MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
-  if (LastI == LastMBB->end()) {
+  if (LastI == LastMBB->end())
     return false;
+
+  // Ensure the loop has a preheader: the loop instruction will be
+  // placed there.
+  bool NewPreheader = false;
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    Preheader = createPreheaderForLoop(L);
+    if (!Preheader)
+      return false;
+    NewPreheader = true;
+  }
+  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+
+  SmallVector<MachineInstr*, 2> OldInsts;
+  // Are we able to determine the trip count for the loop?
+  CountValue *TripCount = getLoopTripCount(L, OldInsts);
+  if (TripCount == 0)
+    return false;
+
+  // Is the trip count available in the preheader?
+  if (TripCount->isReg()) {
+    // There will be a use of the register inserted into the preheader,
+    // so make sure that the register is actually defined at that point.
+    MachineInstr *TCDef = MRI->getVRegDef(TripCount->getReg());
+    MachineBasicBlock *BBDef = TCDef->getParent();
+    if (!NewPreheader) {
+      if (!MDT->dominates(BBDef, Preheader))
+        return false;
+    } else {
+      // If we have just created a preheader, the dominator tree won't be
+      // aware of it.  Check if the definition of the register dominates
+      // the header, but is not the header itself.
+      if (!MDT->properlyDominates(BBDef, L->getHeader()))
+        return false;
+    }
   }
 
   // Determine the loop start.
@@ -470,53 +1068,53 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
   if (L->getLoopLatch() != LastMBB) {
     // When the exit and latch are not the same, use the latch block as the
     // start.
-    // The loop start address is used only after the 1st iteration, and the loop
-    // latch may contains instrs. that need to be executed after the 1st iter.
+    // The loop start address is used only after the 1st iteration, and the
+    // loop latch may contains instrs. that need to be executed after the
+    // first iteration.
     LoopStart = L->getLoopLatch();
     // Make sure the latch is a successor of the exit, otherwise it won't work.
-    if (!LastMBB->isSuccessor(LoopStart)) {
+    if (!LastMBB->isSuccessor(LoopStart))
       return false;
-    }
   }
 
-  // Convert the loop to a hardware loop
+  // Convert the loop to a hardware loop.
   DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
-  DebugLoc InsertPosDL;
+  DebugLoc DL;
   if (InsertPos != Preheader->end())
-    InsertPosDL = InsertPos->getDebugLoc();
+    DL = InsertPos->getDebugLoc();
 
   if (TripCount->isReg()) {
     // Create a copy of the loop count register.
-    MachineFunction *MF = LastMBB->getParent();
-    const TargetRegisterClass *RC =
-      MF->getRegInfo().getRegClass(TripCount->getReg());
-    unsigned CountReg = MF->getRegInfo().createVirtualRegister(RC);
-    BuildMI(*Preheader, InsertPos, InsertPosDL,
-            TII->get(TargetOpcode::COPY), CountReg).addReg(TripCount->getReg());
-    if (TripCount->isNeg()) {
-      unsigned CountReg1 = CountReg;
-      CountReg = MF->getRegInfo().createVirtualRegister(RC);
-      BuildMI(*Preheader, InsertPos, InsertPosDL,
-              TII->get(Hexagon::NEG), CountReg).addReg(CountReg1);
-    }
-
+    unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+    BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg)
+      .addReg(TripCount->getReg(), 0, TripCount->getSubReg());
     // Add the Loop instruction to the beginning of the loop.
-    BuildMI(*Preheader, InsertPos, InsertPosDL,
-            TII->get(Hexagon::LOOP0_r)).addMBB(LoopStart).addReg(CountReg);
+    BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_r))
+      .addMBB(LoopStart)
+      .addReg(CountReg);
   } else {
-    assert(TripCount->isImm() && "Expecting immedate vaule for trip count");
-    // Add the Loop immediate instruction to the beginning of the loop.
+    assert(TripCount->isImm() && "Expecting immediate value for trip count");
+    // Add the Loop immediate instruction to the beginning of the loop,
+    // if the immediate fits in the instructions.  Otherwise, we need to
+    // create a new virtual register.
     int64_t CountImm = TripCount->getImm();
-    BuildMI(*Preheader, InsertPos, InsertPosDL,
-            TII->get(Hexagon::LOOP0_i)).addMBB(LoopStart).addImm(CountImm);
+    if (!TII->isValidOffset(Hexagon::LOOP0_i, CountImm)) {
+      unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::TFRI), CountReg)
+        .addImm(CountImm);
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_r))
+        .addMBB(LoopStart).addReg(CountReg);
+    } else
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_i))
+        .addMBB(LoopStart).addImm(CountImm);
   }
 
-  // Make sure the loop start always has a reference in the CFG.  We need to
-  // create a BlockAddress operand to get this mechanism to work both the
+  // Make sure the loop start always has a reference in the CFG.  We need
+  // to create a BlockAddress operand to get this mechanism to work both the
   // MachineBasicBlock and BasicBlock objects need the flag set.
   LoopStart->setHasAddressTaken();
   // This line is needed to set the hasAddressTaken flag on the BasicBlock
-  // object
+  // object.
   BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
 
   // Replace the loop branch with an endloop instruction.
@@ -529,13 +1127,12 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
   //  - a conditional branch to the loop start.
   if (LastI->getOpcode() == Hexagon::JMP_c ||
       LastI->getOpcode() == Hexagon::JMP_cNot) {
-    // delete one and change/add an uncond. branch to out of the loop
+    // Delete one and change/add an uncond. branch to out of the loop.
     MachineBasicBlock *BranchTarget = LastI->getOperand(1).getMBB();
     LastI = LastMBB->erase(LastI);
     if (!L->contains(BranchTarget)) {
-      if (LastI != LastMBB->end()) {
-        TII->RemoveBranch(*LastMBB);
-      }
+      if (LastI != LastMBB->end())
+        LastI = LastMBB->erase(LastI);
       SmallVector<MachineOperand, 0> Cond;
       TII->InsertBranch(*LastMBB, BranchTarget, 0, Cond, LastIDL);
     }
@@ -545,110 +1142,414 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
   }
   delete TripCount;
 
+  // The induction operation and the comparison may now be
+  // unneeded. If these are unneeded, then remove them.
+  for (unsigned i = 0; i < OldInsts.size(); ++i)
+    removeIfDead(OldInsts[i]);
+
   ++NumHWLoops;
   return true;
 }
 
-/// createHexagonFixupHwLoops - Factory for creating the hardware loop
-/// phase.
-FunctionPass *llvm::createHexagonFixupHwLoops() {
-  return new HexagonFixupHwLoops();
+
+bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
+                                            MachineInstr *CmpI) {
+  assert (BumpI != CmpI && "Bump and compare in the same instruction?");
+
+  MachineBasicBlock *BB = BumpI->getParent();
+  if (CmpI->getParent() != BB)
+    return false;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  // Check if things are in order to begin with.
+  for (instr_iterator I = BumpI, E = BB->instr_end(); I != E; ++I)
+    if (&*I == CmpI)
+      return true;
+
+  // Out of order.
+  unsigned PredR = CmpI->getOperand(0).getReg();
+  bool FoundBump = false;
+  instr_iterator CmpIt = CmpI, NextIt = llvm::next(CmpIt);
+  for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
+    MachineInstr *In = &*I;
+    for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) {
+      MachineOperand &MO = In->getOperand(i);
+      if (MO.isReg() && MO.isUse()) {
+        if (MO.getReg() == PredR)  // Found an intervening use of PredR.
+          return false;
+      }
+    }
+
+    if (In == BumpI) {
+      instr_iterator After = BumpI;
+      instr_iterator From = CmpI;
+      BB->splice(llvm::next(After), BB, From);
+      FoundBump = true;
+      break;
+    }
+  }
+  assert (FoundBump && "Cannot determine instruction order");
+  return FoundBump;
 }
 
-bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "****** Hexagon Hardware Loop Fixup ******\n");
 
-  bool Changed = fixupLoopInstrs(MF);
-  return Changed;
+MachineInstr *HexagonHardwareLoops::defWithImmediate(unsigned R) {
+  MachineInstr *DI = MRI->getVRegDef(R);
+  unsigned DOpc = DI->getOpcode();
+  switch (DOpc) {
+    case Hexagon::TFRI:
+    case Hexagon::TFRI64:
+    case Hexagon::CONST32_Int_Real:
+    case Hexagon::CONST64_Int_Real:
+      return DI;
+  }
+  return 0;
 }
 
-/// fixupLoopInsts - For Hexagon, if the loop label is to far from the
-/// loop instruction then we need to set the LC0 and SA0 registers
-/// explicitly instead of using LOOP(start,count).  This function
-/// checks the distance, and generates register assignments if needed.
-///
-/// This function makes two passes over the basic blocks.  The first
-/// pass computes the offset of the basic block from the start.
-/// The second pass checks all the loop instructions.
-bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
-
-  // Offset of the current instruction from the start.
-  unsigned InstOffset = 0;
-  // Map for each basic block to it's first instruction.
-  DenseMap<MachineBasicBlock*, unsigned> BlockToInstOffset;
-
-  // First pass - compute the offset of each basic block.
-  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
-       MBB != MBBe; ++MBB) {
-    BlockToInstOffset[MBB] = InstOffset;
-    InstOffset += (MBB->size() * 4);
-  }
-
-  // Second pass - check each loop instruction to see if it needs to
-  // be converted.
-  InstOffset = 0;
-  bool Changed = false;
-  RegScavenger RS;
-
-  // Loop over all the basic blocks.
-  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
-       MBB != MBBe; ++MBB) {
-    InstOffset = BlockToInstOffset[MBB];
-    RS.enterBasicBlock(MBB);
-
-    // Loop over all the instructions.
-    MachineBasicBlock::iterator MIE = MBB->end();
-    MachineBasicBlock::iterator MII = MBB->begin();
-    while (MII != MIE) {
-      if (isHardwareLoop(MII)) {
-        RS.forward(MII);
-        assert(MII->getOperand(0).isMBB() &&
-               "Expect a basic block as loop operand");
-        int diff = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
-        diff = (diff > 0 ? diff : -diff);
-        if ((unsigned)diff > MAX_LOOP_DISTANCE) {
-          // Convert to explicity setting LC0 and SA0.
-          convertLoopInstr(MF, MII, RS);
-          MII = MBB->erase(MII);
-          Changed = true;
-        } else {
-          ++MII;
+
+int64_t HexagonHardwareLoops::getImmediate(MachineOperand &MO) {
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isReg());
+  unsigned R = MO.getReg();
+  MachineInstr *DI = defWithImmediate(R);
+  assert(DI && "Need an immediate operand");
+  // All currently supported "define-with-immediate" instructions have the
+  // actual immediate value in the operand(1).
+  int64_t v = DI->getOperand(1).getImm();
+  return v;
+}
+
+
+void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) {
+  if (MO.isImm()) {
+    MO.setImm(Val);
+    return;
+  }
+
+  assert(MO.isReg());
+  unsigned R = MO.getReg();
+  MachineInstr *DI = defWithImmediate(R);
+  if (MRI->hasOneNonDBGUse(R)) {
+    // If R has only one use, then just change its defining instruction to
+    // the new immediate value.
+    DI->getOperand(1).setImm(Val);
+    return;
+  }
+
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  unsigned NewR = MRI->createVirtualRegister(RC);
+  MachineBasicBlock &B = *DI->getParent();
+  DebugLoc DL = DI->getDebugLoc();
+  BuildMI(B, DI, DL, TII->get(DI->getOpcode()), NewR)
+    .addImm(Val);
+  MO.setReg(NewR);
+}
+
+
+bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+
+  if (!Header || !Preheader || !Latch)
+    return false;
+
+  // These data structures follow the same concept as the corresponding
+  // ones in findInductionRegister (where some comments are).
+  typedef std::pair<unsigned,int64_t> RegisterBump;
+  typedef std::pair<unsigned,RegisterBump> RegisterInduction;
+  typedef std::set<RegisterInduction> RegisterInductionSet;
+
+  // Register candidates for induction variables, with their associated bumps.
+  RegisterInductionSet IndRegs;
+
+  // Look for induction patterns:
+  //   vreg1 = PHI ..., [ latch, vreg2 ]
+  //   vreg2 = ADD vreg1, imm
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *Phi = &*I;
+
+    // Have a PHI instruction.
+    for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+      if (Phi->getOperand(i+1).getMBB() != Latch)
+        continue;
+
+      unsigned PhiReg = Phi->getOperand(i).getReg();
+      MachineInstr *DI = MRI->getVRegDef(PhiReg);
+      unsigned UpdOpc = DI->getOpcode();
+      bool isAdd = (UpdOpc == Hexagon::ADD_ri);
+
+      if (isAdd) {
+        // If the register operand to the add/sub is the PHI we are looking
+        // at, this meets the induction pattern.
+        unsigned IndReg = DI->getOperand(1).getReg();
+        if (MRI->getVRegDef(IndReg) == Phi) {
+          unsigned UpdReg = DI->getOperand(0).getReg();
+          int64_t V = DI->getOperand(2).getImm();
+          IndRegs.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
         }
-      } else {
-        ++MII;
       }
-      InstOffset += 4;
+    }  // for (i)
+  }  // for (instr)
+
+  if (IndRegs.empty())
+    return false;
+
+  MachineBasicBlock *TB = 0, *FB = 0;
+  SmallVector<MachineOperand,2> Cond;
+  // AnalyzeBranch returns true if it fails to analyze branch.
+  bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return false;
+
+  // Check if the latch branch is unconditional.
+  if (Cond.empty())
+    return false;
+
+  if (TB != Header && FB != Header)
+    // The latch does not go back to the header.  Not a latch we know and love.
+    return false;
+
+  // Expecting a predicate register as a condition.  It won't be a hardware
+  // predicate register at this point yet, just a vreg.
+  // HexagonInstrInfo::AnalyzeBranch for negated branches inserts imm(0)
+  // into Cond, followed by the predicate register.  For non-negated branches
+  // it's just the register.
+  unsigned CSz = Cond.size();
+  if (CSz != 1 && CSz != 2)
+    return false;
+
+  unsigned P = Cond[CSz-1].getReg();
+  MachineInstr *PredDef = MRI->getVRegDef(P);
+
+  if (!PredDef->isCompare())
+    return false;
+
+  SmallSet<unsigned,2> CmpRegs;
+  MachineOperand *CmpImmOp = 0;
+
+  // Go over all operands to the compare and look for immediate and register
+  // operands.  Assume that if the compare has a single register use and a
+  // single immediate operand, then the register is being compared with the
+  // immediate value.
+  for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+    MachineOperand &MO = PredDef->getOperand(i);
+    if (MO.isReg()) {
+      // Skip all implicit references.  In one case there was:
+      //   %vreg140<def> = FCMPUGT32_rr %vreg138, %vreg139, %USR<imp-use>
+      if (MO.isImplicit())
+        continue;
+      if (MO.isUse()) {
+        unsigned R = MO.getReg();
+        if (!defWithImmediate(R)) {
+          CmpRegs.insert(MO.getReg());
+          continue;
+        }
+        // Consider the register to be the "immediate" operand.
+        if (CmpImmOp)
+          return false;
+        CmpImmOp = &MO;
+      }
+    } else if (MO.isImm()) {
+      if (CmpImmOp)    // A second immediate argument?  Confusing.  Bail out.
+        return false;
+      CmpImmOp = &MO;
     }
   }
 
-  return Changed;
+  if (CmpRegs.empty())
+    return false;
+
+  // Check if the compared register follows the order we want.  Fix if needed.
+  for (RegisterInductionSet::iterator I = IndRegs.begin(), E = IndRegs.end();
+       I != E; ++I) {
+    // This is a success.  If the register used in the comparison is one that
+    // we have identified as a bumped (updated) induction register, there is
+    // nothing to do.
+    if (CmpRegs.count(I->first))
+      return true;
+
+    // Otherwise, if the register being compared comes out of a PHI node,
+    // and has been recognized as following the induction pattern, and is
+    // compared against an immediate, we can fix it.
+    const RegisterBump &RB = I->second;
+    if (CmpRegs.count(RB.first)) {
+      if (!CmpImmOp)
+        return false;
+
+      int64_t CmpImm = getImmediate(*CmpImmOp);
+      int64_t V = RB.second;
+      if (V > 0 && CmpImm+V < CmpImm)  // Overflow (64-bit).
+        return false;
+      if (V < 0 && CmpImm+V > CmpImm)  // Overflow (64-bit).
+        return false;
+      CmpImm += V;
+      // Some forms of cmp-immediate allow u9 and s10.  Assume the worst case
+      // scenario, i.e. an 8-bit value.
+      if (CmpImmOp->isImm() && !isInt<8>(CmpImm))
+        return false;
+
+      // Make sure that the compare happens after the bump.  Otherwise,
+      // after the fixup, the compare would use a yet-undefined register.
+      MachineInstr *BumpI = MRI->getVRegDef(I->first);
+      bool Order = orderBumpCompare(BumpI, PredDef);
+      if (!Order)
+        return false;
+
+      // Finally, fix the compare instruction.
+      setImmediate(*CmpImmOp, CmpImm);
+      for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+        MachineOperand &MO = PredDef->getOperand(i);
+        if (MO.isReg() && MO.getReg() == RB.first) {
+          MO.setReg(I->first);
+          return true;
+        }
+      }
+    }
+  }
 
+  return false;
 }
 
-/// convertLoopInstr - convert a loop instruction to a sequence of instructions
-/// that set the lc and sa register explicitly.
-void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF,
-                                           MachineBasicBlock::iterator &MII,
-                                           RegScavenger &RS) {
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
-  MachineBasicBlock *MBB = MII->getParent();
-  DebugLoc DL = MII->getDebugLoc();
-  unsigned Scratch = RS.scavengeRegister(&Hexagon::IntRegsRegClass, MII, 0);
-
-  // First, set the LC0 with the trip count.
-  if (MII->getOperand(1).isReg()) {
-    // Trip count is a register
-    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
-      .addReg(MII->getOperand(1).getReg());
+
+/// \brief Create a preheader for a given loop.
+MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
+      MachineLoop *L) {
+  if (MachineBasicBlock *TmpPH = L->getLoopPreheader())
+    return TmpPH;
+
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  MachineFunction *MF = Header->getParent();
+  DebugLoc DL;
+
+  if (!Latch || Header->hasAddressTaken())
+    return 0;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  typedef MachineBasicBlock::pred_iterator pred_iterator;
+
+  // Verify that all existing predecessors have analyzable branches
+  // (or no branches at all).
+  typedef std::vector<MachineBasicBlock*> MBBVector;
+  MBBVector Preds(Header->pred_begin(), Header->pred_end());
+  SmallVector<MachineOperand,2> Tmp1;
+  MachineBasicBlock *TB = 0, *FB = 0;
+
+  if (TII->AnalyzeBranch(*Latch, TB, FB, Tmp1, false))
+    return 0;
+
+  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+    MachineBasicBlock *PB = *I;
+    if (PB != Latch) {
+      bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp1, false);
+      if (NotAnalyzed)
+        return 0;
+    }
+  }
+
+  MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock();
+  MF->insert(Header, NewPH);
+
+  if (Header->pred_size() > 2) {
+    // Ensure that the header has only two predecessors: the preheader and
+    // the loop latch.  Any additional predecessors of the header should
+    // join at the newly created preheader.  Inspect all PHI nodes from the
+    // header and create appropriate corresponding PHI nodes in the preheader.
+
+    for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+         I != E && I->isPHI(); ++I) {
+      MachineInstr *PN = &*I;
+
+      const MCInstrDesc &PD = TII->get(TargetOpcode::PHI);
+      MachineInstr *NewPN = MF->CreateMachineInstr(PD, DL);
+      NewPH->insert(NewPH->end(), NewPN);
+
+      unsigned PR = PN->getOperand(0).getReg();
+      const TargetRegisterClass *RC = MRI->getRegClass(PR);
+      unsigned NewPR = MRI->createVirtualRegister(RC);
+      NewPN->addOperand(MachineOperand::CreateReg(NewPR, true));
+
+      // Copy all non-latch operands of a header's PHI node to the newly
+      // created PHI node in the preheader.
+      for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+        unsigned PredR = PN->getOperand(i).getReg();
+        MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+        if (PredB == Latch)
+          continue;
+
+        NewPN->addOperand(MachineOperand::CreateReg(PredR, false));
+        NewPN->addOperand(MachineOperand::CreateMBB(PredB));
+      }
+
+      // Remove copied operands from the old PHI node and add the value
+      // coming from the preheader's PHI.
+      for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
+        MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+        if (PredB != Latch) {
+          PN->RemoveOperand(i+1);
+          PN->RemoveOperand(i);
+        }
+      }
+      PN->addOperand(MachineOperand::CreateReg(NewPR, false));
+      PN->addOperand(MachineOperand::CreateMBB(NewPH));
+    }
+
   } else {
-    // Trip count is an immediate.
-    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFRI), Scratch)
-      .addImm(MII->getOperand(1).getImm());
-    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
-      .addReg(Scratch);
-  }
-  // Then, set the SA0 with the loop start address.
-  BuildMI(*MBB, MII, DL, TII->get(Hexagon::CONST32_Label), Scratch)
-    .addMBB(MII->getOperand(0).getMBB());
-  BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::SA0).addReg(Scratch);
+    assert(Header->pred_size() == 2);
+
+    // The header has only two predecessors, but the non-latch predecessor
+    // is not a preheader (e.g. it has other successors, etc.)
+    // In such a case we don't need any extra PHI nodes in the new preheader,
+    // all we need is to adjust existing PHIs in the header to now refer to
+    // the new preheader.
+    for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+         I != E && I->isPHI(); ++I) {
+      MachineInstr *PN = &*I;
+      for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+        MachineOperand &MO = PN->getOperand(i+1);
+        if (MO.getMBB() != Latch)
+          MO.setMBB(NewPH);
+      }
+    }
+  }
+
+  // "Reroute" the CFG edges to link in the new preheader.
+  // If any of the predecessors falls through to the header, insert a branch
+  // to the new preheader in that place.
+  SmallVector<MachineOperand,1> Tmp2;
+  SmallVector<MachineOperand,1> EmptyCond;
+
+  TB = FB = 0;
+
+  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+    MachineBasicBlock *PB = *I;
+    if (PB != Latch) {
+      Tmp2.clear();
+      bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp2, false);
+      (void)NotAnalyzed; // supress compiler warning
+      assert (!NotAnalyzed && "Should be analyzable!");
+      if (TB != Header && (Tmp2.empty() || FB != Header))
+        TII->InsertBranch(*PB, NewPH, 0, EmptyCond, DL);
+      PB->ReplaceUsesOfBlockWith(Header, NewPH);
+    }
+  }
+
+  // It can happen that the latch block will fall through into the header.
+  // Insert an unconditional branch to the header.
+  TB = FB = 0;
+  bool LatchNotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Tmp2, false);
+  (void)LatchNotAnalyzed; // supress compiler warning
+  assert (!LatchNotAnalyzed && "Should be analyzable!");
+  if (!TB && !FB)
+    TII->InsertBranch(*Latch, Header, 0, EmptyCond, DL);
+
+  // Finally, the branch from the preheader to the header.
+  TII->InsertBranch(*NewPH, Header, 0, EmptyCond, DL);
+  NewPH->addSuccessor(Header);
+
+  return NewPH;
 }
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index db292f2..3a1c48b 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -15,18 +15,29 @@
 #include "Hexagon.h"
 #include "HexagonISelLowering.h"
 #include "HexagonTargetMachine.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-
 using namespace llvm;
 
+static
+cl::opt<unsigned>
+MaxNumOfUsesForConstExtenders("ga-max-num-uses-for-constant-extenders",
+  cl::Hidden, cl::init(2),
+  cl::desc("Maximum number of uses of a global address such that we still us a"
+           "constant extended instruction"));
 
 //===----------------------------------------------------------------------===//
 // Instruction Selector Implementation
 //===----------------------------------------------------------------------===//
 
+namespace llvm {
+  void initializeHexagonDAGToDAGISelPass(PassRegistry&);
+}
+
 //===--------------------------------------------------------------------===//
 /// HexagonDAGToDAGISel - Hexagon specific code to select Hexagon machine
 /// instructions for SelectionDAG operations.
@@ -40,19 +51,24 @@ class HexagonDAGToDAGISel : public SelectionDAGISel {
   // Keep a reference to HexagonTargetMachine.
   HexagonTargetMachine& TM;
   const HexagonInstrInfo *TII;
-
+  DenseMap<const GlobalValue *, unsigned> GlobalAddressUseCountMap;
 public:
-  explicit HexagonDAGToDAGISel(HexagonTargetMachine &targetmachine)
-    : SelectionDAGISel(targetmachine),
+  explicit HexagonDAGToDAGISel(HexagonTargetMachine &targetmachine,
+                               CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(targetmachine, OptLevel),
       Subtarget(targetmachine.getSubtarget<HexagonSubtarget>()),
       TM(targetmachine),
       TII(static_cast<const HexagonInstrInfo*>(TM.getInstrInfo())) {
-
+    initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry());
   }
+  bool hasNumUsesBelowThresGA(SDNode *N) const;
 
   SDNode *Select(SDNode *N);
 
   // Complex Pattern Selectors.
+  inline bool foldGlobalAddress(SDValue &N, SDValue &R);
+  inline bool foldGlobalAddressGP(SDValue &N, SDValue &R);
+  bool foldGlobalAddressImpl(SDValue &N, SDValue &R, bool ShouldLookForGP);
   bool SelectADDRri(SDValue& N, SDValue &R1, SDValue &R2);
   bool SelectADDRriS11_0(SDValue& N, SDValue &R1, SDValue &R2);
   bool SelectADDRriS11_1(SDValue& N, SDValue &R1, SDValue &R2);
@@ -97,7 +113,14 @@ public:
   SDNode *SelectAdd(SDNode *N);
   bool isConstExtProfitable(SDNode *N) const;
 
-  // Include the pieces autogenerated from the target description.
+// XformU7ToU7M1Imm - Return a target constant decremented by 1, in range
+// [1..128], used in cmpb.gtu instructions.
+inline SDValue XformU7ToU7M1Imm(signed Imm) {
+  assert((Imm >= 1 && Imm <= 128) && "Constant out of range for cmpb op");
+  return CurDAG->getTargetConstant(Imm - 1, MVT::i8);
+}
+
+// Include the pieces autogenerated from the target description.
 #include "HexagonGenDAGISel.inc"
 };
 }  // end anonymous namespace
@@ -106,10 +129,23 @@ public:
 /// createHexagonISelDag - This pass converts a legalized DAG into a
 /// Hexagon-specific DAG, ready for instruction scheduling.
 ///
-FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM) {
-  return new HexagonDAGToDAGISel(TM);
+FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new HexagonDAGToDAGISel(TM, OptLevel);
+}
+
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "Hexagon DAG->DAG Pattern Instruction Selection";
+  PassInfo *PI = new PassInfo(Name, "hexagon-isel",
+                              &SelectionDAGISel::ID, 0, false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializeHexagonDAGToDAGISelPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce)
 }
 
+
 static bool IsS11_0_Offset(SDNode * S) {
     ConstantSDNode *N = cast<ConstantSDNode>(S);
 
@@ -608,8 +644,8 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
   // Offset value must be within representable range
   // and must have correct alignment properties.
   if (TII->isValidAutoIncImm(StoredVT, Val)) {
-    SDValue Ops[] = { Value, Base,
-                      CurDAG->getTargetConstant(Val, MVT::i32), Chain};
+    SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, MVT::i32), Value,
+                     Chain};
     unsigned Opcode = 0;
 
     // Figure out the post inc version of opcode.
@@ -1519,3 +1555,69 @@ bool HexagonDAGToDAGISel::isConstExtProfitable(SDNode *N) const {
   return (UseCount <= 1);
 
 }
+
+//===--------------------------------------------------------------------===//
+// Return 'true' if use count of the global address is below threshold.
+//===--------------------------------------------------------------------===//
+bool HexagonDAGToDAGISel::hasNumUsesBelowThresGA(SDNode *N) const {
+  assert(N->getOpcode() == ISD::TargetGlobalAddress &&
+         "Expecting a target global address");
+
+  // Always try to fold the address.
+  if (TM.getOptLevel() == CodeGenOpt::Aggressive)
+    return true;
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+  DenseMap<const GlobalValue *, unsigned>::const_iterator GI =
+    GlobalAddressUseCountMap.find(GA->getGlobal());
+
+  if (GI == GlobalAddressUseCountMap.end())
+    return false;
+
+  return GI->second <= MaxNumOfUsesForConstExtenders;
+}
+
+//===--------------------------------------------------------------------===//
+// Return true if the non GP-relative global address can be folded.
+//===--------------------------------------------------------------------===//
+inline bool HexagonDAGToDAGISel::foldGlobalAddress(SDValue &N, SDValue &R) {
+  return foldGlobalAddressImpl(N, R, false);
+}
+
+//===--------------------------------------------------------------------===//
+// Return true if the GP-relative global address can be folded.
+//===--------------------------------------------------------------------===//
+inline bool HexagonDAGToDAGISel::foldGlobalAddressGP(SDValue &N, SDValue &R) {
+  return foldGlobalAddressImpl(N, R, true);
+}
+
+//===--------------------------------------------------------------------===//
+// Fold offset of the global address if number of uses are below threshold.
+//===--------------------------------------------------------------------===//
+bool HexagonDAGToDAGISel::foldGlobalAddressImpl(SDValue &N, SDValue &R,
+                                                bool ShouldLookForGP) {
+  if (N.getOpcode() == ISD::ADD) {
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    if ((ShouldLookForGP && (N0.getOpcode() == HexagonISD::CONST32_GP)) ||
+        (!ShouldLookForGP && (N0.getOpcode() == HexagonISD::CONST32))) {
+      ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N1);
+      GlobalAddressSDNode *GA =
+        dyn_cast<GlobalAddressSDNode>(N0.getOperand(0));
+
+      if (Const && GA &&
+          (GA->getOpcode() == ISD::TargetGlobalAddress)) {
+        if ((N0.getOpcode() == HexagonISD::CONST32) &&
+                !hasNumUsesBelowThresGA(GA))
+            return false;
+        R = CurDAG->getTargetGlobalAddress(GA->getGlobal(),
+                                          Const->getDebugLoc(),
+                                          N.getValueType(),
+                                          GA->getOffset() +
+                                          (uint64_t)Const->getSExtValue());
+        return true;
+      }
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 16cec5c..fac931a 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -304,15 +304,9 @@ HexagonTargetLowering::LowerReturn(SDValue Chain,
   // Analyze return values of ISD::RET
   CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
 
-  // If this is the first return lowered for this function, add the regs to the
-  // liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
   SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
@@ -321,12 +315,17 @@ HexagonTargetLowering::LowerReturn(SDValue Chain,
 
     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
   if (Flag.getNode())
-    return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+    RetOps.push_back(Flag);
 
-  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, Chain);
+  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
 }
 
 
@@ -1016,8 +1015,8 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
   DebugLoc dl = Op.getDebugLoc();
   Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
 
-  HexagonTargetObjectFile &TLOF =
-    (HexagonTargetObjectFile&)getObjFileLowering();
+  const HexagonTargetObjectFile &TLOF =
+      static_cast<const HexagonTargetObjectFile &>(getObjFileLowering());
   if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
     return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), Result);
   }
@@ -1053,8 +1052,8 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     setPrefLoopAlignment(4);
 
     // Limits for inline expansion of memcpy/memmove
-    maxStoresPerMemcpy = 6;
-    maxStoresPerMemmove = 6;
+    MaxStoresPerMemcpy = 6;
+    MaxStoresPerMemmove = 6;
 
     //
     // Library calls for unsupported operations
@@ -1364,11 +1363,18 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     setOperationAction(ISD::FSIN , MVT::f32, Expand);
     setOperationAction(ISD::FCOS , MVT::f32, Expand);
     setOperationAction(ISD::FREM , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
     setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+    setOperationAction(ISD::CTTZ , MVT::i64, Expand);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
     setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+    setOperationAction(ISD::CTLZ , MVT::i64, Expand);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
     setOperationAction(ISD::ROTL , MVT::i32, Expand);
     setOperationAction(ISD::ROTR , MVT::i32, Expand);
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 5a415eb..65dab85 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -52,6 +52,8 @@ namespace llvm {
       WrapperCP,
       WrapperCombineII,
       WrapperCombineRR,
+      WrapperCombineRI_V4,
+      WrapperCombineIR_V4,
       WrapperPackhl,
       WrapperSplatB,
       WrapperSplatH,
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 71c620b..587fa7d 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -13,19 +13,19 @@
 //                    *** Must match HexagonBaseInfo.h ***
 //===----------------------------------------------------------------------===//
 
-class Type<bits<5> t> {
+class IType<bits<5> t> {
   bits<5> Value = t;
 }
-def TypePSEUDO : Type<0>;
-def TypeALU32  : Type<1>;
-def TypeCR     : Type<2>;
-def TypeJR     : Type<3>;
-def TypeJ      : Type<4>;
-def TypeLD     : Type<5>;
-def TypeST     : Type<6>;
-def TypeSYSTEM : Type<7>;
-def TypeXTYPE  : Type<8>;
-def TypeMARKER : Type<31>;
+def TypePSEUDO : IType<0>;
+def TypeALU32  : IType<1>;
+def TypeCR     : IType<2>;
+def TypeJR     : IType<3>;
+def TypeJ      : IType<4>;
+def TypeLD     : IType<5>;
+def TypeST     : IType<6>;
+def TypeSYSTEM : IType<7>;
+def TypeXTYPE  : IType<8>;
+def TypeENDLOOP: IType<31>;
 
 // Maintain list of valid subtargets for each instruction.
 class SubTarget<bits<4> value> {
@@ -44,8 +44,8 @@ def HasV5SubT     : SubTarget<0x8>;
 def NoV5SubT      : SubTarget<0x7>;
 
 // Addressing modes for load/store instructions
-class AddrModeType<bits<4> value> {
-  bits<4> Value = value;
+class AddrModeType<bits<3> value> {
+  bits<3> Value = value;
 }
 
 def NoAddrMode     : AddrModeType<0>;  // No addressing mode
@@ -55,14 +55,35 @@ def BaseImmOffset  : AddrModeType<3>;  // Indirect with offset
 def BaseLongOffset : AddrModeType<4>;  // Indirect with long offset
 def BaseRegOffset  : AddrModeType<5>;  // Indirect with register offset
 
+class MemAccessSize<bits<3> value> {
+  bits<3> Value = value;
+}
+
+def NoMemAccess      : MemAccessSize<0>;// Not a memory acces instruction.
+def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
+def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
+def WordAccess       : MemAccessSize<3>;// Word access instrution (memw).
+def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
+
+
 //===----------------------------------------------------------------------===//
 //                         Intruction Class Declaration +
 //===----------------------------------------------------------------------===//
 
-class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
-                  string cstr, InstrItinClass itin, Type type> : Instruction {
-  field bits<32> Inst;
+class OpcodeHexagon {
+  field bits<32> Inst = ?; // Default to an invalid insn.
+  bits<4> IClass = 0; // ICLASS
+  bits<2> IParse = 0; // Parse bits.
+
+  let Inst{31-28} = IClass;
+  let Inst{15-14} = IParse;
+
+  bits<1> zero = 0;
+}
 
+class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
+                  string cstr, InstrItinClass itin, IType type>
+  : Instruction, OpcodeHexagon {
   let Namespace = "Hexagon";
 
   dag OutOperandList = outs;
@@ -73,48 +94,63 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   let Itinerary = itin;
   let Size = 4;
 
-  // *** Must match HexagonBaseInfo.h ***
+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+
   // Instruction type according to the ISA.
-  Type HexagonType = type;
-  let TSFlags{4-0} = HexagonType.Value;
+  IType Type = type;
+  let TSFlags{4-0} = Type.Value;
+
   // Solo instructions, i.e., those that cannot be in a packet with others.
-  bits<1> isHexagonSolo = 0;
-  let TSFlags{5} = isHexagonSolo;
+  bits<1> isSolo = 0;
+  let TSFlags{5} = isSolo;
+
   // Predicated instructions.
   bits<1> isPredicated = 0;
   let TSFlags{6} = isPredicated;
+  bits<1> isPredicatedFalse = 0;
+  let TSFlags{7} = isPredicatedFalse;
   bits<1> isPredicatedNew = 0;
-  let TSFlags{7} = isPredicatedNew;
-
-  // Stores that can be newified.
+  let TSFlags{8} = isPredicatedNew;
+
+  // New-value insn helper fields.
+  bits<1> isNewValue = 0;
+  let TSFlags{9} = isNewValue; // New-value consumer insn.
+  bits<1> hasNewValue = 0;
+  let TSFlags{10} = hasNewValue; // New-value producer insn.
+  bits<3> opNewValue = 0;
+  let TSFlags{13-11} = opNewValue; // New-value produced operand.
+  bits<2> opNewBits = 0;
+  let TSFlags{15-14} = opNewBits; // New-value opcode bits location: 0, 8, 16.
   bits<1> isNVStorable = 0;
-  let TSFlags{8} = isNVStorable;
-
-  // New-value store instructions.
+  let TSFlags{16} = isNVStorable; // Store that can become new-value store.
   bits<1> isNVStore = 0;
-  let TSFlags{9} = isNVStore;
+  let TSFlags{17} = isNVStore; // New-value store insn.
 
   // Immediate extender helper fields.
   bits<1> isExtendable = 0;
-  let TSFlags{10} = isExtendable; // Insn may be extended.
+  let TSFlags{18} = isExtendable; // Insn may be extended.
   bits<1> isExtended = 0;
-  let TSFlags{11} = isExtended; // Insn must be extended.
+  let TSFlags{19} = isExtended; // Insn must be extended.
   bits<3> opExtendable = 0;
-  let TSFlags{14-12} = opExtendable; // Which operand may be extended.
+  let TSFlags{22-20} = opExtendable; // Which operand may be extended.
   bits<1> isExtentSigned = 0;
-  let TSFlags{15} = isExtentSigned; // Signed or unsigned range.
+  let TSFlags{23} = isExtentSigned; // Signed or unsigned range.
   bits<5> opExtentBits = 0;
-  let TSFlags{20-16} = opExtentBits; //Number of bits of range before extending.
+  let TSFlags{28-24} = opExtentBits; //Number of bits of range before extending.
 
   // If an instruction is valid on a subtarget (v2-v5), set the corresponding
   // bit from validSubTargets. v2 is the least significant bit.
   // By default, instruction is valid on all subtargets.
   SubTarget validSubTargets = HasV2SubT;
-  let TSFlags{24-21} = validSubTargets.Value;
+  let TSFlags{32-29} = validSubTargets.Value;
 
-  // Addressing mode for load/store instrutions.
+  // Addressing mode for load/store instructions.
   AddrModeType addrMode = NoAddrMode;
-  let TSFlags{28-25} = addrMode.Value;
+  let TSFlags{35-33} = addrMode.Value;
+
+  // Memory access size for mem access instructions (load/store)
+  MemAccessSize accessSize = NoMemAccess;
+  let TSFlags{38-36} = accessSize.Value;
 
   // Fields used for relation models.
   string BaseOpcode = "";
@@ -124,6 +160,11 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   string InputType = "";    // Input is "imm" or "reg" type.
   string isMEMri = "false"; // Set to "true" for load/store with MEMri operand.
   string isFloat = "false"; // Set to "true" for the floating-point load/store.
+  string isBrTaken = ""; // Set to "true"/"false" for jump instructions
+
+  let PredSense = !if(isPredicated, !if(isPredicatedFalse, "false", "true"),
+                                    "");
+  let PNewValue = !if(isPredicatedNew, "new", "");
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 }
@@ -134,187 +175,143 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
-class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", LD, TypeLD> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<13> imm13;
-}
+class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, LD, TypeLD>;
 
-class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", LD, TypeLD> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<13> imm13;
-  let mayLoad = 1;
-}
+let mayLoad = 1 in
+class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                  string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
 
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
-class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
-                 string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, LD, TypeLD> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<5> rt;
-  bits<13> imm13;
-}
+class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1 in
+class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
 
 // ST Instruction Class in V2/V3 can take SLOT0 only.
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
-class STInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", ST, TypeST> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<13> imm13;
-}
+let mayStore = 1 in
+class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST, TypeST>;
 
-class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", ST, TypeST> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<13> imm13;
-  let mayStore = 1;
-}
+class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
 
-// SYSTEM Instruction Class in V4 can take SLOT0 only
-// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
-class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", SYS, TypeSYSTEM> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<13> imm13;
-}
+let mayStore = 1 in
+class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST0, TypeST>;
 
 // ST Instruction Class in V2/V3 can take SLOT0 only.
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
-class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern,
-                 string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST, TypeST> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<5> rt;
-  bits<13> imm13;
-}
+class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+// SYSTEM Instruction Class in V4 can take SLOT0 only
+// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
+class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, SYS, TypeSYSTEM>;
 
 // ALU32 Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
-class ALU32Type<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : InstHexagon<outs, ins, asmstr, pattern, "", ALU32, TypeALU32> {
-  bits<5>  rd;
-  bits<5>  rs;
-  bits<5>  rt;
-  bits<16> imm16;
-  bits<16> imm16_2;
-}
+class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU32, TypeALU32>;
 
 // ALU64 Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
-class ALU64Type<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : InstHexagon<outs, ins, asmstr, pattern, "", ALU64, TypeXTYPE> {
-  bits<5>  rd;
-  bits<5>  rs;
-  bits<5>  rt;
-  bits<16> imm16;
-  bits<16> imm16_2;
-}
+class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU64, TypeXTYPE>;
+
+class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
 
-class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
-   string cstr>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU64, TypeXTYPE> {
-  bits<5>  rd;
-  bits<5>  rs;
-  bits<5>  rt;
-  bits<16> imm16;
-  bits<16> imm16_2;
-}
 
 // M Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
-class MInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", M, TypeXTYPE> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<5> rt;
-}
+class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, M, TypeXTYPE>;
 
 // M Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
-class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
-    string cstr>
-    : InstHexagon<outs, ins, asmstr, pattern, cstr, M, TypeXTYPE> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<5> rt;
-}
+class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+    : MInst<outs, ins, asmstr, pattern, cstr>;
 
 // S Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
-class SInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", S, TypeXTYPE> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<5> rt;
-}
+class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, S, TypeXTYPE>;
 
 // S Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
-class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern,
-   string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, S, TypeXTYPE> {
-//  : InstHexagon<outs, ins, asmstr, pattern, cstr,  S> {
-//  : InstHexagon<outs, ins, asmstr, pattern, cstr, !if(V4T, XTYPE_V4, S)> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<5> rt;
-}
+class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : SInst<outs, ins, asmstr, pattern, cstr>;
 
 // J Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
-class JType<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", J, TypeJ> {
-  bits<16> imm16;
-}
+class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, J, TypeJ>;
 
 // JR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
-class JRType<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", JR, TypeJR> {
-  bits<5>  rs;
-  bits<5>  pu; // Predicate register
-}
+class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, JR, TypeJR>;
 
 // CR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
-class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", CR, TypeCR> {
-  bits<5> rs;
-  bits<10> imm10;
-}
+class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, CR, TypeCR>;
 
-class Marker<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", MARKER, TypeMARKER> {
-  let isCodeGenOnly = 1;
-  let isPseudo = 1;
-}
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Endloop<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, ENDLOOP, TypeENDLOOP>;
 
-class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", PSEUDO, TypePSEUDO> {
-  let isCodeGenOnly = 1;
-  let isPseudo = 1;
-}
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDO, TypePSEUDO>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr="">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDOM, TypePSEUDO>;
 
 //===----------------------------------------------------------------------===//
 //                         Intruction Classes Definitions -
@@ -324,75 +321,52 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
 //
 // ALU32 patterns
 //.
-class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : ALU32Type<outs, ins, asmstr, pattern> {
-}
+class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
 
-class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : ALU32Type<outs, ins, asmstr, pattern> {
-   let rt{0-4} = 0;
-}
+class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
 
-class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : ALU32Type<outs, ins, asmstr, pattern> {
-  let rt{0-4} = 0;
-}
+class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
 
-class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : ALU32Type<outs, ins, asmstr, pattern> {
-  let rt{0-4} = 0;
-}
+class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
 
 //
 // ALU64 patterns.
 //
-class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : ALU64Type<outs, ins, asmstr, pattern> {
-}
-
-class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : ALU64Type<outs, ins, asmstr, pattern> {
-  let rt{0-4} = 0;
-}
-
-// J Type Instructions.
-class JInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : JType<outs, ins, asmstr, pattern> {
-}
-
-// JR type Instructions.
-class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : JRType<outs, ins, asmstr, pattern> {
-}
+class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
 
+class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
 
 // Post increment ST Instruction.
-class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr>
-  : STInstPost<outs, ins, asmstr, pattern, cstr> {
-  let rt{0-4} = 0;
-}
+class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
 
-class STInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern,
-                string cstr>
-  : STInstPost<outs, ins, asmstr, pattern, cstr> {
-  let rt{0-4} = 0;
-  let mayStore = 1;
-}
+let mayStore = 1 in
+class STInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
 
 // Post increment LD Instruction.
-class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr>
-  : LDInstPost<outs, ins, asmstr, pattern, cstr> {
-  let rt{0-4} = 0;
-}
-
-class LDInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern,
-                string cstr>
-  : LDInstPost<outs, ins, asmstr, pattern, cstr> {
-  let rt{0-4} = 0;
-  let mayLoad = 1;
-}
+class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1 in
+class LDInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
 
 //===----------------------------------------------------------------------===//
 // V4 Instruction Format Definitions +
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index 05f1e23..9fda0da 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -17,9 +17,9 @@
 //                        *** Must match BaseInfo.h ***
 //----------------------------------------------------------------------------//
 
-def TypeMEMOP  : Type<9>;
-def TypeNV     : Type<10>;
-def TypePREFIX : Type<30>;
+def TypeMEMOP  : IType<9>;
+def TypeNV     : IType<10>;
+def TypePREFIX : IType<30>;
 
 //----------------------------------------------------------------------------//
 //                         Intruction Classes Definitions +
@@ -28,36 +28,38 @@ def TypePREFIX : Type<30>;
 //
 // NV type instructions.
 //
-class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", NV_V4, TypeNV> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<13> imm13;
-}
+class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4, TypeNV>;
+
+class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
 
 // Definition of Post increment new value store.
-class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern,
-                    string cstr>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4, TypeNV> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<5> rt;
-  bits<13> imm13;
-}
+class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
 
 // Post increment ST Instruction.
-class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern,
-                  string cstr>
-  : NVInstPost_V4<outs, ins, asmstr, pattern, cstr> {
-  let rt{0-4} = 0;
-}
+let mayStore = 1 in
+class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
+
+// New-value conditional branch.
+class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1, mayStore = 1 in
+class MEMInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, MEM_V4, TypeMEMOP>;
 
-class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : InstHexagon<outs, ins, asmstr, pattern, "", MEM_V4, TypeMEMOP> {
-  bits<5> rd;
-  bits<5> rs;
-  bits<6> imm6;
-}
+class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "">
+  : MEMInst<outs, ins, asmstr, pattern, cstr>;
 
 let isCodeGenOnly = 1 in
 class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 3b1ae09..d30cdda 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -305,6 +305,88 @@ unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 }
 
 
+/// \brief For a comparison instruction, return the source registers in
+/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
+                                      unsigned &SrcReg, unsigned &SrcReg2,
+                                      int &Mask, int &Value) const {
+  unsigned Opc = MI->getOpcode();
+
+  // Set mask and the first source register.
+  switch (Opc) {
+    case Hexagon::CMPEHexagon4rr:
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPGT64rr:
+    case Hexagon::CMPGTU64rr:
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPGTrr:
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPLTrr:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = ~0;
+      break;
+    case Hexagon::CMPbEQri_V4:
+    case Hexagon::CMPbEQrr_sbsb_V4:
+    case Hexagon::CMPbEQrr_ubub_V4:
+    case Hexagon::CMPbGTUri_V4:
+    case Hexagon::CMPbGTUrr_V4:
+    case Hexagon::CMPbGTrr_V4:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = 0xFF;
+      break;
+    case Hexagon::CMPhEQri_V4:
+    case Hexagon::CMPhEQrr_shl_V4:
+    case Hexagon::CMPhEQrr_xor_V4:
+    case Hexagon::CMPhGTUri_V4:
+    case Hexagon::CMPhGTUrr_V4:
+    case Hexagon::CMPhGTrr_shl_V4:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = 0xFFFF;
+      break;
+  }
+
+  // Set the value/second source register.
+  switch (Opc) {
+    case Hexagon::CMPEHexagon4rr:
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPGT64rr:
+    case Hexagon::CMPGTU64rr:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTrr:
+    case Hexagon::CMPbEQrr_sbsb_V4:
+    case Hexagon::CMPbEQrr_ubub_V4:
+    case Hexagon::CMPbGTUrr_V4:
+    case Hexagon::CMPbGTrr_V4:
+    case Hexagon::CMPhEQrr_shl_V4:
+    case Hexagon::CMPhEQrr_xor_V4:
+    case Hexagon::CMPhGTUrr_V4:
+    case Hexagon::CMPhGTrr_shl_V4:
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPLTrr:
+      SrcReg2 = MI->getOperand(2).getReg();
+      return true;
+
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPbEQri_V4:
+    case Hexagon::CMPbGTUri_V4:
+    case Hexagon::CMPhEQri_V4:
+    case Hexagon::CMPhGTUri_V4:
+      SrcReg2 = 0;
+      Value = MI->getOperand(2).getImm();
+      return true;
+  }
+
+  return false;
+}
+
+
 void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I, DebugLoc DL,
                                  unsigned DestReg, unsigned SrcReg,
@@ -344,6 +426,18 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(Hexagon::TFCR), DestReg).addReg(SrcReg);
     return;
   }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
+      Hexagon::IntRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR_RsPd), DestReg).
+      addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+      Hexagon::PredRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR_PdRs), DestReg).
+      addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
 
   llvm_unreachable("Unimplemented");
 }
@@ -608,30 +702,6 @@ bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
     case Hexagon::STriw_abs_setimm_V4:
 
     // V4 global address load.
-    case Hexagon::LDrid_GP_cPt_V4 :
-    case Hexagon::LDrid_GP_cNotPt_V4 :
-    case Hexagon::LDrid_GP_cdnPt_V4 :
-    case Hexagon::LDrid_GP_cdnNotPt_V4 :
-    case Hexagon::LDrib_GP_cPt_V4 :
-    case Hexagon::LDrib_GP_cNotPt_V4 :
-    case Hexagon::LDrib_GP_cdnPt_V4 :
-    case Hexagon::LDrib_GP_cdnNotPt_V4 :
-    case Hexagon::LDriub_GP_cPt_V4 :
-    case Hexagon::LDriub_GP_cNotPt_V4 :
-    case Hexagon::LDriub_GP_cdnPt_V4 :
-    case Hexagon::LDriub_GP_cdnNotPt_V4 :
-    case Hexagon::LDrih_GP_cPt_V4 :
-    case Hexagon::LDrih_GP_cNotPt_V4 :
-    case Hexagon::LDrih_GP_cdnPt_V4 :
-    case Hexagon::LDrih_GP_cdnNotPt_V4 :
-    case Hexagon::LDriuh_GP_cPt_V4 :
-    case Hexagon::LDriuh_GP_cNotPt_V4 :
-    case Hexagon::LDriuh_GP_cdnPt_V4 :
-    case Hexagon::LDriuh_GP_cdnNotPt_V4 :
-    case Hexagon::LDriw_GP_cPt_V4 :
-    case Hexagon::LDriw_GP_cNotPt_V4 :
-    case Hexagon::LDriw_GP_cdnPt_V4 :
-    case Hexagon::LDriw_GP_cdnNotPt_V4 :
     case Hexagon::LDd_GP_cPt_V4 :
     case Hexagon::LDd_GP_cNotPt_V4 :
     case Hexagon::LDd_GP_cdnPt_V4 :
@@ -658,22 +728,6 @@ bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
     case Hexagon::LDw_GP_cdnNotPt_V4 :
 
     // V4 global address store.
-    case Hexagon::STrid_GP_cPt_V4 :
-    case Hexagon::STrid_GP_cNotPt_V4 :
-    case Hexagon::STrid_GP_cdnPt_V4 :
-    case Hexagon::STrid_GP_cdnNotPt_V4 :
-    case Hexagon::STrib_GP_cPt_V4 :
-    case Hexagon::STrib_GP_cNotPt_V4 :
-    case Hexagon::STrib_GP_cdnPt_V4 :
-    case Hexagon::STrib_GP_cdnNotPt_V4 :
-    case Hexagon::STrih_GP_cPt_V4 :
-    case Hexagon::STrih_GP_cNotPt_V4 :
-    case Hexagon::STrih_GP_cdnPt_V4 :
-    case Hexagon::STrih_GP_cdnNotPt_V4 :
-    case Hexagon::STriw_GP_cPt_V4 :
-    case Hexagon::STriw_GP_cNotPt_V4 :
-    case Hexagon::STriw_GP_cdnPt_V4 :
-    case Hexagon::STriw_GP_cdnNotPt_V4 :
     case Hexagon::STd_GP_cPt_V4 :
     case Hexagon::STd_GP_cNotPt_V4 :
     case Hexagon::STd_GP_cdnPt_V4 :
@@ -692,18 +746,6 @@ bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
     case Hexagon::STw_GP_cdnNotPt_V4 :
 
     // V4 predicated global address new value store.
-    case Hexagon::STrib_GP_cPt_nv_V4 :
-    case Hexagon::STrib_GP_cNotPt_nv_V4 :
-    case Hexagon::STrib_GP_cdnPt_nv_V4 :
-    case Hexagon::STrib_GP_cdnNotPt_nv_V4 :
-    case Hexagon::STrih_GP_cPt_nv_V4 :
-    case Hexagon::STrih_GP_cNotPt_nv_V4 :
-    case Hexagon::STrih_GP_cdnPt_nv_V4 :
-    case Hexagon::STrih_GP_cdnNotPt_nv_V4 :
-    case Hexagon::STriw_GP_cPt_nv_V4 :
-    case Hexagon::STriw_GP_cNotPt_nv_V4 :
-    case Hexagon::STriw_GP_cdnPt_nv_V4 :
-    case Hexagon::STriw_GP_cdnNotPt_nv_V4 :
     case Hexagon::STb_GP_cPt_nv_V4 :
     case Hexagon::STb_GP_cNotPt_nv_V4 :
     case Hexagon::STb_GP_cdnPt_nv_V4 :
@@ -1095,7 +1137,6 @@ bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
     case Hexagon::STrib_indexed_nv_V4:
     case Hexagon::STrib_indexed_shl_nv_V4:
     case Hexagon::STrib_shl_nv_V4:
-    case Hexagon::STrib_GP_nv_V4:
     case Hexagon::STb_GP_nv_V4:
     case Hexagon::POST_STbri_nv_V4:
     case Hexagon::STrib_cPt_nv_V4:
@@ -1118,10 +1159,6 @@ bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
     case Hexagon::STb_GP_cNotPt_nv_V4:
     case Hexagon::STb_GP_cdnPt_nv_V4:
     case Hexagon::STb_GP_cdnNotPt_nv_V4:
-    case Hexagon::STrib_GP_cPt_nv_V4:
-    case Hexagon::STrib_GP_cNotPt_nv_V4:
-    case Hexagon::STrib_GP_cdnPt_nv_V4:
-    case Hexagon::STrib_GP_cdnNotPt_nv_V4:
     case Hexagon::STrib_abs_nv_V4:
     case Hexagon::STrib_abs_cPt_nv_V4:
     case Hexagon::STrib_abs_cdnPt_nv_V4:
@@ -1138,7 +1175,6 @@ bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
     case Hexagon::STrih_indexed_nv_V4:
     case Hexagon::STrih_indexed_shl_nv_V4:
     case Hexagon::STrih_shl_nv_V4:
-    case Hexagon::STrih_GP_nv_V4:
     case Hexagon::STh_GP_nv_V4:
     case Hexagon::POST_SThri_nv_V4:
     case Hexagon::STrih_cPt_nv_V4:
@@ -1161,10 +1197,6 @@ bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
     case Hexagon::STh_GP_cNotPt_nv_V4:
     case Hexagon::STh_GP_cdnPt_nv_V4:
     case Hexagon::STh_GP_cdnNotPt_nv_V4:
-    case Hexagon::STrih_GP_cPt_nv_V4:
-    case Hexagon::STrih_GP_cNotPt_nv_V4:
-    case Hexagon::STrih_GP_cdnPt_nv_V4:
-    case Hexagon::STrih_GP_cdnNotPt_nv_V4:
     case Hexagon::STrih_abs_nv_V4:
     case Hexagon::STrih_abs_cPt_nv_V4:
     case Hexagon::STrih_abs_cdnPt_nv_V4:
@@ -1181,7 +1213,6 @@ bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
     case Hexagon::STriw_indexed_nv_V4:
     case Hexagon::STriw_indexed_shl_nv_V4:
     case Hexagon::STriw_shl_nv_V4:
-    case Hexagon::STriw_GP_nv_V4:
     case Hexagon::STw_GP_nv_V4:
     case Hexagon::POST_STwri_nv_V4:
     case Hexagon::STriw_cPt_nv_V4:
@@ -1204,10 +1235,6 @@ bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
     case Hexagon::STw_GP_cNotPt_nv_V4:
     case Hexagon::STw_GP_cdnPt_nv_V4:
     case Hexagon::STw_GP_cdnNotPt_nv_V4:
-    case Hexagon::STriw_GP_cPt_nv_V4:
-    case Hexagon::STriw_GP_cNotPt_nv_V4:
-    case Hexagon::STriw_GP_cdnPt_nv_V4:
-    case Hexagon::STriw_GP_cdnNotPt_nv_V4:
     case Hexagon::STriw_abs_nv_V4:
     case Hexagon::STriw_abs_cPt_nv_V4:
     case Hexagon::STriw_abs_cdnPt_nv_V4:
@@ -1500,26 +1527,11 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
       return Hexagon::JMPR_cPt;
 
   // V4 indexed+scaled load.
-    case Hexagon::LDrid_indexed_cPt_V4:
-      return Hexagon::LDrid_indexed_cNotPt_V4;
-    case Hexagon::LDrid_indexed_cNotPt_V4:
-      return Hexagon::LDrid_indexed_cPt_V4;
-
     case Hexagon::LDrid_indexed_shl_cPt_V4:
       return Hexagon::LDrid_indexed_shl_cNotPt_V4;
     case Hexagon::LDrid_indexed_shl_cNotPt_V4:
       return Hexagon::LDrid_indexed_shl_cPt_V4;
 
-    case Hexagon::LDrib_indexed_cPt_V4:
-      return Hexagon::LDrib_indexed_cNotPt_V4;
-    case Hexagon::LDrib_indexed_cNotPt_V4:
-      return Hexagon::LDrib_indexed_cPt_V4;
-
-    case Hexagon::LDriub_indexed_cPt_V4:
-      return Hexagon::LDriub_indexed_cNotPt_V4;
-    case Hexagon::LDriub_indexed_cNotPt_V4:
-      return Hexagon::LDriub_indexed_cPt_V4;
-
     case Hexagon::LDrib_indexed_shl_cPt_V4:
       return Hexagon::LDrib_indexed_shl_cNotPt_V4;
     case Hexagon::LDrib_indexed_shl_cNotPt_V4:
@@ -1530,16 +1542,6 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
     case Hexagon::LDriub_indexed_shl_cNotPt_V4:
       return Hexagon::LDriub_indexed_shl_cPt_V4;
 
-    case Hexagon::LDrih_indexed_cPt_V4:
-      return Hexagon::LDrih_indexed_cNotPt_V4;
-    case Hexagon::LDrih_indexed_cNotPt_V4:
-      return Hexagon::LDrih_indexed_cPt_V4;
-
-    case Hexagon::LDriuh_indexed_cPt_V4:
-      return Hexagon::LDriuh_indexed_cNotPt_V4;
-    case Hexagon::LDriuh_indexed_cNotPt_V4:
-      return Hexagon::LDriuh_indexed_cPt_V4;
-
     case Hexagon::LDrih_indexed_shl_cPt_V4:
       return Hexagon::LDrih_indexed_shl_cNotPt_V4;
     case Hexagon::LDrih_indexed_shl_cNotPt_V4:
@@ -1550,11 +1552,6 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
     case Hexagon::LDriuh_indexed_shl_cNotPt_V4:
       return Hexagon::LDriuh_indexed_shl_cPt_V4;
 
-    case Hexagon::LDriw_indexed_cPt_V4:
-      return Hexagon::LDriw_indexed_cNotPt_V4;
-    case Hexagon::LDriw_indexed_cNotPt_V4:
-      return Hexagon::LDriw_indexed_cPt_V4;
-
     case Hexagon::LDriw_indexed_shl_cPt_V4:
       return Hexagon::LDriw_indexed_shl_cNotPt_V4;
     case Hexagon::LDriw_indexed_shl_cNotPt_V4:
@@ -1680,26 +1677,6 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
     case Hexagon::STw_GP_cNotPt_V4:
       return Hexagon::STw_GP_cPt_V4;
 
-    case Hexagon::STrid_GP_cPt_V4:
-      return Hexagon::STrid_GP_cNotPt_V4;
-    case Hexagon::STrid_GP_cNotPt_V4:
-      return Hexagon::STrid_GP_cPt_V4;
-
-    case Hexagon::STrib_GP_cPt_V4:
-      return Hexagon::STrib_GP_cNotPt_V4;
-    case Hexagon::STrib_GP_cNotPt_V4:
-      return Hexagon::STrib_GP_cPt_V4;
-
-    case Hexagon::STrih_GP_cPt_V4:
-      return Hexagon::STrih_GP_cNotPt_V4;
-    case Hexagon::STrih_GP_cNotPt_V4:
-      return Hexagon::STrih_GP_cPt_V4;
-
-    case Hexagon::STriw_GP_cPt_V4:
-      return Hexagon::STriw_GP_cNotPt_V4;
-    case Hexagon::STriw_GP_cNotPt_V4:
-      return Hexagon::STriw_GP_cPt_V4;
-
   // Load.
     case Hexagon::LDrid_cPt:
       return Hexagon::LDrid_cNotPt;
@@ -1965,75 +1942,26 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
                               Hexagon::JMPR_cNotPt;
 
   // V4 indexed+scaled load.
-  case Hexagon::LDrid_indexed_V4:
-    return !invertPredicate ? Hexagon::LDrid_indexed_cPt_V4 :
-                              Hexagon::LDrid_indexed_cNotPt_V4;
   case Hexagon::LDrid_indexed_shl_V4:
     return !invertPredicate ? Hexagon::LDrid_indexed_shl_cPt_V4 :
                               Hexagon::LDrid_indexed_shl_cNotPt_V4;
-  case Hexagon::LDrib_indexed_V4:
-    return !invertPredicate ? Hexagon::LDrib_indexed_cPt_V4 :
-                              Hexagon::LDrib_indexed_cNotPt_V4;
-  case Hexagon::LDriub_indexed_V4:
-    return !invertPredicate ? Hexagon::LDriub_indexed_cPt_V4 :
-                              Hexagon::LDriub_indexed_cNotPt_V4;
-  case Hexagon::LDriub_ae_indexed_V4:
-    return !invertPredicate ? Hexagon::LDriub_indexed_cPt_V4 :
-                              Hexagon::LDriub_indexed_cNotPt_V4;
   case Hexagon::LDrib_indexed_shl_V4:
     return !invertPredicate ? Hexagon::LDrib_indexed_shl_cPt_V4 :
                               Hexagon::LDrib_indexed_shl_cNotPt_V4;
   case Hexagon::LDriub_indexed_shl_V4:
     return !invertPredicate ? Hexagon::LDriub_indexed_shl_cPt_V4 :
                               Hexagon::LDriub_indexed_shl_cNotPt_V4;
-  case Hexagon::LDriub_ae_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDriub_indexed_shl_cPt_V4 :
-                              Hexagon::LDriub_indexed_shl_cNotPt_V4;
-  case Hexagon::LDrih_indexed_V4:
-    return !invertPredicate ? Hexagon::LDrih_indexed_cPt_V4 :
-                              Hexagon::LDrih_indexed_cNotPt_V4;
-  case Hexagon::LDriuh_indexed_V4:
-    return !invertPredicate ? Hexagon::LDriuh_indexed_cPt_V4 :
-                              Hexagon::LDriuh_indexed_cNotPt_V4;
-  case Hexagon::LDriuh_ae_indexed_V4:
-    return !invertPredicate ? Hexagon::LDriuh_indexed_cPt_V4 :
-                              Hexagon::LDriuh_indexed_cNotPt_V4;
   case Hexagon::LDrih_indexed_shl_V4:
     return !invertPredicate ? Hexagon::LDrih_indexed_shl_cPt_V4 :
                               Hexagon::LDrih_indexed_shl_cNotPt_V4;
   case Hexagon::LDriuh_indexed_shl_V4:
     return !invertPredicate ? Hexagon::LDriuh_indexed_shl_cPt_V4 :
                               Hexagon::LDriuh_indexed_shl_cNotPt_V4;
-  case Hexagon::LDriuh_ae_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDriuh_indexed_shl_cPt_V4 :
-                              Hexagon::LDriuh_indexed_shl_cNotPt_V4;
-  case Hexagon::LDriw_indexed_V4:
-    return !invertPredicate ? Hexagon::LDriw_indexed_cPt_V4 :
-                              Hexagon::LDriw_indexed_cNotPt_V4;
   case Hexagon::LDriw_indexed_shl_V4:
     return !invertPredicate ? Hexagon::LDriw_indexed_shl_cPt_V4 :
                               Hexagon::LDriw_indexed_shl_cNotPt_V4;
 
   // V4 Load from global address
-  case Hexagon::LDrid_GP_V4:
-    return !invertPredicate ? Hexagon::LDrid_GP_cPt_V4 :
-                              Hexagon::LDrid_GP_cNotPt_V4;
-  case Hexagon::LDrib_GP_V4:
-    return !invertPredicate ? Hexagon::LDrib_GP_cPt_V4 :
-                              Hexagon::LDrib_GP_cNotPt_V4;
-  case Hexagon::LDriub_GP_V4:
-    return !invertPredicate ? Hexagon::LDriub_GP_cPt_V4 :
-                              Hexagon::LDriub_GP_cNotPt_V4;
-  case Hexagon::LDrih_GP_V4:
-    return !invertPredicate ? Hexagon::LDrih_GP_cPt_V4 :
-                              Hexagon::LDrih_GP_cNotPt_V4;
-  case Hexagon::LDriuh_GP_V4:
-    return !invertPredicate ? Hexagon::LDriuh_GP_cPt_V4 :
-                              Hexagon::LDriuh_GP_cNotPt_V4;
-  case Hexagon::LDriw_GP_V4:
-    return !invertPredicate ? Hexagon::LDriw_GP_cPt_V4 :
-                              Hexagon::LDriw_GP_cNotPt_V4;
-
   case Hexagon::LDd_GP_V4:
     return !invertPredicate ? Hexagon::LDd_GP_cPt_V4 :
                               Hexagon::LDd_GP_cNotPt_V4;
@@ -2116,19 +2044,6 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
                               Hexagon::STrid_indexed_shl_cNotPt_V4;
 
   // V4 Store to global address
-  case Hexagon::STrid_GP_V4:
-    return !invertPredicate ? Hexagon::STrid_GP_cPt_V4 :
-                              Hexagon::STrid_GP_cNotPt_V4;
-  case Hexagon::STrib_GP_V4:
-    return !invertPredicate ? Hexagon::STrib_GP_cPt_V4 :
-                              Hexagon::STrib_GP_cNotPt_V4;
-  case Hexagon::STrih_GP_V4:
-    return !invertPredicate ? Hexagon::STrih_GP_cPt_V4 :
-                              Hexagon::STrih_GP_cNotPt_V4;
-  case Hexagon::STriw_GP_V4:
-    return !invertPredicate ? Hexagon::STriw_GP_cPt_V4 :
-                              Hexagon::STriw_GP_cNotPt_V4;
-
   case Hexagon::STd_GP_V4:
     return !invertPredicate ? Hexagon::STd_GP_cPt_V4 :
                               Hexagon::STd_GP_cNotPt_V4;
@@ -2215,38 +2130,141 @@ PredicateInstruction(MachineInstr *MI,
   assert (isPredicable(MI) && "Expected predicable instruction");
   bool invertJump = (!Cond.empty() && Cond[0].isImm() &&
                      (Cond[0].getImm() == 0));
+
+  // This will change MI's opcode to its predicate version.
+  // However, its operand list is still the old one, i.e. the
+  // non-predicate one.
   MI->setDesc(get(getMatchingCondBranchOpcode(Opc, invertJump)));
-  //
-  // This assumes that the predicate is always the first operand
-  // in the set of inputs.
-  //
-  MI->addOperand(MI->getOperand(MI->getNumOperands()-1));
-  int oper;
-  for (oper = MI->getNumOperands() - 3; oper >= 0; --oper) {
-    MachineOperand MO = MI->getOperand(oper);
-    if ((MO.isReg() && !MO.isUse() && !MO.isImplicit())) {
-      break;
-    }
 
-    if (MO.isReg()) {
-      MI->getOperand(oper+1).ChangeToRegister(MO.getReg(), MO.isDef(),
-                                              MO.isImplicit(), MO.isKill(),
-                                              MO.isDead(), MO.isUndef(),
-                                              MO.isDebug());
-    } else if (MO.isImm()) {
-      MI->getOperand(oper+1).ChangeToImmediate(MO.getImm());
-    } else {
-      llvm_unreachable("Unexpected operand type");
+  int oper = -1;
+  unsigned int GAIdx = 0;
+
+  // Indicates whether the current MI has a GlobalAddress operand
+  bool hasGAOpnd = false;
+  std::vector<MachineOperand> tmpOpnds;
+
+  // Indicates whether we need to shift operands to right.
+  bool needShift = true;
+
+  // The predicate is ALWAYS the FIRST input operand !!!
+  if (MI->getNumOperands() == 0) {
+    // The non-predicate version of MI does not take any operands,
+    // i.e. no outs and no ins. In this condition, the predicate
+    // operand will be directly placed at Operands[0]. No operand
+    // shift is needed.
+    // Example: BARRIER
+    needShift = false;
+    oper = -1;
+  }
+  else if (   MI->getOperand(MI->getNumOperands()-1).isReg()
+           && MI->getOperand(MI->getNumOperands()-1).isDef()
+           && !MI->getOperand(MI->getNumOperands()-1).isImplicit()) {
+    // The non-predicate version of MI does not have any input operands.
+    // In this condition, we extend the length of Operands[] by one and
+    // copy the original last operand to the newly allocated slot.
+    // At this moment, it is just a place holder. Later, we will put
+    // predicate operand directly into it. No operand shift is needed.
+    // Example: r0=BARRIER (this is a faked insn used here for illustration)
+    MI->addOperand(MI->getOperand(MI->getNumOperands()-1));
+    needShift = false;
+    oper = MI->getNumOperands() - 2;
+  }
+  else {
+    // We need to right shift all input operands by one. Duplicate the
+    // last operand into the newly allocated slot.
+    MI->addOperand(MI->getOperand(MI->getNumOperands()-1));
+  }
+
+  if (needShift)
+  {
+    // Operands[ MI->getNumOperands() - 2 ] has been copied into
+    // Operands[ MI->getNumOperands() - 1 ], so we start from
+    // Operands[ MI->getNumOperands() - 3 ].
+    // oper is a signed int.
+    // It is ok if "MI->getNumOperands()-3" is -3, -2, or -1.
+    for (oper = MI->getNumOperands() - 3; oper >= 0; --oper)
+    {
+      MachineOperand &MO = MI->getOperand(oper);
+
+      // Opnd[0] Opnd[1] Opnd[2] Opnd[3] Opnd[4]   Opnd[5]   Opnd[6]   Opnd[7]
+      // <Def0>  <Def1>  <Use0>  <Use1>  <ImpDef0> <ImpDef1> <ImpUse0> <ImpUse1>
+      //               /\~
+      //              /||\~
+      //               ||
+      //        Predicate Operand here
+      if (MO.isReg() && !MO.isUse() && !MO.isImplicit()) {
+        break;
+      }
+      if (MO.isReg()) {
+        MI->getOperand(oper+1).ChangeToRegister(MO.getReg(), MO.isDef(),
+                                                MO.isImplicit(), MO.isKill(),
+                                                MO.isDead(), MO.isUndef(),
+                                                MO.isDebug());
+      }
+      else if (MO.isImm()) {
+        MI->getOperand(oper+1).ChangeToImmediate(MO.getImm());
+      }
+      else if (MO.isGlobal()) {
+        // MI can not have more than one GlobalAddress operand.
+        assert(hasGAOpnd == false && "MI can only have one GlobalAddress opnd");
+
+        // There is no member function called "ChangeToGlobalAddress" in the
+        // MachineOperand class (not like "ChangeToRegister" and
+        // "ChangeToImmediate"). So we have to remove them from Operands[] list
+        // first, and then add them back after we have inserted the predicate
+        // operand. tmpOpnds[] is to remember these operands before we remove
+        // them.
+        tmpOpnds.push_back(MO);
+
+        // Operands[oper] is a GlobalAddress operand;
+        // Operands[oper+1] has been copied into Operands[oper+2];
+        hasGAOpnd = true;
+        GAIdx = oper;
+        continue;
+      }
+      else {
+        assert(false && "Unexpected operand type");
+      }
     }
   }
 
   int regPos = invertJump ? 1 : 0;
   MachineOperand PredMO = Cond[regPos];
+
+  // [oper] now points to the last explicit Def. Predicate operand must be
+  // located at [oper+1]. See diagram above.
+  // This assumes that the predicate is always the first operand,
+  // i.e. Operands[0+numResults], in the set of inputs
+  // It is better to have an assert here to check this. But I don't know how
+  // to write this assert because findFirstPredOperandIdx() would return -1
+  if (oper < -1) oper = -1;
   MI->getOperand(oper+1).ChangeToRegister(PredMO.getReg(), PredMO.isDef(),
                                           PredMO.isImplicit(), PredMO.isKill(),
                                           PredMO.isDead(), PredMO.isUndef(),
                                           PredMO.isDebug());
 
+  if (hasGAOpnd)
+  {
+    unsigned int i;
+
+    // Operands[GAIdx] is the original GlobalAddress operand, which is
+    // already copied into tmpOpnds[0].
+    // Operands[GAIdx] now stores a copy of Operands[GAIdx-1]
+    // Operands[GAIdx+1] has already been copied into Operands[GAIdx+2],
+    // so we start from [GAIdx+2]
+    for (i = GAIdx + 2; i < MI->getNumOperands(); ++i)
+      tmpOpnds.push_back(MI->getOperand(i));
+
+    // Remove all operands in range [ (GAIdx+1) ... (MI->getNumOperands()-1) ]
+    // It is very important that we always remove from the end of Operands[]
+    // MI->getNumOperands() is at least 2 if program goes to here.
+    for (i = MI->getNumOperands() - 1; i > GAIdx; --i)
+      MI->RemoveOperand(i);
+
+    for (i = 0; i < tmpOpnds.size(); ++i)
+      MI->addOperand(tmpOpnds[i]);
+  }
+
   return true;
 }
 
@@ -2352,7 +2370,9 @@ isValidOffset(const int Opcode, const int Offset) const {
   switch(Opcode) {
 
   case Hexagon::LDriw:
+  case Hexagon::LDriw_indexed:
   case Hexagon::LDriw_f:
+  case Hexagon::STriw_indexed:
   case Hexagon::STriw:
   case Hexagon::STriw_f:
     assert((Offset % 4 == 0) && "Offset has incorrect alignment");
@@ -2360,8 +2380,10 @@ isValidOffset(const int Opcode, const int Offset) const {
       (Offset <= Hexagon_MEMW_OFFSET_MAX);
 
   case Hexagon::LDrid:
+  case Hexagon::LDrid_indexed:
   case Hexagon::LDrid_f:
   case Hexagon::STrid:
+  case Hexagon::STrid_indexed:
   case Hexagon::STrid_f:
     assert((Offset % 8 == 0) && "Offset has incorrect alignment");
     return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
@@ -2435,6 +2457,9 @@ isValidOffset(const int Opcode, const int Offset) const {
   case Hexagon::LDriw_pred:
     return true;
 
+  case Hexagon::LOOP0_i:
+    return isUInt<10>(Offset);
+
   // INLINEASM is very special.
   case Hexagon::INLINEASM:
     return true;
@@ -2643,28 +2668,16 @@ isConditionalLoad (const MachineInstr* MI) const {
     case Hexagon::POST_LDriub_cPt :
     case Hexagon::POST_LDriub_cNotPt :
       return QRI.Subtarget.hasV4TOps();
-    case Hexagon::LDrid_indexed_cPt_V4 :
-    case Hexagon::LDrid_indexed_cNotPt_V4 :
     case Hexagon::LDrid_indexed_shl_cPt_V4 :
     case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDrib_indexed_cPt_V4 :
-    case Hexagon::LDrib_indexed_cNotPt_V4 :
     case Hexagon::LDrib_indexed_shl_cPt_V4 :
     case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDriub_indexed_cPt_V4 :
-    case Hexagon::LDriub_indexed_cNotPt_V4 :
     case Hexagon::LDriub_indexed_shl_cPt_V4 :
     case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDrih_indexed_cPt_V4 :
-    case Hexagon::LDrih_indexed_cNotPt_V4 :
     case Hexagon::LDrih_indexed_shl_cPt_V4 :
     case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDriuh_indexed_cPt_V4 :
-    case Hexagon::LDriuh_indexed_cNotPt_V4 :
     case Hexagon::LDriuh_indexed_shl_cPt_V4 :
     case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
-    case Hexagon::LDriw_indexed_cPt_V4 :
-    case Hexagon::LDriw_indexed_cNotPt_V4 :
     case Hexagon::LDriw_indexed_shl_cPt_V4 :
     case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
       return QRI.Subtarget.hasV4TOps();
@@ -2747,14 +2760,6 @@ isConditionalStore (const MachineInstr* MI) const {
       return QRI.Subtarget.hasV4TOps();
 
     // V4 global address store before promoting to dot new.
-    case Hexagon::STrid_GP_cPt_V4 :
-    case Hexagon::STrid_GP_cNotPt_V4 :
-    case Hexagon::STrib_GP_cPt_V4 :
-    case Hexagon::STrib_GP_cNotPt_V4 :
-    case Hexagon::STrih_GP_cPt_V4 :
-    case Hexagon::STrih_GP_cNotPt_V4 :
-    case Hexagon::STriw_GP_cPt_V4 :
-    case Hexagon::STriw_GP_cNotPt_V4 :
     case Hexagon::STd_GP_cPt_V4 :
     case Hexagon::STd_GP_cNotPt_V4 :
     case Hexagon::STb_GP_cPt_V4 :
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 29e3eb1..4e36dfb 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -66,6 +66,10 @@ public:
                                 const SmallVectorImpl<MachineOperand> &Cond,
                                 DebugLoc DL) const;
 
+  virtual bool analyzeCompare(const MachineInstr *MI,
+                              unsigned &SrcReg, unsigned &SrcReg2,
+                              int &Mask, int &Value) const;
+
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index 8b183b9..082772a 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -808,7 +808,7 @@ let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC],
 // JR +
 //===----------------------------------------------------------------------===//
 def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
-                               [SDNPHasChain, SDNPOptInGlue]>;
+                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
 // Jump to address from register.
 let isPredicable =1, isReturn = 1, isTerminator = 1, isBarrier = 1,
@@ -1195,57 +1195,65 @@ let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
 //===----------------------------------------------------------------------===//
 // Multiply and use lower result.
 // Rd=+mpyi(Rs,#u8)
-def MPYI_riu : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 8 in
+def MPYI_riu : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Ext:$src2),
               "$dst =+ mpyi($src1, #$src2)",
               [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
-                                             u8ImmPred:$src2))]>;
+                                             u8ExtPred:$src2))]>;
 
 // Rd=-mpyi(Rs,#u8)
-def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, n8Imm:$src2),
+def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
               "$dst =- mpyi($src1, #$src2)",
-              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
-                                             n8ImmPred:$src2))]>;
+              [(set (i32 IntRegs:$dst), (ineg (mul (i32 IntRegs:$src1),
+                                                   u8ImmPred:$src2)))]>;
 
 // Rd=mpyi(Rs,#m9)
 // s9 is NOT the same as m9 - but it works.. so far.
 // Assembler maps to either Rd=+mpyi(Rs,#u8 or Rd=-mpyi(Rs,#u8)
 // depending on the value of m9. See Arch Spec.
-def MPYI_ri : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2),
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9,
+CextOpcode = "MPYI", InputType = "imm" in
+def MPYI_ri : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Ext:$src2),
               "$dst = mpyi($src1, #$src2)",
               [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
-                                             s9ImmPred:$src2))]>;
+                                             s9ExtPred:$src2))]>, ImmRegRel;
 
 // Rd=mpyi(Rs,Rt)
+let CextOpcode = "MPYI", InputType = "reg" in
 def MPYI : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
            "$dst = mpyi($src1, $src2)",
            [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
-                                          (i32 IntRegs:$src2)))]>;
+                                          (i32 IntRegs:$src2)))]>, ImmRegRel;
 
 // Rx+=mpyi(Rs,#u8)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 8,
+CextOpcode = "MPYI_acc", InputType = "imm" in
 def MPYI_acc_ri : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3),
+            (ins IntRegs:$src1, IntRegs:$src2, u8Ext:$src3),
             "$dst += mpyi($src2, #$src3)",
             [(set (i32 IntRegs:$dst),
-                  (add (mul (i32 IntRegs:$src2), u8ImmPred:$src3),
+                  (add (mul (i32 IntRegs:$src2), u8ExtPred:$src3),
                        (i32 IntRegs:$src1)))],
-            "$src1 = $dst">;
+            "$src1 = $dst">, ImmRegRel;
 
 // Rx+=mpyi(Rs,Rt)
+let CextOpcode = "MPYI_acc", InputType = "reg" in
 def MPYI_acc_rr : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst += mpyi($src2, $src3)",
             [(set (i32 IntRegs:$dst),
                   (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
                        (i32 IntRegs:$src1)))],
-            "$src1 = $dst">;
+            "$src1 = $dst">, ImmRegRel;
 
 // Rx-=mpyi(Rs,#u8)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 8 in
 def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, u8Imm:$src3),
+            (ins IntRegs:$src1, IntRegs:$src2, u8Ext:$src3),
             "$dst -= mpyi($src2, #$src3)",
             [(set (i32 IntRegs:$dst),
                   (sub (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
-                                                 u8ImmPred:$src3)))],
+                                                 u8ExtPred:$src3)))],
             "$src1 = $dst">;
 
 // Multiply and use upper result.
@@ -1314,7 +1322,7 @@ def MPYU64_acc : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
 // Rxx-=mpyu(Rs,Rt)
 def MPYU64_sub : MInst_acc<(outs DoubleRegs:$dst),
             (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "$dst += mpyu($src2, $src3)",
+            "$dst -= mpyu($src2, $src3)",
             [(set (i64 DoubleRegs:$dst),
                   (sub (i64 DoubleRegs:$src1),
                        (mul (i64 (anyext (i32 IntRegs:$src2))),
@@ -1322,37 +1330,43 @@ def MPYU64_sub : MInst_acc<(outs DoubleRegs:$dst),
             "$src1 = $dst">;
 
 
+let InputType = "reg", CextOpcode = "ADD_acc" in
 def ADDrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, IntRegs:$src3),
              "$dst += add($src2, $src3)",
              [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
                                                  (i32 IntRegs:$src3)),
                                             (i32 IntRegs:$src1)))],
-             "$src1 = $dst">;
+             "$src1 = $dst">, ImmRegRel;
 
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+InputType = "imm", CextOpcode = "ADD_acc" in
 def ADDri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
-                            IntRegs:$src2, s8Imm:$src3),
+                            IntRegs:$src2, s8Ext:$src3),
              "$dst += add($src2, #$src3)",
              [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
-                                                 s8ImmPred:$src3),
+                                                 s8_16ExtPred:$src3),
                                             (i32 IntRegs:$src1)))],
-             "$src1 = $dst">;
+             "$src1 = $dst">, ImmRegRel;
 
+let CextOpcode = "SUB_acc", InputType = "reg" in
 def SUBrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
                             IntRegs:$src2, IntRegs:$src3),
              "$dst -= add($src2, $src3)",
              [(set (i32 IntRegs:$dst),
                    (sub (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
                                                   (i32 IntRegs:$src3))))],
-             "$src1 = $dst">;
+             "$src1 = $dst">, ImmRegRel;
 
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+CextOpcode = "SUB_acc", InputType = "imm" in
 def SUBri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
-                            IntRegs:$src2, s8Imm:$src3),
+                            IntRegs:$src2, s8Ext:$src3),
              "$dst -= add($src2, #$src3)",
              [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1),
                                             (add (i32 IntRegs:$src2),
-                                                 s8ImmPred:$src3)))],
-             "$src1 = $dst">;
+                                                 s8_16ExtPred:$src3)))],
+             "$src1 = $dst">, ImmRegRel;
 
 //===----------------------------------------------------------------------===//
 // MTYPE/MPYH -
@@ -1405,35 +1419,71 @@ def STd_GP : STInst2<(outs),
             []>,
             Requires<[NoV4T]>;
 
-let hasCtrlDep = 1, isPredicable = 1 in
-def POST_STdri : STInstPI<(outs IntRegs:$dst),
-            (ins DoubleRegs:$src1, IntRegs:$src2, s4Imm:$offset),
-            "memd($src2++#$offset) = $src1",
-            [(set IntRegs:$dst,
-            (post_store (i64 DoubleRegs:$src1), (i32 IntRegs:$src2),
-                        s4_3ImmPred:$offset))],
-            "$src2 = $dst">;
+//===----------------------------------------------------------------------===//
+// Post increment store
+//===----------------------------------------------------------------------===//
 
-// if ([!]Pv) memd(Rx++#s4:3)=Rtt
-// if (Pv) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def POST_STdri_cPt : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
-                 s4_3Imm:$offset),
-            "if ($src1) memd($src3++#$offset) = $src2",
-            [],
-            "$src3 = $dst">;
-
-// if (!Pv) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, neverHasSideEffects = 1, isPredicated = 1,
-    isPredicated = 1 in
-def POST_STdri_cNotPt : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
-                 s4_3Imm:$offset),
-            "if (!$src1) memd($src3++#$offset) = $src2",
+multiclass ST_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
+                            bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : STInst2PI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2++#$offset) = $src3",
             [],
-            "$src3 = $dst">;
+            "$src2 = $dst">;
+}
+
+multiclass ST_PostInc_Pred<string mnemonic, RegisterClass RC,
+                           Operand ImmOp, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME# : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
+    // Predicate new
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
+    defm _cdn#NAME#_V4 : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 1>;
+  }
+}
+
+let hasCtrlDep = 1, isNVStorable = 1, neverHasSideEffects = 1 in
+multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
+                      Operand ImmOp> {
+
+  let hasCtrlDep = 1, BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def NAME : STInst2PI<(outs IntRegs:$dst),
+                (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
+                #mnemonic#"($src1++#$offset) = $src2",
+                [],
+                "$src1 = $dst">;
+
+    let isPredicated = 1 in {
+      defm Pt : ST_PostInc_Pred<mnemonic, RC, ImmOp, 0 >;
+      defm NotPt : ST_PostInc_Pred<mnemonic, RC, ImmOp, 1 >;
+    }
+  }
+}
+
+defm POST_STbri: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
+defm POST_SThri: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
+defm POST_STwri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
+
+let isNVStorable = 0 in
+defm POST_STdri: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm>, AddrModeRel;
+
+def : Pat<(post_truncsti8 (i32 IntRegs:$src1), IntRegs:$src2,
+                           s4_3ImmPred:$offset),
+          (POST_STbri IntRegs:$src2, s4_0ImmPred:$offset, IntRegs:$src1)>;
+
+def : Pat<(post_truncsti16 (i32 IntRegs:$src1), IntRegs:$src2,
+                            s4_3ImmPred:$offset),
+          (POST_SThri IntRegs:$src2, s4_1ImmPred:$offset, IntRegs:$src1)>;
+
+def : Pat<(post_store (i32 IntRegs:$src1), IntRegs:$src2, s4_2ImmPred:$offset),
+          (POST_STwri IntRegs:$src2, s4_1ImmPred:$offset, IntRegs:$src1)>;
+
+def : Pat<(post_store (i64 DoubleRegs:$src1), IntRegs:$src2,
+                       s4_3ImmPred:$offset),
+          (POST_STdri IntRegs:$src2, s4_3ImmPred:$offset, DoubleRegs:$src1)>;
 
 //===----------------------------------------------------------------------===//
 // multiclass for the store instructions with MEMri operand.
@@ -1595,32 +1645,6 @@ def STb_GP : STInst2<(outs),
             []>,
             Requires<[NoV4T]>;
 
-// memb(Rx++#s4:0)=Rt
-let hasCtrlDep = 1, isPredicable = 1 in
-def POST_STbri : STInstPI<(outs IntRegs:$dst), (ins IntRegs:$src1,
-                                                    IntRegs:$src2,
-                                                    s4Imm:$offset),
-            "memb($src2++#$offset) = $src1",
-            [(set IntRegs:$dst,
-            (post_truncsti8 (i32 IntRegs:$src1), (i32 IntRegs:$src2),
-                            s4_0ImmPred:$offset))],
-            "$src2 = $dst">;
-
-// if ([!]Pv) memb(Rx++#s4:0)=Rt
-// if (Pv) memb(Rx++#s4:0)=Rt
-let hasCtrlDep = 1, isPredicated = 1 in
-def POST_STbri_cPt : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
-            "if ($src1) memb($src3++#$offset) = $src2",
-            [],"$src3 = $dst">;
-
-// if (!Pv) memb(Rx++#s4:0)=Rt
-let hasCtrlDep = 1, isPredicated = 1 in
-def POST_STbri_cNotPt : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
-            "if (!$src1) memb($src3++#$offset) = $src2",
-            [],"$src3 = $dst">;
-
 let neverHasSideEffects = 1 in
 def STrih_GP : STInst2<(outs),
             (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
@@ -1636,31 +1660,6 @@ def STh_GP   : STInst2<(outs),
             Requires<[NoV4T]>;
 
 // memh(Rx++#s4:1)=Rt.H
-// memh(Rx++#s4:1)=Rt
-let hasCtrlDep = 1, isPredicable = 1 in
-def POST_SThri : STInstPI<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset),
-            "memh($src2++#$offset) = $src1",
-            [(set IntRegs:$dst,
-            (post_truncsti16 (i32 IntRegs:$src1), (i32 IntRegs:$src2),
-                             s4_1ImmPred:$offset))],
-            "$src2 = $dst">;
-
-// if ([!]Pv) memh(Rx++#s4:1)=Rt
-// if (Pv) memh(Rx++#s4:1)=Rt
-let hasCtrlDep = 1, isPredicated = 1 in
-def POST_SThri_cPt : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
-            "if ($src1) memh($src3++#$offset) = $src2",
-            [],"$src3 = $dst">;
-
-// if (!Pv) memh(Rx++#s4:1)=Rt
-let hasCtrlDep = 1, isPredicated = 1 in
-def POST_SThri_cNotPt : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
-            "if (!$src1) memh($src3++#$offset) = $src2",
-            [],"$src3 = $dst">;
-
 
 // Store word.
 // Store predicate.
@@ -1684,32 +1683,6 @@ def STw_GP : STInst2<(outs),
             []>,
             Requires<[NoV4T]>;
 
-let hasCtrlDep = 1, isPredicable = 1  in
-def POST_STwri : STInstPI<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, s4Imm:$offset),
-            "memw($src2++#$offset) = $src1",
-            [(set IntRegs:$dst,
-            (post_store (i32 IntRegs:$src1), (i32 IntRegs:$src2),
-                        s4_2ImmPred:$offset))],
-            "$src2 = $dst">;
-
-// if ([!]Pv) memw(Rx++#s4:2)=Rt
-// if (Pv) memw(Rx++#s4:2)=Rt
-let hasCtrlDep = 1, isPredicated = 1 in
-def POST_STwri_cPt : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
-            "if ($src1) memw($src3++#$offset) = $src2",
-            [],"$src3 = $dst">;
-
-// if (!Pv) memw(Rx++#s4:2)=Rt
-let hasCtrlDep = 1, isPredicated = 1 in
-def POST_STwri_cNotPt : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
-            "if (!$src1) memw($src3++#$offset) = $src2",
-            [],"$src3 = $dst">;
-
-
-
 // Allocate stack frame.
 let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in {
   def ALLOCFRAME : STInst2<(outs),
@@ -1912,7 +1885,7 @@ def SDHexagonBARRIER: SDTypeProfile<0, 0, []>;
 def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDHexagonBARRIER,
                            [SDNPHasChain]>;
 
-let hasSideEffects = 1, isHexagonSolo = 1 in
+let hasSideEffects = 1, isSolo = 1 in
 def BARRIER : SYSInst<(outs), (ins),
                      "barrier",
                      [(HexagonBARRIER)]>;
@@ -1987,9 +1960,9 @@ def LOOP0_r : CRInst<(outs), (ins brtarget:$offset, IntRegs:$src2),
 
 let isBranch = 1, isTerminator = 1, neverHasSideEffects = 1,
     Defs = [PC, LC0], Uses = [SA0, LC0] in {
-def ENDLOOP0 : Marker<(outs), (ins brtarget:$offset),
-                      ":endloop0",
-                      []>;
+def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
+                       ":endloop0",
+                       []>;
 }
 
 // Support for generating global address.
@@ -2852,23 +2825,42 @@ def : Pat <(i32 (zext (i1 PredRegs:$src1))),
 
 // i1 -> i64
 def : Pat <(i64 (zext (i1 PredRegs:$src1))),
-      (i64 (COMBINE_rr (TFRI 0), (MUX_ii (i1 PredRegs:$src1), 1, 0)))>;
+      (i64 (COMBINE_rr (TFRI 0), (MUX_ii (i1 PredRegs:$src1), 1, 0)))>,
+      Requires<[NoV4T]>;
 
 // i32 -> i64
 def : Pat <(i64 (zext (i32 IntRegs:$src1))),
-      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>;
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>,
+      Requires<[NoV4T]>;
 
 // i8 -> i64
 def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>;
+      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
 
 // i16 -> i64
 def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDriuh ADDRriS11_1:$src1)))>;
+      (i64 (COMBINE_rr (TFRI 0), (LDriuh ADDRriS11_1:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriuh_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
 
 // i32 -> i64
 def:  Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>;
+      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[NoV4T]>;
 
 def:  Pat <(i32 (zextloadi1 ADDRriS11_0:$src1)),
       (i32 (LDriw ADDRriS11_0:$src1))>;
@@ -2889,15 +2881,41 @@ def : Pat <(i64 (anyext (i1 PredRegs:$src1))),
 // Any extended 64-bit load.
 // anyext i32 -> i64
 def:  Pat <(i64 (extloadi32 ADDRriS11_2:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>;
+      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[NoV4T]>;
+
+// When there is an offset we should prefer the pattern below over the pattern above.
+// The complexity of the above is 13 (gleaned from HexagonGenDAGIsel.inc)
+// So this complexity below is comfortably higher to allow for choosing the below.
+// If this is not done then we generate addresses such as
+// ********************************************
+//        r1 = add (r0, #4)
+//        r1 = memw(r1 + #0)
+//  instead of
+//        r1 = memw(r0 + #4)
+// ********************************************
+let AddedComplexity = 100 in
+def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
 
 // anyext i16 -> i64.
 def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
-      (i64 (COMBINE_rr (TFRI 0), (LDrih ADDRriS11_2:$src1)))>;
+      (i64 (COMBINE_rr (TFRI 0), (LDrih ADDRriS11_2:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (extloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDrih_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
 
 // Map from Rdd = zxtw(Rs) -> Rdd = combine(0, Rs).
 def : Pat<(i64 (zext (i32 IntRegs:$src1))),
-      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>;
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>,
+      Requires<[NoV4T]>;
 
 // Multiply 64-bit unsigned and use upper result.
 def : Pat <(mulhu (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 372de9a..e1b2f88 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -21,6 +21,17 @@ def IMMEXT_c : T_Immext<(ins calltarget:$imm)>;
 def IMMEXT_g : T_Immext<(ins globaladdress:$imm)>;
 def IMMEXT_i : T_Immext<(ins u26_6Imm:$imm)>;
 
+// Fold (add (CONST32 tglobaladdr:$addr) <offset>) into a global address.
+def FoldGlobalAddr : ComplexPattern<i32, 1, "foldGlobalAddress", [], []>;
+
+// Fold (add (CONST32_GP tglobaladdr:$addr) <offset>) into a global address.
+def FoldGlobalAddrGP : ComplexPattern<i32, 1, "foldGlobalAddressGP", [], []>;
+
+def NumUsesBelowThresCONST32 : PatFrag<(ops node:$addr),
+                                       (HexagonCONST32 node:$addr), [{
+  return hasNumUsesBelowThresGA(N->getOperand(0).getNode());
+}]>;
+
 // Hexagon V4 Architecture spec defines 8 instruction classes:
 // LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
 // compiler)
@@ -251,6 +262,54 @@ def TFR_FI_immext_V4 : ALU32_ri<(outs IntRegs:$dst),
             []>,
             Requires<[HasV4T]>;
 
+// Rd=cmp.eq(Rs,#s8)
+let validSubTargets = HasV4SubT, isExtendable = 1, opExtendable = 2,
+isExtentSigned = 1, opExtentBits = 8 in
+def V4_A4_rcmpeqi : ALU32_ri<(outs IntRegs:$Rd),
+                    (ins IntRegs:$Rs, s8Ext:$s8),
+                    "$Rd = cmp.eq($Rs, #$s8)",
+                    [(set (i32 IntRegs:$Rd),
+                          (i32 (zext (i1 (seteq (i32 IntRegs:$Rs),
+                                                s8ExtPred:$s8)))))]>,
+                    Requires<[HasV4T]>;
+
+// Preserve the TSTBIT generation
+def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 (shl 1, (i32 IntRegs:$src2))),
+                                           (i32 IntRegs:$src1))), 0)))),
+      (i32 (MUX_ii (i1 (TSTBIT_rr (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                   1, 0))>;
+
+// Interfered with tstbit generation, above pattern preserves, see : tstbit.ll
+// Rd=cmp.ne(Rs,#s8)
+let validSubTargets = HasV4SubT, isExtendable = 1, opExtendable = 2,
+isExtentSigned = 1, opExtentBits = 8 in
+def V4_A4_rcmpneqi : ALU32_ri<(outs IntRegs:$Rd),
+                     (ins IntRegs:$Rs, s8Ext:$s8),
+                     "$Rd = !cmp.eq($Rs, #$s8)",
+                     [(set (i32 IntRegs:$Rd),
+                           (i32 (zext (i1 (setne (i32 IntRegs:$Rs),
+                                                 s8ExtPred:$s8)))))]>,
+                     Requires<[HasV4T]>;
+
+// Rd=cmp.eq(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def V4_A4_rcmpeq : ALU32_ri<(outs IntRegs:$Rd),
+                   (ins IntRegs:$Rs, IntRegs:$Rt),
+                   "$Rd = cmp.eq($Rs, $Rt)",
+                   [(set (i32 IntRegs:$Rd),
+                         (i32 (zext (i1 (seteq (i32 IntRegs:$Rs),
+                                               IntRegs:$Rt)))))]>,
+                   Requires<[HasV4T]>;
+
+// Rd=cmp.ne(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def V4_A4_rcmpneq : ALU32_ri<(outs IntRegs:$Rd),
+                    (ins IntRegs:$Rs, IntRegs:$Rt),
+                    "$Rd = !cmp.eq($Rs, $Rt)",
+                    [(set (i32 IntRegs:$Rd),
+                          (i32 (zext (i1 (setne (i32 IntRegs:$Rs),
+                                               IntRegs:$Rt)))))]>,
+                    Requires<[HasV4T]>;
 
 //===----------------------------------------------------------------------===//
 // ALU32 -
@@ -280,6 +339,19 @@ def COMBINE_Ir_V4 : ALU32_ir<(outs DoubleRegs:$dst),
             []>,
             Requires<[HasV4T]>;
 
+def HexagonWrapperCombineRI_V4 :
+  SDNode<"HexagonISD::WrapperCombineRI_V4", SDTHexagonI64I32I32>;
+def HexagonWrapperCombineIR_V4 :
+  SDNode<"HexagonISD::WrapperCombineIR_V4", SDTHexagonI64I32I32>;
+
+def : Pat <(HexagonWrapperCombineRI_V4 IntRegs:$r, s8ExtPred:$i),
+           (COMBINE_rI_V4 IntRegs:$r, s8ExtPred:$i)>,
+          Requires<[HasV4T]>;
+
+def : Pat <(HexagonWrapperCombineIR_V4 s8ExtPred:$i, IntRegs:$r),
+           (COMBINE_Ir_V4 s8ExtPred:$i, IntRegs:$r)>,
+          Requires<[HasV4T]>;
+
 let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 6,
     neverHasSideEffects = 1, validSubTargets = HasV4SubT in
 def COMBINE_iI_V4 : ALU32_ii<(outs DoubleRegs:$dst),
@@ -299,120 +371,95 @@ def COMBINE_iI_V4 : ALU32_ii<(outs DoubleRegs:$dst),
 // These absolute set addressing mode instructions accept immediate as
 // an operand. We have duplicated these patterns to take global address.
 
-let neverHasSideEffects = 1 in
+let isExtended = 1, opExtendable = 2, neverHasSideEffects = 1,
+validSubTargets = HasV4SubT in {
 def LDrid_abs_setimm_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2),
-            (ins u6Imm:$addr),
-            "$dst1 = memd($dst2=#$addr)",
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memd($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memb(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDrib_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins u6Imm:$addr),
-            "$dst1 = memb($dst2=#$addr)",
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memb($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memh(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDrih_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins u6Imm:$addr),
-            "$dst1 = memh($dst2=#$addr)",
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memh($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memub(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDriub_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins u6Imm:$addr),
-            "$dst1 = memub($dst2=#$addr)",
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memub($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memuh(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDriuh_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins u6Imm:$addr),
-            "$dst1 = memuh($dst2=#$addr)",
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memuh($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memw(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDriw_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins u6Imm:$addr),
-            "$dst1 = memw($dst2=#$addr)",
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memw($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
+}
 
 // Following patterns are defined for absolute set addressing mode
 // instruction which take global address as operand.
-let neverHasSideEffects = 1 in
+let isExtended = 1, opExtendable = 2, neverHasSideEffects = 1,
+validSubTargets = HasV4SubT in {
 def LDrid_abs_set_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2),
-            (ins globaladdress:$addr),
+            (ins globaladdressExt:$addr),
             "$dst1 = memd($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memb(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDrib_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins globaladdress:$addr),
+            (ins globaladdressExt:$addr),
             "$dst1 = memb($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memh(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDrih_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins globaladdress:$addr),
+            (ins globaladdressExt:$addr),
             "$dst1 = memh($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memub(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDriub_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins globaladdress:$addr),
+            (ins globaladdressExt:$addr),
             "$dst1 = memub($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memuh(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDriuh_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins globaladdress:$addr),
+            (ins globaladdressExt:$addr),
             "$dst1 = memuh($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
 
 // Rd=memw(Re=#U6)
-let neverHasSideEffects = 1 in
 def LDriw_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
-            (ins globaladdress:$addr),
+            (ins globaladdressExt:$addr),
             "$dst1 = memw($dst2=##$addr)",
             []>,
             Requires<[HasV4T]>;
-
-// Load doubleword.
-//
-// Make sure that in post increment load, the first operand is always the post
-// increment operand.
-//
-// Rdd=memd(Rs+Rt<<#u2)
-// Special case pattern for indexed load without offset which is easier to
-// match. AddedComplexity of this pattern should be lower than base+offset load
-// and lower yet than the more generic version with offset/shift below
-// Similar approach is taken for all other base+index loads.
-let AddedComplexity = 10, isPredicable = 1 in
-def LDrid_indexed_V4 : LDInst<(outs DoubleRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2),
-                    "$dst=memd($src1+$src2<<#0)",
-                    [(set (i64 DoubleRegs:$dst),
-                          (i64 (load (add (i32 IntRegs:$src1),
-                                          (i32 IntRegs:$src2)))))]>,
-                    Requires<[HasV4T]>;
+}
 
 // multiclass for load instructions with base + register offset
 // addressing mode
@@ -512,534 +559,42 @@ def : Pat <(i64 (load (add IntRegs:$src1,
             Requires<[HasV4T]>;
 }
 
-//// Load doubleword conditionally.
-// if ([!]Pv[.new]) Rd=memd(Rs+Rt<<#u2)
-// if (Pv) Rd=memd(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1) $dst=memd($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (Pv.new) Rd=memd(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1.new) $dst=memd($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv) Rd=memd(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1) $dst=memd($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv.new) Rd=memd(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrid_indexed_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1.new) $dst=memd($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// Rdd=memd(Rt<<#u2+#U6)
-
-//// Load byte.
-// Rd=memb(Rs+Rt<<#u2)
-let AddedComplexity = 10, isPredicable = 1 in
-def LDrib_indexed_V4 : LDInst<(outs IntRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2),
-                    "$dst=memb($src1+$src2<<#0)",
-                    [(set (i32 IntRegs:$dst),
-                          (i32 (sextloadi8 (add (i32 IntRegs:$src1),
-                                                (i32 IntRegs:$src2)))))]>,
-                    Requires<[HasV4T]>;
-
-let AddedComplexity = 10, isPredicable = 1 in
-def LDriub_indexed_V4 : LDInst<(outs IntRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2),
-                    "$dst=memub($src1+$src2<<#0)",
-                    [(set (i32 IntRegs:$dst),
-                          (i32 (zextloadi8 (add (i32 IntRegs:$src1),
-                                                (i32 IntRegs:$src2)))))]>,
-                    Requires<[HasV4T]>;
-
-let AddedComplexity = 10, isPredicable = 1 in
-def LDriub_ae_indexed_V4 : LDInst<(outs IntRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2),
-                    "$dst=memub($src1+$src2<<#0)",
-                    [(set (i32 IntRegs:$dst),
-                          (i32 (extloadi8 (add (i32 IntRegs:$src1),
-                                               (i32 IntRegs:$src2)))))]>,
-                    Requires<[HasV4T]>;
-
-let AddedComplexity = 40, isPredicable = 1 in
-def LDriub_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
-                    "$dst=memub($src1+$src2<<#$offset)",
-                    [(set (i32 IntRegs:$dst),
-                          (i32 (extloadi8 (add (i32 IntRegs:$src1),
-                                               (shl (i32 IntRegs:$src2),
-                                                    u2ImmPred:$offset)))))]>,
-                    Requires<[HasV4T]>;
-
-//// Load byte conditionally.
-// if ([!]Pv[.new]) Rd=memb(Rs+Rt<<#u2)
-// if (Pv) Rd=memb(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1) $dst=memb($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (Pv.new) Rd=memb(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1.new) $dst=memb($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv) Rd=memb(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1) $dst=memb($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv.new) Rd=memb(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrib_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1.new) $dst=memb($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-//// Load unsigned byte conditionally.
-// if ([!]Pv[.new]) Rd=memub(Rs+Rt<<#u2)
-// if (Pv) Rd=memub(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1) $dst=memub($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (Pv.new) Rd=memub(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1.new) $dst=memub($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv) Rd=memub(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1) $dst=memub($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv.new) Rd=memub(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriub_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1.new) $dst=memub($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// Rd=memb(Rt<<#u2+#U6)
-
-//// Load halfword
-// Rd=memh(Rs+Rt<<#u2)
-let AddedComplexity = 10, isPredicable = 1 in
-def LDrih_indexed_V4 : LDInst<(outs IntRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2),
-                    "$dst=memh($src1+$src2<<#0)",
-                    [(set (i32 IntRegs:$dst),
-                          (i32 (sextloadi16 (add (i32 IntRegs:$src1),
-                                                 (i32 IntRegs:$src2)))))]>,
-                    Requires<[HasV4T]>;
-
-let AddedComplexity = 10, isPredicable = 1 in
-def LDriuh_indexed_V4 : LDInst<(outs IntRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2),
-                    "$dst=memuh($src1+$src2<<#0)",
-                    [(set (i32 IntRegs:$dst),
-                          (i32 (zextloadi16 (add (i32 IntRegs:$src1),
-                                                 (i32 IntRegs:$src2)))))]>,
-                    Requires<[HasV4T]>;
-
-let AddedComplexity = 10, isPredicable = 1 in
-def LDriuh_ae_indexed_V4 : LDInst<(outs IntRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2),
-                    "$dst=memuh($src1+$src2<<#0)",
-                    [(set (i32 IntRegs:$dst),
-                          (i32 (extloadi16 (add (i32 IntRegs:$src1),
-                                                (i32 IntRegs:$src2)))))]>,
-                    Requires<[HasV4T]>;
-
-let AddedComplexity = 40, isPredicable = 1 in
-def LDriuh_ae_indexed_shl_V4 : LDInst<(outs IntRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
-                    "$dst=memuh($src1+$src2<<#$offset)",
-                    [(set (i32 IntRegs:$dst),
-                          (i32 (extloadi16 (add (i32 IntRegs:$src1),
-                                                (shl (i32 IntRegs:$src2),
-                                                     u2ImmPred:$offset)))))]>,
-                    Requires<[HasV4T]>;
-
-//// Load halfword conditionally.
-// if ([!]Pv[.new]) Rd=memh(Rs+Rt<<#u2)
-// if (Pv) Rd=memh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1) $dst=memh($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1.new) $dst=memh($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1) $dst=memh($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDrih_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1.new) $dst=memh($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-//// Load unsigned halfword conditionally.
-// if ([!]Pv[.new]) Rd=memuh(Rs+Rt<<#u2)
-// if (Pv) Rd=memuh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1) $dst=memuh($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1.new) $dst=memuh($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv) Rd=memuh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1) $dst=memuh($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv.new) Rd=memuh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriuh_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1.new) $dst=memuh($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// Rd=memh(Rt<<#u2+#U6)
-
-//// Load word.
-// Load predicate: Fix for bug 5279.
-let neverHasSideEffects = 1 in
-def LDriw_pred_V4 : LDInst2<(outs PredRegs:$dst),
-            (ins MEMri:$addr),
-            "Error; should not emit",
-            []>,
-            Requires<[HasV4T]>;
-
-// Rd=memw(Re=#U6)
-
-// Rd=memw(Rs+Rt<<#u2)
-let AddedComplexity = 10, isPredicable = 1 in
-def LDriw_indexed_V4 : LDInst<(outs IntRegs:$dst),
-                    (ins IntRegs:$src1, IntRegs:$src2),
-                    "$dst=memw($src1+$src2<<#0)",
-                    [(set (i32 IntRegs:$dst),
-                          (i32 (load (add (i32 IntRegs:$src1),
-                                          (i32 IntRegs:$src2)))))]>,
-                    Requires<[HasV4T]>;
-
-//// Load word conditionally.
-// if ([!]Pv[.new]) Rd=memw(Rs+Rt<<#u2)
-// if (Pv) Rd=memw(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1) $dst=memw($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (Pv.new) Rd=memh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if ($src1.new) $dst=memw($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv) Rd=memh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1) $dst=memw($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-// if (!Pv.new) Rd=memh(Rs+Rt<<#u2)
-let AddedComplexity = 15, isPredicated = 1 in
-def LDriw_indexed_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-                    (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-                    "if (!$src1.new) $dst=memw($src2+$src3<<#0)",
-                    []>,
-                    Requires<[HasV4T]>;
-
-/// Load from global offset
-
-let isPredicable = 1, neverHasSideEffects = 1 in
-def LDrid_GP_V4 : LDInst2<(outs DoubleRegs:$dst),
-            (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memd(#$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
 
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrid_GP_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1) $dst=memd(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrid_GP_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1) $dst=memd(##$global+$offset)",
-            []>,
+// 'def pats' for load instruction base + register offset and
+// zero immediate value.
+let AddedComplexity = 10 in {
+def : Pat <(i64 (load (add IntRegs:$src1, IntRegs:$src2))),
+           (LDrid_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
             Requires<[HasV4T]>;
 
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrid_GP_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1.new) $dst=memd(##$global+$offset)",
-            []>,
+def : Pat <(i32 (sextloadi8 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDrib_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
             Requires<[HasV4T]>;
 
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrid_GP_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1.new) $dst=memd(##$global+$offset)",
-            []>,
+def : Pat <(i32 (zextloadi8 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
             Requires<[HasV4T]>;
 
-let isPredicable = 1, neverHasSideEffects = 1 in
-def LDrib_GP_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memb(#$global+$offset)",
-            []>,
+def : Pat <(i32 (extloadi8 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
             Requires<[HasV4T]>;
 
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrib_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1) $dst=memb(##$global+$offset)",
-            []>,
+def : Pat <(i32 (sextloadi16 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDrih_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
             Requires<[HasV4T]>;
 
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrib_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1) $dst=memb(##$global+$offset)",
-            []>,
+def : Pat <(i32 (zextloadi16 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
             Requires<[HasV4T]>;
 
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrib_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1.new) $dst=memb(##$global+$offset)",
-            []>,
+def : Pat <(i32 (extloadi16 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
             Requires<[HasV4T]>;
 
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrib_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1.new) $dst=memb(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-
-let isPredicable = 1, neverHasSideEffects = 1 in
-def LDriub_GP_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memub(#$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriub_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1) $dst=memub(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriub_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1) $dst=memub(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriub_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1.new) $dst=memub(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriub_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1.new) $dst=memub(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-
-let isPredicable = 1, neverHasSideEffects = 1 in
-def LDrih_GP_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memh(#$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrih_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1) $dst=memh(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrih_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1) $dst=memh(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrih_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1.new) $dst=memh(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDrih_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1.new) $dst=memh(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-
-let isPredicable = 1, neverHasSideEffects = 1 in
-def LDriuh_GP_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memuh(#$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriuh_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1) $dst=memuh(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriuh_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1) $dst=memuh(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriuh_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1.new) $dst=memuh(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriuh_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1.new) $dst=memuh(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let isPredicable = 1, neverHasSideEffects = 1 in
-def LDriw_GP_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins globaladdress:$global, u16Imm:$offset),
-            "$dst=memw(#$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriw_GP_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1) $dst=memw(##$global+$offset)",
-            []>,
+def : Pat <(i32 (load (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriw_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
             Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriw_GP_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1) $dst=memw(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriw_GP_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if ($src1.new) $dst=memw(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def LDriw_GP_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset),
-            "if (!$src1.new) $dst=memw(##$global+$offset)",
-            []>,
-            Requires<[HasV4T]>;
-
+}
 
 let isPredicable = 1, neverHasSideEffects = 1, validSubTargets = HasV4SubT in
 def LDd_GP_V4 : LDInst2<(outs DoubleRegs:$dst),
@@ -1364,82 +919,73 @@ def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))),
            (i32 (LDw_GP_V4 tglobaladdr:$global))>,
             Requires<[HasV4T]>;
 
-def : Pat <(atomic_load_64 (add (HexagonCONST32_GP tglobaladdr:$global),
-                                u16ImmPred:$offset)),
-           (i64 (LDrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-           Requires<[HasV4T]>;
-
-def : Pat <(atomic_load_32 (add (HexagonCONST32_GP tglobaladdr:$global),
-                                u16ImmPred:$offset)),
-           (i32 (LDriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-            Requires<[HasV4T]>;
-
-def : Pat <(atomic_load_16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                                u16ImmPred:$offset)),
-           (i32 (LDriuh_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-            Requires<[HasV4T]>;
-
-def : Pat <(atomic_load_8 (add (HexagonCONST32_GP tglobaladdr:$global),
-                               u16ImmPred:$offset)),
-           (i32 (LDriub_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-           Requires<[HasV4T]>;
-
-// Map from load(globaladdress + x) -> memd(#foo + x)
-let AddedComplexity = 100 in
-def : Pat <(i64 (load (add (HexagonCONST32_GP tglobaladdr:$global),
-                           u16ImmPred:$offset))),
-           (i64 (LDrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-           Requires<[HasV4T]>;
-
-// Map from load(globaladdress + x) -> memb(#foo + x)
-let AddedComplexity = 100 in
-def : Pat <(i32 (extloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
-                           u16ImmPred:$offset))),
-           (i32 (LDrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-           Requires<[HasV4T]>;
+// zext i1->i64
+def : Pat <(i64 (zext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_Ir_V4 0, (MUX_ii (i1 PredRegs:$src1), 1, 0)))>,
+      Requires<[HasV4T]>;
+
+// zext i32->i64
+def : Pat <(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_Ir_V4 0, (i32 IntRegs:$src1)))>,
+      Requires<[HasV4T]>;
+// zext i8->i64
+def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// zext i16->i64
+def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh ADDRriS11_1:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// anyext i16->i64
+def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDrih ADDRriS11_2:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (extloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDrih_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// zext i32->i64
+def:  Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[HasV4T]>;
 
-// Map from load(globaladdress + x) -> memb(#foo + x)
 let AddedComplexity = 100 in
-def : Pat <(i32 (sextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
-                            u16ImmPred:$offset))),
-           (i32 (LDrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-           Requires<[HasV4T]>;
+def:  Pat <(i64 (zextloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
 
-// Map from load(globaladdress + x) -> memub(#foo + x)
-let AddedComplexity = 100 in
-def : Pat <(i32 (zextloadi8 (add (HexagonCONST32_GP tglobaladdr:$global),
-                            u16ImmPred:$offset))),
-           (i32 (LDriub_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-           Requires<[HasV4T]>;
+// anyext i32->i64
+def:  Pat <(i64 (extloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[HasV4T]>;
 
-// Map from load(globaladdress + x) -> memuh(#foo + x)
 let AddedComplexity = 100 in
-def : Pat <(i32 (extloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                            u16ImmPred:$offset))),
-           (i32 (LDrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-            Requires<[HasV4T]>;
+def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
 
-// Map from load(globaladdress + x) -> memh(#foo + x)
-let AddedComplexity = 100 in
-def : Pat <(i32 (sextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                             u16ImmPred:$offset))),
-           (i32 (LDrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-           Requires<[HasV4T]>;
-
-
-// Map from load(globaladdress + x) -> memuh(#foo + x)
-let AddedComplexity = 100 in
-def : Pat <(i32 (zextloadi16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                             u16ImmPred:$offset))),
-           (i32 (LDriuh_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-            Requires<[HasV4T]>;
-
-// Map from load(globaladdress + x) -> memw(#foo + x)
-let AddedComplexity = 100 in
-def : Pat <(i32 (load (add (HexagonCONST32_GP tglobaladdr:$global),
-                      u16ImmPred:$offset))),
-           (i32 (LDriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset))>,
-            Requires<[HasV4T]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -1457,62 +1003,65 @@ def : Pat <(i32 (load (add (HexagonCONST32_GP tglobaladdr:$global),
 ///    last operand.
 ///
 
-// memd(Re=#U6)=Rtt
+// memd(Re=#U)=Rtt
+let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT in {
 def STrid_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
-            (ins DoubleRegs:$src1, u6Imm:$src2),
-            "memd($dst1=#$src2) = $src1",
+            (ins DoubleRegs:$src1, u0AlwaysExt:$src2),
+            "memd($dst1=##$src2) = $src1",
             []>,
             Requires<[HasV4T]>;
 
-// memb(Re=#U6)=Rs
+// memb(Re=#U)=Rs
 def STrib_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
-            (ins IntRegs:$src1, u6Imm:$src2),
-            "memb($dst1=#$src2) = $src1",
+            (ins IntRegs:$src1, u0AlwaysExt:$src2),
+            "memb($dst1=##$src2) = $src1",
             []>,
             Requires<[HasV4T]>;
 
-// memh(Re=#U6)=Rs
+// memh(Re=#U)=Rs
 def STrih_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
-            (ins IntRegs:$src1, u6Imm:$src2),
-            "memh($dst1=#$src2) = $src1",
+            (ins IntRegs:$src1, u0AlwaysExt:$src2),
+            "memh($dst1=##$src2) = $src1",
             []>,
             Requires<[HasV4T]>;
 
-// memw(Re=#U6)=Rs
+// memw(Re=#U)=Rs
 def STriw_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
-            (ins IntRegs:$src1, u6Imm:$src2),
-            "memw($dst1=#$src2) = $src1",
+            (ins IntRegs:$src1, u0AlwaysExt:$src2),
+            "memw($dst1=##$src2) = $src1",
             []>,
             Requires<[HasV4T]>;
+}
 
-// memd(Re=#U6)=Rtt
+// memd(Re=#U)=Rtt
+let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT in {
 def STrid_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
-            (ins DoubleRegs:$src1, globaladdress:$src2),
+            (ins DoubleRegs:$src1, globaladdressExt:$src2),
             "memd($dst1=##$src2) = $src1",
             []>,
             Requires<[HasV4T]>;
 
-// memb(Re=#U6)=Rs
+// memb(Re=#U)=Rs
 def STrib_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
-            (ins IntRegs:$src1, globaladdress:$src2),
+            (ins IntRegs:$src1, globaladdressExt:$src2),
             "memb($dst1=##$src2) = $src1",
             []>,
             Requires<[HasV4T]>;
 
-// memh(Re=#U6)=Rs
+// memh(Re=#U)=Rs
 def STrih_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
-            (ins IntRegs:$src1, globaladdress:$src2),
+            (ins IntRegs:$src1, globaladdressExt:$src2),
             "memh($dst1=##$src2) = $src1",
             []>,
             Requires<[HasV4T]>;
 
-// memw(Re=#U6)=Rs
+// memw(Re=#U)=Rs
 def STriw_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
-            (ins IntRegs:$src1, globaladdress:$src2),
+            (ins IntRegs:$src1, globaladdressExt:$src2),
             "memw($dst1=##$src2) = $src1",
             []>,
             Requires<[HasV4T]>;
-
+}
 
 // multiclass for store instructions with base + register offset addressing
 // mode
@@ -1632,13 +1181,14 @@ def : Pat<(store (i64 DoubleRegs:$src4),
 }
 
 // memd(Ru<<#u2+#U6)=Rtt
-let AddedComplexity = 10 in
+let isExtended = 1, opExtendable = 2, AddedComplexity = 10,
+validSubTargets = HasV4SubT in
 def STrid_shl_V4 : STInst<(outs),
-            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, DoubleRegs:$src4),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, DoubleRegs:$src4),
             "memd($src1<<#$src2+#$src3) = $src4",
             [(store (i64 DoubleRegs:$src4),
                     (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
-                         u6ImmPred:$src3))]>,
+                         u0AlwaysExtPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memd(Rx++#s4:3)=Rtt
@@ -1652,34 +1202,12 @@ def STrid_shl_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memd(#u6)=Rtt
 // TODO: needs to be implemented.
 
-// if ([!]Pv[.new]) memd(Rx++#s4:3)=Rtt
-// if (Pv) memd(Rx++#s4:3)=Rtt
-// if (Pv.new) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def POST_STdri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
-                 s4_3Imm:$offset),
-            "if ($src1.new) memd($src3++#$offset) = $src2",
-            [],
-            "$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memd(Rx++#s4:3)=Rtt
-// if (!Pv.new) memd(Rx++#s4:3)=Rtt
-let AddedComplexity = 10, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def POST_STdri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, DoubleRegs:$src2, IntRegs:$src3,
-                 s4_3Imm:$offset),
-            "if (!$src1.new) memd($src3++#$offset) = $src2",
-            [],
-            "$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-
+//===----------------------------------------------------------------------===//
 // multiclass for store instructions with base + immediate offset
 // addressing mode and immediate stored value.
+// mem[bhw](Rx++#s4:3)=#s8
+// if ([!]Pv[.new]) mem[bhw](Rx++#s4:3)=#s6
+//===----------------------------------------------------------------------===//
 multiclass ST_Imm_Pbase<string mnemonic, Operand OffsetOp, bit isNot,
                         bit isPredNew> {
   let PNewValue = !if(isPredNew, "new", "") in
@@ -1718,9 +1246,9 @@ multiclass ST_Imm<string mnemonic, string CextOp, Operand OffsetOp> {
 
 let addrMode = BaseImmOffset, InputType = "imm",
     validSubTargets = HasV4SubT in {
-  defm STrib_imm : ST_Imm<"memb", "STrib", u6_0Imm>, ImmRegRel;
-  defm STrih_imm : ST_Imm<"memh", "STrih", u6_1Imm>, ImmRegRel;
-  defm STriw_imm : ST_Imm<"memw", "STriw", u6_2Imm>, ImmRegRel;
+  defm STrib_imm : ST_Imm<"memb", "STrib", u6_0Imm>, ImmRegRel, PredNewRel;
+  defm STrih_imm : ST_Imm<"memh", "STrih", u6_1Imm>, ImmRegRel, PredNewRel;
+  defm STriw_imm : ST_Imm<"memw", "STriw", u6_2Imm>, ImmRegRel, PredNewRel;
 }
 
 let Predicates = [HasV4T], AddedComplexity = 10 in {
@@ -1741,13 +1269,14 @@ def : Pat <(truncstorei8 s8ExtPred:$src2, (i32 IntRegs:$src1)),
            Requires<[HasV4T]>;
 
 // memb(Ru<<#u2+#U6)=Rt
-let AddedComplexity = 10 in
+let isExtended = 1, opExtendable = 2, AddedComplexity = 10, isNVStorable = 1,
+validSubTargets = HasV4SubT in
 def STrib_shl_V4 : STInst<(outs),
-            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
             "memb($src1<<#$src2+#$src3) = $src4",
             [(truncstorei8 (i32 IntRegs:$src4),
                            (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
-                                u6ImmPred:$src3))]>,
+                                u0AlwaysExtPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memb(Rx++#s4:0:circ(Mu))=Rt
@@ -1757,30 +1286,6 @@ def STrib_shl_V4 : STInst<(outs),
 // memb(gp+#u16:0)=Rt
 
 
-// Store byte conditionally.
-// if ([!]Pv[.new]) memb(#u6)=Rt
-// if ([!]Pv[.new]) memb(Rx++#s4:0)=Rt
-// if (Pv) memb(Rx++#s4:0)=Rt
-// if (Pv.new) memb(Rx++#s4:0)=Rt
-let hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STbri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
-            "if ($src1.new) memb($src3++#$offset) = $src2",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memb(Rx++#s4:0)=Rt
-// if (!Pv.new) memb(Rx++#s4:0)=Rt
-let hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STbri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
-            "if (!$src1.new) memb($src3++#$offset) = $src2",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-
 // Store halfword.
 // TODO: needs to be implemented
 // memh(Re=#U6)=Rt.H
@@ -1795,13 +1300,14 @@ def : Pat <(truncstorei16 s8ExtPred:$src2, (i32 IntRegs:$src1)),
 
 // memh(Ru<<#u2+#U6)=Rt.H
 // memh(Ru<<#u2+#U6)=Rt
-let AddedComplexity = 10 in
+let isExtended = 1, opExtendable = 2, AddedComplexity = 10, isNVStorable = 1,
+validSubTargets = HasV4SubT in
 def STrih_shl_V4 : STInst<(outs),
-            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
             "memh($src1<<#$src2+#$src3) = $src4",
             [(truncstorei16 (i32 IntRegs:$src4),
                             (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
-                                 u6ImmPred:$src3))]>,
+                                 u0AlwaysExtPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memh(Rx++#s4:1:circ(Mu))=Rt.H
@@ -1823,28 +1329,6 @@ def STrih_shl_V4 : STInst<(outs),
 // if ([!]Pv[.new]) memh(Rx++#s4:1)=Rt.H
 // TODO: Needs to be implemented.
 
-// if ([!]Pv[.new]) memh(Rx++#s4:1)=Rt
-// if (Pv) memh(Rx++#s4:1)=Rt
-// if (Pv.new) memh(Rx++#s4:1)=Rt
-let hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_SThri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
-            "if ($src1.new) memh($src3++#$offset) = $src2",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memh(Rx++#s4:1)=Rt
-// if (!Pv.new) memh(Rx++#s4:1)=Rt
-let hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_SThri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
-            "if (!$src1.new) memh($src3++#$offset) = $src2",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-
 // Store word.
 // memw(Re=#U6)=Rt
 // TODO: Needs to be implemented.
@@ -1863,13 +1347,14 @@ def : Pat <(store s8ExtPred:$src2, (i32 IntRegs:$src1)),
            Requires<[HasV4T]>;
 
 // memw(Ru<<#u2+#U6)=Rt
-let AddedComplexity = 10 in
+let isExtended = 1, opExtendable = 2, AddedComplexity = 10, isNVStorable = 1,
+validSubTargets = HasV4SubT in
 def STriw_shl_V4 : STInst<(outs),
-            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
             "memw($src1<<#$src2+#$src3) = $src4",
             [(store (i32 IntRegs:$src4),
                     (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
-                              u6ImmPred:$src3))]>,
+                              u0AlwaysExtPred:$src3))]>,
             Requires<[HasV4T]>;
 
 // memw(Rx++#s4:2)=Rt
@@ -1880,188 +1365,9 @@ def STriw_shl_V4 : STInst<(outs),
 // memw(gp+#u16:2)=Rt
 
 
-// if ([!]Pv[.new]) memw(Rx++#s4:2)=Rt
-// if (Pv) memw(Rx++#s4:2)=Rt
-// if (Pv.new) memw(Rx++#s4:2)=Rt
-let hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STwri_cdnPt_V4 : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
-            "if ($src1.new) memw($src3++#$offset) = $src2",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memw(Rx++#s4:2)=Rt
-// if (!Pv.new) memw(Rx++#s4:2)=Rt
-let hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STwri_cdnNotPt_V4 : STInst2PI<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
-            "if (!$src1.new) memw($src3++#$offset) = $src2",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-
-/// store to global address
-
-let isPredicable = 1, neverHasSideEffects = 1 in
-def STrid_GP_V4 : STInst2<(outs),
-            (ins globaladdress:$global, u16Imm:$offset, DoubleRegs:$src),
-            "memd(#$global+$offset) = $src",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrid_GP_cPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        DoubleRegs:$src2),
-            "if ($src1) memd(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrid_GP_cNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        DoubleRegs:$src2),
-            "if (!$src1) memd(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrid_GP_cdnPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        DoubleRegs:$src2),
-            "if ($src1.new) memd(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrid_GP_cdnNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        DoubleRegs:$src2),
-            "if (!$src1.new) memd(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let isPredicable = 1, neverHasSideEffects = 1 in
-def STrib_GP_V4 : STInst2<(outs),
-            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
-            "memb(#$global+$offset) = $src",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrib_GP_cPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1) memb(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrib_GP_cNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1) memb(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrib_GP_cdnPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1.new) memb(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrib_GP_cdnNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1.new) memb(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let isPredicable = 1, neverHasSideEffects = 1 in
-def STrih_GP_V4 : STInst2<(outs),
-            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
-            "memh(#$global+$offset) = $src",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrih_GP_cPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1) memh(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrih_GP_cNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1) memh(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrih_GP_cdnPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1.new) memh(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STrih_GP_cdnNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1.new) memh(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let isPredicable = 1, neverHasSideEffects = 1 in
-def STriw_GP_V4 : STInst2<(outs),
-            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
-            "memw(#$global+$offset) = $src",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STriw_GP_cPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1) memw(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STriw_GP_cNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1) memw(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STriw_GP_cdnPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1.new) memw(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def STriw_GP_cdnNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1.new) memw(##$global+$offset) = $src2",
-            []>,
-            Requires<[HasV4T]>;
-
 // memd(#global)=Rtt
-let isPredicable = 1, neverHasSideEffects = 1 in
+let isPredicable = 1, mayStore = 1, neverHasSideEffects = 1,
+validSubTargets = HasV4SubT in
 def STd_GP_V4 : STInst2<(outs),
             (ins globaladdress:$global, DoubleRegs:$src),
             "memd(#$global) = $src",
@@ -2069,7 +1375,8 @@ def STd_GP_V4 : STInst2<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memd(##global) = Rtt
-let neverHasSideEffects = 1, isPredicated = 1 in
+let mayStore = 1, neverHasSideEffects = 1, isPredicated = 1,
+isExtended = 1, opExtendable = 1, validSubTargets = HasV4SubT in {
 def STd_GP_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
             "if ($src1) memd(##$global) = $src2",
@@ -2077,7 +1384,6 @@ def STd_GP_cPt_V4 : STInst2<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memd(##global) = Rtt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STd_GP_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
             "if (!$src1) memd(##$global) = $src2",
@@ -2085,7 +1391,6 @@ def STd_GP_cNotPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (Pv) memd(##global) = Rtt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STd_GP_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
             "if ($src1.new) memd(##$global) = $src2",
@@ -2093,15 +1398,16 @@ def STd_GP_cdnPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (!Pv) memd(##global) = Rtt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STd_GP_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, DoubleRegs:$src2),
             "if (!$src1.new) memd(##$global) = $src2",
             []>,
             Requires<[HasV4T]>;
+}
 
 // memb(#global)=Rt
-let isPredicable = 1, neverHasSideEffects = 1 in
+let isPredicable = 1, neverHasSideEffects = 1, isNVStorable = 1,
+validSubTargets = HasV4SubT in
 def STb_GP_V4 : STInst2<(outs),
             (ins globaladdress:$global, IntRegs:$src),
             "memb(#$global) = $src",
@@ -2109,7 +1415,8 @@ def STb_GP_V4 : STInst2<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memb(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
+let neverHasSideEffects = 1, isPredicated = 1, isNVStorable = 1,
+isExtended = 1, opExtendable = 1, validSubTargets = HasV4SubT in {
 def STb_GP_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1) memb(##$global) = $src2",
@@ -2117,7 +1424,6 @@ def STb_GP_cPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (!Pv) memb(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STb_GP_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1) memb(##$global) = $src2",
@@ -2125,7 +1431,6 @@ def STb_GP_cNotPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (Pv) memb(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STb_GP_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1.new) memb(##$global) = $src2",
@@ -2133,15 +1438,16 @@ def STb_GP_cdnPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (!Pv) memb(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STb_GP_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1.new) memb(##$global) = $src2",
               []>,
               Requires<[HasV4T]>;
+}
 
 // memh(#global)=Rt
-let isPredicable = 1, neverHasSideEffects = 1 in
+let isPredicable = 1, neverHasSideEffects = 1, isNVStorable = 1,
+validSubTargets = HasV4SubT in
 def STh_GP_V4 : STInst2<(outs),
             (ins globaladdress:$global, IntRegs:$src),
             "memh(#$global) = $src",
@@ -2149,7 +1455,8 @@ def STh_GP_V4 : STInst2<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memh(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
+let neverHasSideEffects = 1, isPredicated = 1, isNVStorable = 1,
+isExtended = 1, opExtendable = 1, validSubTargets = HasV4SubT in {
 def STh_GP_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1) memh(##$global) = $src2",
@@ -2157,7 +1464,6 @@ def STh_GP_cPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (!Pv) memh(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STh_GP_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1) memh(##$global) = $src2",
@@ -2165,7 +1471,6 @@ def STh_GP_cNotPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (Pv) memh(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STh_GP_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1.new) memh(##$global) = $src2",
@@ -2173,15 +1478,16 @@ def STh_GP_cdnPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (!Pv) memh(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STh_GP_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1.new) memh(##$global) = $src2",
               []>,
               Requires<[HasV4T]>;
+}
 
 // memw(#global)=Rt
-let isPredicable = 1, neverHasSideEffects = 1 in
+let isPredicable = 1, neverHasSideEffects = 1, isNVStorable = 1,
+validSubTargets = HasV4SubT in
 def STw_GP_V4 : STInst2<(outs),
             (ins globaladdress:$global, IntRegs:$src),
             "memw(#$global) = $src",
@@ -2189,7 +1495,8 @@ def STw_GP_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (Pv) memw(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
+let neverHasSideEffects = 1, isPredicated = 1, isNVStorable = 1,
+isExtended = 1, opExtendable = 1, validSubTargets = HasV4SubT in {
 def STw_GP_cPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1) memw(##$global) = $src2",
@@ -2197,7 +1504,6 @@ def STw_GP_cPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (!Pv) memw(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STw_GP_cNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1) memw(##$global) = $src2",
@@ -2205,7 +1511,6 @@ def STw_GP_cNotPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (Pv) memw(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STw_GP_cdnPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1.new) memw(##$global) = $src2",
@@ -2213,12 +1518,12 @@ def STw_GP_cdnPt_V4 : STInst2<(outs),
               Requires<[HasV4T]>;
 
 // if (!Pv) memw(##global) = Rt
-let neverHasSideEffects = 1, isPredicated = 1 in
 def STw_GP_cdnNotPt_V4 : STInst2<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1.new) memw(##$global) = $src2",
             []>,
               Requires<[HasV4T]>;
+}
 
 // 64 bit atomic store
 def : Pat <(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global),
@@ -2277,72 +1582,6 @@ def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
           (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>,
           Requires<[HasV4T]>;
 
-def : Pat<(atomic_store_64 (add (HexagonCONST32_GP tglobaladdr:$global),
-                                u16ImmPred:$offset),
-                           (i64 DoubleRegs:$src1)),
-          (STrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
-                                            (i64 DoubleRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-def : Pat<(atomic_store_32 (add (HexagonCONST32_GP tglobaladdr:$global),
-                                u16ImmPred:$offset),
-                           (i32 IntRegs:$src1)),
-          (STriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
-                                            (i32 IntRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-def : Pat<(atomic_store_16 (add (HexagonCONST32_GP tglobaladdr:$global),
-                                u16ImmPred:$offset),
-                           (i32 IntRegs:$src1)),
-          (STrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
-                                            (i32 IntRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-def : Pat<(atomic_store_8 (add (HexagonCONST32_GP tglobaladdr:$global),
-                               u16ImmPred:$offset),
-                          (i32 IntRegs:$src1)),
-          (STrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
-                                            (i32 IntRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-// Map from store(globaladdress + x) -> memd(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(store (i64 DoubleRegs:$src1),
-                    (add (HexagonCONST32_GP tglobaladdr:$global),
-                                        u16ImmPred:$offset)),
-          (STrid_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
-                                            (i64 DoubleRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-// Map from store(globaladdress + x) -> memb(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(truncstorei8 (i32 IntRegs:$src1),
-                        (add (HexagonCONST32_GP tglobaladdr:$global),
-                             u16ImmPred:$offset)),
-          (STrib_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
-                                            (i32 IntRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-// Map from store(globaladdress + x) -> memh(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(truncstorei16 (i32 IntRegs:$src1),
-                         (add (HexagonCONST32_GP tglobaladdr:$global),
-                              u16ImmPred:$offset)),
-          (STrih_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
-                                            (i32 IntRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-// Map from store(globaladdress + x) -> memw(#foo + x)
-let AddedComplexity = 100 in
-def : Pat<(store (i32 IntRegs:$src1),
-                 (add (HexagonCONST32_GP tglobaladdr:$global),
-                                u16ImmPred:$offset)),
-          (STriw_GP_V4 tglobaladdr:$global, u16ImmPred:$offset,
-                                            (i32 IntRegs:$src1))>,
-          Requires<[HasV4T]>;
-
-
-
 //===----------------------------------------------------------------------===
 // ST -
 //===----------------------------------------------------------------------===
@@ -2456,35 +1695,72 @@ mayStore = 1 in {
 }
 
 // memb(Ru<<#u2+#U6)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let isExtended = 1, opExtendable = 2, mayStore = 1, AddedComplexity = 10,
+isNVStore = 1, validSubTargets = HasV4SubT in
 def STrib_shl_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
             "memb($src1<<#$src2+#$src3) = $src4.new",
             []>,
             Requires<[HasV4T]>;
 
-// memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1, isPredicable = 1  in
-def POST_STbri_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, s4_0Imm:$offset),
-            "memb($src2++#$offset) = $src1.new",
+//===----------------------------------------------------------------------===//
+// Post increment store
+// mem[bhwd](Rx++#s4:[0123])=Nt.new
+//===----------------------------------------------------------------------===//
+
+multiclass ST_PostInc_Pbase_nv<string mnemonic, RegisterClass RC, Operand ImmOp,
+                            bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME#_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2++#$offset) = $src3.new",
             [],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
+}
+
+multiclass ST_PostInc_Pred_nv<string mnemonic, RegisterClass RC,
+                           Operand ImmOp, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_PostInc_Pbase_nv<mnemonic, RC, ImmOp, PredNot, 0>;
+    // Predicate new
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
+    defm _cdn#NAME : ST_PostInc_Pbase_nv<mnemonic, RC, ImmOp, PredNot, 1>;
+  }
+}
+
+let hasCtrlDep = 1, isNVStore = 1, neverHasSideEffects = 1 in
+multiclass ST_PostInc_nv<string mnemonic, string BaseOp, RegisterClass RC,
+                      Operand ImmOp> {
+
+  let BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def NAME#_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+                (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
+                mnemonic#"($src1++#$offset) = $src2.new",
+                [],
+                "$src1 = $dst">,
+                Requires<[HasV4T]>;
+
+    let isPredicated = 1 in {
+      defm Pt : ST_PostInc_Pred_nv<mnemonic, RC, ImmOp, 0 >;
+      defm NotPt : ST_PostInc_Pred_nv<mnemonic, RC, ImmOp, 1 >;
+    }
+  }
+}
+
+let validSubTargets = HasV4SubT in {
+defm POST_STbri: ST_PostInc_nv <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
+defm POST_SThri: ST_PostInc_nv <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
+defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
+}
 
 // memb(Rx++#s4:0:circ(Mu))=Nt.new
 // memb(Rx++I:circ(Mu))=Nt.new
 // memb(Rx++Mu)=Nt.new
 // memb(Rx++Mu:brev)=Nt.new
 
-// memb(gp+#u16:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_GP_nv_V4 : NVInst_V4<(outs),
-            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
-            "memb(#$global+$offset) = $src.new",
-            []>,
-            Requires<[HasV4T]>;
-
 // memb(#global)=Nt.new
 let mayStore = 1, neverHasSideEffects = 1 in
 def STb_GP_nv_V4 : NVInst_V4<(outs),
@@ -2493,73 +1769,20 @@ def STb_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
-// if ([!]Pv[.new]) memb(Rx++#s4:0)=Nt.new
-// if (Pv) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STbri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
-            "if ($src1) memb($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (Pv.new) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STbri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
-            "if ($src1.new) memb($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STbri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
-            "if (!$src1) memb($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv.new) memb(Rx++#s4:0)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STbri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_0Imm:$offset),
-            "if (!$src1.new) memb($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
 // memh(Ru<<#u2+#U6)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let isExtended = 1, opExtendable = 2, mayStore = 1, AddedComplexity = 10,
+isNVStore = 1, validSubTargets = HasV4SubT in
 def STrih_shl_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
             "memh($src1<<#$src2+#$src3) = $src4.new",
             []>,
             Requires<[HasV4T]>;
 
-// memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1, isPredicable = 1  in
-def POST_SThri_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, s4_1Imm:$offset),
-            "memh($src2++#$offset) = $src1.new",
-            [],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
 // memh(Rx++#s4:1:circ(Mu))=Nt.new
 // memh(Rx++I:circ(Mu))=Nt.new
 // memh(Rx++Mu)=Nt.new
 // memh(Rx++Mu:brev)=Nt.new
 
-// memh(gp+#u16:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_GP_nv_V4 : NVInst_V4<(outs),
-            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
-            "memh(#$global+$offset) = $src.new",
-            []>,
-            Requires<[HasV4T]>;
-
 // memh(#global)=Nt.new
 let mayStore = 1, neverHasSideEffects = 1 in
 def STh_GP_nv_V4 : NVInst_V4<(outs),
@@ -2568,121 +1791,32 @@ def STh_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
-
-// if ([!]Pv[]) memh(Rx++#s4:1)=Nt.new
-// if (Pv) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_SThri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
-            "if ($src1) memh($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (Pv.new) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_SThri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
-            "if ($src1.new) memh($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_SThri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
-            "if (!$src1) memh($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv.new) memh(Rx++#s4:1)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_SThri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_1Imm:$offset),
-            "if (!$src1.new) memh($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
 // memw(Ru<<#u2+#U6)=Nt.new
-let mayStore = 1, AddedComplexity = 10 in
+let isExtended = 1, opExtendable = 2, mayStore = 1, AddedComplexity = 10,
+isNVStore = 1, validSubTargets = HasV4SubT in
 def STriw_shl_nv_V4 : NVInst_V4<(outs),
-            (ins IntRegs:$src1, u2Imm:$src2, u6Imm:$src3, IntRegs:$src4),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
             "memw($src1<<#$src2+#$src3) = $src4.new",
             []>,
             Requires<[HasV4T]>;
 
-// memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1, isPredicable = 1  in
-def POST_STwri_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, s4_2Imm:$offset),
-            "memw($src2++#$offset) = $src1.new",
-            [],
-            "$src2 = $dst">,
-            Requires<[HasV4T]>;
-
 // memw(Rx++#s4:2:circ(Mu))=Nt.new
 // memw(Rx++I:circ(Mu))=Nt.new
 // memw(Rx++Mu)=Nt.new
 // memw(Rx++Mu:brev)=Nt.new
 // memw(gp+#u16:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_GP_nv_V4 : NVInst_V4<(outs),
-            (ins globaladdress:$global, u16Imm:$offset, IntRegs:$src),
-            "memw(#$global+$offset) = $src.new",
-            []>,
-            Requires<[HasV4T]>;
 
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1, isNVStore = 1,
+validSubTargets = HasV4SubT in
 def STw_GP_nv_V4 : NVInst_V4<(outs),
             (ins globaladdress:$global, IntRegs:$src),
             "memw(#$global) = $src.new",
             []>,
             Requires<[HasV4T]>;
 
-// if ([!]Pv[.new]) memw(Rx++#s4:2)=Nt.new
-// if (Pv) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STwri_cPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
-            "if ($src1) memw($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (Pv.new) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STwri_cdnPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
-            "if ($src1.new) memw($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STwri_cNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
-            "if (!$src1) memw($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-// if (!Pv.new) memw(Rx++#s4:2)=Nt.new
-let mayStore = 1, hasCtrlDep = 1,
-    isPredicated = 1 in
-def POST_STwri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, s4_2Imm:$offset),
-            "if (!$src1.new) memw($src3++#$offset) = $src2.new",
-            [],"$src3 = $dst">,
-            Requires<[HasV4T]>;
-
-
-
 // if (Pv) memb(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
+let mayStore = 1, neverHasSideEffects = 1, isNVStore = 1,
+isExtended = 1, opExtendable = 1, validSubTargets = HasV4SubT in {
 def STb_GP_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1) memb(##$global) = $src2.new",
@@ -2690,7 +1824,6 @@ def STb_GP_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STb_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1) memb(##$global) = $src2.new",
@@ -2698,7 +1831,6 @@ def STb_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memb(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STb_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1.new) memb(##$global) = $src2.new",
@@ -2706,7 +1838,6 @@ def STb_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memb(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STb_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1.new) memb(##$global) = $src2.new",
@@ -2714,7 +1845,6 @@ def STb_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memh(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STh_GP_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1) memh(##$global) = $src2.new",
@@ -2722,7 +1852,6 @@ def STh_GP_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STh_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1) memh(##$global) = $src2.new",
@@ -2730,7 +1859,6 @@ def STh_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memh(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STh_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1.new) memh(##$global) = $src2.new",
@@ -2738,7 +1866,6 @@ def STh_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memh(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STh_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1.new) memh(##$global) = $src2.new",
@@ -2746,7 +1873,6 @@ def STh_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memw(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STw_GP_cPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1) memw(##$global) = $src2.new",
@@ -2754,7 +1880,6 @@ def STw_GP_cPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STw_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1) memw(##$global) = $src2.new",
@@ -2762,7 +1887,6 @@ def STw_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (Pv) memw(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STw_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if ($src1.new) memw(##$global) = $src2.new",
@@ -2770,108 +1894,12 @@ def STw_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 // if (!Pv) memw(##global) = Rt
-let mayStore = 1, neverHasSideEffects = 1 in
 def STw_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
             (ins PredRegs:$src1, globaladdress:$global, IntRegs:$src2),
             "if (!$src1.new) memw(##$global) = $src2.new",
             []>,
             Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_GP_cPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1) memb(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1) memb(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1.new) memb(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrib_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1.new) memb(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_GP_cPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1) memh(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1) memh(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1.new) memh(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STrih_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1.new) memh(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_GP_cPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1) memw(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_GP_cNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1) memw(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_GP_cdnPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if ($src1.new) memw(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-let mayStore = 1, neverHasSideEffects = 1 in
-def STriw_GP_cdnNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, globaladdress:$global, u16Imm:$offset,
-                                                        IntRegs:$src2),
-            "if (!$src1.new) memw(##$global+$offset) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
+}
 
 //===----------------------------------------------------------------------===//
 // NV/ST -
@@ -3061,31 +2089,37 @@ let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
 
 //  Add and accumulate.
 //  Rd=add(Rs,add(Ru,#s6))
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 6,
+validSubTargets = HasV4SubT in
 def ADDr_ADDri_V4 : MInst<(outs IntRegs:$dst),
-          (ins IntRegs:$src1, IntRegs:$src2, s6Imm:$src3),
+          (ins IntRegs:$src1, IntRegs:$src2, s6Ext:$src3),
           "$dst = add($src1, add($src2, #$src3))",
           [(set (i32 IntRegs:$dst),
            (add (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
-                                          s6ImmPred:$src3)))]>,
+                                          s6_16ExtPred:$src3)))]>,
           Requires<[HasV4T]>;
 
 //  Rd=add(Rs,sub(#s6,Ru))
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 6,
+validSubTargets = HasV4SubT in
 def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst),
-          (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
+          (ins IntRegs:$src1, s6Ext:$src2, IntRegs:$src3),
           "$dst = add($src1, sub(#$src2, $src3))",
           [(set (i32 IntRegs:$dst),
-           (add (i32 IntRegs:$src1), (sub s6ImmPred:$src2,
+           (add (i32 IntRegs:$src1), (sub s6_10ExtPred:$src2,
                                           (i32 IntRegs:$src3))))]>,
           Requires<[HasV4T]>;
 
 // Generates the same instruction as ADDr_SUBri_V4 but matches different
 // pattern.
 //  Rd=add(Rs,sub(#s6,Ru))
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 6,
+validSubTargets = HasV4SubT in
 def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
-          (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
+          (ins IntRegs:$src1, s6Ext:$src2, IntRegs:$src3),
           "$dst = add($src1, sub(#$src2, $src3))",
           [(set (i32 IntRegs:$dst),
-                (sub (add (i32 IntRegs:$src1), s6ImmPred:$src2),
+                (sub (add (i32 IntRegs:$src1), s6_10ExtPred:$src2),
                      (i32 IntRegs:$src3)))]>,
           Requires<[HasV4T]>;
 
@@ -3099,6 +2133,7 @@ def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
 
 //  Logical doublewords.
 //  Rdd=and(Rtt,~Rss)
+let validSubTargets = HasV4SubT in
 def ANDd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
           (ins DoubleRegs:$src1, DoubleRegs:$src2),
           "$dst = and($src1, ~$src2)",
@@ -3107,6 +2142,7 @@ def ANDd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
           Requires<[HasV4T]>;
 
 //  Rdd=or(Rtt,~Rss)
+let validSubTargets = HasV4SubT in
 def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
           (ins DoubleRegs:$src1, DoubleRegs:$src2),
           "$dst = or($src1, ~$src2)",
@@ -3117,6 +2153,7 @@ def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
 
 //  Logical-logical doublewords.
 //  Rxx^=xor(Rss,Rtt)
+let validSubTargets = HasV4SubT in
 def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
           (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
           "$dst ^= xor($src2, $src3)",
@@ -3129,17 +2166,20 @@ def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
 
 // Logical-logical words.
 // Rx=or(Ru,and(Rx,#s10))
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
+validSubTargets = HasV4SubT in
 def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
             "$dst = or($src1, and($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                                s10ImmPred:$src3)))],
+                                                s10ExtPred:$src3)))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 // Rx[&|^]=and(Rs,Rt)
 // Rx&=and(Rs,Rt)
+let validSubTargets = HasV4SubT in
 def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= and($src2, $src3)",
@@ -3150,6 +2190,7 @@ def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // Rx|=and(Rs,Rt)
+let validSubTargets = HasV4SubT, CextOpcode = "ORr_ANDr", InputType = "reg" in
 def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= and($src2, $src3)",
@@ -3157,9 +2198,10 @@ def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
                   (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
                                                 (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
-            Requires<[HasV4T]>;
+            Requires<[HasV4T]>, ImmRegRel;
 
 // Rx^=and(Rs,Rt)
+let validSubTargets = HasV4SubT in
 def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= and($src2, $src3)",
@@ -3171,6 +2213,7 @@ def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
 
 // Rx[&|^]=and(Rs,~Rt)
 // Rx&=and(Rs,~Rt)
+let validSubTargets = HasV4SubT in
 def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= and($src2, ~$src3)",
@@ -3181,6 +2224,7 @@ def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // Rx|=and(Rs,~Rt)
+let validSubTargets = HasV4SubT in
 def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= and($src2, ~$src3)",
@@ -3191,6 +2235,7 @@ def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // Rx^=and(Rs,~Rt)
+let validSubTargets = HasV4SubT in
 def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= and($src2, ~$src3)",
@@ -3202,6 +2247,7 @@ def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
 
 // Rx[&|^]=or(Rs,Rt)
 // Rx&=or(Rs,Rt)
+let validSubTargets = HasV4SubT in
 def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= or($src2, $src3)",
@@ -3212,6 +2258,7 @@ def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // Rx|=or(Rs,Rt)
+let validSubTargets = HasV4SubT, CextOpcode = "ORr_ORr", InputType = "reg" in
 def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= or($src2, $src3)",
@@ -3219,9 +2266,10 @@ def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
                   (or (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
                                                (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
-            Requires<[HasV4T]>;
+            Requires<[HasV4T]>, ImmRegRel;
 
 // Rx^=or(Rs,Rt)
+let validSubTargets = HasV4SubT in
 def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= or($src2, $src3)",
@@ -3233,6 +2281,7 @@ def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
 
 // Rx[&|^]=xor(Rs,Rt)
 // Rx&=xor(Rs,Rt)
+let validSubTargets = HasV4SubT in
 def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst &= xor($src2, $src3)",
@@ -3243,6 +2292,7 @@ def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // Rx|=xor(Rs,Rt)
+let validSubTargets = HasV4SubT in
 def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst |= xor($src2, $src3)",
@@ -3253,6 +2303,7 @@ def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // Rx^=xor(Rs,Rt)
+let validSubTargets = HasV4SubT in
 def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
             "$dst ^= xor($src2, $src3)",
@@ -3263,24 +2314,28 @@ def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // Rx|=and(Rs,#s10)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
+validSubTargets = HasV4SubT, CextOpcode = "ORr_ANDr", InputType = "imm" in
 def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
             "$dst |= and($src2, #$src3)",
             [(set (i32 IntRegs:$dst),
                   (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                                s10ImmPred:$src3)))],
+                                                s10ExtPred:$src3)))],
             "$src1 = $dst">,
-            Requires<[HasV4T]>;
+            Requires<[HasV4T]>, ImmRegRel;
 
 // Rx|=or(Rs,#s10)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
+validSubTargets = HasV4SubT, CextOpcode = "ORr_ORr", InputType = "imm" in
 def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs: $src2, s10Imm:$src3),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
             "$dst |= or($src2, #$src3)",
             [(set (i32 IntRegs:$dst),
                   (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
-                                                s10ImmPred:$src3)))],
+                                                s10ExtPred:$src3)))],
             "$src1 = $dst">,
-            Requires<[HasV4T]>;
+            Requires<[HasV4T]>, ImmRegRel;
 
 
 //    Modulo wrap
@@ -3327,25 +2382,41 @@ def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
 
 // Multiply and user lower result.
 // Rd=add(#u6,mpyi(Rs,#U6))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 6,
+validSubTargets = HasV4SubT in
 def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst),
-            (ins u6Imm:$src1, IntRegs:$src2, u6Imm:$src3),
+            (ins u6Ext:$src1, IntRegs:$src2, u6Imm:$src3),
             "$dst = add(#$src1, mpyi($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
-                       u6ImmPred:$src1))]>,
+                       u6ExtPred:$src1))]>,
             Requires<[HasV4T]>;
 
-// Rd=add(#u6,mpyi(Rs,Rt))
+// Rd=add(##,mpyi(Rs,#U6))
+def : Pat <(add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
+                     (HexagonCONST32 tglobaladdr:$src1)),
+           (i32 (ADDi_MPYri_V4 tglobaladdr:$src1, IntRegs:$src2,
+                               u6ImmPred:$src3))>;
 
+// Rd=add(#u6,mpyi(Rs,Rt))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 6,
+validSubTargets = HasV4SubT, InputType = "imm", CextOpcode = "ADD_MPY" in
 def ADDi_MPYrr_V4 : MInst<(outs IntRegs:$dst),
-            (ins u6Imm:$src1, IntRegs:$src2, IntRegs:$src3),
+            (ins u6Ext:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst = add(#$src1, mpyi($src2, $src3))",
             [(set (i32 IntRegs:$dst),
                   (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
-                       u6ImmPred:$src1))]>,
-            Requires<[HasV4T]>;
+                       u6ExtPred:$src1))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rd=add(##,mpyi(Rs,Rt))
+def : Pat <(add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                     (HexagonCONST32 tglobaladdr:$src1)),
+           (i32 (ADDi_MPYrr_V4 tglobaladdr:$src1, IntRegs:$src2,
+                               IntRegs:$src3))>;
 
 // Rd=add(Ru,mpyi(#u6:2,Rs))
+let validSubTargets = HasV4SubT in
 def ADDr_MPYir_V4 : MInst<(outs IntRegs:$dst),
             (ins IntRegs:$src1, u6Imm:$src2, IntRegs:$src3),
             "$dst = add($src1, mpyi(#$src2, $src3))",
@@ -3355,15 +2426,18 @@ def ADDr_MPYir_V4 : MInst<(outs IntRegs:$dst),
             Requires<[HasV4T]>;
 
 // Rd=add(Ru,mpyi(Rs,#u6))
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 6,
+validSubTargets = HasV4SubT, InputType = "imm", CextOpcode = "ADD_MPY" in
 def ADDr_MPYri_V4 : MInst<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, u6Imm:$src3),
+            (ins IntRegs:$src1, IntRegs:$src2, u6Ext:$src3),
             "$dst = add($src1, mpyi($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
-                                                 u6ImmPred:$src3)))]>,
-            Requires<[HasV4T]>;
+                                                 u6ExtPred:$src3)))]>,
+            Requires<[HasV4T]>, ImmRegRel;
 
 // Rx=add(Ru,mpyi(Rx,Rs))
+let validSubTargets = HasV4SubT, InputType = "reg", CextOpcode = "ADD_MPY" in
 def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
             "$dst = add($src1, mpyi($src2, $src3))",
@@ -3371,7 +2445,7 @@ def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
              (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
                                             (i32 IntRegs:$src3))))],
             "$src2 = $dst">,
-            Requires<[HasV4T]>;
+            Requires<[HasV4T]>, ImmRegRel;
 
 
 // Polynomial multiply words
@@ -3414,92 +2488,107 @@ def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
 
 // Shift by immediate and accumulate.
 // Rx=add(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
 def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = add(#$src1, asl($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (add (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ImmPred:$src1))],
+                       u8ExtPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 // Rx=add(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
 def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = add(#$src1, lsr($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (add (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ImmPred:$src1))],
+                       u8ExtPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 // Rx=sub(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
 def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = sub(#$src1, asl($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (sub (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ImmPred:$src1))],
+                       u8ExtPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 // Rx=sub(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
 def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = sub(#$src1, lsr($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (sub (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ImmPred:$src1))],
+                       u8ExtPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 
 //Shift by immediate and logical.
 //Rx=and(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
 def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = and(#$src1, asl($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (and (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ImmPred:$src1))],
+                       u8ExtPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 //Rx=and(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
 def ANDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = and(#$src1, lsr($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (and (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                       u8ImmPred:$src1))],
+                       u8ExtPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 //Rx=or(#u8,asl(Rx,#U5))
-let AddedComplexity = 30 in
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+AddedComplexity = 30, validSubTargets = HasV4SubT in
 def ORi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = or(#$src1, asl($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (or (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                      u8ImmPred:$src1))],
+                      u8ExtPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 //Rx=or(#u8,lsr(Rx,#U5))
-let AddedComplexity = 30 in
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+AddedComplexity = 30, validSubTargets = HasV4SubT in
 def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
-            (ins u8Imm:$src1, IntRegs:$src2, u5Imm:$src3),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
             "$dst = or(#$src1, lsr($src2, #$src3))",
             [(set (i32 IntRegs:$dst),
                   (or (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
-                      u8ImmPred:$src1))],
+                      u8ExtPred:$src1))],
             "$src2 = $dst">,
             Requires<[HasV4T]>;
 
 
 //Shift by register.
 //Rd=lsl(#s6,Rt)
+let validSubTargets = HasV4SubT in {
 def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2),
             "$dst = lsl(#$src1, $src2)",
             [(set (i32 IntRegs:$dst), (shl s6ImmPred:$src1,
@@ -3547,7 +2636,7 @@ def LSRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
                                                     (i32 IntRegs:$src3))))],
             "$src1 = $dst">,
             Requires<[HasV4T]>;
-
+}
 
 //===----------------------------------------------------------------------===//
 // XTYPE/SHIFT -
@@ -3981,7 +3070,61 @@ def MEMb_ORr_MEM_V4 : MEMInst_V4<(outs),
 // incorrect code for negative numbers.
 // Pd=cmpb.eq(Rs,#u8)
 
-let isCompare = 1 in
+// p=!cmp.eq(r1,r2)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotEQ_rr : ALU32_rr<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, IntRegs:$src2),
+      "$dst = !cmp.eq($src1, $src2)",
+      [(set (i1 PredRegs:$dst),
+            (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2)))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.eq(r1,#s10)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotEQ_ri : ALU32_ri<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, s10Ext:$src2),
+      "$dst = !cmp.eq($src1, #$src2)",
+      [(set (i1 PredRegs:$dst),
+            (setne (i32 IntRegs:$src1), s10ImmPred:$src2))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gt(r1,r2)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGT_rr : ALU32_rr<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, IntRegs:$src2),
+      "$dst = !cmp.gt($src1, $src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setgt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gt(r1,#s10)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGT_ri : ALU32_ri<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, s10Ext:$src2),
+      "$dst = !cmp.gt($src1, #$src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setgt (i32 IntRegs:$src1), s10ImmPred:$src2)))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gtu(r1,r2)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGTU_rr : ALU32_rr<(outs PredRegs:$dst),
+                            (ins IntRegs:$src1, IntRegs:$src2),
+      "$dst = !cmp.gtu($src1, $src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setugt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gtu(r1,#u9)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGTU_ri : ALU32_ri<(outs PredRegs:$dst),
+                            (ins IntRegs:$src1, u9Ext:$src2),
+      "$dst = !cmp.gtu($src1, #$src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setugt (i32 IntRegs:$src1), u9ImmPred:$src2)))]>,
+      Requires<[HasV4T]>;
+
+let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPbEQri_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, u8Imm:$src2),
             "$dst = cmpb.eq($src1, #$src2)",
@@ -3989,8 +3132,14 @@ def CMPbEQri_V4 : MInst<(outs PredRegs:$dst),
                   (seteq (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
+def : Pat <(brcond (i1 (setne (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2)),
+                       bb:$offset),
+      (JMP_cNot (CMPbEQri_V4 (i32 IntRegs:$src1), u8ImmPred:$src2),
+                bb:$offset)>,
+      Requires<[HasV4T]>;
+
 // Pd=cmpb.eq(Rs,Rt)
-let isCompare = 1 in
+let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPbEQrr_ubub_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.eq($src1, $src2)",
@@ -4000,7 +3149,7 @@ def CMPbEQrr_ubub_V4 : MInst<(outs PredRegs:$dst),
             Requires<[HasV4T]>;
 
 // Pd=cmpb.eq(Rs,Rt)
-let isCompare = 1 in
+let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPbEQrr_sbsb_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.eq($src1, $src2)",
@@ -4010,7 +3159,7 @@ def CMPbEQrr_sbsb_V4 : MInst<(outs PredRegs:$dst),
             Requires<[HasV4T]>;
 
 // Pd=cmpb.gt(Rs,Rt)
-let isCompare = 1 in
+let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPbGTrr_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.gt($src1, $src2)",
@@ -4020,29 +3169,237 @@ def CMPbGTrr_V4 : MInst<(outs PredRegs:$dst),
             Requires<[HasV4T]>;
 
 // Pd=cmpb.gtu(Rs,#u7)
-let isCompare = 1 in
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 7,
+isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPbGTU", InputType = "imm" in
 def CMPbGTUri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, u7Imm:$src2),
+            (ins IntRegs:$src1, u7Ext:$src2),
             "$dst = cmpb.gtu($src1, #$src2)",
             [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
-                                              u7ImmPred:$src2))]>,
-            Requires<[HasV4T]>;
+                                              u7ExtPred:$src2))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// SDNode for converting immediate C to C-1.
+def DEC_CONST_BYTE : SDNodeXForm<imm, [{
+   // Return the byte immediate const-1 as an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformU7ToU7M1Imm(imm);
+}]>;
+
+// For the sequence
+//   zext( seteq ( and(Rs, 255), u8))
+// Generate
+//   Pd=cmpb.eq(Rs, #u8)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (seteq (i32 (and (i32 IntRegs:$Rs), 255)),
+                                           u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQri_V4 (i32 IntRegs:$Rs),
+                                                 (u8ExtPred:$u8))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setne ( and(Rs, 255), u8))
+// Generate
+//   Pd=cmpb.eq(Rs, #u8)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 IntRegs:$Rs), 255)),
+                                           u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQri_V4 (i32 IntRegs:$Rs),
+                                                 (u8ExtPred:$u8))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( seteq (Rs, and(Rt, 255)))
+// Generate
+//   Pd=cmpb.eq(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (seteq (i32 IntRegs:$Rt),
+                                 (i32 (and (i32 IntRegs:$Rs), 255)))))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQrr_ubub_V4 (i32 IntRegs:$Rs),
+                                                      (i32 IntRegs:$Rt))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setne (Rs, and(Rt, 255)))
+// Generate
+//   Pd=cmpb.eq(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setne (i32 IntRegs:$Rt),
+                                 (i32 (and (i32 IntRegs:$Rs), 255)))))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQrr_ubub_V4 (i32 IntRegs:$Rs),
+                                                      (i32 IntRegs:$Rt))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setugt ( and(Rs, 255), u8))
+// Generate
+//   Pd=cmpb.gtu(Rs, #u8)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 255)),
+                                            u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$Rs),
+                                                  (u8ExtPred:$u8))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setugt ( and(Rs, 254), u8))
+// Generate
+//   Pd=cmpb.gtu(Rs, #u8)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 254)),
+                                            u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$Rs),
+                                                  (u8ExtPred:$u8))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setult ( Rs, Rt))
+// Generate
+//   Pd=cmp.ltu(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+// cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setult (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rt),
+                                              (i32 IntRegs:$Rs))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setlt ( Rs, Rt))
+// Generate
+//   Pd=cmp.lt(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+// cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setlt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rt),
+                                             (i32 IntRegs:$Rs))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setugt ( Rs, Rt))
+// Generate
+//   Pd=cmp.gtu(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (setugt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rs),
+                                              (i32 IntRegs:$Rt))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// This pattern interefers with coremark performance, not implementing at this
+// time.
+// For the sequence
+//   zext( setgt ( Rs, Rt))
+// Generate
+//   Pd=cmp.gt(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+
+// For the sequence
+//   zext( setuge ( Rs, Rt))
+// Generate
+//   Pd=cmp.ltu(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+// cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setuge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rt),
+                                              (i32 IntRegs:$Rs))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setge ( Rs, Rt))
+// Generate
+//   Pd=cmp.lt(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+// cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rt),
+                                             (i32 IntRegs:$Rs))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setule ( Rs, Rt))
+// Generate
+//   Pd=cmp.gtu(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setule (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rs),
+                                              (i32 IntRegs:$Rt))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setle ( Rs, Rt))
+// Generate
+//   Pd=cmp.gt(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setle (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rs),
+                                             (i32 IntRegs:$Rt))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setult ( and(Rs, 255), u8))
+// Use the isdigit transformation below
+
+// Generate code of the form 'mux_ii(cmpbgtu(Rdd, C-1),0,1)'
+// for C code of the form r = ((c>='0') & (c<='9')) ? 1 : 0;.
+// The isdigit transformation relies on two 'clever' aspects:
+// 1) The data type is unsigned which allows us to eliminate a zero test after
+//    biasing the expression by 48. We are depending on the representation of
+//    the unsigned types, and semantics.
+// 2) The front end has converted <= 9 into < 10 on entry to LLVM
+//
+// For the C code:
+//   retval = ((c>='0') & (c<='9')) ? 1 : 0;
+// The code is transformed upstream of llvm into
+//   retval = (c-48) < 10 ? 1 : 0;
+let AddedComplexity = 139 in
+def : Pat <(i32 (zext (i1 (setult (i32 (and (i32 IntRegs:$src1), 255)),
+                                  u7StrictPosImmPred:$src2)))),
+  (i32 (MUX_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$src1),
+                                 (DEC_CONST_BYTE u7StrictPosImmPred:$src2))),
+                   0, 1))>,
+                   Requires<[HasV4T]>;
 
 // Pd=cmpb.gtu(Rs,Rt)
-let isCompare = 1 in
+let isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPbGTU",
+InputType = "reg" in
 def CMPbGTUrr_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmpb.gtu($src1, $src2)",
             [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
                                              (and (i32 IntRegs:$src2), 255)))]>,
-            Requires<[HasV4T]>;
+            Requires<[HasV4T]>, ImmRegRel;
 
 // Following instruction is not being extended as it results into the incorrect
 // code for negative numbers.
 
 // Signed half compare(.eq) ri.
 // Pd=cmph.eq(Rs,#s8)
-let isCompare = 1 in
+let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPhEQri_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, s8Imm:$src2),
             "$dst = cmph.eq($src1, #$src2)",
@@ -4056,7 +3413,7 @@ def CMPhEQri_V4 : MInst<(outs PredRegs:$dst),
 //   r0=and(r0,#0xffff)
 //   p0=cmp.eq(r0,#0)
 // Pd=cmph.eq(Rs,Rt)
-let isCompare = 1 in
+let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPhEQrr_xor_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.eq($src1, $src2)",
@@ -4071,7 +3428,7 @@ def CMPhEQrr_xor_V4 : MInst<(outs PredRegs:$dst),
 //   r1=asl(r1,16)
 //   p0=cmp.eq(r0,r1)
 // Pd=cmph.eq(Rs,Rt)
-let isCompare = 1 in
+let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPhEQrr_shl_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.eq($src1, $src2)",
@@ -4085,19 +3442,20 @@ used in the cmph.gt instruction.
 // Signed half compare(.gt) ri.
 // Pd=cmph.gt(Rs,#s8)
 
-let isCompare = 1 in
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
+isCompare = 1, validSubTargets = HasV4SubT in
 def CMPhGTri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, s8Imm:$src2),
+            (ins IntRegs:$src1, s8Ext:$src2),
             "$dst = cmph.gt($src1, #$src2)",
             [(set (i1 PredRegs:$dst),
                   (setgt (shl (i32 IntRegs:$src1), (i32 16)),
-                         s8ImmPred:$src2))]>,
+                         s8ExtPred:$src2))]>,
             Requires<[HasV4T]>;
 */
 
 // Signed half compare(.gt) rr.
 // Pd=cmph.gt(Rs,Rt)
-let isCompare = 1 in
+let isCompare = 1, validSubTargets = HasV4SubT in
 def CMPhGTrr_shl_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.gt($src1, $src2)",
@@ -4108,24 +3466,41 @@ def CMPhGTrr_shl_V4 : MInst<(outs PredRegs:$dst),
 
 // Unsigned half compare rr (.gtu).
 // Pd=cmph.gtu(Rs,Rt)
-let isCompare = 1 in
+let isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPhGTU",
+InputType = "reg" in
 def CMPhGTUrr_V4 : MInst<(outs PredRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
             "$dst = cmph.gtu($src1, $src2)",
             [(set (i1 PredRegs:$dst),
                   (setugt (and (i32 IntRegs:$src1), 65535),
                           (and (i32 IntRegs:$src2), 65535)))]>,
-            Requires<[HasV4T]>;
+            Requires<[HasV4T]>, ImmRegRel;
 
 // Unsigned half compare ri (.gtu).
 // Pd=cmph.gtu(Rs,#u7)
-let isCompare = 1 in
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 7,
+isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPhGTU",
+InputType = "imm" in
 def CMPhGTUri_V4 : MInst<(outs PredRegs:$dst),
-            (ins IntRegs:$src1, u7Imm:$src2),
+            (ins IntRegs:$src1, u7Ext:$src2),
             "$dst = cmph.gtu($src1, #$src2)",
             [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 65535),
-                                              u7ImmPred:$src2))]>,
-            Requires<[HasV4T]>;
+                                              u7ExtPred:$src2))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+let validSubTargets = HasV4SubT in
+def NTSTBIT_rr : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+    "$dst = !tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (seteq (and (shl 1, (i32 IntRegs:$src2)), (i32 IntRegs:$src1)), 0))]>,
+    Requires<[HasV4T]>;
+
+let validSubTargets = HasV4SubT in
+def NTSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+    "$dst = !tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (seteq (and (shl 1, u5ImmPred:$src2), (i32 IntRegs:$src1)), 0))]>,
+    Requires<[HasV4T]>;
 
 //===----------------------------------------------------------------------===//
 // XTYPE/PRED -
@@ -4237,227 +3612,156 @@ let isReturn = 1, isTerminator = 1,
             Requires<[HasV4T]>;
 }
 
-
 // Load/Store with absolute addressing mode
 // memw(#u6)=Rt
 
-multiclass ST_abs<string OpcStr> {
-  let isPredicable = 1 in
-  def _abs_V4 : STInst2<(outs),
-            (ins globaladdress:$absaddr, IntRegs:$src),
-            !strconcat(OpcStr, "(##$absaddr) = $src"),
-            []>,
-            Requires<[HasV4T]>;
-
-  let isPredicated = 1 in
-  def _abs_cPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
-            !strconcat("if ($src1)",
-            !strconcat(OpcStr, "(##$absaddr) = $src2")),
-            []>,
-            Requires<[HasV4T]>;
-
-  let isPredicated = 1 in
-  def _abs_cNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
-            !strconcat("if (!$src1)",
-            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+multiclass ST_Abs_Predbase<string mnemonic, RegisterClass RC, bit isNot,
+                           bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME#_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdressExt:$absaddr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"(##$absaddr) = $src2",
             []>,
             Requires<[HasV4T]>;
+}
 
-  let isPredicated = 1 in
-  def _abs_cdnPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
-            !strconcat("if ($src1.new)",
-            !strconcat(OpcStr, "(##$absaddr) = $src2")),
-            []>,
-            Requires<[HasV4T]>;
+multiclass ST_Abs_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_Abs_Predbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Abs_Predbase<mnemonic, RC, PredNot, 1>;
+  }
+}
 
-  let isPredicated = 1 in
-  def _abs_cdnNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
-            !strconcat("if (!$src1.new)",
-            !strconcat(OpcStr, "(##$absaddr) = $src2")),
+let isNVStorable = 1, isExtended = 1, neverHasSideEffects = 1 in
+multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 0, isPredicable = 1 in
+    def NAME#_V4 : STInst2<(outs),
+            (ins globaladdressExt:$absaddr, RC:$src),
+            mnemonic#"(##$absaddr) = $src",
             []>,
             Requires<[HasV4T]>;
 
-  def _abs_nv_V4 : STInst2<(outs),
-            (ins globaladdress:$absaddr, IntRegs:$src),
-            !strconcat(OpcStr, "(##$absaddr) = $src.new"),
-            []>,
-            Requires<[HasV4T]>;
+    let opExtendable = 1, isPredicated = 1 in {
+      defm Pt : ST_Abs_Pred<mnemonic, RC, 0>;
+      defm NotPt : ST_Abs_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
 
-  let isPredicated = 1 in
-  def _abs_cPt_nv_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
-            !strconcat("if ($src1)",
-            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+multiclass ST_Abs_Predbase_nv<string mnemonic, RegisterClass RC, bit isNot,
+                           bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdressExt:$absaddr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"(##$absaddr) = $src2.new",
             []>,
             Requires<[HasV4T]>;
+}
 
-  let isPredicated = 1 in
-  def _abs_cNotPt_nv_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
-            !strconcat("if (!$src1)",
-            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
-            []>,
-            Requires<[HasV4T]>;
+multiclass ST_Abs_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_Abs_Predbase_nv<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Abs_Predbase_nv<mnemonic, RC, PredNot, 1>;
+  }
+}
 
-  let isPredicated = 1 in
-  def _abs_cdnPt_nv_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
-            !strconcat("if ($src1.new)",
-            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
+let mayStore = 1, isNVStore = 1, isExtended = 1, neverHasSideEffects = 1 in
+multiclass ST_Abs_nv<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 0, isPredicable = 1 in
+    def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdressExt:$absaddr, RC:$src),
+            mnemonic#"(##$absaddr) = $src.new",
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
-  def _abs_cdnNotPt_nv_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, globaladdress:$absaddr, IntRegs:$src2),
-            !strconcat("if (!$src1.new)",
-            !strconcat(OpcStr, "(##$absaddr) = $src2.new")),
-            []>,
-            Requires<[HasV4T]>;
+    let opExtendable = 1, isPredicated = 1 in {
+      defm Pt : ST_Abs_Pred_nv<mnemonic, RC, 0>;
+      defm NotPt : ST_Abs_Pred_nv<mnemonic, RC, 1>;
+    }
+  }
 }
 
-let AddedComplexity = 30, isPredicable = 1 in
-def STrid_abs_V4 : STInst<(outs),
-          (ins globaladdress:$absaddr, DoubleRegs:$src),
-           "memd(##$absaddr) = $src",
-          [(store (i64 DoubleRegs:$src),
-                  (HexagonCONST32 tglobaladdr:$absaddr))]>,
-          Requires<[HasV4T]>;
+let addrMode = Absolute in {
+    defm STrib_abs : ST_Abs<"memb", "STrib", IntRegs>,
+                     ST_Abs_nv<"memb", "STrib", IntRegs>, AddrModeRel;
 
-let AddedComplexity = 30, isPredicated = 1 in
-def STrid_abs_cPt_V4 : STInst2<(outs),
-          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
-          "if ($src1) memd(##$absaddr) = $src2",
-          []>,
-          Requires<[HasV4T]>;
+    defm STrih_abs : ST_Abs<"memh", "STrih", IntRegs>,
+                     ST_Abs_nv<"memh", "STrih", IntRegs>, AddrModeRel;
 
-let AddedComplexity = 30, isPredicated = 1 in
-def STrid_abs_cNotPt_V4 : STInst2<(outs),
-          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
-          "if (!$src1) memd(##$absaddr) = $src2",
-          []>,
-          Requires<[HasV4T]>;
+    defm STriw_abs : ST_Abs<"memw", "STriw", IntRegs>,
+                     ST_Abs_nv<"memw", "STriw", IntRegs>, AddrModeRel;
 
-let AddedComplexity = 30, isPredicated = 1 in
-def STrid_abs_cdnPt_V4 : STInst2<(outs),
-          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
-          "if ($src1.new) memd(##$absaddr) = $src2",
-          []>,
-          Requires<[HasV4T]>;
-
-let AddedComplexity = 30, isPredicated = 1 in
-def STrid_abs_cdnNotPt_V4 : STInst2<(outs),
-          (ins PredRegs:$src1, globaladdress:$absaddr, DoubleRegs:$src2),
-          "if (!$src1.new) memd(##$absaddr) = $src2",
-          []>,
-          Requires<[HasV4T]>;
-
-defm STrib : ST_abs<"memb">;
-defm STrih : ST_abs<"memh">;
-defm STriw : ST_abs<"memw">;
+  let isNVStorable = 0 in
+    defm STrid_abs : ST_Abs<"memd", "STrid", DoubleRegs>, AddrModeRel;
+}
 
-let Predicates = [HasV4T], AddedComplexity  = 30 in
+let Predicates = [HasV4T], AddedComplexity = 30 in {
 def : Pat<(truncstorei8 (i32 IntRegs:$src1),
                         (HexagonCONST32 tglobaladdr:$absaddr)),
           (STrib_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
 
-let Predicates = [HasV4T], AddedComplexity  = 30 in
 def : Pat<(truncstorei16 (i32 IntRegs:$src1),
                           (HexagonCONST32 tglobaladdr:$absaddr)),
           (STrih_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
 
-let Predicates = [HasV4T], AddedComplexity  = 30 in
 def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32 tglobaladdr:$absaddr)),
           (STriw_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
 
+def : Pat<(store (i64 DoubleRegs:$src1),
+                 (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrid_abs_V4 tglobaladdr: $absaddr, DoubleRegs: $src1)>;
+}
 
-multiclass LD_abs<string OpcStr> {
-  let isPredicable = 1 in
-  def _abs_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins globaladdress:$absaddr),
-            !strconcat("$dst = ", !strconcat(OpcStr, "(##$absaddr)")),
-            []>,
-            Requires<[HasV4T]>;
-
-  let isPredicated = 1 in
-  def _abs_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$absaddr),
-            !strconcat("if ($src1) $dst = ",
-            !strconcat(OpcStr, "(##$absaddr)")),
+multiclass LD_Abs_Predbase<string mnemonic, RegisterClass RC, bit isNot,
+                           bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : LDInst2<(outs RC:$dst),
+            (ins PredRegs:$src1, globaladdressExt:$absaddr),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"(##$absaddr)",
             []>,
             Requires<[HasV4T]>;
+}
 
-  let isPredicated = 1 in
-  def _abs_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$absaddr),
-            !strconcat("if (!$src1) $dst = ",
-            !strconcat(OpcStr, "(##$absaddr)")),
-            []>,
-            Requires<[HasV4T]>;
+multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : LD_Abs_Predbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : LD_Abs_Predbase<mnemonic, RC, PredNot, 1>;
+  }
+}
 
-  let isPredicated = 1 in
-  def _abs_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$absaddr),
-            !strconcat("if ($src1.new) $dst = ",
-            !strconcat(OpcStr, "(##$absaddr)")),
+let isExtended = 1, neverHasSideEffects = 1 in
+multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let  opExtendable = 1, isPredicable = 1 in
+    def NAME#_V4 : LDInst2<(outs RC:$dst),
+            (ins globaladdressExt:$absaddr),
+            "$dst = "#mnemonic#"(##$absaddr)",
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
-  def _abs_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, globaladdress:$absaddr),
-            !strconcat("if (!$src1.new) $dst = ",
-            !strconcat(OpcStr, "(##$absaddr)")),
-            []>,
-            Requires<[HasV4T]>;
+    let opExtendable = 2, isPredicated = 1 in {
+      defm Pt_V4 : LD_Abs_Pred<mnemonic, RC, 0>;
+      defm NotPt_V4 : LD_Abs_Pred<mnemonic, RC, 1>;
+    }
+  }
 }
 
-let AddedComplexity = 30 in
-def LDrid_abs_V4 : LDInst<(outs DoubleRegs:$dst),
-          (ins globaladdress:$absaddr),
-          "$dst = memd(##$absaddr)",
-          [(set (i64 DoubleRegs:$dst),
-                (load (HexagonCONST32 tglobaladdr:$absaddr)))]>,
-          Requires<[HasV4T]>;
-
-let AddedComplexity = 30, isPredicated = 1 in
-def LDrid_abs_cPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-          (ins PredRegs:$src1, globaladdress:$absaddr),
-          "if ($src1) $dst = memd(##$absaddr)",
-          []>,
-          Requires<[HasV4T]>;
-
-let AddedComplexity = 30, isPredicated = 1 in
-def LDrid_abs_cNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-          (ins PredRegs:$src1, globaladdress:$absaddr),
-          "if (!$src1) $dst = memd(##$absaddr)",
-          []>,
-          Requires<[HasV4T]>;
-
-let AddedComplexity = 30, isPredicated = 1 in
-def LDrid_abs_cdnPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-          (ins PredRegs:$src1, globaladdress:$absaddr),
-          "if ($src1.new) $dst = memd(##$absaddr)",
-          []>,
-          Requires<[HasV4T]>;
-
-let AddedComplexity = 30, isPredicated = 1 in
-def LDrid_abs_cdnNotPt_V4 : LDInst2<(outs DoubleRegs:$dst),
-          (ins PredRegs:$src1, globaladdress:$absaddr),
-          "if (!$src1.new) $dst = memd(##$absaddr)",
-          []>,
-          Requires<[HasV4T]>;
-
-defm LDrib : LD_abs<"memb">;
-defm LDriub : LD_abs<"memub">;
-defm LDrih : LD_abs<"memh">;
-defm LDriuh : LD_abs<"memuh">;
-defm LDriw : LD_abs<"memw">;
-
+let addrMode = Absolute in {
+    defm LDrib_abs  : LD_Abs<"memb", "LDrib", IntRegs>, AddrModeRel;
+    defm LDriub_abs : LD_Abs<"memub", "LDriub", IntRegs>, AddrModeRel;
+    defm LDrih_abs  : LD_Abs<"memh", "LDrih", IntRegs>, AddrModeRel;
+    defm LDriuh_abs : LD_Abs<"memuh", "LDriuh", IntRegs>, AddrModeRel;
+    defm LDriw_abs  : LD_Abs<"memw", "LDriw", IntRegs>, AddrModeRel;
+    defm LDrid_abs : LD_Abs<"memd",  "LDrid", DoubleRegs>, AddrModeRel;
+}
 
 let Predicates = [HasV4T], AddedComplexity  = 30 in
 def : Pat<(i32 (load (HexagonCONST32 tglobaladdr:$absaddr))),
@@ -4577,172 +3881,167 @@ defm STrih_ind : ST_indirect_lo<"memh", truncstorei16>;
 defm STriw_ind : ST_indirect_lo<"memw", store>;
 
 // Store - absolute addressing mode: These instruction take constant
-// value as the extended operand
+// value as the extended operand.
 multiclass ST_absimm<string OpcStr> {
-  let isPredicable = 1 in
+let isExtended = 1, opExtendable = 0, isPredicable = 1,
+validSubTargets = HasV4SubT in
   def _abs_V4 : STInst2<(outs),
-            (ins u6Imm:$src1, IntRegs:$src2),
-            !strconcat(OpcStr, "(#$src1) = $src2"),
+            (ins u0AlwaysExt:$src1, IntRegs:$src2),
+            !strconcat(OpcStr, "(##$src1) = $src2"),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
+let isExtended = 1, opExtendable = 1, isPredicated = 1,
+validSubTargets = HasV4SubT in {
   def _abs_cPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
-            !strconcat("if ($src1)", !strconcat(OpcStr, "(#$src2) = $src3")),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if ($src1)", !strconcat(OpcStr, "(##$src2) = $src3")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
   def _abs_cNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
-            !strconcat("if (!$src1)", !strconcat(OpcStr, "(#$src2) = $src3")),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1)", !strconcat(OpcStr, "(##$src2) = $src3")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
   def _abs_cdnPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
             !strconcat("if ($src1.new)",
-            !strconcat(OpcStr, "(#$src2) = $src3")),
+            !strconcat(OpcStr, "(##$src2) = $src3")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
   def _abs_cdnNotPt_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
             !strconcat("if (!$src1.new)",
-            !strconcat(OpcStr, "(#$src2) = $src3")),
+            !strconcat(OpcStr, "(##$src2) = $src3")),
             []>,
             Requires<[HasV4T]>;
+}
 
-  def _abs_nv_V4 : STInst2<(outs),
-            (ins u6Imm:$src1, IntRegs:$src2),
-            !strconcat(OpcStr, "(#$src1) = $src2.new"),
+let isExtended = 1, opExtendable = 0, mayStore = 1, isNVStore = 1,
+validSubTargets = HasV4SubT in
+  def _abs_nv_V4 : NVInst_V4<(outs),
+            (ins u0AlwaysExt:$src1, IntRegs:$src2),
+            !strconcat(OpcStr, "(##$src1) = $src2.new"),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
-  def _abs_cPt_nv_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+let isExtended = 1, opExtendable = 1, mayStore = 1, isPredicated = 1,
+isNVStore = 1, validSubTargets = HasV4SubT in {
+  def _abs_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
             !strconcat("if ($src1)",
-            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            !strconcat(OpcStr, "(##$src2) = $src3.new")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
-  def _abs_cNotPt_nv_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+  def _abs_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
             !strconcat("if (!$src1)",
-            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            !strconcat(OpcStr, "(##$src2) = $src3.new")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
-  def _abs_cdnPt_nv_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+  def _abs_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
             !strconcat("if ($src1.new)",
-            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            !strconcat(OpcStr, "(##$src2) = $src3.new")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
-  def _abs_cdnNotPt_nv_V4 : STInst2<(outs),
-            (ins PredRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+  def _abs_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
             !strconcat("if (!$src1.new)",
-            !strconcat(OpcStr, "(#$src2) = $src3.new")),
+            !strconcat(OpcStr, "(##$src2) = $src3.new")),
             []>,
             Requires<[HasV4T]>;
 }
+}
 
 defm STrib_imm : ST_absimm<"memb">;
 defm STrih_imm : ST_absimm<"memh">;
 defm STriw_imm : ST_absimm<"memw">;
 
-let Predicates = [HasV4T], AddedComplexity  = 30 in
-def : Pat<(truncstorei8 (i32 IntRegs:$src1), u6ImmPred:$src2),
-          (STrib_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>;
+let Predicates = [HasV4T], AddedComplexity  = 30 in {
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
+          (STrib_imm_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
 
-let Predicates = [HasV4T], AddedComplexity  = 30 in
-def : Pat<(truncstorei16 (i32 IntRegs:$src1), u6ImmPred:$src2),
-          (STrih_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>;
-
-let Predicates = [HasV4T], AddedComplexity  = 30 in
-def : Pat<(store (i32 IntRegs:$src1), u6ImmPred:$src2),
-          (STriw_imm_abs_V4 u6ImmPred:$src2, IntRegs: $src1)>;
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
+          (STrih_imm_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
 
+def : Pat<(store (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
+          (STriw_imm_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+}
 
 // Load - absolute addressing mode: These instruction take constant
 // value as the extended operand
 
 multiclass LD_absimm<string OpcStr> {
-  let isPredicable = 1 in
+let isExtended = 1, opExtendable = 1, isPredicable = 1,
+validSubTargets = HasV4SubT in
   def _abs_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins u6Imm:$src),
+            (ins u0AlwaysExt:$src),
             !strconcat("$dst = ",
-            !strconcat(OpcStr, "(#$src)")),
+            !strconcat(OpcStr, "(##$src)")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, isPredicated = 1,
+validSubTargets = HasV4SubT in {
   def _abs_cPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, u6Imm:$src2),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2),
             !strconcat("if ($src1) $dst = ",
-            !strconcat(OpcStr, "(#$src2)")),
+            !strconcat(OpcStr, "(##$src2)")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
   def _abs_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, u6Imm:$src2),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2),
             !strconcat("if (!$src1) $dst = ",
-            !strconcat(OpcStr, "(#$src2)")),
+            !strconcat(OpcStr, "(##$src2)")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
   def _abs_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, u6Imm:$src2),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2),
             !strconcat("if ($src1.new) $dst = ",
-            !strconcat(OpcStr, "(#$src2)")),
+            !strconcat(OpcStr, "(##$src2)")),
             []>,
             Requires<[HasV4T]>;
 
-  let isPredicated = 1 in
   def _abs_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, u6Imm:$src2),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2),
             !strconcat("if (!$src1.new) $dst = ",
-            !strconcat(OpcStr, "(#$src2)")),
+            !strconcat(OpcStr, "(##$src2)")),
             []>,
             Requires<[HasV4T]>;
 }
+}
 
-defm LDrib_imm : LD_absimm<"memb">;
+defm LDrib_imm  : LD_absimm<"memb">;
 defm LDriub_imm : LD_absimm<"memub">;
-defm LDrih_imm : LD_absimm<"memh">;
+defm LDrih_imm  : LD_absimm<"memh">;
 defm LDriuh_imm : LD_absimm<"memuh">;
-defm LDriw_imm : LD_absimm<"memw">;
+defm LDriw_imm  : LD_absimm<"memw">;
 
-let Predicates = [HasV4T], AddedComplexity  = 30 in
-def : Pat<(i32 (load u6ImmPred:$src)),
-          (LDriw_imm_abs_V4 u6ImmPred:$src)>;
+let Predicates = [HasV4T], AddedComplexity  = 30 in {
+def : Pat<(i32 (load u0AlwaysExtPred:$src)),
+          (LDriw_imm_abs_V4 u0AlwaysExtPred:$src)>;
 
-let Predicates = [HasV4T], AddedComplexity=30 in
-def : Pat<(i32 (sextloadi8 u6ImmPred:$src)),
-          (LDrib_imm_abs_V4 u6ImmPred:$src)>;
+def : Pat<(i32 (sextloadi8 u0AlwaysExtPred:$src)),
+          (LDrib_imm_abs_V4 u0AlwaysExtPred:$src)>;
 
-let Predicates = [HasV4T], AddedComplexity=30 in
-def : Pat<(i32 (zextloadi8 u6ImmPred:$src)),
-          (LDriub_imm_abs_V4 u6ImmPred:$src)>;
+def : Pat<(i32 (zextloadi8 u0AlwaysExtPred:$src)),
+          (LDriub_imm_abs_V4 u0AlwaysExtPred:$src)>;
 
-let Predicates = [HasV4T], AddedComplexity=30 in
-def : Pat<(i32 (sextloadi16 u6ImmPred:$src)),
-          (LDrih_imm_abs_V4 u6ImmPred:$src)>;
-
-let Predicates = [HasV4T], AddedComplexity=30 in
-def : Pat<(i32 (zextloadi16 u6ImmPred:$src)),
-          (LDriuh_imm_abs_V4 u6ImmPred:$src)>;
+def : Pat<(i32 (sextloadi16 u0AlwaysExtPred:$src)),
+          (LDrih_imm_abs_V4 u0AlwaysExtPred:$src)>;
 
+def : Pat<(i32 (zextloadi16 u0AlwaysExtPred:$src)),
+          (LDriuh_imm_abs_V4 u0AlwaysExtPred:$src)>;
+}
 
 // Indexed store double word - global address.
 // memw(Rs+#u6:2)=#S8
@@ -4764,3 +4063,109 @@ def STrih_offset_ext_V4 : STInst<(outs),
             [(truncstorei16 (HexagonCONST32 tglobaladdr:$src3),
                     (add IntRegs:$src1, u6_1ImmPred:$src2))]>,
             Requires<[HasV4T]>;
+// Map from store(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i64 DoubleRegs:$src1),
+                 FoldGlobalAddrGP:$addr),
+          (STrid_abs_V4 FoldGlobalAddrGP:$addr, (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_64 FoldGlobalAddrGP:$addr,
+                           (i64 DoubleRegs:$src1)),
+          (STrid_abs_V4 FoldGlobalAddrGP:$addr, (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
+          (STrib_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_8 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
+          (STrib_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
+          (STrih_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_16 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
+          (STrih_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
+          (STriw_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_32 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
+          (STriw_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i64 (load FoldGlobalAddrGP:$addr)),
+          (i64 (LDrid_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_64 FoldGlobalAddrGP:$addr),
+          (i64 (LDrid_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (extloadi8 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrib_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (sextloadi8 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrib_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+//let AddedComplexity = 100 in
+let AddedComplexity = 100 in
+def : Pat<(i32 (extloadi16 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrih_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (sextloadi16 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrih_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memuh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (zextloadi16 FoldGlobalAddrGP:$addr)),
+          (i32 (LDriuh_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_16 FoldGlobalAddrGP:$addr),
+          (i32 (LDriuh_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memub(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (zextloadi8 FoldGlobalAddrGP:$addr)),
+          (i32 (LDriub_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_8 FoldGlobalAddrGP:$addr),
+          (i32 (LDriub_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (load FoldGlobalAddrGP:$addr)),
+          (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_32 FoldGlobalAddrGP:$addr),
+          (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index db36ac0..f011d51 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -15,6 +15,7 @@
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonMachineFunctionInfo.h"
+#include "MCTargetDesc/HexagonMCInst.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/MC/MCExpr.h"
@@ -38,9 +39,10 @@ static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
 }
 
 // Create an MCInst from a MachineInstr
-void llvm::HexagonLowerToMC(const MachineInstr* MI, MCInst& MCI,
+void llvm::HexagonLowerToMC(const MachineInstr* MI, HexagonMCInst& MCI,
                             HexagonAsmPrinter& AP) {
   MCI.setOpcode(MI->getOpcode());
+  MCI.setDesc(MI->getDesc());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
     const MachineOperand &MO = MI->getOperand(i);
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index aef6830..ced17b3 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -152,6 +152,12 @@ void VLIWMachineScheduler::schedule() {
   // Postprocess the DAG to add platform specific artificial dependencies.
   postprocessDAG();
 
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  SchedImpl->initialize(this);
+
   // To view Height/Depth correctly, they should be accessed at least once.
   DEBUG(unsigned maxH = 0;
         for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
@@ -166,7 +172,7 @@ void VLIWMachineScheduler::schedule() {
   DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           SUnits[su].dumpAll(this));
 
-  initQueues();
+  initQueues(TopRoots, BotRoots);
 
   bool IsTopNode = false;
   while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
@@ -186,6 +192,7 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = static_cast<VLIWMachineScheduler*>(dag);
   SchedModel = DAG->getSchedModel();
   TRI = DAG->TRI;
+
   Top.init(DAG, SchedModel);
   Bot.init(DAG, SchedModel);
 
@@ -193,6 +200,8 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
   // are disabled, then these HazardRecs will be disabled.
   const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
   const TargetMachine &TM = DAG->MF.getTarget();
+  delete Top.HazardRec;
+  delete Bot.HazardRec;
   Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
   Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
 
@@ -677,4 +686,3 @@ void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
     Bot.bumpNode(SU);
   }
 }
-
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index d1882de..f947dfc 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -117,37 +117,15 @@ HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
                    "architecture version");
 }
 
-void HexagonRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  MachineInstr &MI = *I;
-
-  if (MI.getOpcode() == Hexagon::ADJCALLSTACKDOWN) {
-    // Hexagon_TODO: add code
-  } else if (MI.getOpcode() == Hexagon::ADJCALLSTACKUP) {
-    // Hexagon_TODO: add code
-  } else {
-    llvm_unreachable("Cannot handle this call frame pseudo instruction");
-  }
-  MBB.erase(I);
-}
-
 void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                            int SPAdj, RegScavenger *RS) const {
-
+                                              int SPAdj, unsigned FIOperandNum,
+                                              RegScavenger *RS) const {
   //
   // Hexagon_TODO: Do we need to enforce this for Hexagon?
   assert(SPAdj == 0 && "Unexpected");
 
-
-  unsigned i = 0;
   MachineInstr &MI = *II;
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
-  int FrameIndex = MI.getOperand(i).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
   // Addressable stack objects are accessed using neg. offsets from %fp.
   MachineFunction &MF = *MI.getParent()->getParent();
@@ -167,8 +145,9 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset)) &&
       !TII.isSpillPredRegOp(&MI)) {
     // Replace frame index with a stack pointer reference.
-    MI.getOperand(i).ChangeToRegister(getStackRegister(), false, false, true);
-    MI.getOperand(i+1).ChangeToImmediate(FrameSize+Offset);
+    MI.getOperand(FIOperandNum).ChangeToRegister(getStackRegister(), false,
+                                                 false, true);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(FrameSize+Offset);
   } else {
     // Replace frame index with a frame pointer reference.
     if (!TII.isValidOffset(MI.getOpcode(), Offset)) {
@@ -205,8 +184,8 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                   dstReg).addReg(FrameReg).addImm(Offset);
         }
 
-        MI.getOperand(i).ChangeToRegister(dstReg, false, false, true);
-        MI.getOperand(i+1).ChangeToImmediate(0);
+        MI.getOperand(FIOperandNum).ChangeToRegister(dstReg, false, false,true);
+        MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
       } else if ((MI.getOpcode() == Hexagon::STriw_indexed) ||
                  (MI.getOpcode() == Hexagon::STriw) ||
                  (MI.getOpcode() == Hexagon::STrid) ||
@@ -233,29 +212,31 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                   TII.get(Hexagon::ADD_ri),
                   resReg).addReg(FrameReg).addImm(Offset);
         }
-        MI.getOperand(i).ChangeToRegister(resReg, false, false, true);
-        MI.getOperand(i+1).ChangeToImmediate(0);
+        MI.getOperand(FIOperandNum).ChangeToRegister(resReg, false, false,true);
+        MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
       } else if (TII.isMemOp(&MI)) {
         unsigned resReg = HEXAGON_RESERVED_REG_1;
         if (!MFI.hasVarSizedObjects() &&
             TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset))) {
-          MI.getOperand(i).ChangeToRegister(getStackRegister(), false, false,
-                                            true);
-          MI.getOperand(i+1).ChangeToImmediate(FrameSize+Offset);
+          MI.getOperand(FIOperandNum).ChangeToRegister(getStackRegister(),
+                                                       false, false, true);
+          MI.getOperand(FIOperandNum+1).ChangeToImmediate(FrameSize+Offset);
         } else if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
                   TII.get(Hexagon::CONST32_Int_Real), resReg).addImm(Offset);
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
                   TII.get(Hexagon::ADD_rr),
                   resReg).addReg(FrameReg).addReg(resReg);
-          MI.getOperand(i).ChangeToRegister(resReg, false, false, true);
-          MI.getOperand(i+1).ChangeToImmediate(0);
+          MI.getOperand(FIOperandNum).ChangeToRegister(resReg, false, false,
+                                                       true);
+          MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
         } else {
           BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
                   TII.get(Hexagon::ADD_ri),
                   resReg).addReg(FrameReg).addImm(Offset);
-          MI.getOperand(i).ChangeToRegister(resReg, false, false, true);
-          MI.getOperand(i+1).ChangeToImmediate(0);
+          MI.getOperand(FIOperandNum).ChangeToRegister(resReg, false, false,
+                                                       true);
+          MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
         }
       } else {
         unsigned dstReg = MI.getOperand(0).getReg();
@@ -265,14 +246,14 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                 TII.get(Hexagon::ADD_rr),
                 dstReg).addReg(FrameReg).addReg(dstReg);
         // Can we delete MI??? r2 = add (r2, #0).
-        MI.getOperand(i).ChangeToRegister(dstReg, false, false, true);
-        MI.getOperand(i+1).ChangeToImmediate(0);
+        MI.getOperand(FIOperandNum).ChangeToRegister(dstReg, false, false,true);
+        MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
       }
     } else {
       // If the offset is small enough to fit in the immediate field, directly
       // encode it.
-      MI.getOperand(i).ChangeToRegister(FrameReg, false);
-      MI.getOperand(i+1).ChangeToImmediate(Offset);
+      MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
     }
   }
 
@@ -310,58 +291,6 @@ void HexagonRegisterInfo::getInitialFrameState(std::vector<MachineMove>
   Moves.push_back(MachineMove(0, Dst, Src));
 }
 
-// Get the weight in units of pressure for this register class.
-const RegClassWeight &
-HexagonRegisterInfo::getRegClassWeight(const TargetRegisterClass *RC) const {
-  // Each TargetRegisterClass has a per register weight, and weight
-  // limit which must be less than the limits of its pressure sets.
-  static const RegClassWeight RCWeightTable[] = {
-    {1, 32}, // IntRegs
-    {1, 8},  // CRRegs
-    {1, 4},  // PredRegs
-    {2, 16}, // DoubleRegs
-    {0, 0} };
-  return RCWeightTable[RC->getID()];
-}
-
-/// Get the number of dimensions of register pressure.
-unsigned HexagonRegisterInfo::getNumRegPressureSets() const {
-  return 4;
-}
-
-/// Get the name of this register unit pressure set.
-const char *HexagonRegisterInfo::getRegPressureSetName(unsigned Idx) const {
-  static const char *const RegPressureSetName[] = {
-    "IntRegsRegSet",
-    "CRRegsRegSet",
-    "PredRegsRegSet",
-    "DoubleRegsRegSet"
-  };
-  assert((Idx < 4) && "Index out of bounds");
-  return RegPressureSetName[Idx];
-}
-
-/// Get the register unit pressure limit for this dimension.
-/// This limit must be adjusted dynamically for reserved registers.
-unsigned HexagonRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
-  static const int RegPressureLimit [] = { 16, 4, 2, 8 };
-  assert((Idx < 4) && "Index out of bounds");
-  return RegPressureLimit[Idx];
-}
-
-const int*
-HexagonRegisterInfo::getRegClassPressureSets(const TargetRegisterClass *RC)
-  const {
-  static const int RCSetsTable[] = {
-    0,  -1,  // IntRegs
-    1,  -1,  // CRRegs
-    2,  -1,  // PredRegs
-    0,  -1,  // DoubleRegs
-    -1 };
-  static const unsigned RCSetStartTable[] = { 0, 2, 4, 6, 0 };
-  unsigned SetListStart = RCSetStartTable[RC->getID()];
-  return &RCSetsTable[SetListStart];
-}
 unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
   llvm_unreachable("What is the exception register");
 }
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index e8f3cfb..8a3f94a 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -56,12 +56,9 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
 
   /// determineFrameLayout - Determine the size of the frame and maximum call
   /// frame size.
@@ -87,11 +84,6 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-  const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
-  unsigned getNumRegPressureSets() const;
-  const char *getRegPressureSetName(unsigned Idx) const;
-  unsigned getRegPressureSetLimit(unsigned Idx) const;
-  const int* getRegClassPressureSets(const TargetRegisterClass *RC) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index b5ff69a..c2cfbb9 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -8,10 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 // Functional Units
-def LUNIT     : FuncUnit;
-def LSUNIT    : FuncUnit;
-def MUNIT     : FuncUnit;
-def SUNIT     : FuncUnit;
+def LSUNIT    : FuncUnit; // SLOT0
+def LUNIT     : FuncUnit; // SLOT1
+def MUNIT     : FuncUnit; // SLOT2
+def SUNIT     : FuncUnit; // SLOT3
+def LOOPUNIT  : FuncUnit;
 
 // Itinerary classes
 def ALU32     : InstrItinClass;
@@ -20,27 +21,34 @@ def CR        : InstrItinClass;
 def J         : InstrItinClass;
 def JR        : InstrItinClass;
 def LD        : InstrItinClass;
+def LD0       : InstrItinClass;
 def M         : InstrItinClass;
 def ST        : InstrItinClass;
+def ST0       : InstrItinClass;
 def S         : InstrItinClass;
 def SYS       : InstrItinClass;
-def MARKER    : InstrItinClass;
+def ENDLOOP   : InstrItinClass;
 def PSEUDO    : InstrItinClass;
+def PSEUDOM   : InstrItinClass;
 
 def HexagonItineraries :
-      ProcessorItineraries<[LUNIT, LSUNIT, MUNIT, SUNIT], [], [
+      ProcessorItineraries<[LSUNIT, LUNIT, MUNIT, SUNIT, LOOPUNIT], [], [
         InstrItinData<ALU32  , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
         InstrItinData<ALU64  , [InstrStage<1, [MUNIT, SUNIT]>]>,
         InstrItinData<CR     , [InstrStage<1, [SUNIT]>]>,
         InstrItinData<J      , [InstrStage<1, [SUNIT, MUNIT]>]>,
         InstrItinData<JR     , [InstrStage<1, [MUNIT]>]>,
         InstrItinData<LD     , [InstrStage<1, [LUNIT, LSUNIT]>]>,
+        InstrItinData<LD0    , [InstrStage<1, [LSUNIT]>]>,
         InstrItinData<M      , [InstrStage<1, [MUNIT, SUNIT]>]>,
         InstrItinData<ST     , [InstrStage<1, [LSUNIT]>]>,
+        InstrItinData<ST0    , [InstrStage<1, [LSUNIT]>]>,
         InstrItinData<S      , [InstrStage<1, [SUNIT, MUNIT]>]>,
         InstrItinData<SYS    , [InstrStage<1, [LSUNIT]>]>,
-        InstrItinData<MARKER , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
-        InstrItinData<PSEUDO , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>
+        InstrItinData<ENDLOOP, [InstrStage<1, [LOOPUNIT]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
+        InstrItinData<PSEUDOM, [InstrStage<1, [MUNIT, SUNIT], 0>,
+                                InstrStage<1, [MUNIT, SUNIT]>]>
       ]>;
 
 def HexagonModel : SchedMachineModel {
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index 5668ae8..ef72cf4 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -28,6 +28,10 @@ def SLOT0       : FuncUnit;
 def SLOT1       : FuncUnit;
 def SLOT2       : FuncUnit;
 def SLOT3       : FuncUnit;
+// Endloop is a pseudo instruction that is encoded with 2 bits in a packet
+// rather than taking an execution slot. This special unit is needed
+// to schedule an ENDLOOP with 4 other instructions.
+def SLOT_ENDLOOP: FuncUnit;
 
 // Itinerary classes.
 def NV_V4       : InstrItinClass;
@@ -36,22 +40,26 @@ def MEM_V4      : InstrItinClass;
 def PREFIX      : InstrItinClass;
 
 def HexagonItinerariesV4 :
-      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3], [], [
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
         InstrItinData<ALU32  , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
         InstrItinData<ALU64  , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<CR     , [InstrStage<1, [SLOT3]>]>,
         InstrItinData<J      , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<JR     , [InstrStage<1, [SLOT2]>]>,
         InstrItinData<LD     , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD0    , [InstrStage<1, [SLOT0]>]>,
         InstrItinData<M      , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<ST     , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST0    , [InstrStage<1, [SLOT0]>]>,
         InstrItinData<S      , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<SYS    , [InstrStage<1, [SLOT0]>]>,
         InstrItinData<NV_V4  , [InstrStage<1, [SLOT0]>]>,
         InstrItinData<MEM_V4 , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<MARKER , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ENDLOOP, [InstrStage<1, [SLOT_ENDLOOP]>]>,
         InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                InstrStage<1, [SLOT2, SLOT3]>]>
       ]>;
 
 def HexagonModelV4 : SchedMachineModel {
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 287b3d6..d9fef3e 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -122,7 +122,7 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool HexagonPassConfig::addInstSelector() {
   addPass(createHexagonRemoveExtendOps(getHexagonTargetMachine()));
-  addPass(createHexagonISelDag(getHexagonTargetMachine()));
+  addPass(createHexagonISelDag(getHexagonTargetMachine(), getOptLevel()));
   addPass(createHexagonPeephole());
   return false;
 }
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 409a243..aff6b86 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -376,7 +376,6 @@ bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) {
     case Hexagon::STrib_indexed:
     case Hexagon::STrib_indexed_shl_V4:
     case Hexagon::STrib_shl_V4:
-    case Hexagon::STrib_GP_V4:
     case Hexagon::STb_GP_V4:
     case Hexagon::POST_STbri:
     case Hexagon::STrib_cPt:
@@ -399,17 +398,12 @@ bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) {
     case Hexagon::STb_GP_cNotPt_V4:
     case Hexagon::STb_GP_cdnPt_V4:
     case Hexagon::STb_GP_cdnNotPt_V4:
-    case Hexagon::STrib_GP_cPt_V4:
-    case Hexagon::STrib_GP_cNotPt_V4:
-    case Hexagon::STrib_GP_cdnPt_V4:
-    case Hexagon::STrib_GP_cdnNotPt_V4:
 
     // store halfword
     case Hexagon::STrih:
     case Hexagon::STrih_indexed:
     case Hexagon::STrih_indexed_shl_V4:
     case Hexagon::STrih_shl_V4:
-    case Hexagon::STrih_GP_V4:
     case Hexagon::STh_GP_V4:
     case Hexagon::POST_SThri:
     case Hexagon::STrih_cPt:
@@ -432,17 +426,12 @@ bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) {
     case Hexagon::STh_GP_cNotPt_V4:
     case Hexagon::STh_GP_cdnPt_V4:
     case Hexagon::STh_GP_cdnNotPt_V4:
-    case Hexagon::STrih_GP_cPt_V4:
-    case Hexagon::STrih_GP_cNotPt_V4:
-    case Hexagon::STrih_GP_cdnPt_V4:
-    case Hexagon::STrih_GP_cdnNotPt_V4:
 
     // store word
     case Hexagon::STriw:
     case Hexagon::STriw_indexed:
     case Hexagon::STriw_indexed_shl_V4:
     case Hexagon::STriw_shl_V4:
-    case Hexagon::STriw_GP_V4:
     case Hexagon::STw_GP_V4:
     case Hexagon::POST_STwri:
     case Hexagon::STriw_cPt:
@@ -465,10 +454,6 @@ bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) {
     case Hexagon::STw_GP_cNotPt_V4:
     case Hexagon::STw_GP_cdnPt_V4:
     case Hexagon::STw_GP_cdnNotPt_V4:
-    case Hexagon::STriw_GP_cPt_V4:
-    case Hexagon::STriw_GP_cNotPt_V4:
-    case Hexagon::STriw_GP_cdnPt_V4:
-    case Hexagon::STriw_GP_cdnNotPt_V4:
         return QRI->Subtarget.hasV4TOps();
   }
   return false;
@@ -508,9 +493,6 @@ static int GetDotNewOp(const int opc) {
   case Hexagon::STrib_shl_V4:
     return Hexagon::STrib_shl_nv_V4;
 
-  case Hexagon::STrib_GP_V4:
-    return Hexagon::STrib_GP_nv_V4;
-
   case Hexagon::STb_GP_V4:
     return Hexagon::STb_GP_nv_V4;
 
@@ -577,18 +559,6 @@ static int GetDotNewOp(const int opc) {
   case Hexagon::STb_GP_cdnNotPt_V4:
     return Hexagon::STb_GP_cdnNotPt_nv_V4;
 
-  case Hexagon::STrib_GP_cPt_V4:
-    return Hexagon::STrib_GP_cPt_nv_V4;
-
-  case Hexagon::STrib_GP_cNotPt_V4:
-    return Hexagon::STrib_GP_cNotPt_nv_V4;
-
-  case Hexagon::STrib_GP_cdnPt_V4:
-    return Hexagon::STrib_GP_cdnPt_nv_V4;
-
-  case Hexagon::STrib_GP_cdnNotPt_V4:
-    return Hexagon::STrib_GP_cdnNotPt_nv_V4;
-
   // store new value halfword
   case Hexagon::STrih:
     return Hexagon::STrih_nv_V4;
@@ -602,9 +572,6 @@ static int GetDotNewOp(const int opc) {
   case Hexagon::STrih_shl_V4:
     return Hexagon::STrih_shl_nv_V4;
 
-  case Hexagon::STrih_GP_V4:
-    return Hexagon::STrih_GP_nv_V4;
-
   case Hexagon::STh_GP_V4:
     return Hexagon::STh_GP_nv_V4;
 
@@ -671,18 +638,6 @@ static int GetDotNewOp(const int opc) {
   case Hexagon::STh_GP_cdnNotPt_V4:
     return Hexagon::STh_GP_cdnNotPt_nv_V4;
 
-  case Hexagon::STrih_GP_cPt_V4:
-    return Hexagon::STrih_GP_cPt_nv_V4;
-
-  case Hexagon::STrih_GP_cNotPt_V4:
-    return Hexagon::STrih_GP_cNotPt_nv_V4;
-
-  case Hexagon::STrih_GP_cdnPt_V4:
-    return Hexagon::STrih_GP_cdnPt_nv_V4;
-
-  case Hexagon::STrih_GP_cdnNotPt_V4:
-    return Hexagon::STrih_GP_cdnNotPt_nv_V4;
-
   // store new value word
   case Hexagon::STriw:
     return Hexagon::STriw_nv_V4;
@@ -696,9 +651,6 @@ static int GetDotNewOp(const int opc) {
   case Hexagon::STriw_shl_V4:
     return Hexagon::STriw_shl_nv_V4;
 
-  case Hexagon::STriw_GP_V4:
-    return Hexagon::STriw_GP_nv_V4;
-
   case Hexagon::STw_GP_V4:
     return Hexagon::STw_GP_nv_V4;
 
@@ -765,17 +717,6 @@ static int GetDotNewOp(const int opc) {
   case Hexagon::STw_GP_cdnNotPt_V4:
     return Hexagon::STw_GP_cdnNotPt_nv_V4;
 
-  case Hexagon::STriw_GP_cPt_V4:
-    return Hexagon::STriw_GP_cPt_nv_V4;
-
-  case Hexagon::STriw_GP_cNotPt_V4:
-    return Hexagon::STriw_GP_cNotPt_nv_V4;
-
-  case Hexagon::STriw_GP_cdnPt_V4:
-    return Hexagon::STriw_GP_cdnPt_nv_V4;
-
-  case Hexagon::STriw_GP_cdnNotPt_V4:
-    return Hexagon::STriw_GP_cdnNotPt_nv_V4;
   }
 }
 
@@ -821,12 +762,6 @@ static int GetDotNewPredOp(const int opc) {
   case Hexagon::STb_GP_cNotPt_V4 :
     return Hexagon::STb_GP_cdnNotPt_V4;
 
-  case Hexagon::STrib_GP_cPt_V4 :
-    return Hexagon::STrib_GP_cdnPt_V4;
-
-  case Hexagon::STrib_GP_cNotPt_V4 :
-    return Hexagon::STrib_GP_cdnNotPt_V4;
-
   // Store doubleword conditionally
   case Hexagon::STrid_cPt :
     return Hexagon::STrid_cdnPt_V4;
@@ -858,12 +793,6 @@ static int GetDotNewPredOp(const int opc) {
   case Hexagon::STd_GP_cNotPt_V4 :
     return Hexagon::STd_GP_cdnNotPt_V4;
 
-  case Hexagon::STrid_GP_cPt_V4 :
-    return Hexagon::STrid_GP_cdnPt_V4;
-
-  case Hexagon::STrid_GP_cNotPt_V4 :
-    return Hexagon::STrid_GP_cdnNotPt_V4;
-
   // Store halfword conditionally
   case Hexagon::STrih_cPt :
     return Hexagon::STrih_cdnPt_V4;
@@ -901,12 +830,6 @@ static int GetDotNewPredOp(const int opc) {
   case Hexagon::STh_GP_cNotPt_V4 :
     return Hexagon::STh_GP_cdnNotPt_V4;
 
-  case Hexagon::STrih_GP_cPt_V4 :
-    return Hexagon::STrih_GP_cdnPt_V4;
-
-  case Hexagon::STrih_GP_cNotPt_V4 :
-    return Hexagon::STrih_GP_cdnNotPt_V4;
-
   // Store word conditionally
   case Hexagon::STriw_cPt :
     return Hexagon::STriw_cdnPt_V4;
@@ -944,12 +867,6 @@ static int GetDotNewPredOp(const int opc) {
   case Hexagon::STw_GP_cNotPt_V4 :
     return Hexagon::STw_GP_cdnNotPt_V4;
 
-  case Hexagon::STriw_GP_cPt_V4 :
-    return Hexagon::STriw_GP_cdnPt_V4;
-
-  case Hexagon::STriw_GP_cNotPt_V4 :
-    return Hexagon::STriw_GP_cdnNotPt_V4;
-
   // Condtional Jumps
   case Hexagon::JMP_c:
     return Hexagon::JMP_cdnPt;
@@ -1092,72 +1009,36 @@ static int GetDotNewPredOp(const int opc) {
 
   // V4 indexed+scaled load
 
-  case Hexagon::LDrid_indexed_cPt_V4 :
-    return Hexagon::LDrid_indexed_cdnPt_V4;
-
-  case Hexagon::LDrid_indexed_cNotPt_V4 :
-    return Hexagon::LDrid_indexed_cdnNotPt_V4;
-
   case Hexagon::LDrid_indexed_shl_cPt_V4 :
     return Hexagon::LDrid_indexed_shl_cdnPt_V4;
 
   case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
     return Hexagon::LDrid_indexed_shl_cdnNotPt_V4;
 
-  case Hexagon::LDrib_indexed_cPt_V4 :
-    return Hexagon::LDrib_indexed_cdnPt_V4;
-
-  case Hexagon::LDrib_indexed_cNotPt_V4 :
-    return Hexagon::LDrib_indexed_cdnNotPt_V4;
-
   case Hexagon::LDrib_indexed_shl_cPt_V4 :
     return Hexagon::LDrib_indexed_shl_cdnPt_V4;
 
   case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
     return Hexagon::LDrib_indexed_shl_cdnNotPt_V4;
 
-  case Hexagon::LDriub_indexed_cPt_V4 :
-    return Hexagon::LDriub_indexed_cdnPt_V4;
-
-  case Hexagon::LDriub_indexed_cNotPt_V4 :
-    return Hexagon::LDriub_indexed_cdnNotPt_V4;
-
   case Hexagon::LDriub_indexed_shl_cPt_V4 :
     return Hexagon::LDriub_indexed_shl_cdnPt_V4;
 
   case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
     return Hexagon::LDriub_indexed_shl_cdnNotPt_V4;
 
-  case Hexagon::LDrih_indexed_cPt_V4 :
-    return Hexagon::LDrih_indexed_cdnPt_V4;
-
-  case Hexagon::LDrih_indexed_cNotPt_V4 :
-    return Hexagon::LDrih_indexed_cdnNotPt_V4;
-
   case Hexagon::LDrih_indexed_shl_cPt_V4 :
     return Hexagon::LDrih_indexed_shl_cdnPt_V4;
 
   case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
     return Hexagon::LDrih_indexed_shl_cdnNotPt_V4;
 
-  case Hexagon::LDriuh_indexed_cPt_V4 :
-    return Hexagon::LDriuh_indexed_cdnPt_V4;
-
-  case Hexagon::LDriuh_indexed_cNotPt_V4 :
-    return Hexagon::LDriuh_indexed_cdnNotPt_V4;
-
   case Hexagon::LDriuh_indexed_shl_cPt_V4 :
     return Hexagon::LDriuh_indexed_shl_cdnPt_V4;
 
   case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
     return Hexagon::LDriuh_indexed_shl_cdnNotPt_V4;
 
-  case Hexagon::LDriw_indexed_cPt_V4 :
-    return Hexagon::LDriw_indexed_cdnPt_V4;
-
-  case Hexagon::LDriw_indexed_cNotPt_V4 :
-    return Hexagon::LDriw_indexed_cdnNotPt_V4;
-
   case Hexagon::LDriw_indexed_shl_cPt_V4 :
     return Hexagon::LDriw_indexed_shl_cdnPt_V4;
 
@@ -1202,42 +1083,6 @@ static int GetDotNewPredOp(const int opc) {
   case Hexagon::LDw_GP_cNotPt_V4:
     return Hexagon::LDw_GP_cdnNotPt_V4;
 
-  case Hexagon::LDrid_GP_cPt_V4:
-    return Hexagon::LDrid_GP_cdnPt_V4;
-
-  case Hexagon::LDrid_GP_cNotPt_V4:
-    return Hexagon::LDrid_GP_cdnNotPt_V4;
-
-  case Hexagon::LDrib_GP_cPt_V4:
-    return Hexagon::LDrib_GP_cdnPt_V4;
-
-  case Hexagon::LDrib_GP_cNotPt_V4:
-    return Hexagon::LDrib_GP_cdnNotPt_V4;
-
-  case Hexagon::LDriub_GP_cPt_V4:
-    return Hexagon::LDriub_GP_cdnPt_V4;
-
-  case Hexagon::LDriub_GP_cNotPt_V4:
-    return Hexagon::LDriub_GP_cdnNotPt_V4;
-
-  case Hexagon::LDrih_GP_cPt_V4:
-    return Hexagon::LDrih_GP_cdnPt_V4;
-
-  case Hexagon::LDrih_GP_cNotPt_V4:
-    return Hexagon::LDrih_GP_cdnNotPt_V4;
-
-  case Hexagon::LDriuh_GP_cPt_V4:
-    return Hexagon::LDriuh_GP_cdnPt_V4;
-
-  case Hexagon::LDriuh_GP_cNotPt_V4:
-    return Hexagon::LDriuh_GP_cdnNotPt_V4;
-
-  case Hexagon::LDriw_GP_cPt_V4:
-    return Hexagon::LDriw_GP_cdnPt_V4;
-
-  case Hexagon::LDriw_GP_cNotPt_V4:
-    return Hexagon::LDriw_GP_cdnNotPt_V4;
-
   // Conditional store new-value byte
   case Hexagon::STrib_cPt_nv_V4 :
     return Hexagon::STrib_cdnPt_nv_V4;
@@ -1265,12 +1110,6 @@ static int GetDotNewPredOp(const int opc) {
   case Hexagon::STb_GP_cNotPt_nv_V4 :
     return Hexagon::STb_GP_cdnNotPt_nv_V4;
 
-  case Hexagon::STrib_GP_cPt_nv_V4 :
-    return Hexagon::STrib_GP_cdnPt_nv_V4;
-
-  case Hexagon::STrib_GP_cNotPt_nv_V4 :
-    return Hexagon::STrib_GP_cdnNotPt_nv_V4;
-
   // Conditional store new-value halfword
   case Hexagon::STrih_cPt_nv_V4 :
     return Hexagon::STrih_cdnPt_nv_V4;
@@ -1298,12 +1137,6 @@ static int GetDotNewPredOp(const int opc) {
   case Hexagon::STh_GP_cNotPt_nv_V4 :
     return Hexagon::STh_GP_cdnNotPt_nv_V4;
 
-  case Hexagon::STrih_GP_cPt_nv_V4 :
-    return Hexagon::STrih_GP_cdnPt_nv_V4;
-
-  case Hexagon::STrih_GP_cNotPt_nv_V4 :
-    return Hexagon::STrih_GP_cdnNotPt_nv_V4;
-
   // Conditional store new-value word
   case Hexagon::STriw_cPt_nv_V4 :
     return  Hexagon::STriw_cdnPt_nv_V4;
@@ -1331,12 +1164,6 @@ static int GetDotNewPredOp(const int opc) {
   case Hexagon::STw_GP_cNotPt_nv_V4 :
     return Hexagon::STw_GP_cdnNotPt_nv_V4;
 
-  case Hexagon::STriw_GP_cPt_nv_V4 :
-    return Hexagon::STriw_GP_cdnPt_nv_V4;
-
-  case Hexagon::STriw_GP_cNotPt_nv_V4 :
-    return Hexagon::STriw_GP_cdnNotPt_nv_V4;
-
   // Conditional add
   case Hexagon::ADD_ri_cPt :
     return Hexagon::ADD_ri_cdnPt;
@@ -1623,72 +1450,36 @@ static int GetDotOldOp(const int opc) {
 
   // V4 indexed+scaled Load
 
-  case Hexagon::LDrid_indexed_cdnPt_V4 :
-    return Hexagon::LDrid_indexed_cPt_V4;
-
-  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
-    return Hexagon::LDrid_indexed_cNotPt_V4;
-
   case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
     return Hexagon::LDrid_indexed_shl_cPt_V4;
 
   case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
     return Hexagon::LDrid_indexed_shl_cNotPt_V4;
 
-  case Hexagon::LDrib_indexed_cdnPt_V4 :
-    return Hexagon::LDrib_indexed_cPt_V4;
-
-  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
-    return Hexagon::LDrib_indexed_cNotPt_V4;
-
   case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
     return Hexagon::LDrib_indexed_shl_cPt_V4;
 
   case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
     return Hexagon::LDrib_indexed_shl_cNotPt_V4;
 
-  case Hexagon::LDriub_indexed_cdnPt_V4 :
-    return Hexagon::LDriub_indexed_cPt_V4;
-
-  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
-    return Hexagon::LDriub_indexed_cNotPt_V4;
-
   case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
     return Hexagon::LDriub_indexed_shl_cPt_V4;
 
   case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
     return Hexagon::LDriub_indexed_shl_cNotPt_V4;
 
-  case Hexagon::LDrih_indexed_cdnPt_V4 :
-    return Hexagon::LDrih_indexed_cPt_V4;
-
-  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
-    return Hexagon::LDrih_indexed_cNotPt_V4;
-
   case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
     return Hexagon::LDrih_indexed_shl_cPt_V4;
 
   case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
     return Hexagon::LDrih_indexed_shl_cNotPt_V4;
 
-  case Hexagon::LDriuh_indexed_cdnPt_V4 :
-    return Hexagon::LDriuh_indexed_cPt_V4;
-
-  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
-    return Hexagon::LDriuh_indexed_cNotPt_V4;
-
   case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
     return Hexagon::LDriuh_indexed_shl_cPt_V4;
 
   case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
     return Hexagon::LDriuh_indexed_shl_cNotPt_V4;
 
-  case Hexagon::LDriw_indexed_cdnPt_V4 :
-    return Hexagon::LDriw_indexed_cPt_V4;
-
-  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
-    return Hexagon::LDriw_indexed_cNotPt_V4;
-
   case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
     return Hexagon::LDriw_indexed_shl_cPt_V4;
 
@@ -1733,42 +1524,6 @@ static int GetDotOldOp(const int opc) {
   case Hexagon::LDw_GP_cdnNotPt_V4:
     return Hexagon::LDw_GP_cNotPt_V4;
 
-  case Hexagon::LDrid_GP_cdnPt_V4:
-    return Hexagon::LDrid_GP_cPt_V4;
-
-  case Hexagon::LDrid_GP_cdnNotPt_V4:
-    return Hexagon::LDrid_GP_cNotPt_V4;
-
-  case Hexagon::LDrib_GP_cdnPt_V4:
-    return Hexagon::LDrib_GP_cPt_V4;
-
-  case Hexagon::LDrib_GP_cdnNotPt_V4:
-    return Hexagon::LDrib_GP_cNotPt_V4;
-
-  case Hexagon::LDriub_GP_cdnPt_V4:
-    return Hexagon::LDriub_GP_cPt_V4;
-
-  case Hexagon::LDriub_GP_cdnNotPt_V4:
-    return Hexagon::LDriub_GP_cNotPt_V4;
-
-  case Hexagon::LDrih_GP_cdnPt_V4:
-    return Hexagon::LDrih_GP_cPt_V4;
-
-  case Hexagon::LDrih_GP_cdnNotPt_V4:
-    return Hexagon::LDrih_GP_cNotPt_V4;
-
-  case Hexagon::LDriuh_GP_cdnPt_V4:
-    return Hexagon::LDriuh_GP_cPt_V4;
-
-  case Hexagon::LDriuh_GP_cdnNotPt_V4:
-    return Hexagon::LDriuh_GP_cNotPt_V4;
-
-  case Hexagon::LDriw_GP_cdnPt_V4:
-    return Hexagon::LDriw_GP_cPt_V4;
-
-  case Hexagon::LDriw_GP_cdnNotPt_V4:
-    return Hexagon::LDriw_GP_cNotPt_V4;
-
   // Conditional add
 
   case Hexagon::ADD_ri_cdnPt :
@@ -1902,16 +1657,6 @@ static int GetDotOldOp(const int opc) {
   case Hexagon::STb_GP_cNotPt_nv_V4:
     return Hexagon::STb_GP_cNotPt_V4;
 
-  case Hexagon::STrib_GP_cdnPt_nv_V4:
-  case Hexagon::STrib_GP_cdnPt_V4:
-  case Hexagon::STrib_GP_cPt_nv_V4:
-    return Hexagon::STrib_GP_cPt_V4;
-
-  case Hexagon::STrib_GP_cdnNotPt_nv_V4:
-  case Hexagon::STrib_GP_cdnNotPt_V4:
-  case Hexagon::STrib_GP_cNotPt_nv_V4:
-    return Hexagon::STrib_GP_cNotPt_V4;
-
   // Store new-value byte - unconditional
   case Hexagon::STrib_nv_V4:
     return Hexagon::STrib;
@@ -1925,9 +1670,6 @@ static int GetDotOldOp(const int opc) {
   case Hexagon::STrib_shl_nv_V4:
     return Hexagon::STrib_shl_V4;
 
-  case Hexagon::STrib_GP_nv_V4:
-    return Hexagon::STrib_GP_V4;
-
   case Hexagon::STb_GP_nv_V4:
     return Hexagon::STb_GP_V4;
 
@@ -1991,16 +1733,6 @@ static int GetDotOldOp(const int opc) {
   case Hexagon::STh_GP_cNotPt_nv_V4:
     return Hexagon::STh_GP_cNotPt_V4;
 
-  case Hexagon::STrih_GP_cdnPt_nv_V4:
-  case Hexagon::STrih_GP_cdnPt_V4:
-  case Hexagon::STrih_GP_cPt_nv_V4:
-    return Hexagon::STrih_GP_cPt_V4;
-
-  case Hexagon::STrih_GP_cdnNotPt_nv_V4:
-  case Hexagon::STrih_GP_cdnNotPt_V4:
-  case Hexagon::STrih_GP_cNotPt_nv_V4:
-    return Hexagon::STrih_GP_cNotPt_V4;
-
   // Store new-value halfword - unconditional
 
   case Hexagon::STrih_nv_V4:
@@ -2015,9 +1747,6 @@ static int GetDotOldOp(const int opc) {
   case Hexagon::STrih_shl_nv_V4:
     return Hexagon::STrih_shl_V4;
 
-  case Hexagon::STrih_GP_nv_V4:
-    return Hexagon::STrih_GP_V4;
-
   case Hexagon::STh_GP_nv_V4:
     return Hexagon::STh_GP_V4;
 
@@ -2082,16 +1811,6 @@ static int GetDotOldOp(const int opc) {
   case Hexagon::STw_GP_cNotPt_nv_V4:
     return Hexagon::STw_GP_cNotPt_V4;
 
-  case Hexagon::STriw_GP_cdnPt_nv_V4:
-  case Hexagon::STriw_GP_cdnPt_V4:
-  case Hexagon::STriw_GP_cPt_nv_V4:
-    return Hexagon::STriw_GP_cPt_V4;
-
-  case Hexagon::STriw_GP_cdnNotPt_nv_V4:
-  case Hexagon::STriw_GP_cdnNotPt_V4:
-  case Hexagon::STriw_GP_cNotPt_nv_V4:
-    return Hexagon::STriw_GP_cNotPt_V4;
-
   // Store new-value word - unconditional
 
   case Hexagon::STriw_nv_V4:
@@ -2106,9 +1825,6 @@ static int GetDotOldOp(const int opc) {
   case Hexagon::STriw_shl_nv_V4:
     return Hexagon::STriw_shl_V4;
 
-  case Hexagon::STriw_GP_nv_V4:
-    return Hexagon::STriw_GP_V4;
-
   case Hexagon::STw_GP_nv_V4:
     return Hexagon::STw_GP_V4;
 
@@ -2147,11 +1863,6 @@ static int GetDotOldOp(const int opc) {
   case Hexagon::STd_GP_cdnNotPt_V4 :
     return Hexagon::STd_GP_cNotPt_V4;
 
-  case Hexagon::STrid_GP_cdnPt_V4 :
-    return Hexagon::STrid_GP_cPt_V4;
-
-  case Hexagon::STrid_GP_cdnNotPt_V4 :
-    return Hexagon::STrid_GP_cNotPt_V4;
   }
 }
 
@@ -2249,28 +1960,16 @@ static bool GetPredicateSense(MachineInstr* MI,
   case Hexagon::LDriub_indexed_cdnPt :
   case Hexagon::POST_LDriub_cPt :
   case Hexagon::POST_LDriub_cdnPt_V4 :
-  case Hexagon::LDrid_indexed_cPt_V4 :
-  case Hexagon::LDrid_indexed_cdnPt_V4 :
   case Hexagon::LDrid_indexed_shl_cPt_V4 :
   case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDrib_indexed_cPt_V4 :
-  case Hexagon::LDrib_indexed_cdnPt_V4 :
   case Hexagon::LDrib_indexed_shl_cPt_V4 :
   case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDriub_indexed_cPt_V4 :
-  case Hexagon::LDriub_indexed_cdnPt_V4 :
   case Hexagon::LDriub_indexed_shl_cPt_V4 :
   case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDrih_indexed_cPt_V4 :
-  case Hexagon::LDrih_indexed_cdnPt_V4 :
   case Hexagon::LDrih_indexed_shl_cPt_V4 :
   case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDriuh_indexed_cPt_V4 :
-  case Hexagon::LDriuh_indexed_cdnPt_V4 :
   case Hexagon::LDriuh_indexed_shl_cPt_V4 :
   case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDriw_indexed_cPt_V4 :
-  case Hexagon::LDriw_indexed_cdnPt_V4 :
   case Hexagon::LDriw_indexed_shl_cPt_V4 :
   case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
   case Hexagon::ADD_ri_cPt :
@@ -2299,42 +1998,22 @@ static bool GetPredicateSense(MachineInstr* MI,
   case Hexagon::ZXTB_cdnPt_V4 :
   case Hexagon::ZXTH_cPt_V4 :
   case Hexagon::ZXTH_cdnPt_V4 :
-  case Hexagon::LDrid_GP_cPt_V4 :
-  case Hexagon::LDrib_GP_cPt_V4 :
-  case Hexagon::LDriub_GP_cPt_V4 :
-  case Hexagon::LDrih_GP_cPt_V4 :
-  case Hexagon::LDriuh_GP_cPt_V4 :
-  case Hexagon::LDriw_GP_cPt_V4 :
   case Hexagon::LDd_GP_cPt_V4 :
   case Hexagon::LDb_GP_cPt_V4 :
   case Hexagon::LDub_GP_cPt_V4 :
   case Hexagon::LDh_GP_cPt_V4 :
   case Hexagon::LDuh_GP_cPt_V4 :
   case Hexagon::LDw_GP_cPt_V4 :
-  case Hexagon::STrid_GP_cPt_V4 :
-  case Hexagon::STrib_GP_cPt_V4 :
-  case Hexagon::STrih_GP_cPt_V4 :
-  case Hexagon::STriw_GP_cPt_V4 :
   case Hexagon::STd_GP_cPt_V4 :
   case Hexagon::STb_GP_cPt_V4 :
   case Hexagon::STh_GP_cPt_V4 :
   case Hexagon::STw_GP_cPt_V4 :
-  case Hexagon::LDrid_GP_cdnPt_V4 :
-  case Hexagon::LDrib_GP_cdnPt_V4 :
-  case Hexagon::LDriub_GP_cdnPt_V4 :
-  case Hexagon::LDrih_GP_cdnPt_V4 :
-  case Hexagon::LDriuh_GP_cdnPt_V4 :
-  case Hexagon::LDriw_GP_cdnPt_V4 :
   case Hexagon::LDd_GP_cdnPt_V4 :
   case Hexagon::LDb_GP_cdnPt_V4 :
   case Hexagon::LDub_GP_cdnPt_V4 :
   case Hexagon::LDh_GP_cdnPt_V4 :
   case Hexagon::LDuh_GP_cdnPt_V4 :
   case Hexagon::LDw_GP_cdnPt_V4 :
-  case Hexagon::STrid_GP_cdnPt_V4 :
-  case Hexagon::STrib_GP_cdnPt_V4 :
-  case Hexagon::STrih_GP_cdnPt_V4 :
-  case Hexagon::STriw_GP_cdnPt_V4 :
   case Hexagon::STd_GP_cdnPt_V4 :
   case Hexagon::STb_GP_cdnPt_V4 :
   case Hexagon::STh_GP_cdnPt_V4 :
@@ -2420,28 +2099,16 @@ static bool GetPredicateSense(MachineInstr* MI,
   case Hexagon::LDriub_indexed_cdnNotPt :
   case Hexagon::POST_LDriub_cNotPt :
   case Hexagon::POST_LDriub_cdnNotPt_V4 :
-  case Hexagon::LDrid_indexed_cNotPt_V4 :
-  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
   case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
   case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDrib_indexed_cNotPt_V4 :
-  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
   case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
   case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriub_indexed_cNotPt_V4 :
-  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
   case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
   case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDrih_indexed_cNotPt_V4 :
-  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
   case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
   case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriuh_indexed_cNotPt_V4 :
-  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
   case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
   case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriw_indexed_cNotPt_V4 :
-  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
   case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
   case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
   case Hexagon::ADD_ri_cNotPt :
@@ -2471,42 +2138,22 @@ static bool GetPredicateSense(MachineInstr* MI,
   case Hexagon::ZXTH_cNotPt_V4 :
   case Hexagon::ZXTH_cdnNotPt_V4 :
 
-  case Hexagon::LDrid_GP_cNotPt_V4 :
-  case Hexagon::LDrib_GP_cNotPt_V4 :
-  case Hexagon::LDriub_GP_cNotPt_V4 :
-  case Hexagon::LDrih_GP_cNotPt_V4 :
-  case Hexagon::LDriuh_GP_cNotPt_V4 :
-  case Hexagon::LDriw_GP_cNotPt_V4 :
   case Hexagon::LDd_GP_cNotPt_V4 :
   case Hexagon::LDb_GP_cNotPt_V4 :
   case Hexagon::LDub_GP_cNotPt_V4 :
   case Hexagon::LDh_GP_cNotPt_V4 :
   case Hexagon::LDuh_GP_cNotPt_V4 :
   case Hexagon::LDw_GP_cNotPt_V4 :
-  case Hexagon::STrid_GP_cNotPt_V4 :
-  case Hexagon::STrib_GP_cNotPt_V4 :
-  case Hexagon::STrih_GP_cNotPt_V4 :
-  case Hexagon::STriw_GP_cNotPt_V4 :
   case Hexagon::STd_GP_cNotPt_V4 :
   case Hexagon::STb_GP_cNotPt_V4 :
   case Hexagon::STh_GP_cNotPt_V4 :
   case Hexagon::STw_GP_cNotPt_V4 :
-  case Hexagon::LDrid_GP_cdnNotPt_V4 :
-  case Hexagon::LDrib_GP_cdnNotPt_V4 :
-  case Hexagon::LDriub_GP_cdnNotPt_V4 :
-  case Hexagon::LDrih_GP_cdnNotPt_V4 :
-  case Hexagon::LDriuh_GP_cdnNotPt_V4 :
-  case Hexagon::LDriw_GP_cdnNotPt_V4 :
   case Hexagon::LDd_GP_cdnNotPt_V4 :
   case Hexagon::LDb_GP_cdnNotPt_V4 :
   case Hexagon::LDub_GP_cdnNotPt_V4 :
   case Hexagon::LDh_GP_cdnNotPt_V4 :
   case Hexagon::LDuh_GP_cdnNotPt_V4 :
   case Hexagon::LDw_GP_cdnNotPt_V4 :
-  case Hexagon::STrid_GP_cdnNotPt_V4 :
-  case Hexagon::STrib_GP_cdnNotPt_V4 :
-  case Hexagon::STrih_GP_cdnNotPt_V4 :
-  case Hexagon::STriw_GP_cdnNotPt_V4 :
   case Hexagon::STd_GP_cdnNotPt_V4 :
   case Hexagon::STb_GP_cdnNotPt_V4 :
   case Hexagon::STh_GP_cdnNotPt_V4 :
@@ -2563,28 +2210,16 @@ bool HexagonPacketizerList::isDotNewInst(MachineInstr* MI) {
   case Hexagon::POST_LDriub_cdnPt_V4 :
   case Hexagon::POST_LDriub_cdnNotPt_V4 :
 
-  case Hexagon::LDrid_indexed_cdnPt_V4 :
-  case Hexagon::LDrid_indexed_cdnNotPt_V4 :
   case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
   case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDrib_indexed_cdnPt_V4 :
-  case Hexagon::LDrib_indexed_cdnNotPt_V4 :
   case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
   case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriub_indexed_cdnPt_V4 :
-  case Hexagon::LDriub_indexed_cdnNotPt_V4 :
   case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
   case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDrih_indexed_cdnPt_V4 :
-  case Hexagon::LDrih_indexed_cdnNotPt_V4 :
   case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
   case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriuh_indexed_cdnPt_V4 :
-  case Hexagon::LDriuh_indexed_cdnNotPt_V4 :
   case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
   case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriw_indexed_cdnPt_V4 :
-  case Hexagon::LDriw_indexed_cdnNotPt_V4 :
   case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
   case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
 
@@ -2680,27 +2315,7 @@ bool HexagonPacketizerList::isDotNewInst(MachineInstr* MI) {
   case Hexagon::LDuh_GP_cdnNotPt_V4:
   case Hexagon::LDw_GP_cdnPt_V4:
   case Hexagon::LDw_GP_cdnNotPt_V4:
-  case Hexagon::LDrid_GP_cdnPt_V4:
-  case Hexagon::LDrid_GP_cdnNotPt_V4:
-  case Hexagon::LDrib_GP_cdnPt_V4:
-  case Hexagon::LDrib_GP_cdnNotPt_V4:
-  case Hexagon::LDriub_GP_cdnPt_V4:
-  case Hexagon::LDriub_GP_cdnNotPt_V4:
-  case Hexagon::LDrih_GP_cdnPt_V4:
-  case Hexagon::LDrih_GP_cdnNotPt_V4:
-  case Hexagon::LDriuh_GP_cdnPt_V4:
-  case Hexagon::LDriuh_GP_cdnNotPt_V4:
-  case Hexagon::LDriw_GP_cdnPt_V4:
-  case Hexagon::LDriw_GP_cdnNotPt_V4:
-
-  case Hexagon::STrid_GP_cdnPt_V4:
-  case Hexagon::STrid_GP_cdnNotPt_V4:
-  case Hexagon::STrib_GP_cdnPt_V4:
-  case Hexagon::STrib_GP_cdnNotPt_V4:
-  case Hexagon::STrih_GP_cdnPt_V4:
-  case Hexagon::STrih_GP_cdnNotPt_V4:
-  case Hexagon::STriw_GP_cdnPt_V4:
-  case Hexagon::STriw_GP_cdnNotPt_V4:
+
   case Hexagon::STd_GP_cdnPt_V4:
   case Hexagon::STd_GP_cdnNotPt_V4:
   case Hexagon::STb_GP_cdnPt_V4:
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
index c700354..36da6df 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -12,14 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "asm-printer"
-#include "HexagonInstPrinter.h"
-#include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
-#include "HexagonMCInst.h"
+#include "Hexagon.h"
+#include "HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
 
@@ -28,6 +28,8 @@ using namespace llvm;
 #define GET_INSTRUCTION_NAME
 #include "HexagonGenAsmWriter.inc"
 
+const char HexagonInstPrinter::PacketPadding = '\t';
+
 StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
   return MII.getName(Opcode);
 }
@@ -43,43 +45,42 @@ void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
 
 void HexagonInstPrinter::printInst(const HexagonMCInst *MI, raw_ostream &O,
                                    StringRef Annot) {
-  const char packetPadding[] = "      ";
   const char startPacket = '{',
              endPacket = '}';
   // TODO: add outer HW loop when it's supported too.
   if (MI->getOpcode() == Hexagon::ENDLOOP0) {
     // Ending a harware loop is different from ending an regular packet.
-    assert(MI->isEndPacket() && "Loop end must also end the packet");
+    assert(MI->isPacketEnd() && "Loop-end must also end the packet");
 
-    if (MI->isStartPacket()) {
+    if (MI->isPacketStart()) {
       // There must be a packet to end a loop.
       // FIXME: when shuffling is always run, this shouldn't be needed.
       HexagonMCInst Nop;
       StringRef NoAnnot;
 
       Nop.setOpcode (Hexagon::NOP);
-      Nop.setStartPacket (MI->isStartPacket());
+      Nop.setPacketStart (MI->isPacketStart());
       printInst (&Nop, O, NoAnnot);
     }
 
     // Close the packet.
-    if (MI->isEndPacket())
-      O << packetPadding << endPacket;
+    if (MI->isPacketEnd())
+      O << PacketPadding << endPacket;
 
     printInstruction(MI, O);
   }
   else {
     // Prefix the insn opening the packet.
-    if (MI->isStartPacket())
-      O << packetPadding << startPacket << '\n';
+    if (MI->isPacketStart())
+      O << PacketPadding << startPacket << '\n';
 
     printInstruction(MI, O);
 
     // Suffix the insn closing the packet.
-    if (MI->isEndPacket())
+    if (MI->isPacketEnd())
       // Suffix the packet in a new line always, since the GNU assembler has
       // issues with a closing brace on the same line as CONST{32,64}.
-      O << '\n' << packetPadding << endPacket;
+      O << '\n' << PacketPadding << endPacket;
   }
 
   printAnnotation(O, Annot);
@@ -102,12 +103,23 @@ void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 
 void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
                                          raw_ostream &O) const {
-  O << MI->getOperand(OpNo).getImm();
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  if(MO.isExpr()) {
+    O << *MO.getExpr();
+  } else if(MO.isImm()) {
+    O << MI->getOperand(OpNo).getImm();
+  } else {
+    llvm_unreachable("Unknown operand");
+  }
 }
 
 void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
                                          raw_ostream &O) const {
-  O << MI->getOperand(OpNo).getImm();
+  const HexagonMCInst *HMCI = static_cast<const HexagonMCInst*>(MI);
+  if (HMCI->isConstExtended())
+    O << "#";
+  printOperand(MI, OpNo, O);
 }
 
 void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI,
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
index 902a323..d0cef68 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
@@ -14,16 +14,18 @@
 #ifndef HEXAGONINSTPRINTER_H
 #define HEXAGONINSTPRINTER_H
 
-#include "HexagonMCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
 
 namespace llvm {
+  class HexagonMCInst;
+
   class HexagonInstPrinter : public MCInstPrinter {
   public:
     explicit HexagonInstPrinter(const MCAsmInfo &MAI,
                                 const MCInstrInfo &MII,
                                 const MCRegisterInfo &MRI)
-      : MCInstPrinter(MAI, MII, MRI) {}
+      : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
 
     virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
     void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
@@ -65,10 +67,19 @@ namespace llvm {
     void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
       { printSymbol(MI, OpNo, O, false); }
 
-    bool isConstExtended(const MCInst *MI) const;
+    const MCInstrInfo &getMII() const {
+      return MII;
+    }
+
   protected:
     void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi)
            const;
+
+    static const char PacketPadding;
+
+  private:
+    const MCInstrInfo &MII;
+
   };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/InstPrinter/LLVMBuild.txt b/lib/Target/Hexagon/InstPrinter/LLVMBuild.txt
index 8678401..59849aa 100644
--- a/lib/Target/Hexagon/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/Hexagon/InstPrinter/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = HexagonAsmPrinter
 parent = Hexagon
-required_libraries = MC Support
+required_libraries = HexagonDesc MC Support
 add_to_library_groups = Hexagon
diff --git a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
index 8e3da99..62b9b60 100644
--- a/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Hexagon/MCTargetDesc/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMHexagonDesc
-  HexagonMCTargetDesc.cpp
   HexagonMCAsmInfo.cpp
+  HexagonMCInst.cpp
+  HexagonMCTargetDesc.cpp
   )
 
 add_dependencies(LLVMHexagonDesc HexagonCommonTableGen)
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 9fc826f..5f9718b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -17,6 +17,9 @@
 #ifndef HEXAGONBASEINFO_H
 #define HEXAGONBASEINFO_H
 
+#include "HexagonMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
 namespace llvm {
 
 /// HexagonII - This namespace holds all of the target specific flags that
@@ -28,19 +31,19 @@ namespace HexagonII {
   // Insn types.
   // *** Must match HexagonInstrFormat*.td ***
   enum Type {
-    TypePSEUDO = 0,
-    TypeALU32  = 1,
-    TypeCR     = 2,
-    TypeJR     = 3,
-    TypeJ      = 4,
-    TypeLD     = 5,
-    TypeST     = 6,
-    TypeSYSTEM = 7,
-    TypeXTYPE  = 8,
-    TypeMEMOP  = 9,
-    TypeNV     = 10,
-    TypePREFIX = 30, // Such as extenders.
-    TypeMARKER = 31  // Such as end of a HW loop.
+    TypePSEUDO  = 0,
+    TypeALU32   = 1,
+    TypeCR      = 2,
+    TypeJR      = 3,
+    TypeJ       = 4,
+    TypeLD      = 5,
+    TypeST      = 6,
+    TypeSYSTEM  = 7,
+    TypeXTYPE   = 8,
+    TypeMEMOP   = 9,
+    TypeNV      = 10,
+    TypePREFIX  = 30, // Such as extenders.
+    TypeENDLOOP = 31  // Such as end of a HW loop.
   };
 
   enum SubTarget {
@@ -65,6 +68,14 @@ namespace HexagonII {
     BaseRegOffset  = 5   // Indirect with register offset
   };
 
+  enum MemAccessSize {
+    NoMemAccess = 0,            // Not a memory acces instruction.
+    ByteAccess = 1,             // Byte access instruction (memb).
+    HalfWordAccess = 2,         // Half word access instruction (memh).
+    WordAccess = 3,             // Word access instrution (memw).
+    DoubleWordAccess = 4        // Double word access instruction (memd)
+  };
+
   // MCInstrDesc TSFlags
   // *** Must match HexagonInstrFormat*.td ***
   enum {
@@ -79,46 +90,67 @@ namespace HexagonII {
     // Predicated instructions.
     PredicatedPos  = 6,
     PredicatedMask = 0x1,
-    PredicatedNewPos  = 7,
+    PredicatedFalsePos  = 7,
+    PredicatedFalseMask = 0x1,
+    PredicatedNewPos  = 8,
     PredicatedNewMask = 0x1,
 
-    // Stores that can be newified.
-    mayNVStorePos  = 8,
+    // New-Value consumer instructions.
+    NewValuePos  = 9,
+    NewValueMask = 0x1,
+
+    // New-Value producer instructions.
+    hasNewValuePos  = 10,
+    hasNewValueMask = 0x1,
+
+    // Which operand consumes or produces a new value.
+    NewValueOpPos  = 11,
+    NewValueOpMask = 0x7,
+
+    // Which bits encode the new value.
+    NewValueBitsPos  = 14,
+    NewValueBitsMask = 0x3,
+
+    // Stores that can become new-value stores.
+    mayNVStorePos  = 16,
     mayNVStoreMask = 0x1,
 
-    // Dot new value store instructions.
-    NVStorePos  = 9,
+    // New-value store instructions.
+    NVStorePos  = 17,
     NVStoreMask = 0x1,
 
     // Extendable insns.
-    ExtendablePos  = 10,
+    ExtendablePos  = 18,
     ExtendableMask = 0x1,
 
     // Insns must be extended.
-    ExtendedPos  = 11,
+    ExtendedPos  = 19,
     ExtendedMask = 0x1,
 
     // Which operand may be extended.
-    ExtendableOpPos  = 12,
+    ExtendableOpPos  = 20,
     ExtendableOpMask = 0x7,
 
     // Signed or unsigned range.
-    ExtentSignedPos = 15,
+    ExtentSignedPos = 23,
     ExtentSignedMask = 0x1,
 
     // Number of bits of range before extending operand.
-    ExtentBitsPos  = 16,
+    ExtentBitsPos  = 24,
     ExtentBitsMask = 0x1f,
 
     // Valid subtargets
-    validSubTargetPos = 21,
+    validSubTargetPos = 29,
     validSubTargetMask = 0xf,
 
-    // Addressing mode for load/store instructions
-    AddrModePos = 25,
-    AddrModeMask = 0xf
+    // Addressing mode for load/store instructions.
+    AddrModePos = 33,
+    AddrModeMask = 0x7,
 
- };
+    // Access size of memory access instructions (load/store).
+    MemAccessSizePos = 36,
+    MemAccesSizeMask = 0x7
+  };
 
   // *** The code above must match HexagonInstrFormat*.td *** //
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
new file mode 100644
index 0000000..9260b4a
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
@@ -0,0 +1,175 @@
+//===- HexagonMCInst.cpp - Hexagon sub-class of MCInst --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInst to allow some Hexagon VLIW annotations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+
+using namespace llvm;
+
+// Return the slots used by the insn.
+unsigned HexagonMCInst::getUnits(const HexagonTargetMachine* TM) const {
+  const HexagonInstrInfo* QII = TM->getInstrInfo();
+  const InstrItineraryData* II = TM->getInstrItineraryData();
+  const InstrStage*
+    IS = II->beginStage(QII->get(this->getOpcode()).getSchedClass());
+
+  return (IS->getUnits());
+}
+
+// Return the Hexagon ISA class for the insn.
+unsigned HexagonMCInst::getType() const {
+  const uint64_t F = MCID->TSFlags;
+
+  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+}
+
+// Return whether the insn is an actual insn.
+bool HexagonMCInst::isCanon() const {
+  return (!MCID->isPseudo() &&
+          !isPrefix() &&
+          getType() != HexagonII::TypeENDLOOP);
+}
+
+// Return whether the insn is a prefix.
+bool HexagonMCInst::isPrefix() const {
+  return (getType() == HexagonII::TypePREFIX);
+}
+
+// Return whether the insn is solo, i.e., cannot be in a packet.
+bool HexagonMCInst::isSolo() const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
+}
+
+// Return whether the insn is a new-value consumer.
+bool HexagonMCInst::isNewValue() const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+}
+
+// Return whether the instruction is a legal new-value producer.
+bool HexagonMCInst::hasNewValue() const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
+}
+
+// Return the operand that consumes or produces a new value.
+const MCOperand& HexagonMCInst::getNewValue() const {
+  const uint64_t F = MCID->TSFlags;
+  const unsigned O = (F >> HexagonII::NewValueOpPos) &
+                     HexagonII::NewValueOpMask;
+  const MCOperand& MCO = getOperand(O);
+
+  assert ((isNewValue() || hasNewValue()) && MCO.isReg());
+  return (MCO);
+}
+
+// Return whether the instruction needs to be constant extended.
+// 1) Always return true if the instruction has 'isExtended' flag set.
+//
+// isExtendable:
+// 2) For immediate extended operands, return true only if the value is
+//    out-of-range.
+// 3) For global address, always return true.
+
+bool HexagonMCInst::isConstExtended(void) const {
+  if (isExtended())
+    return true;
+
+  if (!isExtendable())
+    return false;
+
+  short ExtOpNum = getCExtOpNum();
+  int MinValue   = getMinValue();
+  int MaxValue   = getMaxValue();
+  const MCOperand& MO = getOperand(ExtOpNum);
+
+  // We could be using an instruction with an extendable immediate and shoehorn
+  // a global address into it. If it is a global address it will be constant
+  // extended. We do this for COMBINE.
+  // We currently only handle isGlobal() because it is the only kind of
+  // object we are going to end up with here for now.
+  // In the future we probably should add isSymbol(), etc.
+  if (MO.isExpr())
+    return true;
+
+  // If the extendable operand is not 'Immediate' type, the instruction should
+  // have 'isExtended' flag set.
+  assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+  int ImmValue = MO.getImm();
+  return (ImmValue < MinValue || ImmValue > MaxValue);
+}
+
+// Return whether the instruction must be always extended.
+bool HexagonMCInst::isExtended(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+}
+
+// Return true if the instruction may be extended based on the operand value.
+bool HexagonMCInst::isExtendable(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+}
+
+// Return number of bits in the constant extended operand.
+unsigned HexagonMCInst::getBitCount(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
+}
+
+// Return constant extended operand number.
+unsigned short HexagonMCInst::getCExtOpNum(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+// Return whether the operand can be constant extended.
+bool HexagonMCInst::isOperandExtended(const unsigned short OperandNum) const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+          == OperandNum;
+}
+
+// Return the min value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInst::getMinValue(void) const {
+  const uint64_t F = MCID->TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return -1 << (bits - 1);
+  else
+    return 0;
+}
+
+// Return the max value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInst::getMaxValue(void) const {
+  const uint64_t F = MCID->TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return ~(-1 << (bits - 1));
+  else
+    return ~(-1 << bits);
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
new file mode 100644
index 0000000..3ca71f0
--- /dev/null
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
@@ -0,0 +1,100 @@
+//===- HexagonMCInst.h - Hexagon sub-class of MCInst ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInst to allow some VLIW annotations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCINST_H
+#define HEXAGONMCINST_H
+
+#include "HexagonTargetMachine.h"
+#include "llvm/MC/MCInst.h"
+
+namespace llvm {
+  class MCOperand;
+
+  class HexagonMCInst: public MCInst {
+    // MCID is set during instruction lowering.
+    // It is needed in order to access TSFlags for
+    // use in checking MC instruction properties.
+    const MCInstrDesc *MCID;
+
+    // Packet start and end markers
+    unsigned packetStart: 1, packetEnd: 1;
+
+  public:
+    explicit HexagonMCInst():
+      MCInst(), MCID(0), packetStart(0), packetEnd(0) {};
+    HexagonMCInst(const MCInstrDesc& mcid):
+      MCInst(), MCID(&mcid), packetStart(0), packetEnd(0) {};
+
+    bool isPacketStart() const { return (packetStart); };
+    bool isPacketEnd() const { return (packetEnd); };
+    void setPacketStart(bool Y) { packetStart = Y; };
+    void setPacketEnd(bool Y) { packetEnd = Y; };
+    void resetPacket() { setPacketStart(false); setPacketEnd(false); };
+
+    // Return the slots used by the insn.
+    unsigned getUnits(const HexagonTargetMachine* TM) const;
+
+    // Return the Hexagon ISA class for the insn.
+    unsigned getType() const;
+
+    void setDesc(const MCInstrDesc& mcid) { MCID = &mcid; };
+    const MCInstrDesc& getDesc(void) const { return *MCID; };
+
+    // Return whether the insn is an actual insn.
+    bool isCanon() const;
+
+    // Return whether the insn is a prefix.
+    bool isPrefix() const;
+
+    // Return whether the insn is solo, i.e., cannot be in a packet.
+    bool isSolo() const;
+
+    // Return whether the instruction needs to be constant extended.
+    bool isConstExtended() const;
+
+    // Return constant extended operand number.
+    unsigned short getCExtOpNum(void) const;
+
+    // Return whether the insn is a new-value consumer.
+    bool isNewValue() const;
+
+    // Return whether the instruction is a legal new-value producer.
+    bool hasNewValue() const;
+
+    // Return the operand that consumes or produces a new value.
+    const MCOperand& getNewValue() const;
+
+    // Return number of bits in the constant extended operand.
+    unsigned getBitCount(void) const;
+
+  private:
+    // Return whether the instruction must be always extended.
+    bool isExtended() const;
+
+    // Return true if the insn may be extended based on the operand value.
+    bool isExtendable() const;
+
+    // Return true if the operand can be constant extended.
+    bool isOperandExtended(const unsigned short OperandNum) const;
+
+    // Return the min value that a constant extendable operand can have
+    // without being extended.
+    int getMinValue() const;
+
+    // Return the max value that a constant extendable operand can have
+    // without being extended.
+    int getMaxValue() const;
+  };
+}
+
+#endif
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 737789b..6b1d2d1 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -13,11 +13,13 @@
 
 #include "HexagonMCTargetDesc.h"
 #include "HexagonMCAsmInfo.h"
+#include "InstPrinter/HexagonInstPrinter.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index f3a9c1c..c06e8bc 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = ARM CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc X86 XCore
+subdirectories = AArch64 ARM CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
index 2ab163e..ad495ff 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
@@ -451,7 +451,7 @@ MBlazeOperand *MBlazeAsmParser::ParseImmediate() {
   case AsmToken::Minus:
   case AsmToken::Integer:
   case AsmToken::Identifier:
-    if (getParser().ParseExpression(EVal))
+    if (getParser().parseExpression(EVal))
       return 0;
 
     return MBlazeOperand::CreateImm(EVal, S, E);
@@ -537,10 +537,10 @@ bool MBlazeAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
-      if (getParser().ParseExpression(Value))
+      if (getParser().parseExpression(Value))
         return true;
 
-      getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/);
+      getParser().getStreamer().EmitValue(Value, Size);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.cpp b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
index b6edbba..172304b 100644
--- a/lib/Target/MBlaze/MBlazeFrameLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
@@ -426,6 +426,45 @@ void MBlazeFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
+// Eliminate ADJCALLSTACKDOWN/ADJCALLSTACKUP pseudo instructions
+void MBlazeFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const MBlazeInstrInfo &TII =
+    *static_cast<const MBlazeInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'addi r1, r1, -<amt>' and the adjcallstackdown instruction into
+    // 'addi r1, r1, <amt>'
+    MachineInstr *Old = I;
+    int Amount = Old->getOperand(0).getImm() + 4;
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      MachineInstr *New;
+      if (Old->getOpcode() == MBlaze::ADJCALLSTACKDOWN) {
+        New = BuildMI(MF,Old->getDebugLoc(), TII.get(MBlaze::ADDIK),MBlaze::R1)
+                .addReg(MBlaze::R1).addImm(-Amount);
+      } else {
+        assert(Old->getOpcode() == MBlaze::ADJCALLSTACKUP);
+        New = BuildMI(MF,Old->getDebugLoc(), TII.get(MBlaze::ADDIK),MBlaze::R1)
+                .addReg(MBlaze::R1).addImm(Amount);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+
 void MBlazeFrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.h b/lib/Target/MBlaze/MBlazeFrameLowering.h
index 01e6578..f4228c5 100644
--- a/lib/Target/MBlaze/MBlazeFrameLowering.h
+++ b/lib/Target/MBlaze/MBlazeFrameLowering.h
@@ -39,6 +39,10 @@ public:
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
   bool hasFP(const MachineFunction &MF) const;
 
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 8a9f092..7664c60 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -81,6 +81,7 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
   setOperationAction(ISD::FSIN,       MVT::f32, Expand);
   setOperationAction(ISD::FCOS,       MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS,    MVT::f32, Expand);
   setOperationAction(ISD::FPOWI,      MVT::f32, Expand);
   setOperationAction(ISD::FPOW,       MVT::f32, Expand);
   setOperationAction(ISD::FLOG,       MVT::f32, Expand);
@@ -1027,15 +1028,17 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze);
 
-  // If this is the first return lowered for this function, add
-  // the regs to the liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
   SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // If this function is using the interrupt_handler calling convention
+  // then use "rtid r14, 0" otherwise use "rtsd r15, 8"
+  unsigned Ret = (CallConv == CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet
+                                                        : MBlazeISD::Ret;
+  unsigned Reg = (CallConv == CallingConv::MBLAZE_INTR) ? MBlaze::R14
+                                                        : MBlaze::R15;
+  RetOps.push_back(DAG.getRegister(Reg, MVT::i32));
+
 
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -1048,20 +1051,16 @@ LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     // guarantee that all emitted copies are
     // stuck together, avoiding something bad
     Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  // If this function is using the interrupt_handler calling convention
-  // then use "rtid r14, 0" otherwise use "rtsd r15, 8"
-  unsigned Ret = (CallConv == CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet
-                                                        : MBlazeISD::Ret;
-  unsigned Reg = (CallConv == CallingConv::MBLAZE_INTR) ? MBlaze::R14
-                                                        : MBlaze::R15;
-  SDValue DReg = DAG.getRegister(Reg, MVT::i32);
+  RetOps[0] = Chain;  // Update chain.
 
+  // Add the flag if we have it.
   if (Flag.getNode())
-    return DAG.getNode(Ret, dl, MVT::Other, Chain, DReg, Flag);
+    RetOps.push_back(Flag);
 
-  return DAG.getNode(Ret, dl, MVT::Other, Chain, DReg);
+  return DAG.getNode(Ret, dl, MVT::Other, &RetOps[0], RetOps.size());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 139bf71..f86bc0b 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -28,9 +28,9 @@ def SDT_MBCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 //===----------------------------------------------------------------------===//
 
 def MBlazeRet     : SDNode<"MBlazeISD::Ret", SDT_MBlazeRet,
-                           [SDNPHasChain, SDNPOptInGlue]>;
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def MBlazeIRet    : SDNode<"MBlazeISD::IRet", SDT_MBlazeIRet,
-                           [SDNPHasChain, SDNPOptInGlue]>;
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
 def MBlazeJmpLink : SDNode<"MBlazeISD::JmpLink",SDT_MBlazeJmpLink,
                            [SDNPHasChain,SDNPOptInGlue,SDNPOutGlue,
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index ed06cc4..d0fd7dc 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -83,67 +83,21 @@ getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-// This function eliminate ADJCALLSTACKDOWN/ADJCALLSTACKUP pseudo instructions
-void MBlazeRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  if (!TFI->hasReservedCallFrame(MF)) {
-    // If we have a frame pointer, turn the adjcallstackup instruction into a
-    // 'addi r1, r1, -<amt>' and the adjcallstackdown instruction into
-    // 'addi r1, r1, <amt>'
-    MachineInstr *Old = I;
-    int Amount = Old->getOperand(0).getImm() + 4;
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = TFI->getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
-
-      MachineInstr *New;
-      if (Old->getOpcode() == MBlaze::ADJCALLSTACKDOWN) {
-        New = BuildMI(MF,Old->getDebugLoc(),TII.get(MBlaze::ADDIK),MBlaze::R1)
-                .addReg(MBlaze::R1).addImm(-Amount);
-      } else {
-        assert(Old->getOpcode() == MBlaze::ADJCALLSTACKUP);
-        New = BuildMI(MF,Old->getDebugLoc(),TII.get(MBlaze::ADDIK),MBlaze::R1)
-                .addReg(MBlaze::R1).addImm(Amount);
-      }
-
-      // Replace the pseudo instruction with a new instruction...
-      MBB.insert(I, New);
-    }
-  }
-
-  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
-  MBB.erase(I);
-}
-
 // FrameIndex represent objects inside a abstract stack.
 // We must replace FrameIndex with an stack/frame pointer
 // direct reference.
 void MBlazeRegisterInfo::
 eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                    RegScavenger *RS) const {
+                    unsigned FIOperandNum, RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  unsigned i = 0;
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
-  }
-
-  unsigned oi = i == 2 ? 1 : 2;
+  unsigned OFIOperandNum = FIOperandNum == 2 ? 1 : 2;
 
   DEBUG(dbgs() << "\nFunction : " << MF.getName() << "\n";
         dbgs() << "<--------->\n" << MI);
 
-  int FrameIndex = MI.getOperand(i).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   int stackSize  = MFI->getStackSize();
   int spOffset   = MFI->getObjectOffset(FrameIndex);
 
@@ -159,12 +113,12 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   // as explained on LowerFormalArguments, detect negative offsets
   // and adjust SPOffsets considering the final stack size.
   int Offset = (spOffset < 0) ? (stackSize - spOffset) : spOffset;
-  Offset += MI.getOperand(oi).getImm();
+  Offset += MI.getOperand(OFIOperandNum).getImm();
 
   DEBUG(dbgs() << "Offset     : " << Offset << "\n" << "<--------->\n");
 
-  MI.getOperand(oi).ChangeToImmediate(Offset);
-  MI.getOperand(i).ChangeToRegister(getFrameRegister(MF), false);
+  MI.getOperand(OFIOperandNum).ChangeToImmediate(Offset);
+  MI.getOperand(FIOperandNum).ChangeToRegister(getFrameRegister(MF), false);
 }
 
 void MBlazeRegisterInfo::
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index 1d51162..99a2fac 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -50,13 +50,10 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
   /// Stack Frame Processing Methods
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index 2e328cb..3c95760 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 void MSP430MCAsmInfo::anchor() { }
 
 MSP430MCAsmInfo::MSP430MCAsmInfo(const Target &T, StringRef TT) {
-  PointerSize = 2;
+  PointerSize = CalleeSaveStackSlotSize = 2;
 
   PrivateGlobalPrefix = ".L";
   WeakRefDirective ="\t.weak\t";
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index aef45d8..ae2e556 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -222,13 +222,73 @@ MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
+void MSP430FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const MSP430InstrInfo &TII =
+    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+  unsigned StackAlign = getStackAlignment();
+
+  if (!hasReservedCallFrame(MF)) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub SPW, <amt>' and the
+    // adjcallstackdown instruction into 'add SPW, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+
+      MachineInstr *New = 0;
+      if (Old->getOpcode() == TII.getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, Old->getDebugLoc(),
+                      TII.get(MSP430::SUB16ri), MSP430::SPW)
+          .addReg(MSP430::SPW).addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == TII.getCallFrameDestroyOpcode());
+        // factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        Amount -= CalleeAmt;
+        if (Amount)
+          New = BuildMI(MF, Old->getDebugLoc(),
+                        TII.get(MSP430::ADD16ri), MSP430::SPW)
+            .addReg(MSP430::SPW).addImm(Amount);
+      }
+
+      if (New) {
+        // The SRW implicit def is dead.
+        New->getOperand(3).setIsDead();
+
+        // Replace the pseudo instruction with a new instruction...
+        MBB.insert(I, New);
+      }
+    }
+  } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      MachineInstr *Old = I;
+      MachineInstr *New =
+        BuildMI(MF, Old->getDebugLoc(), TII.get(MSP430::SUB16ri),
+                MSP430::SPW).addReg(MSP430::SPW).addImm(CalleeAmt);
+      // The SRW implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
 void
 MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
                                                                          const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
   // Create a frame entry for the FPW register that must be saved.
-  if (TFI->hasFP(MF)) {
+  if (hasFP(MF)) {
     int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
     (void)FrameIdx;
     assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index cb02545..a077dd7 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -35,6 +35,10 @@ public:
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 5a156c1..09cdf32 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -423,15 +423,8 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
 
-  // If this is the first return lowered for this function, add the regs to the
-  // liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
   SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
 
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -444,16 +437,19 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
     // Guarantee that all emitted copies are stuck together,
     // avoiding something bad.
     Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
   unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
                   MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
 
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
   if (Flag.getNode())
-    return DAG.getNode(Opc, dl, MVT::Other, Chain, Flag);
+    RetOps.push_back(Flag);
 
-  // Return Void
-  return DAG.getNode(Opc, dl, MVT::Other, Chain);
+  return DAG.getNode(Opc, dl, MVT::Other, &RetOps[0], RetOps.size());
 }
 
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index f003574..e45780d 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -40,9 +40,9 @@ def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
 // MSP430 Specific Node Definitions.
 //===----------------------------------------------------------------------===//
 def MSP430retflag  : SDNode<"MSP430ISD::RET_FLAG", SDTNone,
-                       [SDNPHasChain, SDNPOptInGlue]>;
+                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def MSP430retiflag : SDNode<"MSP430ISD::RETI_FLAG", SDTNone,
-                       [SDNPHasChain, SDNPOptInGlue]>;
+                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
 def MSP430rra     : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
 def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 8f7813a..0b3e9e2 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -101,83 +101,18 @@ MSP430RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
   return &MSP430::GR16RegClass;
 }
 
-void MSP430RegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  if (!TFI->hasReservedCallFrame(MF)) {
-    // If the stack pointer can be changed after prologue, turn the
-    // adjcallstackup instruction into a 'sub SPW, <amt>' and the
-    // adjcallstackdown instruction into 'add SPW, <amt>'
-    // TODO: consider using push / pop instead of sub + store / add
-    MachineInstr *Old = I;
-    uint64_t Amount = Old->getOperand(0).getImm();
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
-
-      MachineInstr *New = 0;
-      if (Old->getOpcode() == TII.getCallFrameSetupOpcode()) {
-        New = BuildMI(MF, Old->getDebugLoc(),
-                      TII.get(MSP430::SUB16ri), MSP430::SPW)
-          .addReg(MSP430::SPW).addImm(Amount);
-      } else {
-        assert(Old->getOpcode() == TII.getCallFrameDestroyOpcode());
-        // factor out the amount the callee already popped.
-        uint64_t CalleeAmt = Old->getOperand(1).getImm();
-        Amount -= CalleeAmt;
-        if (Amount)
-          New = BuildMI(MF, Old->getDebugLoc(),
-                        TII.get(MSP430::ADD16ri), MSP430::SPW)
-            .addReg(MSP430::SPW).addImm(Amount);
-      }
-
-      if (New) {
-        // The SRW implicit def is dead.
-        New->getOperand(3).setIsDead();
-
-        // Replace the pseudo instruction with a new instruction...
-        MBB.insert(I, New);
-      }
-    }
-  } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
-    // If we are performing frame pointer elimination and if the callee pops
-    // something off the stack pointer, add it back.
-    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
-      MachineInstr *Old = I;
-      MachineInstr *New =
-        BuildMI(MF, Old->getDebugLoc(), TII.get(MSP430::SUB16ri),
-                MSP430::SPW).addReg(MSP430::SPW).addImm(CalleeAmt);
-      // The SRW implicit def is dead.
-      New->getOperand(3).setIsDead();
-
-      MBB.insert(I, New);
-    }
-  }
-
-  MBB.erase(I);
-}
-
 void
 MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                        int SPAdj, RegScavenger *RS) const {
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
-  unsigned i = 0;
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   DebugLoc dl = MI.getDebugLoc();
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
-  int FrameIndex = MI.getOperand(i).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
   unsigned BasePtr = (TFI->hasFP(MF) ? MSP430::FPW : MSP430::SPW);
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
@@ -191,7 +126,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     Offset += 2; // Skip the saved FPW
 
   // Fold imm into offset
-  Offset += MI.getOperand(i+1).getImm();
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
 
   if (MI.getOpcode() == MSP430::ADD16ri) {
     // This is actually "load effective address" of the stack slot
@@ -199,7 +134,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // expand it into mov + add
 
     MI.setDesc(TII.get(MSP430::MOV16rr));
-    MI.getOperand(i).ChangeToRegister(BasePtr, false);
+    MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
 
     if (Offset == 0)
       return;
@@ -216,8 +151,8 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     return;
   }
 
-  MI.getOperand(i).ChangeToRegister(BasePtr, false);
-  MI.getOperand(i+1).ChangeToImmediate(Offset);
+  MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
 }
 
 unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 64a43bc..69cccb2 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -42,12 +42,9 @@ public:
   const TargetRegisterClass*
   getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 085503eb..ade6084 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -84,15 +84,30 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool ParseDirective(AsmToken DirectiveID);
 
   MipsAsmParser::OperandMatchResultTy
-  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&);
+  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseCPURegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseCPU64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseHW64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
   bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &,
                     StringRef Mnemonic);
 
-  int tryParseRegister(StringRef Mnemonic);
+  int tryParseRegister(bool is64BitReg);
 
   bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               StringRef Mnemonic);
+                               bool is64BitReg);
 
   bool needsExpansion(MCInst &Inst);
 
@@ -107,7 +122,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool reportParseError(StringRef ErrorMsg);
 
   bool parseMemOffset(const MCExpr *&Res);
-  bool parseRelocOperand(const MCExpr *&Res, SMLoc &E);
+  bool parseRelocOperand(const MCExpr *&Res);
 
   bool parseDirectiveSet();
 
@@ -118,6 +133,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetReorderDirective();
   bool parseSetNoReorderDirective();
 
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+
   MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
 
   bool isMips64() const {
@@ -128,9 +145,11 @@ class MipsAsmParser : public MCTargetAsmParser {
     return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
   }
 
-  int matchRegisterName(StringRef Symbol);
+  int matchRegisterName(StringRef Symbol, bool is64BitReg);
 
-  int matchRegisterByNumber(unsigned RegNum, StringRef Mnemonic);
+  int matchCPURegisterName(StringRef Symbol);
+
+  int matchRegisterByNumber(unsigned RegNum, unsigned RegClass);
 
   void setFpFormat(FpFormatTy Format) {
     FpFormat = Format;
@@ -146,7 +165,7 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   unsigned getReg(int RC,int RegNo);
 
-  unsigned getATReg();
+  int getATReg();
 public:
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
     : MCTargetAsmParser(), STI(sti), Parser(parser) {
@@ -166,6 +185,20 @@ namespace {
 /// instruction.
 class MipsOperand : public MCParsedAsmOperand {
 
+public:
+  enum RegisterKind {
+    Kind_None,
+    Kind_CPURegs,
+    Kind_CPU64Regs,
+    Kind_HWRegs,
+    Kind_HW64Regs,
+    Kind_FGR32Regs,
+    Kind_FGR64Regs,
+    Kind_AFGR64Regs,
+    Kind_CCRRegs
+  };
+
+private:
   enum KindTy {
     k_CondCode,
     k_CoprocNum,
@@ -186,6 +219,7 @@ class MipsOperand : public MCParsedAsmOperand {
 
     struct {
       unsigned RegNum;
+      RegisterKind Kind;
     } Reg;
 
     struct {
@@ -246,6 +280,11 @@ public:
     return Reg.RegNum;
   }
 
+  void setRegKind(RegisterKind RegKind) {
+    assert((Kind == k_Register) && "Invalid access!");
+    Reg.Kind = RegKind;
+  }
+
   const MCExpr *getImm() const {
     assert((Kind == k_Immediate) && "Invalid access!");
     return Imm.Val;
@@ -296,6 +335,45 @@ public:
     return Op;
   }
 
+  bool isCPURegsAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_CPURegs;
+  }
+  void addCPURegsAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  bool isCPU64RegsAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_CPU64Regs;
+  }
+  void addCPU64RegsAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  bool isHWRegsAsm() const {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.Kind == Kind_HWRegs;
+  }
+  void addHWRegsAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  bool isHW64RegsAsm() const {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.Kind == Kind_HW64Regs;
+  }
+  void addHW64RegsAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  void addCCRAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  bool isCCRAsm() const {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.Kind == Kind_CCRRegs;
+  }
+
   /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
@@ -344,31 +422,31 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
   if ( 0 <= ImmValue && ImmValue <= 65535) {
     // for 0 <= j <= 65535.
     // li d,j => ori d,$zero,j
-    tmpInst.setOpcode(isMips64() ? Mips::ORi64 : Mips::ORi);
+    tmpInst.setOpcode(Mips::ORi);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(
-              MCOperand::CreateReg(isMips64() ? Mips::ZERO_64 : Mips::ZERO));
+              MCOperand::CreateReg(Mips::ZERO));
     tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
     Instructions.push_back(tmpInst);
   } else if ( ImmValue < 0 && ImmValue >= -32768) {
     // for -32768 <= j < 0.
     // li d,j => addiu d,$zero,j
-    tmpInst.setOpcode(Mips::ADDiu); //TODO:no ADDiu64 in td files?
+    tmpInst.setOpcode(Mips::ADDiu);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(
-              MCOperand::CreateReg(isMips64() ? Mips::ZERO_64 : Mips::ZERO));
+              MCOperand::CreateReg(Mips::ZERO));
     tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
     Instructions.push_back(tmpInst);
   } else {
     // for any other value of j that is representable as a 32-bit integer.
     // li d,j => lui d,hi16(j)
     //           ori d,d,lo16(j)
-    tmpInst.setOpcode(isMips64() ? Mips::LUi64 : Mips::LUi);
+    tmpInst.setOpcode(Mips::LUi);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
     Instructions.push_back(tmpInst);
     tmpInst.clear();
-    tmpInst.setOpcode(isMips64() ? Mips::ORi64 : Mips::ORi);
+    tmpInst.setOpcode(Mips::ORi);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
@@ -390,7 +468,7 @@ void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
   if ( -32768 <= ImmValue && ImmValue <= 65535) {
     //for -32768 <= j <= 65535.
     //la d,j(s) => addiu d,s,j
-    tmpInst.setOpcode(Mips::ADDiu); //TODO:no ADDiu64 in td files?
+    tmpInst.setOpcode(Mips::ADDiu);
     tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
@@ -400,12 +478,12 @@ void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
     //la d,j(s) => lui d,hi16(j)
     //             ori d,d,lo16(j)
     //             addu d,d,s
-    tmpInst.setOpcode(isMips64()?Mips::LUi64:Mips::LUi);
+    tmpInst.setOpcode(Mips::LUi);
     tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
     Instructions.push_back(tmpInst);
     tmpInst.clear();
-    tmpInst.setOpcode(isMips64()?Mips::ORi64:Mips::ORi);
+    tmpInst.setOpcode(Mips::ORi);
     tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
@@ -433,19 +511,19 @@ void MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
     tmpInst.setOpcode(Mips::ADDiu);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(
-              MCOperand::CreateReg(isMips64()?Mips::ZERO_64:Mips::ZERO));
+              MCOperand::CreateReg(Mips::ZERO));
     tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
     Instructions.push_back(tmpInst);
   } else {
     //for any other value of j that is representable as a 32-bit integer.
     //la d,j => lui d,hi16(j)
     //          ori d,d,lo16(j)
-    tmpInst.setOpcode(isMips64()?Mips::LUi64:Mips::LUi);
+    tmpInst.setOpcode(Mips::LUi);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
     Instructions.push_back(tmpInst);
     tmpInst.clear();
-    tmpInst.setOpcode(isMips64()?Mips::ORi64:Mips::ORi);
+    tmpInst.setOpcode(Mips::ORi);
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
     tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
@@ -498,84 +576,72 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   return true;
 }
 
-int MipsAsmParser::matchRegisterName(StringRef Name) {
-
+int MipsAsmParser::matchCPURegisterName(StringRef Name) {
    int CC;
-   if (!isMips64())
+
+  if (Name == "at")
+    return getATReg();
+
     CC = StringSwitch<unsigned>(Name)
-      .Case("zero",  Mips::ZERO)
-      .Case("a0",  Mips::A0)
-      .Case("a1",  Mips::A1)
-      .Case("a2",  Mips::A2)
-      .Case("a3",  Mips::A3)
-      .Case("v0",  Mips::V0)
-      .Case("v1",  Mips::V1)
-      .Case("s0",  Mips::S0)
-      .Case("s1",  Mips::S1)
-      .Case("s2",  Mips::S2)
-      .Case("s3",  Mips::S3)
-      .Case("s4",  Mips::S4)
-      .Case("s5",  Mips::S5)
-      .Case("s6",  Mips::S6)
-      .Case("s7",  Mips::S7)
-      .Case("k0",  Mips::K0)
-      .Case("k1",  Mips::K1)
-      .Case("sp",  Mips::SP)
-      .Case("fp",  Mips::FP)
-      .Case("gp",  Mips::GP)
-      .Case("ra",  Mips::RA)
-      .Case("t0",  Mips::T0)
-      .Case("t1",  Mips::T1)
-      .Case("t2",  Mips::T2)
-      .Case("t3",  Mips::T3)
-      .Case("t4",  Mips::T4)
-      .Case("t5",  Mips::T5)
-      .Case("t6",  Mips::T6)
-      .Case("t7",  Mips::T7)
-      .Case("t8",  Mips::T8)
-      .Case("t9",  Mips::T9)
-      .Case("at",  Mips::AT)
-      .Case("fcc0",  Mips::FCC0)
-      .Default(-1);
-   else
+    .Case("zero", 0)
+    .Case("a0",   4)
+    .Case("a1",   5)
+    .Case("a2",   6)
+    .Case("a3",   7)
+    .Case("v0",   2)
+    .Case("v1",   3)
+    .Case("s0",  16)
+    .Case("s1",  17)
+    .Case("s2",  18)
+    .Case("s3",  19)
+    .Case("s4",  20)
+    .Case("s5",  21)
+    .Case("s6",  22)
+    .Case("s7",  23)
+    .Case("k0",  26)
+    .Case("k1",  27)
+    .Case("sp",  29)
+    .Case("fp",  30)
+    .Case("gp",  28)
+    .Case("ra",  31)
+    .Case("t0",   8)
+    .Case("t1",   9)
+    .Case("t2",  10)
+    .Case("t3",  11)
+    .Case("t4",  12)
+    .Case("t5",  13)
+    .Case("t6",  14)
+    .Case("t7",  15)
+    .Case("t8",  24)
+    .Case("t9",  25)
+    .Default(-1);
+
+  // Although SGI documentation just cut out t0-t3 for n32/n64,
+  // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
+  // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
+  if (isMips64() && 8 <= CC  && CC <= 11)
+    CC += 4;
+
+  if (CC == -1 && isMips64())
     CC = StringSwitch<unsigned>(Name)
-      .Case("zero", Mips::ZERO_64)
-      .Case("at", Mips::AT_64)
-      .Case("v0", Mips::V0_64)
-      .Case("v1", Mips::V1_64)
-      .Case("a0", Mips::A0_64)
-      .Case("a1", Mips::A1_64)
-      .Case("a2", Mips::A2_64)
-      .Case("a3", Mips::A3_64)
-      .Case("a4", Mips::T0_64)
-      .Case("a5", Mips::T1_64)
-      .Case("a6", Mips::T2_64)
-      .Case("a7", Mips::T3_64)
-      .Case("t4", Mips::T4_64)
-      .Case("t5", Mips::T5_64)
-      .Case("t6", Mips::T6_64)
-      .Case("t7", Mips::T7_64)
-      .Case("s0", Mips::S0_64)
-      .Case("s1", Mips::S1_64)
-      .Case("s2", Mips::S2_64)
-      .Case("s3", Mips::S3_64)
-      .Case("s4", Mips::S4_64)
-      .Case("s5", Mips::S5_64)
-      .Case("s6", Mips::S6_64)
-      .Case("s7", Mips::S7_64)
-      .Case("t8", Mips::T8_64)
-      .Case("t9", Mips::T9_64)
-      .Case("kt0", Mips::K0_64)
-      .Case("kt1", Mips::K1_64)
-      .Case("gp", Mips::GP_64)
-      .Case("sp", Mips::SP_64)
-      .Case("fp", Mips::FP_64)
-      .Case("s8", Mips::FP_64)
-      .Case("ra", Mips::RA_64)
+      .Case("a4",   8)
+      .Case("a5",   9)
+      .Case("a6",  10)
+      .Case("a7",  11)
+      .Case("kt0", 26)
+      .Case("kt1", 27)
+      .Case("s8",  30)
       .Default(-1);
 
+  return CC;
+}
+int MipsAsmParser::matchRegisterName(StringRef Name, bool is64BitReg) {
+
+  int CC;
+  CC = matchCPURegisterName(Name);
   if (CC != -1)
-    return CC;
+    return matchRegisterByNumber(CC,is64BitReg?Mips::CPU64RegsRegClassID:
+                               Mips::CPURegsRegClassID);
 
   if (Name[0] == 'f') {
     StringRef NumString = Name.substr(1);
@@ -639,75 +705,49 @@ bool MipsAssemblerOptions::setATReg(unsigned Reg) {
   return true;
 }
 
-unsigned MipsAsmParser::getATReg() {
-  unsigned Reg = Options.getATRegNum();
-  if (isMips64())
-    return getReg(Mips::CPU64RegsRegClassID,Reg);
-  
-  return getReg(Mips::CPURegsRegClassID,Reg);
+int MipsAsmParser::getATReg() {
+  return Options.getATRegNum();
 }
 
 unsigned MipsAsmParser::getReg(int RC,int RegNo) {
   return *(getContext().getRegisterInfo().getRegClass(RC).begin() + RegNo);
 }
 
-int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, StringRef Mnemonic) {
-
-  if (Mnemonic.lower() == "rdhwr") {
-    // at the moment only hwreg29 is supported
-    if (RegNum != 29)
-      return -1;
-    return Mips::HWR29;
-  }
+int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
 
   if (RegNum > 31)
     return -1;
 
-  // MIPS64 registers are numbered 1 after the 32-bit equivalents
-  return getReg(Mips::CPURegsRegClassID, RegNum) + isMips64();
+  return getReg(RegClass, RegNum);
 }
 
-int MipsAsmParser::tryParseRegister(StringRef Mnemonic) {
+int MipsAsmParser::tryParseRegister(bool is64BitReg) {
   const AsmToken &Tok = Parser.getTok();
   int RegNum = -1;
 
   if (Tok.is(AsmToken::Identifier)) {
     std::string lowerCase = Tok.getString().lower();
-    RegNum = matchRegisterName(lowerCase);
+    RegNum = matchRegisterName(lowerCase, is64BitReg);
   } else if (Tok.is(AsmToken::Integer))
     RegNum = matchRegisterByNumber(static_cast<unsigned>(Tok.getIntVal()),
-                                   Mnemonic.lower());
-    else
-      return RegNum;  //error
-  // 64 bit div operations require Mips::ZERO instead of MIPS::ZERO_64
-  if (isMips64() && RegNum == Mips::ZERO_64) {
-    if (Mnemonic.find("ddiv") != StringRef::npos)
-      RegNum = Mips::ZERO;
-  }
+                                   is64BitReg ? Mips::CPU64RegsRegClassID
+                                              : Mips::CPURegsRegClassID);
   return RegNum;
 }
 
 bool MipsAsmParser::
   tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                          StringRef Mnemonic){
+                          bool is64BitReg){
 
   SMLoc S = Parser.getTok().getLoc();
-  SMLoc E = Parser.getTok().getEndLoc();
   int RegNo = -1;
 
-  // FIXME: we should make a more generic method for CCR
-  if ((Mnemonic == "cfc1" || Mnemonic == "ctc1")
-      && Operands.size() == 2 && Parser.getTok().is(AsmToken::Integer)){
-    RegNo = Parser.getTok().getIntVal();  // get the int value
-    // at the moment only fcc0 is supported
-    if (RegNo ==  0)
-      RegNo = Mips::FCC0;
-  } else
-    RegNo = tryParseRegister(Mnemonic);
+  RegNo = tryParseRegister(is64BitReg);
   if (RegNo == -1)
     return true;
 
-  Operands.push_back(MipsOperand::CreateReg(RegNo, S, E));
+  Operands.push_back(MipsOperand::CreateReg(RegNo, S,
+      Parser.getTok().getLoc()));
   Parser.Lex(); // Eat register token.
   return false;
 }
@@ -734,7 +774,7 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
     SMLoc S = Parser.getTok().getLoc();
     Parser.Lex(); // Eat dollar token.
     // parse register operand
-    if (!tryParseRegisterOperand(Operands, Mnemonic)) {
+    if (!tryParseRegisterOperand(Operands, isMips64())) {
       if (getLexer().is(AsmToken::LParen)) {
         // check if it is indexed addressing operand
         Operands.push_back(MipsOperand::CreateToken("(", S));
@@ -743,7 +783,7 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
           return true;
 
         Parser.Lex(); // eat dollar
-        if (tryParseRegisterOperand(Operands, Mnemonic))
+        if (tryParseRegisterOperand(Operands, isMips64()))
           return true;
 
         if (!getLexer().is(AsmToken::RParen))
@@ -757,10 +797,10 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
     }
     // maybe it is a symbol reference
     StringRef Identifier;
-    if (Parser.ParseIdentifier(Identifier))
+    if (Parser.parseIdentifier(Identifier))
       return true;
 
-    SMLoc E = SMLoc::getFromPointer(Identifier.end());
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
 
     MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
 
@@ -780,9 +820,9 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
      // quoted label names
     const MCExpr *IdVal;
     SMLoc S = Parser.getTok().getLoc();
-    SMLoc E;
-    if (getParser().ParseExpression(IdVal, E))
+    if (getParser().parseExpression(IdVal))
       return true;
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
     Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
     return false;
   }
@@ -790,10 +830,11 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
     // it is a symbol reference or constant expression
     const MCExpr *IdVal;
     SMLoc S = Parser.getTok().getLoc(); // start location of the operand
-    SMLoc E;
-    if (parseRelocOperand(IdVal, E))
+    if (parseRelocOperand(IdVal))
       return true;
 
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
     Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
     return false;
   } // case AsmToken::Percent
@@ -801,7 +842,7 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
   return true;
 }
 
-bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res, SMLoc &EndLoc) {
+bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
 
   Parser.Lex(); // eat % token
   const AsmToken &Tok = Parser.getTok(); // get next token, operation
@@ -813,6 +854,7 @@ bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res, SMLoc &EndLoc) {
   Parser.Lex(); // eat identifier
   // now make expression from the rest of the operand
   const MCExpr *IdVal;
+  SMLoc EndLoc;
 
   if (getLexer().getKind() == AsmToken::LParen) {
     while (1) {
@@ -830,13 +872,11 @@ bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res, SMLoc &EndLoc) {
       } else
         break;
     }
-    if (getParser().ParseParenExpression(IdVal,EndLoc))
+    if (getParser().parseParenExpression(IdVal,EndLoc))
       return true;
 
-    while (getLexer().getKind() == AsmToken::RParen) {
-      EndLoc = Parser.getTok().getEndLoc();
+    while (getLexer().getKind() == AsmToken::RParen)
       Parser.Lex(); // eat ')' token
-    }
 
   } else
     return true; // parenthesis must follow reloc operand
@@ -848,7 +888,12 @@ bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res, SMLoc &EndLoc) {
     if (Str == "lo") {
       Val = Val & 0xffff;
     } else if (Str == "hi") {
+      int LoSign = Val & 0x8000;
       Val = (Val & 0xffff0000) >> 16;
+      //lower part is treated as signed int, so if it is negative
+      //we must add 1 to hi part to compensate
+      if (LoSign)
+        Val++;
     }
     Res = MCConstantExpr::Create(Val, getContext());
     return false;
@@ -868,23 +913,24 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                   SMLoc &EndLoc) {
 
   StartLoc = Parser.getTok().getLoc();
-  EndLoc = Parser.getTok().getEndLoc();
-  RegNo = tryParseRegister("");
+  RegNo = tryParseRegister(isMips64());
+  EndLoc = Parser.getTok().getLoc();
   return (RegNo == (unsigned)-1);
 }
 
 bool MipsAsmParser::parseMemOffset(const MCExpr *&Res) {
+
+  SMLoc S;
+
   switch(getLexer().getKind()) {
   default:
     return true;
   case AsmToken::Integer:
   case AsmToken::Minus:
   case AsmToken::Plus:
-    return getParser().ParseExpression(Res);
-  case AsmToken::Percent: {
-    SMLoc E;
-    return parseRelocOperand(Res, E);
-  }
+    return (getParser().parseExpression(Res));
+  case AsmToken::Percent:
+    return parseRelocOperand(Res);
   case AsmToken::LParen:
     return false;  // it's probably assuming 0
   }
@@ -895,8 +941,9 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
                SmallVectorImpl<MCParsedAsmOperand*>&Operands) {
 
   const MCExpr *IdVal = 0;
-  SMLoc S = Parser.getTok().getLoc();
-  SMLoc E = Parser.getTok().getEndLoc();
+  SMLoc S;
+  // first operand is the offset
+  S = Parser.getTok().getLoc();
 
   if (parseMemOffset(IdVal))
     return MatchOperand_ParseFail;
@@ -905,6 +952,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
   if (Tok.isNot(AsmToken::LParen)) {
     MipsOperand *Mnemonic = static_cast<MipsOperand*>(Operands[0]);
     if (Mnemonic->getToken() == "la") {
+      SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() -1);
       Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
       return MatchOperand_Success;
     }
@@ -917,7 +965,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
   const AsmToken &Tok1 = Parser.getTok(); // get next token
   if (Tok1.is(AsmToken::Dollar)) {
     Parser.Lex(); // Eat '$' token.
-    if (tryParseRegisterOperand(Operands,"")) {
+    if (tryParseRegisterOperand(Operands, isMips64())) {
       Error(Parser.getTok().getLoc(), "unexpected token in operand");
       return MatchOperand_ParseFail;
     }
@@ -933,7 +981,8 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     return MatchOperand_ParseFail;
   }
 
-  E = Parser.getTok().getEndLoc();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
   Parser.Lex(); // Eat ')' token.
 
   if (IdVal == 0)
@@ -950,6 +999,132 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
   return MatchOperand_Success;
 }
 
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCPU64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  if (!isMips64())
+    return MatchOperand_NoMatch;
+  // if the first token is not '$' we have an error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat $
+  if(!tryParseRegisterOperand(Operands, true)) {
+    // set the proper register kind
+    MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+    op->setRegKind(MipsOperand::Kind_CPU64Regs);
+    return MatchOperand_Success;
+  }
+  return MatchOperand_NoMatch;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCPURegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  // if the first token is not '$' we have an error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat $
+  if(!tryParseRegisterOperand(Operands, false)) {
+    // set the propper register kind
+    MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+    op->setRegKind(MipsOperand::Kind_CPURegs);
+    return MatchOperand_Success;
+  }
+  return MatchOperand_NoMatch;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  if (isMips64())
+    return MatchOperand_NoMatch;
+
+  // if the first token is not '$' we have error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat $
+
+  const AsmToken &Tok = Parser.getTok(); // get next token
+  if (Tok.isNot(AsmToken::Integer))
+    return MatchOperand_NoMatch;
+
+  unsigned RegNum = Tok.getIntVal();
+  // at the moment only hwreg29 is supported
+  if (RegNum != 29)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29, S,
+        Parser.getTok().getLoc());
+  op->setRegKind(MipsOperand::Kind_HWRegs);
+  Operands.push_back(op);
+
+  Parser.Lex(); // Eat reg number
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseHW64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  if (!isMips64())
+    return MatchOperand_NoMatch;
+    //if the first token is not '$' we have error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat $
+
+  const AsmToken &Tok = Parser.getTok(); // get next token
+  if (Tok.isNot(AsmToken::Integer))
+    return MatchOperand_NoMatch;
+
+  unsigned RegNum = Tok.getIntVal();
+  // at the moment only hwreg29 is supported
+  if (RegNum != 29)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29_64, S,
+        Parser.getTok().getLoc());
+  op->setRegKind(MipsOperand::Kind_HW64Regs);
+  Operands.push_back(op);
+
+  Parser.Lex(); // Eat reg number
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  unsigned RegNum;
+  //if the first token is not '$' we have error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat $
+
+  const AsmToken &Tok = Parser.getTok(); // get next token
+  if (Tok.is(AsmToken::Integer)) {
+    RegNum = Tok.getIntVal();
+    // at the moment only fcc0 is supported
+    if (RegNum != 0)
+      return MatchOperand_ParseFail;
+  } else if (Tok.is(AsmToken::Identifier)) {
+    // at the moment only fcc0 is supported
+    if (Tok.getIdentifier() != "fcc0")
+      return MatchOperand_ParseFail;
+  } else
+    return MatchOperand_NoMatch;
+
+  MipsOperand *op = MipsOperand::CreateReg(Mips::FCC0, S,
+        Parser.getTok().getLoc());
+  op->setRegKind(MipsOperand::Kind_CCRRegs);
+  Operands.push_back(op);
+
+  Parser.Lex(); // Eat reg number
+  return MatchOperand_Success;
+}
+
 MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
 
   MCSymbolRefExpr::VariantKind VK
@@ -1019,13 +1194,13 @@ parseMathOperation(StringRef Name, SMLoc NameLoc,
     // Read the first operand.
     if (ParseOperand(Operands, Name)) {
       SMLoc Loc = getLexer().getLoc();
-      Parser.EatToEndOfStatement();
+      Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
     }
 
     if (getLexer().isNot(AsmToken::Comma)) {
       SMLoc Loc = getLexer().getLoc();
-      Parser.EatToEndOfStatement();
+      Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
 
     }
@@ -1037,14 +1212,14 @@ parseMathOperation(StringRef Name, SMLoc NameLoc,
     // Parse and remember the operand.
     if (ParseOperand(Operands, Name)) {
       SMLoc Loc = getLexer().getLoc();
-      Parser.EatToEndOfStatement();
+      Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
     }
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     SMLoc Loc = getLexer().getLoc();
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return Error(Loc, "unexpected token in argument list");
   }
 
@@ -1055,16 +1230,18 @@ parseMathOperation(StringRef Name, SMLoc NameLoc,
 bool MipsAsmParser::
 ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  StringRef Mnemonic;
   // floating point instructions: should register be treated as double?
   if (requestsDoubleOperand(Name)) {
     setFpFormat(FP_FORMAT_D);
   Operands.push_back(MipsOperand::CreateToken(Name, NameLoc));
+  Mnemonic = Name;
   }
   else {
     setDefaultFpFormat();
     // Create the leading tokens for the mnemonic, split by '.' characters.
     size_t Start = 0, Next = Name.find('.');
-    StringRef Mnemonic = Name.slice(Start, Next);
+    Mnemonic = Name.slice(Start, Next);
 
     Operands.push_back(MipsOperand::CreateToken(Mnemonic, NameLoc));
 
@@ -1083,8 +1260,8 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
           if (Cc == -1) {
             return Error(NameLoc, "Invalid conditional code");
           }
-          // FIXME: May include trailing whitespace...
-          SMLoc E = Parser.getTok().getLoc();
+          SMLoc E = SMLoc::getFromPointer(
+              Parser.getTok().getLoc().getPointer() -1 );
           Operands.push_back(MipsOperand::CreateImm(
               MCConstantExpr::Create(Cc, getContext()), NameLoc, E));
         } else {
@@ -1104,9 +1281,9 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
   // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
-    if (ParseOperand(Operands, Name)) {
+    if (ParseOperand(Operands, Mnemonic)) {
       SMLoc Loc = getLexer().getLoc();
-      Parser.EatToEndOfStatement();
+      Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
     }
 
@@ -1116,7 +1293,7 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       // Parse and remember the operand.
       if (ParseOperand(Operands, Name)) {
         SMLoc Loc = getLexer().getLoc();
-        Parser.EatToEndOfStatement();
+        Parser.eatToEndOfStatement();
         return Error(Loc, "unexpected token in argument list");
       }
     }
@@ -1124,7 +1301,7 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     SMLoc Loc = getLexer().getLoc();
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return Error(Loc, "unexpected token in argument list");
   }
 
@@ -1134,7 +1311,7 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
 
 bool MipsAsmParser::reportParseError(StringRef ErrorMsg) {
    SMLoc Loc = getLexer().getLoc();
-   Parser.EatToEndOfStatement();
+   Parser.eatToEndOfStatement();
    return Error(Loc, ErrorMsg);
 }
 
@@ -1157,6 +1334,7 @@ bool MipsAsmParser::parseSetAtDirective() {
   // line can be
   //  .set at - defaults to $1
   // or .set at=$reg
+  int AtRegNo;
   getParser().Lex();
   if (getLexer().is(AsmToken::EndOfStatement)) {
     Options.setATReg(1);
@@ -1169,12 +1347,22 @@ bool MipsAsmParser::parseSetAtDirective() {
       return false;
     }
     Parser.Lex(); // eat '$'
-    if (getLexer().isNot(AsmToken::Integer)) {
+    const AsmToken &Reg = Parser.getTok();
+    if (Reg.is(AsmToken::Identifier)) {
+      AtRegNo = matchCPURegisterName(Reg.getIdentifier());
+    } else if (Reg.is(AsmToken::Integer)) {
+      AtRegNo = Reg.getIntVal();
+    } else {
       reportParseError("unexpected token in statement");
       return false;
     }
-    const AsmToken &Reg = Parser.getTok();
-    if (!Options.setATReg(Reg.getIntVal())) {
+
+    if ( AtRegNo < 1 || AtRegNo > 31) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+
+    if (!Options.setATReg(AtRegNo)) {
       reportParseError("unexpected token in statement");
       return false;
     }
@@ -1262,55 +1450,88 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetNoMacroDirective();
   } else if (Tok.getString() == "nomips16") {
     // ignore this directive for now
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return false;
   } else if (Tok.getString() == "nomicromips") {
     // ignore this directive for now
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return false;
   }
+
   return true;
 }
 
+/// parseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool MipsAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
 bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
-  if (DirectiveID.getString() == ".ent") {
+  StringRef IDVal = DirectiveID.getString();
+
+  if ( IDVal == ".ent") {
     // ignore this directive for now
     Parser.Lex();
     return false;
   }
 
-  if (DirectiveID.getString() == ".end") {
+  if (IDVal == ".end") {
     // ignore this directive for now
     Parser.Lex();
     return false;
   }
 
-  if (DirectiveID.getString() == ".frame") {
+  if (IDVal == ".frame") {
     // ignore this directive for now
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return false;
   }
 
-  if (DirectiveID.getString() == ".set") {
+  if (IDVal == ".set") {
     return parseDirectiveSet();
   }
 
-  if (DirectiveID.getString() == ".fmask") {
+  if (IDVal == ".fmask") {
     // ignore this directive for now
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return false;
   }
 
-  if (DirectiveID.getString() == ".mask") {
+  if (IDVal == ".mask") {
     // ignore this directive for now
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
     return false;
   }
 
-  if (DirectiveID.getString() == ".gpword") {
+  if (IDVal == ".gpword") {
     // ignore this directive for now
-    Parser.EatToEndOfStatement();
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (IDVal == ".word") {
+    parseDirectiveWord(4, DirectiveID.getLoc());
     return false;
   }
 
diff --git a/lib/Target/Mips/Disassembler/LLVMBuild.txt b/lib/Target/Mips/Disassembler/LLVMBuild.txt
index 048ad0d..7101c06 100644
--- a/lib/Target/Mips/Disassembler/LLVMBuild.txt
+++ b/lib/Target/Mips/Disassembler/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/Mips/Disassembler/LLVMBuild.txt --------------*- Conf -*--===;
+;===- ./lib/Target/Mips/Disassembler/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/Mips/Disassembler/Makefile b/lib/Target/Mips/Disassembler/Makefile
index a78feba..7900373 100644
--- a/lib/Target/Mips/Disassembler/Makefile
+++ b/lib/Target/Mips/Disassembler/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/Mips/Disassembler/Makefile ----------------*- Makefile -*-===##
+##===- lib/Target/Mips/Disassembler/Makefile ---------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 9560f3f..025a783 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -93,6 +93,11 @@ static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
                                                  uint64_t Address,
                                                  const void *Decoder);
 
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
 static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
                                                unsigned RegNo,
                                                uint64_t Address,
@@ -322,6 +327,15 @@ static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
   return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
 }
 
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+
+  return MCDisassembler::Fail;
+
+}
+
 static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 68d3ac5..fc23cd3 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define PRINT_ALIAS_INSTR
 #include "MipsGenAsmWriter.inc"
 
 const char* Mips::MipsFCCToString(Mips::CondCode CC) {
@@ -78,7 +79,9 @@ void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     O << "\t.set\tmips32r2\n";
   }
 
-  printInstruction(MI, O);
+  // Try to print any aliases first.
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
   printAnnotation(O, Annot);
 
   switch (MI->getOpcode()) {
@@ -149,6 +152,11 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
     OS << ')';
 }
 
+void MipsInstPrinter::printCPURegs(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printRegName(O, MI->getOperand(OpNo).getReg());
+}
+
 void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                    raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 3d8a6f9..d1b561f 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -87,6 +87,9 @@ public:
 
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printCPURegs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
 
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index be5d7e4..4212c94 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -5,6 +5,8 @@ add_llvm_library(LLVMMipsDesc
   MipsMCCodeEmitter.cpp
   MipsMCTargetDesc.cpp
   MipsELFObjectWriter.cpp
+  MipsReginfo.cpp
+  MipsELFStreamer.cpp
   )
 
 add_dependencies(LLVMMipsDesc MipsCommonTableGen)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index f82e203..6471b51 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -42,7 +42,6 @@ namespace {
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend) const;
-    virtual unsigned getEFlags() const;
     virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
                                            const MCValue &Target,
                                            const MCFragment &F,
@@ -61,19 +60,6 @@ MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}
 
-// FIXME: get the real EABI Version from the Subtarget class.
-unsigned MipsELFObjectWriter::getEFlags() const {
-
-  // FIXME: We can't tell if we are PIC (dynamic) or CPIC (static)
-  unsigned Flag = ELF::EF_MIPS_NOREORDER;
-
-  if (is64Bit())
-    Flag |= ELF::EF_MIPS_ARCH_64R2;
-  else
-    Flag |= ELF::EF_MIPS_ARCH_32R2;
-  return Flag;
-}
-
 const MCSymbol *MipsELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
                                                     const MCValue &Target,
                                                     const MCFragment &F,
@@ -108,7 +94,13 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
     Type = ELF::R_MIPS_64;
     break;
   case FK_GPRel_4:
-    Type = ELF::R_MIPS_GPREL32;
+    if (isN64()) {
+      Type = setRType((unsigned)ELF::R_MIPS_GPREL32, Type);
+      Type = setRType2((unsigned)ELF::R_MIPS_64, Type);
+      Type = setRType3((unsigned)ELF::R_MIPS_NONE, Type);
+    }
+    else
+      Type = ELF::R_MIPS_GPREL32;
     break;
   case Mips::fixup_Mips_GPREL16:
     Type = ELF::R_MIPS_GPREL16;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
new file mode 100644
index 0000000..c33bc9a
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -0,0 +1,89 @@
+//===-- MipsELFStreamer.cpp - MipsELFStreamer ---------------------------===//
+//
+//                       The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-------------------------------------------------------------------===//
+#include "MCTargetDesc/MipsELFStreamer.h"
+#include "MipsSubtarget.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                       raw_ostream &OS, MCCodeEmitter *Emitter,
+                                       bool RelaxAll, bool NoExecStack) {
+    MipsELFStreamer *S = new MipsELFStreamer(Context, TAB, OS, Emitter,
+                                             RelaxAll, NoExecStack);
+    return S;
+  }
+
+  // For llc. Set a group of ELF header flags
+  void
+  MipsELFStreamer::emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget) {
+
+    if (hasRawTextSupport())
+      return;
+
+    // Update e_header flags
+    MCAssembler& MCA = getAssembler();
+    unsigned EFlags = MCA.getELFHeaderEFlags();
+
+    if (Subtarget.inMips16Mode())
+      EFlags |= ELF::EF_MIPS_ARCH_ASE_M16;
+    else
+      EFlags |= ELF::EF_MIPS_NOREORDER;
+
+    // Architecture
+    if (Subtarget.hasMips64r2())
+      EFlags |= ELF::EF_MIPS_ARCH_64R2;
+    else if (Subtarget.hasMips64())
+      EFlags |= ELF::EF_MIPS_ARCH_64;
+    else if (Subtarget.hasMips32r2())
+      EFlags |= ELF::EF_MIPS_ARCH_32R2;
+    else
+      EFlags |= ELF::EF_MIPS_ARCH_32;
+
+    if (Subtarget.inMicroMipsMode())
+      EFlags |= ELF::EF_MIPS_MICROMIPS;
+
+    // ABI
+    if (Subtarget.isABI_O32())
+      EFlags |= ELF::EF_MIPS_ABI_O32;
+
+    // Relocation Model
+    Reloc::Model RM = Subtarget.getRelocationModel();
+    if (RM == Reloc::PIC_ || RM == Reloc::Default)
+      EFlags |= ELF::EF_MIPS_PIC;
+    else if (RM == Reloc::Static)
+      ; // Do nothing for Reloc::Static
+    else
+      llvm_unreachable("Unsupported relocation model for e_flags");
+
+    MCA.setELFHeaderEFlags(EFlags);
+  }
+
+  // For llc. Set a symbol's STO flags
+  void
+  MipsELFStreamer::emitMipsSTOCG(const MipsSubtarget &Subtarget,
+                                 MCSymbol *Sym,
+                                 unsigned Val) {
+
+    if (hasRawTextSupport())
+      return;
+
+    MCSymbolData &Data = getOrCreateSymbolData(Sym);
+    // The "other" values are stored in the last 6 bits of the second byte
+    // The traditional defines for STO values assume the full byte and thus
+    // the shift to pack it.
+    MCELF::setOther(Data, Val >> 2);
+  }
+
+} // namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
new file mode 100644
index 0000000..b10ccc7
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -0,0 +1,43 @@
+//=== MipsELFStreamer.h - MipsELFStreamer ------------------------------===//
+//
+//                    The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENCE.TXT for details.
+//
+//===-------------------------------------------------------------------===//
+#ifndef MIPSELFSTREAMER_H_
+#define MIPSELFSTREAMER_H_
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MipsAsmPrinter;
+class MipsSubtarget;
+class MCSymbol;
+
+class MipsELFStreamer : public MCELFStreamer {
+public:
+  MipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                  raw_ostream &OS, MCCodeEmitter *Emitter,
+                  bool RelaxAll, bool NoExecStack)
+    : MCELFStreamer(SK_MipsELFStreamer, Context, TAB, OS, Emitter) {
+  }
+
+  ~MipsELFStreamer() {}
+  void emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget);
+  void emitMipsSTOCG(const MipsSubtarget &Subtarget,
+                     MCSymbol *Sym,
+                     unsigned Val);
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_MipsELFStreamer;
+  }
+};
+
+  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                       raw_ostream &OS, MCCodeEmitter *Emitter,
+                                       bool RelaxAll, bool NoExecStack);
+}
+
+#endif /* MIPSELFSTREAMER_H_ */
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index a679749..5d4b32d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -25,8 +25,9 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
     IsLittleEndian = false;
 
   if ((TheTriple.getArch() == Triple::mips64el) ||
-      (TheTriple.getArch() == Triple::mips64))
-    PointerSize = 8;
+      (TheTriple.getArch() == Triple::mips64)) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
 
   AlignmentIsInBytes          = false;
   Data16bitsDirective         = "\t.2byte\t";
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 4b68b7e..96f93a0 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -38,7 +38,8 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
   bool IsLittleEndian;
 
 public:
-  MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_, bool IsLittle) :
+  MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_,
+                    const MCSubtargetInfo &sti, bool IsLittle) :
     MCII(mcii), Ctx(Ctx_), IsLittleEndian(IsLittle) {}
 
   ~MipsMCCodeEmitter() {}
@@ -95,7 +96,7 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
-  return new MipsMCCodeEmitter(MCII, Ctx, false);
+  return new MipsMCCodeEmitter(MCII, Ctx, STI, false);
 }
 
 MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
@@ -103,7 +104,7 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
-  return new MipsMCCodeEmitter(MCII, Ctx, true);
+  return new MipsMCCodeEmitter(MCII, Ctx, STI, true);
 }
 
 /// EncodeInstruction - Emit the instruction.
@@ -141,12 +142,6 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     llvm_unreachable("unimplemented opcode in EncodeInstruction()");
 
   const MCInstrDesc &Desc = MCII.get(TmpInst.getOpcode());
-  uint64_t TSFlags = Desc.TSFlags;
-
-  // Pseudo instructions don't get encoded and shouldn't be here
-  // in the first place!
-  if ((TSFlags & MipsII::FormMask) == MipsII::Pseudo)
-    llvm_unreachable("Pseudo opcode found in EncodeInstruction()");
 
   // Get byte count of instruction
   unsigned Size = Desc.getSize();
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 9360971..be83b54 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsELFStreamer.h"
 #include "MipsMCTargetDesc.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsMCAsmInfo.h"
@@ -131,7 +132,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
 
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  return createMipsELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
 }
 
 extern "C" void LLVMInitializeMipsTargetMC() {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsReginfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsReginfo.cpp
new file mode 100644
index 0000000..1dc9bcb
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsReginfo.cpp
@@ -0,0 +1,80 @@
+//===-- MipsReginfo.cpp - Registerinfo handling  --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// .reginfo
+//    Elf32_Word ri_gprmask
+//    Elf32_Word ri_cprmask[4]
+//    Elf32_Word ri_gp_value
+//
+// .MIPS.options - N64
+//    Elf64_Byte    kind (ODK_REGINFO)
+//    Elf64_Byte    size (40 bytes)
+//    Elf64_Section section (0)
+//    Elf64_Word    info (unused)
+//    Elf64_Word    ri_gprmask ()
+//    Elf64_Word    ri_pad ()
+//    Elf64_Word[4] ri_cprmask ()
+//    Elf64_Addr    ri_gp_value ()
+//
+// .MIPS.options - N32
+//    Elf32_Byte    kind (ODK_REGINFO)
+//    Elf32_Byte    size (36 bytes)
+//    Elf32_Section section (0)
+//    Elf32_Word    info (unused)
+//    Elf32_Word    ri_gprmask ()
+//    Elf32_Word    ri_pad ()
+//    Elf32_Word[4] ri_cprmask ()
+//    Elf32_Addr    ri_gp_value ()
+//
+//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/MipsReginfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+// Integrated assembler version
+void
+MipsReginfo::emitMipsReginfoSectionCG(MCStreamer &OS,
+    const TargetLoweringObjectFile &TLOF,
+    const MipsSubtarget &MST) const
+{
+
+  if (OS.hasRawTextSupport())
+    return;
+
+  const MipsTargetObjectFile &TLOFELF =
+      static_cast<const MipsTargetObjectFile &>(TLOF);
+  OS.SwitchSection(TLOFELF.getReginfoSection());
+
+  // .reginfo
+  if (MST.isABI_O32()) {
+    OS.EmitIntValue(0, 4); // ri_gprmask
+    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
+    OS.EmitIntValue(0, 4); // ri_gp_value
+  }
+  // .MIPS.options
+  else if (MST.isABI_N64()) {
+    OS.EmitIntValue(1, 1); // kind
+    OS.EmitIntValue(40, 1); // size
+    OS.EmitIntValue(0, 2); // section
+    OS.EmitIntValue(0, 4); // info
+    OS.EmitIntValue(0, 4); // ri_gprmask
+    OS.EmitIntValue(0, 4); // pad
+    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
+    OS.EmitIntValue(0, 8); // ri_gp_value
+  }
+  else llvm_unreachable("Unsupported abi for reginfo");
+}
+
diff --git a/lib/Target/Mips/MCTargetDesc/MipsReginfo.h b/lib/Target/Mips/MCTargetDesc/MipsReginfo.h
new file mode 100644
index 0000000..039b8ea
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsReginfo.h
@@ -0,0 +1,31 @@
+//=== MipsReginfo.h - MipsReginfo -----------------------------------------===//
+//
+//                    The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENCE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSREGINFO_H
+#define MIPSREGINFO_H
+
+namespace llvm {
+  class MCStreamer;
+  class TargetLoweringObjectFile;
+  class MipsSubtarget;
+
+  class MipsReginfo {
+    void anchor();
+  public:
+    MipsReginfo() {}
+
+    void emitMipsReginfoSectionCG(MCStreamer &OS,
+        const TargetLoweringObjectFile &TLOF,
+        const MipsSubtarget &MST) const;
+  };
+
+} // namespace llvm
+
+#endif
+
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 23e2a94..1326623 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -80,6 +80,9 @@ def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
 def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
                                     "Mips DSP-R2 ASE", [FeatureDSP]>;
 
+def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
+                                         "microMips mode">;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 127fcf2..1bb6fe4 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -139,6 +139,25 @@ bool Mips16FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
+void Mips16FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+
+    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+      Amount = -Amount;
+
+    const Mips16InstrInfo &TII =
+      *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+
+    TII.adjustStackPtr(Mips::SP, Amount, MBB, I);
+  }
+
+  MBB.erase(I);
+}
+
 bool
 Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 01db71e..25f4ffb 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -27,6 +27,10 @@ public:
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/Mips/Mips16InstrFormats.td b/lib/Target/Mips/Mips16InstrFormats.td
index 61602b6..4ff62ef 100644
--- a/lib/Target/Mips/Mips16InstrFormats.td
+++ b/lib/Target/Mips/Mips16InstrFormats.td
@@ -29,45 +29,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Format specifies the encoding used by the instruction.  This is part of the
-// ad-hoc solution used to emit machine instruction encodings by our machine
-// code emitter.
-//
-class Format16<bits<5> val> {
-  bits<5> Value = val;
-}
-
-def Pseudo16          : Format16<0>;
-def FrmI16            : Format16<1>;
-def FrmRI16           : Format16<2>;
-def FrmRR16           : Format16<3>;
-def FrmRRI16          : Format16<4>;
-def FrmRRR16          : Format16<5>;
-def FrmRRI_A16        : Format16<6>;
-def FrmSHIFT16        : Format16<7>;
-def FrmI8_TYPE16      : Format16<8>;
-def FrmI8_MOVR3216    : Format16<9>;
-def FrmI8_MOV32R16    : Format16<10>;
-def FrmI8_SVRS16      : Format16<11>;
-def FrmJAL16          : Format16<12>;
-def FrmJALX16         : Format16<13>;
-def FrmEXT_I16        : Format16<14>;
-def FrmASMACRO16      : Format16<15>;
-def FrmEXT_RI16       : Format16<16>;
-def FrmEXT_RRI16      : Format16<17>;
-def FrmEXT_RRI_A16    : Format16<18>;
-def FrmEXT_SHIFT16    : Format16<19>;
-def FrmEXT_I816       : Format16<20>;
-def FrmEXT_I8_SVRS16  : Format16<21>;
-def FrmOther16        : Format16<22>; // Instruction w/ a custom format
 
 // Base class for Mips 16 Format
 // This class does not depend on the instruction size
 //
 class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern,
-                      InstrItinClass itin, Format16 f>: Instruction
+                      InstrItinClass itin>: Instruction
 {
-  Format16 Form = f;
 
   let Namespace = "Mips";
 
@@ -78,14 +46,6 @@ class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern,
   let Pattern     = pattern;
   let Itinerary   = itin;
 
-  //
-  // Attributes specific to Mips instructions...
-  //
-  bits<5> FormBits = Form.Value;
-
-  // TSFlags layout should be kept in sync with MipsInstrInfo.h.
-  let TSFlags{4-0}   = FormBits;
-
   let Predicates = [InMips16Mode];
 }
 
@@ -93,30 +53,35 @@ class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern,
 // Generic Mips 16 Format
 //
 class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
-                 InstrItinClass itin, Format16 f>:
-  MipsInst16_Base<outs, ins, asmstr, pattern, itin, f>
+                 InstrItinClass itin>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin>
 {
   field bits<16> Inst;
   bits<5> Opcode = 0;
 
   // Top 5 bits are the 'opcode' field
   let Inst{15-11} = Opcode;
+  
+  let Size=2;
+  field bits<16> SoftFail = 0;
 }
 
 //
 // For 32 bit extended instruction forms.
 //
 class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern,
-                    InstrItinClass itin, Format16 f>:
-  MipsInst16_Base<outs, ins, asmstr, pattern, itin, f>
+                    InstrItinClass itin>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin>
 {
   field bits<32> Inst;
-
+  
+  let Size=4;
+  field bits<32> SoftFail = 0;
 }
 
 class MipsInst16_EXTEND<dag outs, dag ins, string asmstr, list<dag> pattern,
-                        InstrItinClass itin, Format16 f>:
-  MipsInst16_32<outs, ins, asmstr, pattern, itin, f>
+                        InstrItinClass itin>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin>
 {
   let Inst{31-27} = 0b11110;
 }
@@ -125,7 +90,7 @@ class MipsInst16_EXTEND<dag outs, dag ins, string asmstr, list<dag> pattern,
 
 // Mips Pseudo Instructions Format
 class MipsPseudo16<dag outs, dag ins, string asmstr, list<dag> pattern>:
-  MipsInst16<outs, ins, asmstr, pattern, IIPseudo, Pseudo16> {
+  MipsInst16<outs, ins, asmstr, pattern, IIPseudo> {
   let isCodeGenOnly = 1;
   let isPseudo = 1;
 }
@@ -137,7 +102,7 @@ class MipsPseudo16<dag outs, dag ins, string asmstr, list<dag> pattern>:
 
 class FI16<bits<5> op, dag outs, dag ins, string asmstr, list<dag> pattern,
            InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<11> imm11;
 
@@ -152,7 +117,7 @@ class FI16<bits<5> op, dag outs, dag ins, string asmstr, list<dag> pattern,
 
 class FRI16<bits<5> op, dag outs, dag ins, string asmstr,
             list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRI16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<3>  rx;
   bits<8>   imm8;
@@ -169,7 +134,7 @@ class FRI16<bits<5> op, dag outs, dag ins, string asmstr,
 
 class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr,
             list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<3>  rx;
   bits<3>  ry;
@@ -188,7 +153,7 @@ class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr,
 //
 class FRR_SF16<bits<5> _funct, bits<3> _subfunct, dag outs, dag ins,
                string asmstr, list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<3>  rx;
   bits<3>  subfunct;
@@ -208,7 +173,7 @@ class FRR_SF16<bits<5> _funct, bits<3> _subfunct, dag outs, dag ins,
 //
 class FC16<bits<5> _funct, dag outs, dag ins, string asmstr,
            list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<6>  _code;  // code is a keyword in tablegen
   bits<5>  funct;
@@ -226,7 +191,7 @@ class FC16<bits<5> _funct, dag outs, dag ins, string asmstr,
 class FRR16_JALRC<bits<1> _nd, bits<1> _l, bits<1> r_a,
                   dag outs, dag ins, string asmstr,
                   list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRR16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<3>  rx;
   bits<1>  nd;
@@ -252,7 +217,7 @@ class FRR16_JALRC<bits<1> _nd, bits<1> _l, bits<1> r_a,
 
 class FRRI16<bits<5> op, dag outs, dag ins, string asmstr,
              list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRI16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<3>  rx;
   bits<3>  ry;
@@ -272,7 +237,7 @@ class FRRI16<bits<5> op, dag outs, dag ins, string asmstr,
 
 class FRRR16<bits<2> _f, dag outs, dag ins, string asmstr,
              list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRR16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<3>  rx;
   bits<3>  ry;
@@ -294,7 +259,7 @@ class FRRR16<bits<2> _f, dag outs, dag ins, string asmstr,
 
 class FRRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
                list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmRRI_A16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<3>  rx;
   bits<3>  ry;
@@ -316,7 +281,7 @@ class FRRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
 
 class FSHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
                list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmSHIFT16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<3>  rx;
   bits<3>  ry;
@@ -338,7 +303,7 @@ class FSHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
 
 class FI816<bits<3> _func, dag outs, dag ins, string asmstr,
             list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_TYPE16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<3>  func;
   bits<8>   imm8;
@@ -356,7 +321,7 @@ class FI816<bits<3> _func, dag outs, dag ins, string asmstr,
 
 class FI8_MOVR3216<dag outs, dag ins, string asmstr,
                    list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_MOVR3216>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
 
   bits<4> ry;
@@ -378,7 +343,7 @@ class FI8_MOVR3216<dag outs, dag ins, string asmstr,
 
 class FI8_MOV32R16<dag outs, dag ins, string asmstr,
                    list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_MOV32R16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
 
   bits<3>  func;
@@ -402,7 +367,7 @@ class FI8_MOV32R16<dag outs, dag ins, string asmstr,
 
 class FI8_SVRS16<bits<1> _s, dag outs, dag ins, string asmstr,
                  list<dag> pattern, InstrItinClass itin>:
-  MipsInst16<outs, ins, asmstr, pattern, itin, FrmI8_SVRS16>
+  MipsInst16<outs, ins, asmstr, pattern, itin>
 {
   bits<1> s;
   bits<1> ra = 0;
@@ -429,7 +394,7 @@ class FI8_SVRS16<bits<1> _s, dag outs, dag ins, string asmstr,
 
 class FJAL16<bits<1> _X, dag outs, dag ins, string asmstr,
              list<dag> pattern, InstrItinClass itin>:
-  MipsInst16_32<outs, ins, asmstr, pattern, itin, FrmJAL16>
+  MipsInst16_32<outs, ins, asmstr, pattern, itin>
 {
   bits<1> X;
   bits<26> imm26;
@@ -452,7 +417,7 @@ class FJAL16<bits<1> _X, dag outs, dag ins, string asmstr,
 
 class FEXT_I16<bits<5> _eop, dag outs, dag ins, string asmstr,
                list<dag> pattern, InstrItinClass itin>:
-  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_I16>
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
 {
   bits<16> imm16;
   bits<5> eop;
@@ -474,7 +439,7 @@ class FEXT_I16<bits<5> _eop, dag outs, dag ins, string asmstr,
 
 class FASMACRO16<dag outs, dag ins, string asmstr,
                  list<dag> pattern, InstrItinClass itin>:
-  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmASMACRO16>
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
 {
   bits<3> select;
   bits<3> p4;
@@ -503,7 +468,7 @@ class FASMACRO16<dag outs, dag ins, string asmstr,
 
 class FEXT_RI16<bits<5> _op, dag outs, dag ins, string asmstr,
                 list<dag> pattern, InstrItinClass itin>:
-  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RI16>
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
 {
   bits<16> imm16;
   bits<5> op;
@@ -527,7 +492,7 @@ class FEXT_RI16<bits<5> _op, dag outs, dag ins, string asmstr,
 
 class FEXT_RRI16<bits<5> _op, dag outs, dag ins, string asmstr,
                  list<dag> pattern, InstrItinClass itin>:
-  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RRI16>
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
 {
   bits<5> op;
   bits<16> imm16;
@@ -552,7 +517,7 @@ class FEXT_RRI16<bits<5> _op, dag outs, dag ins, string asmstr,
 
 class FEXT_RRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
                    list<dag> pattern, InstrItinClass itin>:
-  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_RRI_A16>
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
 {
   bits<15> imm15;
   bits<3> rx;
@@ -578,7 +543,7 @@ class FEXT_RRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
 
 class FEXT_SHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
                    list<dag> pattern, InstrItinClass itin>:
-  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_SHIFT16>
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
 {
   bits<6> sa6;
   bits<3> rx;
@@ -605,7 +570,7 @@ class FEXT_SHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
 
 class FEXT_I816<bits<3> _funct, dag outs, dag ins, string asmstr,
                 list<dag> pattern, InstrItinClass itin>:
-  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmEXT_I816>
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
 {
   bits<16> imm16;
   bits<5> I8;
@@ -630,7 +595,7 @@ class FEXT_I816<bits<3> _funct, dag outs, dag ins, string asmstr,
 
 class FEXT_I8_SVRS16<bits<1> s_, dag outs, dag ins, string asmstr,
                      list<dag> pattern, InstrItinClass itin>:
-  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin, FrmI8_SVRS16>
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
 {
   bits<3> xsregs =0;
   bits<8> framesize =0;
@@ -659,5 +624,3 @@ class FEXT_I8_SVRS16<bits<1> s_, dag outs, dag ins, string asmstr,
 
 }
 
-
-
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 91b5ba0..fd3cc8f 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -19,7 +19,9 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -28,7 +30,8 @@ using namespace llvm;
 static cl::opt<bool> NeverUseSaveRestore(
   "mips16-never-use-save-restore",
   cl::init(false),
-  cl::desc("For testing ability to adjust stack pointer without save/restore instruction"),
+  cl::desc("For testing ability to adjust stack pointer "
+           "without save/restore instruction"),
   cl::Hidden);
 
 
@@ -129,7 +132,6 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
 bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   MachineBasicBlock &MBB = *MI->getParent();
-
   switch(MI->getDesc().getOpcode()) {
   default:
     return false;
@@ -169,19 +171,20 @@ unsigned Mips16InstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
 }
 
 // Adjust SP by FrameSize bytes. Save RA, S0, S1
-void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
+void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize,
+                    MachineBasicBlock &MBB,
                     MachineBasicBlock::iterator I) const {
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   if (!NeverUseSaveRestore) {
     if (isUInt<11>(FrameSize))
       BuildMI(MBB, I, DL, get(Mips::SaveRaF16)).addImm(FrameSize);
     else {
-      int Base = 2040; // should create template function like isUInt that returns largest
-                       // possible n bit unsigned integer
+      int Base = 2040; // should create template function like isUInt that
+                       // returns largest possible n bit unsigned integer
       int64_t Remainder = FrameSize - Base;
       BuildMI(MBB, I, DL, get(Mips::SaveRaF16)). addImm(Base);
       if (isInt<16>(-Remainder))
-        BuildMI(MBB, I, DL, get(Mips::AddiuSpImmX16)). addImm(-Remainder);
+        BuildAddiuSpImm(MBB, I, -Remainder);
       else
         adjustStackPtrBig(SP, -Remainder, MBB, I, Mips::V0, Mips::V1);
     }
@@ -193,13 +196,16 @@ void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBloc
     // sw s1, -8[sp]
     // sw s0, -12[sp]
 
-    MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16), Mips::RA);
+    MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16),
+                                       Mips::RA);
     MIB1.addReg(Mips::SP);
     MIB1.addImm(-4);
-    MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16), Mips::S1);
+    MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16),
+                                       Mips::S1);
     MIB2.addReg(Mips::SP);
     MIB2.addImm(-8);
-    MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16), Mips::S0);
+    MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16),
+                                       Mips::S0);
     MIB3.addReg(Mips::SP);
     MIB3.addImm(-12);
     adjustStackPtrBig(SP, -FrameSize, MBB, I, Mips::V0, Mips::V1);
@@ -207,18 +213,19 @@ void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBloc
 }
 
 // Adjust SP by FrameSize bytes. Restore RA, S0, S1
-void Mips16InstrInfo::restoreFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator I) const {
+void Mips16InstrInfo::restoreFrame(unsigned SP, int64_t FrameSize,
+                                   MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I) const {
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   if (!NeverUseSaveRestore) {
     if (isUInt<11>(FrameSize))
       BuildMI(MBB, I, DL, get(Mips::RestoreRaF16)).addImm(FrameSize);
     else {
-      int Base = 2040; // should create template function like isUInt that returns largest
-                       // possible n bit unsigned integer
+      int Base = 2040; // should create template function like isUInt that
+                       // returns largest possible n bit unsigned integer
       int64_t Remainder = FrameSize - Base;
       if (isInt<16>(Remainder))
-        BuildMI(MBB, I, DL, get(Mips::AddiuSpImmX16)). addImm(Remainder);
+        BuildAddiuSpImm(MBB, I, Remainder);
       else
         adjustStackPtrBig(SP, Remainder, MBB, I, Mips::A0, Mips::A1);
       BuildMI(MBB, I, DL, get(Mips::RestoreRaF16)). addImm(Base);
@@ -229,15 +236,19 @@ void Mips16InstrInfo::restoreFrame(unsigned SP, int64_t FrameSize, MachineBasicB
     // lw ra, -4[sp]
     // lw s1, -8[sp]
     // lw s0, -12[sp]
-    MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16), Mips::A0);
+    MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16),
+                                       Mips::A0);
     MIB1.addReg(Mips::SP);
     MIB1.addImm(-4);
-    MachineInstrBuilder MIB0 = BuildMI(MBB, I, DL, get(Mips::Move32R16), Mips::RA);
+    MachineInstrBuilder MIB0 = BuildMI(MBB, I, DL, get(Mips::Move32R16),
+                                       Mips::RA);
      MIB0.addReg(Mips::A0);
-    MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16), Mips::S1);
+    MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16),
+                                       Mips::S1);
     MIB2.addReg(Mips::SP);
     MIB2.addImm(-8);
-    MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16), Mips::S0);
+    MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16),
+                                       Mips::S0);
     MIB3.addReg(Mips::SP);
     MIB3.addImm(-12);
   }
@@ -245,10 +256,12 @@ void Mips16InstrInfo::restoreFrame(unsigned SP, int64_t FrameSize, MachineBasicB
 }
 
 // Adjust SP by Amount bytes where bytes can be up to 32bit number.
-// This can only be called at times that we know that there is at least one free register.
+// This can only be called at times that we know that there is at least one free
+// register.
 // This is clearly safe at prologue and epilogue.
 //
-void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount,
+                                        MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator I,
                                         unsigned Reg1, unsigned Reg2) const {
   DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
@@ -269,11 +282,13 @@ void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount, MachineBasi
   MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::AdduRxRyRz16), Reg1);
   MIB3.addReg(Reg1);
   MIB3.addReg(Reg2, RegState::Kill);
-  MachineInstrBuilder MIB4 = BuildMI(MBB, I, DL, get(Mips::Move32R16), Mips::SP);
+  MachineInstrBuilder MIB4 = BuildMI(MBB, I, DL, get(Mips::Move32R16),
+                                                     Mips::SP);
   MIB4.addReg(Reg1, RegState::Kill);
 }
 
-void Mips16InstrInfo::adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+void Mips16InstrInfo::adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount,
+                    MachineBasicBlock &MBB,
                     MachineBasicBlock::iterator I) const {
    assert(false && "adjust stack pointer amount exceeded");
 }
@@ -282,9 +297,8 @@ void Mips16InstrInfo::adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount,
 void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const {
-  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
   if (isInt<16>(Amount))  // need to change to addiu sp, ....and isInt<16>
-    BuildMI(MBB, I, DL, get(Mips::AddiuSpImmX16)). addImm(Amount);
+    BuildAddiuSpImm(MBB, I, Amount);
   else
     adjustStackPtrBigUnrestricted(SP, Amount, MBB, I);
 }
@@ -292,11 +306,79 @@ void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
 /// This function generates the sequence of instructions needed to get the
 /// result of adding register REG and immediate IMM.
 unsigned
-Mips16InstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+Mips16InstrInfo::loadImmediate(unsigned FrameReg,
+                               int64_t Imm, MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator II, DebugLoc DL,
-                               unsigned *NewImm) const {
+                               unsigned &NewImm) const {
+  //
+  // given original instruction is:
+  // Instr rx, T[offset] where offset is too big.
+  //
+  // lo = offset & 0xFFFF
+  // hi = ((offset >> 16) + (lo >> 15)) & 0xFFFF;
+  //
+  // let T = temporary register
+  // li T, hi
+  // shl T, 16
+  // add T, Rx, T
+  //
+  RegScavenger rs;
+  int32_t lo = Imm & 0xFFFF;
+  int32_t hi = ((Imm >> 16) + (lo >> 15)) & 0xFFFF;
+  NewImm = lo;
+  unsigned Reg =0;
+  unsigned SpReg = 0;
+  rs.enterBasicBlock(&MBB);
+  rs.forward(II);
+  //
+  // we use T0 for the first register, if we need to save something away.
+  // we use T1 for the second register, if we need to save something away.
+  //
+  unsigned FirstRegSaved =0, SecondRegSaved=0;
+  unsigned FirstRegSavedTo = 0, SecondRegSavedTo = 0;
+
+  Reg = rs.FindUnusedReg(&Mips::CPU16RegsRegClass);
+  if (Reg == 0) {
+    FirstRegSaved = Reg = Mips::V0;
+    FirstRegSavedTo = Mips::T0;
+    copyPhysReg(MBB, II, DL, FirstRegSavedTo, FirstRegSaved, true);
+  }
+  else
+    rs.setUsed(Reg);
+  BuildMI(MBB, II, DL, get(Mips::LiRxImmX16), Reg).addImm(hi);
+  BuildMI(MBB, II, DL, get(Mips::SllX16), Reg).addReg(Reg).
+    addImm(16);
+  if (FrameReg == Mips::SP) {
+    SpReg = rs.FindUnusedReg(&Mips::CPU16RegsRegClass);
+    if (SpReg == 0) {
+      if (Reg != Mips::V1) {
+        SecondRegSaved = SpReg = Mips::V1;
+        SecondRegSavedTo = Mips::T1;
+      }
+      else {
+        SecondRegSaved = SpReg = Mips::V0;
+        SecondRegSavedTo = Mips::T0;
+      }
+      copyPhysReg(MBB, II, DL, SecondRegSavedTo, SecondRegSaved, true);
+    }
+    else
+      rs.setUsed(SpReg);
 
-  return 0;
+    copyPhysReg(MBB, II, DL, SpReg, Mips::SP, false);
+    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(SpReg)
+      .addReg(Reg);
+  }
+  else
+    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(FrameReg)
+      .addReg(Reg, RegState::Kill);
+  if (FirstRegSaved || SecondRegSaved) {
+    II = llvm::next(II);
+    if (FirstRegSaved)
+      copyPhysReg(MBB, II, DL, FirstRegSaved, FirstRegSavedTo, true);
+    if (SecondRegSaved)
+      copyPhysReg(MBB, II, DL, SecondRegSaved, SecondRegSavedTo, true);
+  }
+  return Reg;
 }
 
 unsigned Mips16InstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
@@ -317,6 +399,20 @@ void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB,
   BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
 }
 
+
+const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const {
+  if (validSpImm8(Imm))
+    return get(Mips::AddiuSpImm16);
+  else
+    return get(Mips::AddiuSpImmX16);
+}
+
+void Mips16InstrInfo::BuildAddiuSpImm
+  (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm);
+}
+
 const MipsInstrInfo *llvm::createMips16InstrInfo(MipsTargetMachine &TM) {
   return new Mips16InstrInfo(TM);
 }
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index 3704e25..1cb1dfe 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -77,12 +77,27 @@ public:
   void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator I) const;
 
-  /// Emit a series of instructions to load an immediate. If NewImm is a
-  /// non-NULL parameter, the last instruction is not emitted, but instead
-  /// its immediate operand is returned in NewImm.
-  unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+  /// Emit a series of instructions to load an immediate.
+  // This is to adjust some FrameReg. We return the new register to be used
+  // in place of FrameReg and the adjusted immediate field (&NewImm)
+  //
+  unsigned loadImmediate(unsigned FrameReg,
+                         int64_t Imm, MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator II, DebugLoc DL,
-                         unsigned *NewImm) const;
+                         unsigned &NewImm) const;
+
+  static bool validSpImm8(int offset) {
+    return ((offset & 7) == 0) && isInt<11>(offset);
+  }
+
+  //
+  // build the proper one based on the Imm field
+  //
+
+  const MCInstrDesc& AddiuSpImm(int64_t Imm) const;
+
+  void BuildAddiuSpImm
+    (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
 
 private:
   virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
@@ -100,7 +115,6 @@ private:
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-
 };
 
 }
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index e8e2f3c..a9e9c52 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -32,18 +32,76 @@ def mem16_ea : Operand<i32> {
 }
 
 //
+//
+// I8 instruction format
+//
+
+class FI816_ins_base<bits<3> _func, string asmstr,
+                     string asmstr2, InstrItinClass itin>:
+  FI816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
+        [], itin>;
+
+
+class FI816_SP_ins<bits<3> _func, string asmstr,
+                   InstrItinClass itin>:
+  FI816_ins_base<_func, asmstr, "\t$$sp, $imm # 16 bit inst", itin>;
+
+//
+// RI instruction format
+//
+
+
+class FRI16_ins_base<bits<5> op, string asmstr, string asmstr2,
+                     InstrItinClass itin>:
+  FRI16<op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+        !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16_ins<bits<5> op, string asmstr,
+                InstrItinClass itin>:
+  FRI16_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class FRI16R_ins_base<bits<5> op, string asmstr, string asmstr2,
+                     InstrItinClass itin>:
+  FRI16<op, (outs), (ins CPU16Regs:$rx, simm16:$imm),
+        !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16R_ins<bits<5> op, string asmstr,
+                InstrItinClass itin>:
+  FRI16R_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class F2RI16_ins<bits<5> _op, string asmstr,
+                     InstrItinClass itin>:
+  FRI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+        !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin> {
+  let Constraints = "$rx_ = $rx";
+}
+
+class FRI16_B_ins<bits<5> _op, string asmstr,
+                  InstrItinClass itin>:
+  FRI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
+        !strconcat(asmstr, "\t$rx, $imm  # 16 bit inst"), [], itin>;
+//
 // Compare a register and immediate and place result in CC
 // Implicit use of T8
 //
 // EXT-CCRR Instruction format
 //
-class FEXT_CCRXI16_ins<bits<5> _op, string asmstr,
-                       InstrItinClass itin>:
-  FEXT_RI16<_op, (outs CPU16Regs:$cc), (ins CPU16Regs:$rx, simm16:$imm),
-            !strconcat(asmstr, "\t$rx, $imm\n\tmove\t$cc, $$t8"), [], itin> {
+class FEXT_CCRXI16_ins<string asmstr>:
+  MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, simm16:$imm),
+               !strconcat(asmstr, "\t$rx, $imm\n\tmove\t$cc, $$t8"), []> {
   let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
 }
 
+// JAL and JALX instruction format
+//
+class FJAL16_ins<bits<1> _X, string asmstr,
+                 InstrItinClass itin>:
+  FJAL16<_X, (outs), (ins simm20:$imm),
+         !strconcat(asmstr, "\t$imm\n\tnop"),[],
+         itin>  {
+  let isCodeGenOnly=1;
+}
 //
 // EXT-I instruction format
 //
@@ -77,10 +135,11 @@ class FEXT_I816_SP_ins<bits<3> _func, string asmstr,
 //
 // CC-RR Instruction format
 //
-class FCCRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
-  FRR16<f, (outs CPU16Regs:$cc), (ins CPU16Regs:$rx, CPU16Regs:$ry),
-        !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$cc, $$t8"), [], itin> {
+class FCCRR16_ins<string asmstr> :
+  MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$cc, $$t8"), []> {
   let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
 }
 
 //
@@ -96,6 +155,15 @@ class FEXT_RI16_ins<bits<5> _op, string asmstr,
                     InstrItinClass itin>:
   FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
 
+class FEXT_RI16R_ins_base<bits<5> _op, string asmstr, string asmstr2,
+                         InstrItinClass itin>:
+  FEXT_RI16<_op, (outs ), (ins CPU16Regs:$rx, simm16:$imm),
+                  !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16R_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_RI16R_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
 class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>:
   FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>;
 
@@ -153,25 +221,25 @@ class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
 //
 // EXT-T8I8
 //
-class FEXT_T8I816_ins<bits<3> _func, string asmstr, string asmstr2,
-                      InstrItinClass itin>:
-  FEXT_I816<_func, (outs),
-            (ins CPU16Regs:$rx, CPU16Regs:$ry, brtarget:$imm),
-            !strconcat(asmstr2, !strconcat("\t$rx, $ry\n\t",
-            !strconcat(asmstr, "\t$imm"))),[], itin> {
+class FEXT_T8I816_ins<string asmstr, string asmstr2>:
+  MipsPseudo16<(outs),
+               (ins CPU16Regs:$rx, CPU16Regs:$ry, brtarget:$imm),
+               !strconcat(asmstr2, !strconcat("\t$rx, $ry\n\t",
+               !strconcat(asmstr, "\t$imm"))),[]> {
   let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
 }
 
 //
 // EXT-T8I8I
 //
-class FEXT_T8I8I16_ins<bits<3> _func, string asmstr, string asmstr2,
-                       InstrItinClass itin>:
-  FEXT_I816<_func, (outs),
-            (ins CPU16Regs:$rx, simm16:$imm, brtarget:$targ),
-            !strconcat(asmstr2, !strconcat("\t$rx, $imm\n\t",
-            !strconcat(asmstr, "\t$targ"))), [], itin> {
+class FEXT_T8I8I16_ins<string asmstr, string asmstr2>:
+  MipsPseudo16<(outs),
+               (ins CPU16Regs:$rx, simm16:$imm, brtarget:$targ),
+               !strconcat(asmstr2, !strconcat("\t$rx, $imm\n\t",
+               !strconcat(asmstr, "\t$targ"))), []> {
   let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
 }
 //
 
@@ -219,9 +287,14 @@ class FRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
         !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
 }
 
-class FRRTR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
-  FRR16<f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
-        !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$rz, $$t8"), [], itin> ;
+class FRR16R_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs), (ins  CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRRTR16_ins<string asmstr> :
+  MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$rz, $$t8"), []> ;
 
 //
 // maybe refactor but need a $zero as a dummy first parameter
@@ -257,7 +330,7 @@ class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
 
 class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
                       string asmstr, InstrItinClass itin>:
-  FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx), 
+  FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
               !strconcat(asmstr, "\t $rx"), [], itin> ;
 
 //
@@ -296,13 +369,13 @@ class FRRR16_ins<bits<2> _f, string asmstr,  InstrItinClass itin> :
 //
 // So this pseudo class only has one operand, i.e. op
 //
-class Sel<bits<5> f1, string op, InstrItinClass itin>:
-  MipsInst16_32<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
-                CPU16Regs:$rt),
-                !strconcat(op, "\t$rt, .+4\n\t\n\tmove $rd, $rs"), [], itin,
-                Pseudo16> {
-  let isCodeGenOnly=1;
+class Sel<string op>:
+  MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+               CPU16Regs:$rt),
+               !strconcat(op, "\t$rt, .+4\n\t\n\tmove $rd, $rs"), []> {
+  //let isCodeGenOnly=1;
   let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
 }
 
 //
@@ -320,16 +393,15 @@ class Sel<bits<5> f1, string op, InstrItinClass itin>:
 // move $rd, $rs
 //
 //
-class SeliT<bits<5> f1, string op1, bits<5> f2, string op2,
-                 InstrItinClass itin>:
-  MipsInst16_32<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
-                                        CPU16Regs:$rl, simm16:$imm),
-                 !strconcat(op2,
-                 !strconcat("\t$rl, $imm\n\t",
-                 !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), [], itin,
-                 Pseudo16> {
+class SeliT<string op1, string op2>:
+  MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                                       CPU16Regs:$rl, simm16:$imm),
+               !strconcat(op2,
+               !strconcat("\t$rl, $imm\n\t",
+               !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
   let isCodeGenOnly=1;
   let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
 }
 
 //
@@ -344,16 +416,16 @@ class SeliT<bits<5> f1, string op1, bits<5> f2, string op2,
 // move $rd, $rs
 //
 //
-class SelT<bits<5> f1, string op1, bits<5> f2, string op2,
-           InstrItinClass itin>:
-  MipsInst16_32<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+class SelT<string op1, string op2>:
+  MipsPseudo16<(outs CPU16Regs:$rd_),
+               (ins CPU16Regs:$rd, CPU16Regs:$rs,
                 CPU16Regs:$rl, CPU16Regs:$rr),
-                !strconcat(op2,
-                !strconcat("\t$rl, $rr\n\t",
-                !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), [], itin,
-                Pseudo16> {
+               !strconcat(op2,
+               !strconcat("\t$rl, $rr\n\t",
+               !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
   let isCodeGenOnly=1;
   let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
 }
 
 //
@@ -363,7 +435,7 @@ def imm32: Operand<i32>;
 
 def Constant32:
   MipsPseudo16<(outs), (ins imm32:$imm), "\t.word $imm", []>;
-  
+
 def LwConstant32:
   MipsPseudo16<(outs), (ins CPU16Regs:$rx, imm32:$imm),
     "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
@@ -401,14 +473,21 @@ class MayStore {
 }
 //
 
+
 // Format: ADDIU rx, immediate MIPS16e
 // Purpose: Add Immediate Unsigned Word (2-Operand, Extended)
 // To add a constant to a 32-bit integer.
 //
 def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIAlu>;
 
+def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIAlu>,
+  ArithLogic16Defs<0> {
+  let AddedComplexity = 5;
+}
 def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIAlu>,
-  ArithLogic16Defs<0>;
+  ArithLogic16Defs<0> {
+  let isCodeGenOnly = 1;
+}
 
 def AddiuRxRyOffMemX16:
   FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIAlu>;
@@ -426,11 +505,18 @@ def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>;
 // Purpose: Add Immediate Unsigned Word (2-Operand, SP-Relative, Extended)
 // To add a constant to the stack pointer.
 //
+def AddiuSpImm16
+  : FI816_SP_ins<0b011, "addiu", IIAlu> {
+  let Defs = [SP];
+  let Uses = [SP];
+  let AddedComplexity = 5;
+}
+
 def AddiuSpImmX16
   : FEXT_I816_SP_ins<0b011, "addiu", IIAlu> {
   let Defs = [SP];
   let Uses = [SP];
-}   
+}
 
 //
 // Format: ADDU rz, rx, ry MIPS16e
@@ -450,6 +536,14 @@ def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
 
 //
 // Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
 // Purpose: Branch on Equal to Zero (Extended)
 // To test a GPR then do a PC-relative conditional branch.
 //
@@ -463,6 +557,13 @@ def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16;
 
 //
 // Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
 // Purpose: Branch on Not Equal to Zero (Extended)
 // To test a GPR then do a PC-relative conditional branch.
 //
@@ -473,20 +574,22 @@ def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
 // Purpose: Branch on T Equal to Zero (Extended)
 // To test special register T then do a PC-relative conditional branch.
 //
-def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16;
+def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
 
-def BteqzT8CmpX16: FEXT_T8I816_ins<0b000, "bteqz", "cmp", IIAlu>, cbranch16;
+def BteqzT8CmpX16: FEXT_T8I816_ins<"bteqz", "cmp">, cbranch16;
 
-def BteqzT8CmpiX16: FEXT_T8I8I16_ins<0b000, "bteqz", "cmpi", IIAlu>,
+def BteqzT8CmpiX16: FEXT_T8I8I16_ins<"bteqz", "cmpi">,
   cbranch16;
 
-def BteqzT8SltX16: FEXT_T8I816_ins<0b000, "bteqz", "slt", IIAlu>, cbranch16;
+def BteqzT8SltX16: FEXT_T8I816_ins<"bteqz", "slt">, cbranch16;
 
-def BteqzT8SltuX16: FEXT_T8I816_ins<0b000, "bteqz", "sltu", IIAlu>, cbranch16;
+def BteqzT8SltuX16: FEXT_T8I816_ins<"bteqz", "sltu">, cbranch16;
 
-def BteqzT8SltiX16: FEXT_T8I8I16_ins<0b000, "bteqz", "slti", IIAlu>, cbranch16;
+def BteqzT8SltiX16: FEXT_T8I8I16_ins<"bteqz", "slti">, cbranch16;
 
-def BteqzT8SltiuX16: FEXT_T8I8I16_ins<0b000, "bteqz", "sltiu", IIAlu>,
+def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
   cbranch16;
 
 //
@@ -494,22 +597,52 @@ def BteqzT8SltiuX16: FEXT_T8I8I16_ins<0b000, "bteqz", "sltiu", IIAlu>,
 // Purpose: Branch on T Not Equal to Zero (Extended)
 // To test special register T then do a PC-relative conditional branch.
 //
-def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16;
+def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 {
+  let Uses = [T8];
+}
 
-def BtnezT8CmpX16: FEXT_T8I816_ins<0b000, "btnez", "cmp", IIAlu>, cbranch16;
+def BtnezT8CmpX16: FEXT_T8I816_ins<"btnez", "cmp">, cbranch16;
 
-def BtnezT8CmpiX16: FEXT_T8I8I16_ins<0b000, "btnez", "cmpi", IIAlu>, cbranch16;
+def BtnezT8CmpiX16: FEXT_T8I8I16_ins<"btnez", "cmpi">, cbranch16;
 
-def BtnezT8SltX16: FEXT_T8I816_ins<0b000, "btnez", "slt", IIAlu>, cbranch16;
+def BtnezT8SltX16: FEXT_T8I816_ins<"btnez", "slt">, cbranch16;
 
-def BtnezT8SltuX16: FEXT_T8I816_ins<0b000, "btnez", "sltu", IIAlu>, cbranch16;
+def BtnezT8SltuX16: FEXT_T8I816_ins<"btnez", "sltu">, cbranch16;
 
-def BtnezT8SltiX16: FEXT_T8I8I16_ins<0b000, "btnez", "slti", IIAlu>, cbranch16;
+def BtnezT8SltiX16: FEXT_T8I8I16_ins<"btnez", "slti">, cbranch16;
 
-def BtnezT8SltiuX16: FEXT_T8I8I16_ins<0b000, "btnez", "sltiu", IIAlu>,
+def BtnezT8SltiuX16: FEXT_T8I8I16_ins<"btnez", "sltiu">,
   cbranch16;
 
 //
+// Format: CMP rx, ry MIPS16e
+// Purpose: Compare
+// To compare the contents of two GPRs.
+//
+def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate (Extended)
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
+  let Defs = [T8];
+}
+
+
+//
 // Format: DIV rx, ry MIPS16e
 // Purpose: Divide Word
 // To divide 32-bit signed integers.
@@ -526,7 +659,19 @@ def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
 def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
   let Defs = [HI, LO];
 }
+//
+// Format: JAL target MIPS16e
+// Purpose: Jump and Link
+// To execute a procedure call within the current 256 MB-aligned
+// region and preserve the current ISA.
+//
 
+def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> {
+  let isBranch = 1;
+  let hasDelaySlot = 0;  // not true, but we add the nop for now
+  let isTerminator=1;
+  let isBarrier=1;
+}
 
 //
 // Format: JR ra MIPS16e
@@ -543,7 +688,7 @@ def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> {
   let isBarrier=1;
 }
 
-def JrcRa16: FRR16_JALRC_RA_only_ins<0, 0, "jrc", IIAlu> {
+def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIAlu> {
   let isBranch = 1;
   let isIndirectBranch = 1;
   let isTerminator=1;
@@ -561,7 +706,9 @@ def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIAlu> {
 // Purpose: Load Byte (Extended)
 // To load a byte from memory as a signed value.
 //
-def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, IILoad>, MayLoad;
+def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, IILoad>, MayLoad{
+  let isCodeGenOnly = 1;
+}
 
 //
 // Format: LBU ry, offset(rx) MIPS16e
@@ -569,14 +716,18 @@ def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, IILoad>, MayLoad;
 // To load a byte from memory as a unsigned value.
 //
 def LbuRxRyOffMemX16:
-  FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, IILoad>, MayLoad;
+  FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, IILoad>, MayLoad {
+  let isCodeGenOnly = 1;
+}
 
 //
 // Format: LH ry, offset(rx) MIPS16e
 // Purpose: Load Halfword signed (Extended)
 // To load a halfword from memory as a signed value.
 //
-def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, IILoad>, MayLoad;
+def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, IILoad>, MayLoad{
+  let isCodeGenOnly = 1;
+}
 
 //
 // Format: LHU ry, offset(rx) MIPS16e
@@ -584,7 +735,16 @@ def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, IILoad>, MayLoad;
 // To load a halfword from memory as an unsigned value.
 //
 def LhuRxRyOffMemX16:
-  FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, IILoad>, MayLoad;
+  FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, IILoad>, MayLoad {
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate
+// To load a constant into a GPR.
+//
+def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>;
 
 //
 // Format: LI rx, immediate MIPS16e
@@ -598,7 +758,9 @@ def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
 // Purpose: Load Word (Extended)
 // To load a word from memory as a signed value.
 //
-def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IILoad>, MayLoad;
+def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IILoad>, MayLoad{
+  let isCodeGenOnly = 1;
+}
 
 // Format: LW rx, offset(sp) MIPS16e
 // Purpose: Load Word (SP-Relative, Extended)
@@ -779,7 +941,7 @@ def SbRxRyOffMemX16:
 // Purpose: if rt==0, do nothing
 //          else rs = rt
 //
-def SelBeqZ: Sel<0b00100, "beqz", IIAlu>;
+def SelBeqZ: Sel<"beqz">;
 
 //
 // Format:  SelTBteqZCmp rd, rs, rl, rr
@@ -787,7 +949,7 @@ def SelBeqZ: Sel<0b00100, "beqz", IIAlu>;
 //          If b==0 then do nothing.
 //          if b!=0 then rd = rs
 //
-def SelTBteqZCmp: SelT<0b000, "bteqz", 0b01010, "cmp", IIAlu>;
+def SelTBteqZCmp: SelT<"bteqz", "cmp">;
 
 //
 // Format:  SelTBteqZCmpi rd, rs, rl, rr
@@ -795,7 +957,7 @@ def SelTBteqZCmp: SelT<0b000, "bteqz", 0b01010, "cmp", IIAlu>;
 //          If b==0 then do nothing.
 //          if b!=0 then rd = rs
 //
-def SelTBteqZCmpi: SeliT<0b000, "bteqz", 0b01110, "cmpi", IIAlu>;
+def SelTBteqZCmpi: SeliT<"bteqz", "cmpi">;
 
 //
 // Format:  SelTBteqZSlt rd, rs, rl, rr
@@ -803,7 +965,7 @@ def SelTBteqZCmpi: SeliT<0b000, "bteqz", 0b01110, "cmpi", IIAlu>;
 //          If b==0 then do nothing.
 //          if b!=0 then rd = rs
 //
-def SelTBteqZSlt: SelT<0b000, "bteqz", 0b00010, "slt", IIAlu>;
+def SelTBteqZSlt: SelT<"bteqz", "slt">;
 
 //
 // Format:  SelTBteqZSlti rd, rs, rl, rr
@@ -811,7 +973,7 @@ def SelTBteqZSlt: SelT<0b000, "bteqz", 0b00010, "slt", IIAlu>;
 //          If b==0 then do nothing.
 //          if b!=0 then rd = rs
 //
-def SelTBteqZSlti: SeliT<0b000, "bteqz", 0b01010, "slti", IIAlu>;
+def SelTBteqZSlti: SeliT<"bteqz", "slti">;
 
 //
 // Format:  SelTBteqZSltu rd, rs, rl, rr
@@ -819,7 +981,7 @@ def SelTBteqZSlti: SeliT<0b000, "bteqz", 0b01010, "slti", IIAlu>;
 //          If b==0 then do nothing.
 //          if b!=0 then rd = rs
 //
-def SelTBteqZSltu: SelT<0b000, "bteqz", 0b00011, "sltu", IIAlu>;
+def SelTBteqZSltu: SelT<"bteqz", "sltu">;
 
 //
 // Format:  SelTBteqZSltiu rd, rs, rl, rr
@@ -827,14 +989,14 @@ def SelTBteqZSltu: SelT<0b000, "bteqz", 0b00011, "sltu", IIAlu>;
 //          If b==0 then do nothing.
 //          if b!=0 then rd = rs
 //
-def SelTBteqZSltiu: SeliT<0b000, "bteqz", 0b01011, "sltiu", IIAlu>;
+def SelTBteqZSltiu: SeliT<"bteqz", "sltiu">;
 
 //
 // Format: SelBnez rd, rs, rt
 // Purpose: if rt!=0, do nothing
 //          else rs = rt
 //
-def SelBneZ: Sel<0b00101, "bnez", IIAlu>;
+def SelBneZ: Sel<"bnez">;
 
 //
 // Format:  SelTBtneZCmp rd, rs, rl, rr
@@ -842,7 +1004,7 @@ def SelBneZ: Sel<0b00101, "bnez", IIAlu>;
 //          If b!=0 then do nothing.
 //          if b0=0 then rd = rs
 //
-def SelTBtneZCmp: SelT<0b001, "btnez", 0b01010, "cmp", IIAlu>;
+def SelTBtneZCmp: SelT<"btnez", "cmp">;
 
 //
 // Format:  SelTBtnezCmpi rd, rs, rl, rr
@@ -850,7 +1012,7 @@ def SelTBtneZCmp: SelT<0b001, "btnez", 0b01010, "cmp", IIAlu>;
 //          If b!=0 then do nothing.
 //          if b==0 then rd = rs
 //
-def SelTBtneZCmpi: SeliT<0b000, "btnez", 0b01110, "cmpi", IIAlu>;
+def SelTBtneZCmpi: SeliT<"btnez", "cmpi">;
 
 //
 // Format:  SelTBtneZSlt rd, rs, rl, rr
@@ -858,7 +1020,7 @@ def SelTBtneZCmpi: SeliT<0b000, "btnez", 0b01110, "cmpi", IIAlu>;
 //          If b!=0 then do nothing.
 //          if b==0 then rd = rs
 //
-def SelTBtneZSlt: SelT<0b001, "btnez", 0b00010, "slt", IIAlu>;
+def SelTBtneZSlt: SelT<"btnez", "slt">;
 
 //
 // Format:  SelTBtneZSlti rd, rs, rl, rr
@@ -866,7 +1028,7 @@ def SelTBtneZSlt: SelT<0b001, "btnez", 0b00010, "slt", IIAlu>;
 //          If b!=0 then do nothing.
 //          if b==0 then rd = rs
 //
-def SelTBtneZSlti: SeliT<0b001, "btnez", 0b01010, "slti", IIAlu>;
+def SelTBtneZSlti: SeliT<"btnez", "slti">;
 
 //
 // Format:  SelTBtneZSltu rd, rs, rl, rr
@@ -874,7 +1036,7 @@ def SelTBtneZSlti: SeliT<0b001, "btnez", 0b01010, "slti", IIAlu>;
 //          If b!=0 then do nothing.
 //          if b==0 then rd = rs
 //
-def SelTBtneZSltu: SelT<0b001, "btnez", 0b00011, "sltu", IIAlu>;
+def SelTBtneZSltu: SelT<"btnez", "sltu">;
 
 //
 // Format:  SelTBtneZSltiu rd, rs, rl, rr
@@ -882,7 +1044,7 @@ def SelTBtneZSltu: SelT<0b001, "btnez", 0b00011, "sltu", IIAlu>;
 //          If b!=0 then do nothing.
 //          if b==0 then rd = rs
 //
-def SelTBtneZSltiu: SeliT<0b001, "btnez", 0b01011, "sltiu", IIAlu>;
+def SelTBtneZSltiu: SeliT<"btnez", "sltiu">;
 //
 //
 // Format: SH ry, offset(rx) MIPS16e
@@ -906,39 +1068,78 @@ def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
 //
 def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIAlu>;
 
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> {
+  let Defs = [T8];
+}
+
 //
 // Format: SLTI rx, immediate MIPS16e
 // Purpose: Set on Less Than Immediate (Extended)
 // To record the result of a less-than comparison with a constant.
 //
-def SltiCCRxImmX16: FEXT_CCRXI16_ins<0b01010, "slti", IIAlu>;
+//
+def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIAlu> {
+  let Defs = [T8];
+}
+
+def SltiCCRxImmX16: FEXT_CCRXI16_ins<"slti">;
 
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIAlu> {
+  let Defs = [T8];
+}
 //
 // Format: SLTIU rx, immediate MIPS16e
 // Purpose: Set on Less Than Immediate Unsigned (Extended)
 // To record the result of a less-than comparison with a constant.
 //
-def SltiuCCRxImmX16: FEXT_CCRXI16_ins<0b01011, "sltiu", IIAlu>;
+def SltiuCCRxImmX16: FEXT_CCRXI16_ins<"sltiu">;
 
 //
 // Format: SLT rx, ry MIPS16e
 // Purpose: Set on Less Than
 // To record the result of a less-than comparison.
 //
-def SltRxRy16: FRR16_ins<0b00010, "slt", IIAlu>;
+def SltRxRy16: FRR16R_ins<0b00010, "slt", IIAlu>{
+  let Defs = [T8];
+}
 
-def SltCCRxRy16: FCCRR16_ins<0b00010, "slt", IIAlu>;
+def SltCCRxRy16: FCCRR16_ins<"slt">;
 
 // Format: SLTU rx, ry MIPS16e
 // Purpose: Set on Less Than Unsigned
 // To record the result of an unsigned less-than comparison.
 //
-def SltuRxRyRz16: FRRTR16_ins<0b00011, "sltu", IIAlu> {
+def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIAlu>{
+  let Defs = [T8];
+}
+
+def SltuRxRyRz16: FRRTR16_ins<"sltu"> {
   let isCodeGenOnly=1;
+  let Defs = [T8];
 }
 
 
-def SltuCCRxRy16: FCCRR16_ins<0b00011, "sltu", IIAlu>;
+def SltuCCRxRy16: FCCRR16_ins<"sltu">;
 //
 // Format: SRAV ry, rx MIPS16e
 // Purpose: Shift Word Right Arithmetic Variable
@@ -1034,6 +1235,7 @@ class ArithLogicI16_pat<SDNode OpNode, PatFrag imm_type, Instruction I> :
   Mips16Pat<(OpNode CPU16Regs:$in, imm_type:$imm),
             (I CPU16Regs:$in, imm_type:$imm)>;
 
+def: ArithLogicI16_pat<add, immSExt8, AddiuRxRxImm16>;
 def: ArithLogicI16_pat<add, immSExt16, AddiuRxRxImmX16>;
 def: ArithLogicI16_pat<shl, immZExt5, SllX16>;
 def: ArithLogicI16_pat<srl, immZExt5, SrlX16>;
@@ -1067,14 +1269,19 @@ def: StoreM16_pat<store, SwRxRyOffMemX16>;
 // Unconditional branch
 class UncondBranch16_pat<SDNode OpNode, Instruction I>:
   Mips16Pat<(OpNode bb:$imm16), (I bb:$imm16)> {
-    let Predicates = [RelocPIC, InMips16Mode];
+    let Predicates = [InMips16Mode];
   }
 
+def : Mips16Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+                (Jal16 tglobaladdr:$dst)>;
+
+def : Mips16Pat<(MipsJmpLink (i32 texternalsym:$dst)),
+                (Jal16 texternalsym:$dst)>;
+
 // Indirect branch
 def: Mips16Pat<
-  (brind CPU16Regs:$rs), 
-  (JrcRx16 CPU16Regs:$rs)>;  
-
+  (brind CPU16Regs:$rs),
+  (JrcRx16 CPU16Regs:$rs)>;
 
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=0 in
@@ -1502,7 +1709,7 @@ def: Mips16Pat
 //
 def: Mips16Pat
   <(setle CPU16Regs:$lhs, CPU16Regs:$rhs),
-   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImmX16 1))>;
+   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImm16 1))>;
 
 //
 // setlt
@@ -1562,7 +1769,11 @@ def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
 
 // hi/lo relocs
 
-def : Mips16Pat<(MipsHi tglobaltlsaddr:$in), 
+def : Mips16Pat<(MipsHi tglobaladdr:$in),
+                (SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
+def : Mips16Pat<(MipsHi tjumptable:$in),
+                (SllX16 (LiRxImmX16 tjumptable:$in), 16)>;
+def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
                 (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
 
 // wrapper_pic
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index c2e09a7..0ea9368 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -1,3 +1,4 @@
+
 //===-- Mips16RegisterInfo.cpp - MIPS16 Register Information -== ----------===//
 //
 //                     The LLVM Compiler Infrastructure
@@ -12,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Mips16RegisterInfo.h"
+#include "Mips16InstrInfo.h"
 #include "Mips.h"
 #include "Mips16InstrInfo.h"
 #include "MipsAnalyzeImmediate.h"
@@ -23,6 +25,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
@@ -69,27 +72,6 @@ bool Mips16RegisterInfo::saveScavengerRegister
   return true;
 }
 
-// This function eliminate ADJCALLSTACKDOWN,
-// ADJCALLSTACKUP pseudo instructions
-void Mips16RegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  if (!TFI->hasReservedCallFrame(MF)) {
-    int64_t Amount = I->getOperand(0).getImm();
-
-    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
-      Amount = -Amount;
-
-    const Mips16InstrInfo *II = static_cast<const Mips16InstrInfo*>(&TII);
-
-    II->adjustStackPtr(Mips::SP, Amount, MBB, I);
-  }
-
-  MBB.erase(I);
-}
-
 void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
                                      unsigned OpNo, int FrameIndex,
                                      uint64_t StackSize,
@@ -140,6 +122,7 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   //   by adding the size of the stack:
   //   incoming argument, callee-saved register location or local variable.
   int64_t Offset;
+  bool IsKill = false;
   Offset = SPOffset + (int64_t)StackSize;
   Offset += MI.getOperand(OpNo + 1).getImm();
 
@@ -148,9 +131,14 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
 
   if (!MI.isDebugValue() && ( ((FrameReg != Mips::SP) && !isInt<16>(Offset)) ||
       ((FrameReg == Mips::SP) && !isInt<15>(Offset)) )) {
-    llvm_unreachable("frame offset does not fit in instruction");
+    MachineBasicBlock &MBB = *MI.getParent();
+    DebugLoc DL = II->getDebugLoc();
+    unsigned NewImm;
+    FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm);
+    Offset = SignExtend64<16>(NewImm);
+    IsKill = true;
   }
-  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false);
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
   MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
 
 
diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index 6101739..b8f818a 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h
@@ -25,10 +25,6 @@ public:
   Mips16RegisterInfo(const MipsSubtarget &Subtarget,
                      const Mips16InstrInfo &TII);
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index cdf12c8..494ba87 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -71,52 +71,55 @@ let usesCustomInserter = 1, Predicates = [HasStdEnc],
 //===----------------------------------------------------------------------===//
 let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
-def DADDi   : ArithLogicI<"daddi", simm16_64, CPU64Regs>, ADDI_FM<0x18>;
-def DADDiu  : ArithLogicI<"daddiu", simm16_64, CPU64Regs, immSExt16, add>,
+def DADDi   : ArithLogicI<"daddi", simm16_64, CPU64RegsOpnd>, ADDI_FM<0x18>;
+def DADDiu  : ArithLogicI<"daddiu", simm16_64, CPU64RegsOpnd, immSExt16, add>,
               ADDI_FM<0x19>, IsAsCheapAsAMove;
-def DANDi   : ArithLogicI<"andi", uimm16_64, CPU64Regs, immZExt16, and>,
+def DANDi   : ArithLogicI<"andi", uimm16_64, CPU64RegsOpnd, immZExt16, and>,
               ADDI_FM<0xc>;
 def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, CPU64Regs>,
               SLTI_FM<0xa>;
 def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, CPU64Regs>,
               SLTI_FM<0xb>;
-def ORi64   : ArithLogicI<"ori", uimm16_64, CPU64Regs, immZExt16, or>,
+def ORi64   : ArithLogicI<"ori", uimm16_64, CPU64RegsOpnd, immZExt16, or>,
               ADDI_FM<0xd>;
-def XORi64  : ArithLogicI<"xori", uimm16_64, CPU64Regs, immZExt16, xor>,
+def XORi64  : ArithLogicI<"xori", uimm16_64, CPU64RegsOpnd, immZExt16, xor>,
               ADDI_FM<0xe>;
 def LUi64   : LoadUpper<"lui", CPU64Regs, uimm16_64>, LUI_FM;
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def DADD   : ArithLogicR<"dadd", CPU64Regs>, ADD_FM<0, 0x2c>;
-def DADDu  : ArithLogicR<"daddu", CPU64Regs, 1, IIAlu, add>, ADD_FM<0, 0x2d>;
-def DSUBu  : ArithLogicR<"dsubu", CPU64Regs, 0, IIAlu, sub>, ADD_FM<0, 0x2f>;
+def DADD   : ArithLogicR<"dadd", CPU64RegsOpnd>, ADD_FM<0, 0x2c>;
+def DADDu  : ArithLogicR<"daddu", CPU64RegsOpnd, 1, IIAlu, add>,
+                              ADD_FM<0, 0x2d>;
+def DSUBu  : ArithLogicR<"dsubu", CPU64RegsOpnd, 0, IIAlu, sub>,
+                              ADD_FM<0, 0x2f>;
 def SLT64  : SetCC_R<"slt", setlt, CPU64Regs>, ADD_FM<0, 0x2a>;
 def SLTu64 : SetCC_R<"sltu", setult, CPU64Regs>, ADD_FM<0, 0x2b>;
-def AND64  : ArithLogicR<"and", CPU64Regs, 1, IIAlu, and>, ADD_FM<0, 0x24>;
-def OR64   : ArithLogicR<"or", CPU64Regs, 1, IIAlu, or>, ADD_FM<0, 0x25>;
-def XOR64  : ArithLogicR<"xor", CPU64Regs, 1, IIAlu, xor>, ADD_FM<0, 0x26>;
-def NOR64  : LogicNOR<"nor", CPU64Regs>, ADD_FM<0, 0x27>;
+def AND64  : ArithLogicR<"and", CPU64RegsOpnd, 1, IIAlu, and>, ADD_FM<0, 0x24>;
+def OR64   : ArithLogicR<"or", CPU64RegsOpnd, 1, IIAlu, or>, ADD_FM<0, 0x25>;
+def XOR64  : ArithLogicR<"xor", CPU64RegsOpnd, 1, IIAlu, xor>, ADD_FM<0, 0x26>;
+def NOR64  : LogicNOR<"nor", CPU64RegsOpnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
-def DSLL   : shift_rotate_imm<"dsll", shamt, CPU64Regs, shl, immZExt6>,
+def DSLL   : shift_rotate_imm<"dsll", shamt, CPU64RegsOpnd, shl, immZExt6>,
              SRA_FM<0x38, 0>;
-def DSRL   : shift_rotate_imm<"dsrl", shamt, CPU64Regs, srl, immZExt6>,
+def DSRL   : shift_rotate_imm<"dsrl", shamt, CPU64RegsOpnd, srl, immZExt6>,
              SRA_FM<0x3a, 0>;
-def DSRA   : shift_rotate_imm<"dsra", shamt, CPU64Regs, sra, immZExt6>,
+def DSRA   : shift_rotate_imm<"dsra", shamt, CPU64RegsOpnd, sra, immZExt6>,
              SRA_FM<0x3b, 0>;
-def DSLLV  : shift_rotate_reg<"dsllv", CPU64Regs, shl>, SRLV_FM<0x14, 0>;
-def DSRLV  : shift_rotate_reg<"dsrlv", CPU64Regs, srl>, SRLV_FM<0x16, 0>;
-def DSRAV  : shift_rotate_reg<"dsrav", CPU64Regs, sra>, SRLV_FM<0x17, 0>;
-def DSLL32 : shift_rotate_imm<"dsll32", shamt, CPU64Regs>, SRA_FM<0x3c, 0>;
-def DSRL32 : shift_rotate_imm<"dsrl32", shamt, CPU64Regs>, SRA_FM<0x3e, 0>;
-def DSRA32 : shift_rotate_imm<"dsra32", shamt, CPU64Regs>, SRA_FM<0x3f, 0>;
+def DSLLV  : shift_rotate_reg<"dsllv", CPU64RegsOpnd, shl>, SRLV_FM<0x14, 0>;
+def DSRLV  : shift_rotate_reg<"dsrlv", CPU64RegsOpnd, srl>, SRLV_FM<0x16, 0>;
+def DSRAV  : shift_rotate_reg<"dsrav", CPU64RegsOpnd, sra>, SRLV_FM<0x17, 0>;
+def DSLL32 : shift_rotate_imm<"dsll32", shamt, CPU64RegsOpnd>, SRA_FM<0x3c, 0>;
+def DSRL32 : shift_rotate_imm<"dsrl32", shamt, CPU64RegsOpnd>, SRA_FM<0x3e, 0>;
+def DSRA32 : shift_rotate_imm<"dsra32", shamt, CPU64RegsOpnd>, SRA_FM<0x3f, 0>;
 }
 // Rotate Instructions
 let Predicates = [HasMips64r2, HasStdEnc],
     DecoderNamespace = "Mips64" in {
-  def DROTR  : shift_rotate_imm<"drotr", shamt, CPU64Regs, rotr, immZExt6>,
-                SRA_FM<0x3a, 1>;
-  def DROTRV : shift_rotate_reg<"drotrv", CPU64Regs, rotr>, SRLV_FM<0x16, 1>;
+  def DROTR  : shift_rotate_imm<"drotr", shamt, CPU64RegsOpnd, rotr, immZExt6>,
+               SRA_FM<0x3a, 1>;
+  def DROTRV : shift_rotate_reg<"drotrv", CPU64RegsOpnd, rotr>,
+               SRLV_FM<0x16, 1>;
 }
 
 let DecoderNamespace = "Mips64" in {
@@ -135,12 +138,11 @@ defm LD    : LoadM<"ld", CPU64Regs, load>, LW_FM<0x37>;
 defm SD    : StoreM<"sd", CPU64Regs, store>, LW_FM<0x3f>;
 
 /// load/store left/right
-let isCodeGenOnly = 1 in {
-  defm LWL64 : LoadLeftRightM<"lwl", MipsLWL, CPU64Regs>, LW_FM<0x22>;
-  defm LWR64 : LoadLeftRightM<"lwr", MipsLWR, CPU64Regs>, LW_FM<0x26>;
-  defm SWL64 : StoreLeftRightM<"swl", MipsSWL, CPU64Regs>, LW_FM<0x2a>;
-  defm SWR64 : StoreLeftRightM<"swr", MipsSWR, CPU64Regs>, LW_FM<0x2e>;
-}
+defm LWL64 : LoadLeftRightM<"lwl", MipsLWL, CPU64Regs>, LW_FM<0x22>;
+defm LWR64 : LoadLeftRightM<"lwr", MipsLWR, CPU64Regs>, LW_FM<0x26>;
+defm SWL64 : StoreLeftRightM<"swl", MipsSWL, CPU64Regs>, LW_FM<0x2a>;
+defm SWR64 : StoreLeftRightM<"swr", MipsSWR, CPU64Regs>, LW_FM<0x2e>;
+
 defm LDL   : LoadLeftRightM<"ldl", MipsLDL, CPU64Regs>, LW_FM<0x1a>;
 defm LDR   : LoadLeftRightM<"ldr", MipsLDR, CPU64Regs>, LW_FM<0x1b>;
 defm SDL   : StoreLeftRightM<"sdl", MipsSDL, CPU64Regs>, LW_FM<0x2c>;
@@ -148,13 +150,13 @@ defm SDR   : StoreLeftRightM<"sdr", MipsSDR, CPU64Regs>, LW_FM<0x2d>;
 
 /// Load-linked, Store-conditional
 let Predicates = [NotN64, HasStdEnc] in {
-  def LLD : LLBase<"lld", CPU64Regs, mem>, LW_FM<0x34>;
-  def SCD : SCBase<"scd", CPU64Regs, mem>, LW_FM<0x3c>;
+  def LLD : LLBase<"lld", CPU64RegsOpnd, mem>, LW_FM<0x34>;
+  def SCD : SCBase<"scd", CPU64RegsOpnd, mem>, LW_FM<0x3c>;
 }
 
 let Predicates = [IsN64, HasStdEnc], isCodeGenOnly = 1 in {
-  def LLD_P8 : LLBase<"lld", CPU64Regs, mem64>, LW_FM<0x34>;
-  def SCD_P8 : SCBase<"scd", CPU64Regs, mem64>, LW_FM<0x3c>;
+  def LLD_P8 : LLBase<"lld", CPU64RegsOpnd, mem64>, LW_FM<0x34>;
+  def SCD_P8 : SCBase<"scd", CPU64RegsOpnd, mem64>, LW_FM<0x3c>;
 }
 
 /// Jump and Branch Instructions
@@ -168,15 +170,18 @@ def BLTZ64 : CBranchZero<"bltz", setlt, CPU64Regs>, BGEZ_FM<1, 0>;
 }
 let DecoderNamespace = "Mips64" in
 def JALR64 : JumpLinkReg<"jalr", CPU64Regs>, JALR_FM;
+def JALR64Pseudo : JumpLinkRegPseudo<CPU64Regs, JALR64, RA_64>;
 def TAILCALL64_R : JumpFR<CPU64Regs, MipsTailCall>, MTLO_FM<8>, IsTailCall;
 
 let DecoderNamespace = "Mips64" in {
 /// Multiply and Divide Instructions.
-def DMULT  : Mult<"dmult", IIImul, CPU64Regs, [HI64, LO64]>, MULT_FM<0, 0x1c>;
-def DMULTu : Mult<"dmultu", IIImul, CPU64Regs, [HI64, LO64]>, MULT_FM<0, 0x1d>;
-def DSDIV  : Div<MipsDivRem, "ddiv", IIIdiv, CPU64Regs, [HI64, LO64]>,
+def DMULT  : Mult<"dmult", IIImul, CPU64RegsOpnd, [HI64, LO64]>,
+             MULT_FM<0, 0x1c>;
+def DMULTu : Mult<"dmultu", IIImul, CPU64RegsOpnd, [HI64, LO64]>,
+             MULT_FM<0, 0x1d>;
+def DSDIV  : Div<MipsDivRem, "ddiv", IIIdiv, CPU64RegsOpnd, [HI64, LO64]>,
              MULT_FM<0, 0x1e>;
-def DUDIV  : Div<MipsDivRemU, "ddivu", IIIdiv, CPU64Regs, [HI64, LO64]>,
+def DUDIV  : Div<MipsDivRemU, "ddivu", IIIdiv, CPU64RegsOpnd, [HI64, LO64]>,
              MULT_FM<0, 0x1f>;
 
 def MTHI64 : MoveToLOHI<"mthi", CPU64Regs, [HI64]>, MTLO_FM<0x11>;
@@ -189,28 +194,28 @@ def SEB64 : SignExtInReg<"seb", i8, CPU64Regs>, SEB_FM<0x10, 0x20>;
 def SEH64 : SignExtInReg<"seh", i16, CPU64Regs>, SEB_FM<0x18, 0x20>;
 
 /// Count Leading
-def DCLZ : CountLeading0<"dclz", CPU64Regs>, CLO_FM<0x24>;
-def DCLO : CountLeading1<"dclo", CPU64Regs>, CLO_FM<0x25>;
+def DCLZ : CountLeading0<"dclz", CPU64RegsOpnd>, CLO_FM<0x24>;
+def DCLO : CountLeading1<"dclo", CPU64RegsOpnd>, CLO_FM<0x25>;
 
 /// Double Word Swap Bytes/HalfWords
-def DSBH : SubwordSwap<"dsbh", CPU64Regs>, SEB_FM<2, 0x24>;
-def DSHD : SubwordSwap<"dshd", CPU64Regs>, SEB_FM<5, 0x24>;
+def DSBH : SubwordSwap<"dsbh", CPU64RegsOpnd>, SEB_FM<2, 0x24>;
+def DSHD : SubwordSwap<"dshd", CPU64RegsOpnd>, SEB_FM<5, 0x24>;
 
 def LEA_ADDiu64 : EffectiveAddress<"daddiu", CPU64Regs, mem_ea_64>, LW_FM<0x19>;
 
 }
 let DecoderNamespace = "Mips64" in {
-def RDHWR64 : ReadHardware<CPU64Regs, HWRegs64>, RDHWR_FM;
+def RDHWR64 : ReadHardware<CPU64Regs, HW64RegsOpnd>, RDHWR_FM;
 
-def DEXT : ExtBase<"dext", CPU64Regs>, EXT_FM<3>;
+def DEXT : ExtBase<"dext", CPU64RegsOpnd>, EXT_FM<3>;
 let Pattern = []<dag> in {
-  def DEXTU : ExtBase<"dextu", CPU64Regs>, EXT_FM<2>;
-  def DEXTM : ExtBase<"dextm", CPU64Regs>, EXT_FM<1>;
+  def DEXTU : ExtBase<"dextu", CPU64RegsOpnd>, EXT_FM<2>;
+  def DEXTM : ExtBase<"dextm", CPU64RegsOpnd>, EXT_FM<1>;
 }
-def DINS : InsBase<"dins", CPU64Regs>, EXT_FM<7>;
+def DINS : InsBase<"dins", CPU64RegsOpnd>, EXT_FM<7>;
 let Pattern = []<dag> in {
-  def DINSU : InsBase<"dinsu", CPU64Regs>, EXT_FM<6>;
-  def DINSM : InsBase<"dinsm", CPU64Regs>, EXT_FM<5>;
+  def DINSU : InsBase<"dinsu", CPU64RegsOpnd>, EXT_FM<6>;
+  def DINSM : InsBase<"dinsm", CPU64RegsOpnd>, EXT_FM<5>;
 }
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
@@ -304,38 +309,60 @@ def : MipsPat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>;
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
-def : InstAlias<"move $dst,$src", (DADD CPU64Regs:$dst,CPU64Regs:$src,ZERO_64)>;
+def : InstAlias<"move $dst, $src",
+                (DADDu CPU64RegsOpnd:$dst,  CPU64RegsOpnd:$src, ZERO_64), 1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"move $dst, $src",
+                (OR64 CPU64RegsOpnd:$dst, CPU64RegsOpnd:$src, ZERO_64), 0>,
+      Requires<[HasMips64]>;
+def : InstAlias<"and $rs, $rt, $imm",
+                (DANDi CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
+                1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"slt $rs, $rt, $imm",
+                (SLTi64 CPURegsOpnd:$rs, CPU64Regs:$rt, simm16_64:$imm), 1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"xor $rs, $rt, $imm",
+                (XORi64 CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
+                1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"not $rt, $rs",
+                (NOR64 CPU64RegsOpnd:$rt, CPU64RegsOpnd:$rs, ZERO_64), 1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"j $rs", (JR64 CPU64Regs:$rs), 0>, Requires<[HasMips64]>;
+def : InstAlias<"jalr $rs", (JALR64 RA_64, CPU64Regs:$rs)>,
+      Requires<[HasMips64]>;
+def : InstAlias<"daddu $rs, $rt, $imm",
+                (DADDiu CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, simm16_64:$imm),
+                1>;
+def : InstAlias<"dadd $rs, $rt, $imm",
+                (DADDi CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, simm16_64:$imm),
+                1>;
 
 /// Move between CPU and coprocessor registers
+
 let DecoderNamespace = "Mips64" in {
-def MFC0_3OP64 : MFC3OP<(outs CPU64Regs:$rt), (ins CPU64Regs:$rd, uimm16:$sel),
-                        "mfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 0>;
-def MTC0_3OP64 : MFC3OP<(outs CPU64Regs:$rd, uimm16:$sel), (ins CPU64Regs:$rt),
-                        "mtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 4>;
-def MFC2_3OP64 : MFC3OP<(outs CPU64Regs:$rt), (ins CPU64Regs:$rd, uimm16:$sel),
-                        "mfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 0>;
-def MTC2_3OP64 : MFC3OP<(outs CPU64Regs:$rd, uimm16:$sel), (ins CPU64Regs:$rt),
-                        "mtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 4>;
-def DMFC0_3OP64 : MFC3OP<(outs CPU64Regs:$rt), (ins CPU64Regs:$rd, uimm16:$sel),
+def DMFC0_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rt),
+                         (ins CPU64RegsOpnd:$rd, uimm16:$sel),
                          "dmfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 1>;
-def DMTC0_3OP64 : MFC3OP<(outs CPU64Regs:$rd, uimm16:$sel), (ins CPU64Regs:$rt),
+def DMTC0_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rd, uimm16:$sel),
+                         (ins CPU64RegsOpnd:$rt),
                          "dmtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 5>;
-def DMFC2_3OP64 : MFC3OP<(outs CPU64Regs:$rt), (ins CPU64Regs:$rd, uimm16:$sel),
+def DMFC2_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rt),
+                         (ins CPU64RegsOpnd:$rd, uimm16:$sel),
                          "dmfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 1>;
-def DMTC2_3OP64 : MFC3OP<(outs CPU64Regs:$rd, uimm16:$sel), (ins CPU64Regs:$rt),
+def DMTC2_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rd, uimm16:$sel),
+                         (ins CPU64RegsOpnd:$rt),
                          "dmtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 5>;
 }
+
 // Two operand (implicit 0 selector) versions:
-def : InstAlias<"mfc0 $rt, $rd", (MFC0_3OP64 CPU64Regs:$rt, CPU64Regs:$rd, 0)>;
-def : InstAlias<"mtc0 $rt, $rd", (MTC0_3OP64 CPU64Regs:$rd, 0, CPU64Regs:$rt)>;
-def : InstAlias<"mfc2 $rt, $rd", (MFC2_3OP64 CPU64Regs:$rt, CPU64Regs:$rd, 0)>;
-def : InstAlias<"mtc2 $rt, $rd", (MTC2_3OP64 CPU64Regs:$rd, 0, CPU64Regs:$rt)>;
 def : InstAlias<"dmfc0 $rt, $rd",
-                (DMFC0_3OP64 CPU64Regs:$rt, CPU64Regs:$rd, 0)>;
+                (DMFC0_3OP64 CPU64RegsOpnd:$rt, CPU64RegsOpnd:$rd, 0), 0>;
 def : InstAlias<"dmtc0 $rt, $rd",
-                (DMTC0_3OP64 CPU64Regs:$rd, 0, CPU64Regs:$rt)>;
+                (DMTC0_3OP64 CPU64RegsOpnd:$rd, 0, CPU64RegsOpnd:$rt), 0>;
 def : InstAlias<"dmfc2 $rt, $rd",
-                (DMFC2_3OP64 CPU64Regs:$rt, CPU64Regs:$rd, 0)>;
+                (DMFC2_3OP64 CPU64RegsOpnd:$rt, CPU64RegsOpnd:$rd, 0), 0>;
 def : InstAlias<"dmtc2 $rt, $rd",
-                (DMTC2_3OP64 CPU64Regs:$rd, 0, CPU64Regs:$rt)>;
+                (DMTC2_3OP64 CPU64RegsOpnd:$rd, 0, CPU64RegsOpnd:$rt), 0>;
 
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 6ad7e96..1876cb6 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -13,10 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-asm-printer"
-#include "MipsAsmPrinter.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsELFStreamer.h"
 #include "Mips.h"
+#include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
 #include "MipsMCInstLower.h"
 #include "llvm/ADT/SmallString.h"
@@ -35,6 +36,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/Mangler.h"
@@ -65,19 +67,28 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(OutStreamer, MI))
-    return;
-
   MachineBasicBlock::const_instr_iterator I = MI;
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
   do {
-    MCInst TmpInst0;
-    MCInstLowering.Lower(I++, TmpInst0);
+    // Do any auto-generated pseudo lowerings.
+    if (emitPseudoExpansionLowering(OutStreamer, &*I))
+      continue;
+
+    // The inMips16Mode() test is not permanent.
+    // Some instructions are marked as pseudo right now which
+    // would make the test fail for the wrong reason but
+    // that will be fixed soon. We need this here because we are
+    // removing another test for this situation downstream in the
+    // callchain.
+    //
+    if (I->isPseudo() && !Subtarget->inMips16Mode())
+      llvm_unreachable("Pseudo opcode found in EmitInstruction()");
 
+    MCInst TmpInst0;
+    MCInstLowering.Lower(I, TmpInst0);
     OutStreamer.EmitInstruction(TmpInst0);
-  } while ((I != E) && I->isInsideBundle()); // Delay slot check
+  } while ((++I != E) && I->isInsideBundle()); // Delay slot check
 }
 
 //===----------------------------------------------------------------------===//
@@ -221,6 +232,11 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
     // OutStreamer.EmitRawText(StringRef("\t.set\tnomicromips"));
     OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName()));
   }
+
+  if (Subtarget->inMicroMipsMode())
+    if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
+      MES->emitMipsSTOCG(*Subtarget, CurrentFnSym,
+      (unsigned)ELF::STO_MIPS_MICROMIPS);
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
@@ -236,10 +252,11 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
     raw_svector_ostream OS(Str);
     printSavedRegsBitmask(OS);
     OutStreamer.EmitRawText(OS.str());
-
-    OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder"));
-    OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
-    OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
+    if (!Subtarget->inMips16Mode()) {
+      OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder"));
+      OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
+      OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
+    }
   }
 }
 
@@ -250,9 +267,11 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
   // always be at the function end, and we can't emit and
   // break with BB logic.
   if (OutStreamer.hasRawTextSupport()) {
-    OutStreamer.EmitRawText(StringRef("\t.set\tat"));
-    OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
-    OutStreamer.EmitRawText(StringRef("\t.set\treorder"));
+    if (!Subtarget->inMips16Mode()) {
+      OutStreamer.EmitRawText(StringRef("\t.set\tat"));
+      OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
+      OutStreamer.EmitRawText(StringRef("\t.set\treorder"));
+    }
     OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
   }
 }
@@ -540,6 +559,18 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // return to previous section
   if (OutStreamer.hasRawTextSupport())
     OutStreamer.EmitRawText(StringRef("\t.previous"));
+
+}
+
+void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
+
+  if (OutStreamer.hasRawTextSupport()) return;
+
+  // Emit Mips ELF register info
+  Subtarget->getMReginfo().emitMipsReginfoSectionCG(
+             OutStreamer, getObjFileLowering(), *Subtarget);
+  if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
+    MES->emitELFHeaderFlagsCG(*Subtarget);
 }
 
 MachineLocation
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index d8fbeeb..dbdaf26 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -80,6 +80,7 @@ public:
   void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                        const char *Modifier = 0);
   void EmitStartOfAsmFile(Module &M);
+  void EmitEndOfAsmFile(Module &M);
   virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 };
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 52fa95b..df877b6 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/Passes.h"
@@ -62,67 +63,73 @@ class MipsCodeEmitter : public MachineFunctionPass {
 
   static char ID;
 
-  public:
-    MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce) :
-      MachineFunctionPass(ID), JTI(0),
-      II((const MipsInstrInfo *) tm.getInstrInfo()),
-      TD(tm.getDataLayout()), TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
-      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {
-    }
+public:
+  MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+    : MachineFunctionPass(ID), JTI(0),
+      II((const MipsInstrInfo *) tm.getInstrInfo()), TD(tm.getDataLayout()),
+      TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
-    bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF);
 
-    virtual const char *getPassName() const {
-      return "Mips Machine Code Emitter";
-    }
+  virtual const char *getPassName() const {
+    return "Mips Machine Code Emitter";
+  }
 
-    /// getBinaryCodeForInstr - This function, generated by the
-    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
-    /// machine instructions.
-    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+  /// getBinaryCodeForInstr - This function, generated by the
+  /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+  /// machine instructions.
+  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
 
-    void emitInstruction(const MachineInstr &MI);
+  void emitInstruction(MachineBasicBlock::instr_iterator MI,
+                       MachineBasicBlock &MBB);
 
-  private:
+private:
 
-    void emitWord(unsigned Word);
+  void emitWord(unsigned Word);
 
-    /// Routines that handle operands which add machine relocations which are
-    /// fixed up by the relocation stage.
-    void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
-                           bool MayNeedFarStub) const;
-    void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
-    void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
-    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
-    void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
+  /// Routines that handle operands which add machine relocations which are
+  /// fixed up by the relocation stage.
+  void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                         bool MayNeedFarStub) const;
+  void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+  void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+  void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
+  void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
 
-    /// getMachineOpValue - Return binary encoding of operand. If the machine
-    /// operand requires relocation, record the relocation and return zero.
-    unsigned getMachineOpValue(const MachineInstr &MI,
-                               const MachineOperand &MO) const;
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MachineInstr &MI,
+                             const MachineOperand &MO) const;
 
-    unsigned getRelocation(const MachineInstr &MI,
-                           const MachineOperand &MO) const;
+  unsigned getRelocation(const MachineInstr &MI,
+                         const MachineOperand &MO) const;
 
-    unsigned getJumpTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getJumpTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
 
-    unsigned getBranchTargetOpValue(const MachineInstr &MI,
-                                    unsigned OpNo) const;
-    unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
-    void emitGlobalAddressUnaligned(const GlobalValue *GV, unsigned Reloc,
-                                    int Offset) const;
-  };
+  void emitGlobalAddressUnaligned(const GlobalValue *GV, unsigned Reloc,
+                                  int Offset) const;
+
+  /// \brief Expand pseudo instruction. Return true if MI was expanded.
+  bool expandPseudos(MachineBasicBlock::instr_iterator &MI,
+                     MachineBasicBlock &MBB) const;
+};
 }
 
 char MipsCodeEmitter::ID = 0;
 
 bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
-  JTI = ((MipsTargetMachine&) MF.getTarget()).getJITInfo();
-  II = ((const MipsTargetMachine&) MF.getTarget()).getInstrInfo();
-  TD = ((const MipsTargetMachine&) MF.getTarget()).getDataLayout();
+  MipsTargetMachine &Target = static_cast<MipsTargetMachine &>(
+                                const_cast<TargetMachine &>(MF.getTarget()));
+
+  JTI = Target.getJITInfo();
+  II = Target.getInstrInfo();
+  TD = Target.getDataLayout();
   Subtarget = &TM.getSubtarget<MipsSubtarget> ();
   MCPEs = &MF.getConstantPool()->getConstants();
   MJTEs = 0;
@@ -139,8 +146,8 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
         MBB != E; ++MBB){
       MCE.StartMachineBasicBlock(MBB);
       for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
-           E = MBB->instr_end(); I != E; ++I)
-        emitInstruction(*I);
+           E = MBB->instr_end(); I != E;)
+        emitInstruction(*I++, *MBB);
     }
   } while (MCE.finishFunction(MF));
 
@@ -265,19 +272,21 @@ void MipsCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
                                              Reloc, BB));
 }
 
-void MipsCodeEmitter::emitInstruction(const MachineInstr &MI) {
-  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI);
-
-  MCE.processDebugLoc(MI.getDebugLoc(), true);
+void MipsCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
+                                      MachineBasicBlock &MBB) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << *MI);
 
-  // Skip pseudo instructions.
-  if ((MI.getDesc().TSFlags & MipsII::FormMask) == MipsII::Pseudo)
+  // Expand pseudo instruction. Skip if MI was not expanded.
+  if (((MI->getDesc().TSFlags & MipsII::FormMask) == MipsII::Pseudo) &&
+      !expandPseudos(MI, MBB))
     return;
 
-  emitWord(getBinaryCodeForInstr(MI));
+  MCE.processDebugLoc(MI->getDebugLoc(), true);
+
+  emitWord(getBinaryCodeForInstr(*MI));
   ++NumEmitted;  // Keep track of the # of mi's emitted
 
-  MCE.processDebugLoc(MI.getDebugLoc(), false);
+  MCE.processDebugLoc(MI->getDebugLoc(), false);
 }
 
 void MipsCodeEmitter::emitWord(unsigned Word) {
@@ -289,6 +298,25 @@ void MipsCodeEmitter::emitWord(unsigned Word) {
     MCE.emitWordBE(Word);
 }
 
+bool MipsCodeEmitter::expandPseudos(MachineBasicBlock::instr_iterator &MI,
+                                    MachineBasicBlock &MBB) const {
+  switch (MI->getOpcode()) {
+  case Mips::NOP:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::SLL), Mips::ZERO)
+      .addReg(Mips::ZERO).addImm(0);
+    break;
+  case Mips::JALRPseudo:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::JALR), Mips::RA)
+      .addReg(MI->getOperand(0).getReg());
+    break;
+  default:
+    return false;
+  }
+
+  (MI--)->eraseFromBundle();
+  return true;
+}
+
 /// createMipsJITCodeEmitterPass - Return a pass that emits the collected Mips
 /// code to the specified MCE object.
 FunctionPass *llvm::createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 041a9d0..d62b166 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -1,4 +1,4 @@
-//===-- DelaySlotFiller.cpp - Mips Delay Slot Filler ----------------------===//
+//===-- MipsDelaySlotFiller.cpp - Mips Delay Slot Filler ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Simple pass to fills delay slots with useful instructions.
+// Simple pass to fill delay slots with useful instructions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -15,7 +15,7 @@
 
 #include "Mips.h"
 #include "MipsTargetMachine.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -33,8 +33,7 @@ STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
 static cl::opt<bool> DisableDelaySlotFiller(
   "disable-mips-delay-filler",
   cl::init(false),
-  cl::desc("Disable the delay slot filler, which attempts to fill the Mips"
-           "delay slots with useful instructions."),
+  cl::desc("Fill all delay slots with NOPs."),
   cl::Hidden);
 
 // This option can be used to silence complaints by machine verifier passes.
@@ -45,15 +44,25 @@ static cl::opt<bool> SkipDelaySlotFiller(
   cl::Hidden);
 
 namespace {
-  struct Filler : public MachineFunctionPass {
-    typedef MachineBasicBlock::instr_iterator InstrIter;
-    typedef MachineBasicBlock::reverse_instr_iterator ReverseInstrIter;
+  class RegDefsUses {
+  public:
+    RegDefsUses(TargetMachine &TM);
+    void init(const MachineInstr &MI);
+    bool update(const MachineInstr &MI, unsigned Begin, unsigned End);
 
-    TargetMachine &TM;
-    const TargetInstrInfo *TII;
-    InstrIter LastFiller;
+  private:
+    bool checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses, unsigned Reg,
+                          bool IsDef) const;
 
-    static char ID;
+    /// Returns true if Reg or its alias is in RegSet.
+    bool isRegInSet(const BitVector &RegSet, unsigned Reg) const;
+
+    const TargetRegisterInfo &TRI;
+    BitVector Defs, Uses;
+  };
+
+  class Filler : public MachineFunctionPass {
+  public:
     Filler(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
 
@@ -61,7 +70,6 @@ namespace {
       return "Mips Delay Slot Filler";
     }
 
-    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
     bool runOnMachineFunction(MachineFunction &F) {
       if (SkipDelaySlotFiller)
         return false;
@@ -73,66 +81,115 @@ namespace {
       return Changed;
     }
 
-    bool isDelayFiller(MachineBasicBlock &MBB,
-                       InstrIter candidate);
+  private:
+    typedef MachineBasicBlock::iterator Iter;
+    typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
-    void insertCallUses(InstrIter MI,
-                        SmallSet<unsigned, 32> &RegDefs,
-                        SmallSet<unsigned, 32> &RegUses);
-
-    void insertDefsUses(InstrIter MI,
-                        SmallSet<unsigned, 32> &RegDefs,
-                        SmallSet<unsigned, 32> &RegUses);
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
 
-    bool IsRegInSet(SmallSet<unsigned, 32> &RegSet,
-                    unsigned Reg);
+    /// This function checks if it is valid to move Candidate to the delay slot
+    /// and returns true if it isn't. It also updates load and store flags and
+    /// register defs and uses.
+    bool delayHasHazard(const MachineInstr &Candidate, bool &SawLoad,
+                        bool &SawStore, RegDefsUses &RegDU) const;
 
-    bool delayHasHazard(InstrIter candidate,
-                        bool &sawLoad, bool &sawStore,
-                        SmallSet<unsigned, 32> &RegDefs,
-                        SmallSet<unsigned, 32> &RegUses);
+    bool findDelayInstr(MachineBasicBlock &MBB, Iter slot, Iter &Filler) const;
 
-    bool
-    findDelayInstr(MachineBasicBlock &MBB, InstrIter slot,
-                   InstrIter &Filler);
+    bool terminateSearch(const MachineInstr &Candidate) const;
 
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
 
+    static char ID;
   };
   char Filler::ID = 0;
 } // end of anonymous namespace
 
+RegDefsUses::RegDefsUses(TargetMachine &TM)
+  : TRI(*TM.getRegisterInfo()), Defs(TRI.getNumRegs(), false),
+    Uses(TRI.getNumRegs(), false) {}
+
+void RegDefsUses::init(const MachineInstr &MI) {
+  // Add all register operands which are explicit and non-variadic.
+  update(MI, 0, MI.getDesc().getNumOperands());
+
+  // If MI is a call, add RA to Defs to prevent users of RA from going into
+  // delay slot.
+  if (MI.isCall())
+    Defs.set(Mips::RA);
+
+  // Add all implicit register operands of branch instructions except
+  // register AT.
+  if (MI.isBranch()) {
+    update(MI, MI.getDesc().getNumOperands(), MI.getNumOperands());
+    Defs.reset(Mips::AT);
+  }
+}
+
+bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) {
+  BitVector NewDefs(TRI.getNumRegs()), NewUses(TRI.getNumRegs());
+  bool HasHazard = false;
+
+  for (unsigned I = Begin; I != End; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+
+    if (MO.isReg() && MO.getReg())
+      HasHazard |= checkRegDefsUses(NewDefs, NewUses, MO.getReg(), MO.isDef());
+  }
+
+  Defs |= NewDefs;
+  Uses |= NewUses;
+
+  return HasHazard;
+}
+
+bool RegDefsUses::checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses,
+                                   unsigned Reg, bool IsDef) const {
+  if (IsDef) {
+    NewDefs.set(Reg);
+    // check whether Reg has already been defined or used.
+    return (isRegInSet(Defs, Reg) || isRegInSet(Uses, Reg));
+  }
+
+  NewUses.set(Reg);
+  // check whether Reg has already been defined.
+  return isRegInSet(Defs, Reg);
+}
+
+bool RegDefsUses::isRegInSet(const BitVector &RegSet, unsigned Reg) const {
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    if (RegSet.test(*AI))
+      return true;
+  return false;
+}
+
 /// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
 /// We assume there is only one delay slot per delayed instruction.
-bool Filler::
-runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
-  LastFiller = MBB.instr_end();
-
-  for (InstrIter I = MBB.instr_begin(); I != MBB.instr_end(); ++I)
-    if (I->hasDelaySlot()) {
-      ++FilledSlots;
-      Changed = true;
-      InstrIter InstrWithSlot = I;
-      InstrIter D;
-
-      // Delay slot filling is disabled at -O0.
-      if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None) &&
-          findDelayInstr(MBB, I, D)) {
-        MBB.splice(llvm::next(I), &MBB, D);
-        ++UsefulSlots;
-      } else
-        BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
-
-      // Record the filler instruction that filled the delay slot.
-      // The instruction after it will be visited in the next iteration.
-      LastFiller = ++I;
-
-      // Bundle the delay slot filler to InstrWithSlot so that the machine
-      // verifier doesn't expect this instruction to be a terminator.
-      MIBundleBuilder(MBB, InstrWithSlot, llvm::next(LastFiller));
-     }
-  return Changed;
 
+  for (Iter I = MBB.begin(); I != MBB.end(); ++I) {
+    if (!I->hasDelaySlot())
+      continue;
+
+    ++FilledSlots;
+    Changed = true;
+    Iter D;
+
+    // Delay slot filling is disabled at -O0.
+    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None) &&
+        findDelayInstr(MBB, I, D)) {
+      MBB.splice(llvm::next(I), &MBB, D);
+      ++UsefulSlots;
+    } else
+      BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+
+    // Bundle the delay slot filler to the instruction with the delay slot.
+    MIBundleBuilder(MBB, I, llvm::next(llvm::next(I)));
+  }
+
+  return Changed;
 }
 
 /// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
@@ -141,146 +198,57 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
   return new Filler(tm);
 }
 
-bool Filler::findDelayInstr(MachineBasicBlock &MBB,
-                            InstrIter slot,
-                            InstrIter &Filler) {
-  SmallSet<unsigned, 32> RegDefs;
-  SmallSet<unsigned, 32> RegUses;
+bool Filler::findDelayInstr(MachineBasicBlock &MBB, Iter Slot,
+                            Iter &Filler) const {
+  RegDefsUses RegDU(TM);
 
-  insertDefsUses(slot, RegDefs, RegUses);
+  RegDU.init(*Slot);
 
-  bool sawLoad = false;
-  bool sawStore = false;
+  bool SawLoad = false;
+  bool SawStore = false;
 
-  for (ReverseInstrIter I(slot); I != MBB.instr_rend(); ++I) {
+  for (ReverseIter I(Slot); I != MBB.rend(); ++I) {
     // skip debug value
     if (I->isDebugValue())
       continue;
 
-    // Convert to forward iterator.
-    InstrIter FI(llvm::next(I).base());
-
-    if (I->hasUnmodeledSideEffects()
-        || I->isInlineAsm()
-        || I->isLabel()
-        || FI == LastFiller
-        || I->isPseudo()
-        //
-        // Should not allow:
-        // ERET, DERET or WAIT, PAUSE. Need to add these to instruction
-        // list. TBD.
-        )
+    if (terminateSearch(*I))
       break;
 
-    if (delayHasHazard(FI, sawLoad, sawStore, RegDefs, RegUses)) {
-      insertDefsUses(FI, RegDefs, RegUses);
+    if (delayHasHazard(*I, SawLoad, SawStore, RegDU))
       continue;
-    }
 
-    Filler = FI;
+    Filler = llvm::next(I).base();
     return true;
   }
 
   return false;
 }
 
-bool Filler::delayHasHazard(InstrIter candidate,
-                            bool &sawLoad, bool &sawStore,
-                            SmallSet<unsigned, 32> &RegDefs,
-                            SmallSet<unsigned, 32> &RegUses) {
-  if (candidate->isImplicitDef() || candidate->isKill())
-    return true;
+bool Filler::delayHasHazard(const MachineInstr &Candidate, bool &SawLoad,
+                            bool &SawStore, RegDefsUses &RegDU) const {
+  bool HasHazard = (Candidate.isImplicitDef() || Candidate.isKill());
 
   // Loads or stores cannot be moved past a store to the delay slot
   // and stores cannot be moved past a load.
-  if (candidate->mayLoad()) {
-    if (sawStore)
-      return true;
-    sawLoad = true;
-  }
-
-  if (candidate->mayStore()) {
-    if (sawStore)
-      return true;
-    sawStore = true;
-    if (sawLoad)
-      return true;
+  if (Candidate.mayStore() || Candidate.hasOrderedMemoryRef()) {
+    HasHazard |= SawStore | SawLoad;
+    SawStore = true;
+  } else if (Candidate.mayLoad()) {
+    HasHazard |= SawStore;
+    SawLoad = true;
   }
 
-  assert((!candidate->isCall() && !candidate->isReturn()) &&
+  assert((!Candidate.isCall() && !Candidate.isReturn()) &&
          "Cannot put calls or returns in delay slot.");
 
-  for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) {
-    const MachineOperand &MO = candidate->getOperand(i);
-    unsigned Reg;
+  HasHazard |= RegDU.update(Candidate, 0, Candidate.getNumOperands());
 
-    if (!MO.isReg() || !(Reg = MO.getReg()))
-      continue; // skip
-
-    if (MO.isDef()) {
-      // check whether Reg is defined or used before delay slot.
-      if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg))
-        return true;
-    }
-    if (MO.isUse()) {
-      // check whether Reg is defined before delay slot.
-      if (IsRegInSet(RegDefs, Reg))
-        return true;
-    }
-  }
-  return false;
-}
-
-// Helper function for getting a MachineOperand's register number and adding it
-// to RegDefs or RegUses.
-static void insertDefUse(const MachineOperand &MO,
-                         SmallSet<unsigned, 32> &RegDefs,
-                         SmallSet<unsigned, 32> &RegUses,
-                         unsigned ExcludedReg = 0) {
-  unsigned Reg;
-
-  if (!MO.isReg() || !(Reg = MO.getReg()) || (Reg == ExcludedReg))
-    return;
-
-  if (MO.isDef())
-    RegDefs.insert(Reg);
-  else if (MO.isUse())
-    RegUses.insert(Reg);
-}
-
-// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
-void Filler::insertDefsUses(InstrIter MI,
-                            SmallSet<unsigned, 32> &RegDefs,
-                            SmallSet<unsigned, 32> &RegUses) {
-  unsigned I, E = MI->getDesc().getNumOperands();
-
-  for (I = 0; I != E; ++I)
-    insertDefUse(MI->getOperand(I), RegDefs, RegUses);
-
-  // If MI is a call, add RA to RegDefs to prevent users of RA from going into
-  // delay slot.
-  if (MI->isCall()) {
-    RegDefs.insert(Mips::RA);
-    return;
-  }
-
-  // Return if MI is a return.
-  if (MI->isReturn())
-    return;
-
-  // Examine the implicit operands. Exclude register AT which is in the list of
-  // clobbered registers of branch instructions.
-  E = MI->getNumOperands();
-  for (; I != E; ++I)
-    insertDefUse(MI->getOperand(I), RegDefs, RegUses, Mips::AT);
+  return HasHazard;
 }
 
-//returns true if the Reg or its alias is in the RegSet.
-bool Filler::IsRegInSet(SmallSet<unsigned, 32> &RegSet, unsigned Reg) {
-  // Check Reg and all aliased Registers.
-  for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true);
-       AI.isValid(); ++AI)
-    if (RegSet.count(*AI))
-      return true;
-  return false;
+bool Filler::terminateSearch(const MachineInstr &Candidate) const {
+  return (Candidate.isTerminator() || Candidate.isCall() ||
+          Candidate.isLabel() || Candidate.isInlineAsm() ||
+          Candidate.hasUnmodeledSideEffects());
 }
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index c5f1290..78c74ef 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -96,7 +96,14 @@ private:
   SDNode *Select(SDNode *N);
 
   // Complex Pattern.
-  bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Offset);
+  /// (reg + imm).
+  bool selectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset) const;
+
+  /// Fall back on this function if all else fails.
+  bool selectAddrDefault(SDValue Addr, SDValue &Base, SDValue &Offset) const;
+
+  /// Match integer address pattern.
+  bool selectIntAddr(SDValue Addr, SDValue &Base, SDValue &Offset) const;
 
   bool SelectAddr16(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Offset,
        SDValue &Alias);
@@ -323,8 +330,8 @@ SDValue MipsDAGToDAGISel::getMips16SPAliasReg() {
 
 /// ComplexPattern used on MipsInstrInfo
 /// Used on Mips Load/Store instructions
-bool MipsDAGToDAGISel::
-SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
+bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
   EVT ValTy = Addr.getValueType();
 
   // if Address is FI, get the TargetFrameIndex.
@@ -384,21 +391,24 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
         return true;
       }
     }
-
-    // If an indexed floating point load/store can be emitted, return false.
-    const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
-
-    if (LS &&
-        (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
-        Subtarget.hasFPIdx())
-      return false;
   }
 
-  Base   = Addr;
-  Offset = CurDAG->getTargetConstant(0, ValTy);
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, Addr.getValueType());
   return true;
 }
 
+bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+                                     SDValue &Offset) const {
+  return selectAddrRegImm(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
 void MipsDAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
   SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, TLI.getPointerTy());
   if (Parent) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index a309040..36e1a15 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -11,8 +11,8 @@
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//
-
 #define DEBUG_TYPE "mips-lower"
+#include <set>
 #include "MipsISelLowering.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
@@ -55,6 +55,12 @@ Mips16HardFloat("mips16-hard-float", cl::NotHidden,
                 cl::desc("MIPS: mips16 hard float enable."),
                 cl::init(false));
 
+static cl::opt<bool> DontExpandCondPseudos16(
+  "mips16-dont-expand-cond-pseudo",
+  cl::init(false),
+  cl::desc("Dont expand conditional move related "
+           "pseudos for Mips 16"),
+  cl::Hidden);
 
 
 static const uint16_t O32IntRegs[4] = {
@@ -162,6 +168,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::GPRel:             return "MipsISD::GPRel";
   case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
   case MipsISD::Ret:               return "MipsISD::Ret";
+  case MipsISD::EH_RETURN:         return "MipsISD::EH_RETURN";
   case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
   case MipsISD::FPCmp:             return "MipsISD::FPCmp";
   case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
@@ -205,39 +212,56 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
+namespace {
+  struct ltstr {
+    bool operator()(const char *s1, const char *s2) const
+    {
+      return strcmp(s1, s2) < 0;
+    }
+  };
+
+  std::set<const char*, ltstr> noHelperNeeded;
+}
+
+void MipsTargetLowering::SetMips16LibcallName
+  (RTLIB::Libcall l, const char *Name) {
+  setLibcallName(l, Name);
+  noHelperNeeded.insert(Name);
+}
+
 void MipsTargetLowering::setMips16HardFloatLibCalls() {
-  setLibcallName(RTLIB::ADD_F32, "__mips16_addsf3");
-  setLibcallName(RTLIB::ADD_F64, "__mips16_adddf3");
-  setLibcallName(RTLIB::SUB_F32, "__mips16_subsf3");
-  setLibcallName(RTLIB::SUB_F64, "__mips16_subdf3");
-  setLibcallName(RTLIB::MUL_F32, "__mips16_mulsf3");
-  setLibcallName(RTLIB::MUL_F64, "__mips16_muldf3");
-  setLibcallName(RTLIB::DIV_F32, "__mips16_divsf3");
-  setLibcallName(RTLIB::DIV_F64, "__mips16_divdf3");
-  setLibcallName(RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2");
-  setLibcallName(RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2");
-  setLibcallName(RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi");
-  setLibcallName(RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi");
-  setLibcallName(RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf");
-  setLibcallName(RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf");
-  setLibcallName(RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf");
-  setLibcallName(RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf");
-  setLibcallName(RTLIB::OEQ_F32, "__mips16_eqsf2");
-  setLibcallName(RTLIB::OEQ_F64, "__mips16_eqdf2");
-  setLibcallName(RTLIB::UNE_F32, "__mips16_nesf2");
-  setLibcallName(RTLIB::UNE_F64, "__mips16_nedf2");
-  setLibcallName(RTLIB::OGE_F32, "__mips16_gesf2");
-  setLibcallName(RTLIB::OGE_F64, "__mips16_gedf2");
-  setLibcallName(RTLIB::OLT_F32, "__mips16_ltsf2");
-  setLibcallName(RTLIB::OLT_F64, "__mips16_ltdf2");
-  setLibcallName(RTLIB::OLE_F32, "__mips16_lesf2");
-  setLibcallName(RTLIB::OLE_F64, "__mips16_ledf2");
-  setLibcallName(RTLIB::OGT_F32, "__mips16_gtsf2");
-  setLibcallName(RTLIB::OGT_F64, "__mips16_gtdf2");
-  setLibcallName(RTLIB::UO_F32, "__mips16_unordsf2");
-  setLibcallName(RTLIB::UO_F64, "__mips16_unorddf2");
-  setLibcallName(RTLIB::O_F32, "__mips16_unordsf2");
-  setLibcallName(RTLIB::O_F64, "__mips16_unorddf2");
+  SetMips16LibcallName(RTLIB::ADD_F32, "__mips16_addsf3");
+  SetMips16LibcallName(RTLIB::ADD_F64, "__mips16_adddf3");
+  SetMips16LibcallName(RTLIB::SUB_F32, "__mips16_subsf3");
+  SetMips16LibcallName(RTLIB::SUB_F64, "__mips16_subdf3");
+  SetMips16LibcallName(RTLIB::MUL_F32, "__mips16_mulsf3");
+  SetMips16LibcallName(RTLIB::MUL_F64, "__mips16_muldf3");
+  SetMips16LibcallName(RTLIB::DIV_F32, "__mips16_divsf3");
+  SetMips16LibcallName(RTLIB::DIV_F64, "__mips16_divdf3");
+  SetMips16LibcallName(RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2");
+  SetMips16LibcallName(RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2");
+  SetMips16LibcallName(RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi");
+  SetMips16LibcallName(RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi");
+  SetMips16LibcallName(RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf");
+  SetMips16LibcallName(RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf");
+  SetMips16LibcallName(RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf");
+  SetMips16LibcallName(RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf");
+  SetMips16LibcallName(RTLIB::OEQ_F32, "__mips16_eqsf2");
+  SetMips16LibcallName(RTLIB::OEQ_F64, "__mips16_eqdf2");
+  SetMips16LibcallName(RTLIB::UNE_F32, "__mips16_nesf2");
+  SetMips16LibcallName(RTLIB::UNE_F64, "__mips16_nedf2");
+  SetMips16LibcallName(RTLIB::OGE_F32, "__mips16_gesf2");
+  SetMips16LibcallName(RTLIB::OGE_F64, "__mips16_gedf2");
+  SetMips16LibcallName(RTLIB::OLT_F32, "__mips16_ltsf2");
+  SetMips16LibcallName(RTLIB::OLT_F64, "__mips16_ltdf2");
+  SetMips16LibcallName(RTLIB::OLE_F32, "__mips16_lesf2");
+  SetMips16LibcallName(RTLIB::OLE_F64, "__mips16_ledf2");
+  SetMips16LibcallName(RTLIB::OGT_F32, "__mips16_gtsf2");
+  SetMips16LibcallName(RTLIB::OGT_F64, "__mips16_gtdf2");
+  SetMips16LibcallName(RTLIB::UO_F32, "__mips16_unordsf2");
+  SetMips16LibcallName(RTLIB::UO_F64, "__mips16_unorddf2");
+  SetMips16LibcallName(RTLIB::O_F32, "__mips16_unordsf2");
+  SetMips16LibcallName(RTLIB::O_F64, "__mips16_unorddf2");
 }
 
 MipsTargetLowering::
@@ -404,6 +428,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
   setOperationAction(ISD::FCOS,              MVT::f64,   Expand);
+  setOperationAction(ISD::FSINCOS,           MVT::f32,   Expand);
+  setOperationAction(ISD::FSINCOS,           MVT::f64,   Expand);
   setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
   setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
   setOperationAction(ISD::FPOW,              MVT::f64,   Expand);
@@ -426,6 +452,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::EHSELECTION,       MVT::i32, Expand);
   setOperationAction(ISD::EHSELECTION,       MVT::i64, Expand);
 
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+
   setOperationAction(ISD::VAARG,             MVT::Other, Expand);
   setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
   setOperationAction(ISD::VAEND,             MVT::Other, Expand);
@@ -498,7 +526,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setExceptionPointerRegister(IsN64 ? Mips::A0_64 : Mips::A0);
   setExceptionSelectorRegister(IsN64 ? Mips::A1_64 : Mips::A1);
 
-  maxStoresPerMemcpy = 16;
+  MaxStoresPerMemcpy = 16;
 }
 
 bool
@@ -1026,6 +1054,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::FABS:               return LowerFABS(Op, DAG);
     case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
     case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+    case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
     case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
     case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
     case ISD::SHL_PARTS:          return LowerShiftLeftParts(Op, DAG);
@@ -1207,11 +1236,290 @@ MipsTargetLowering::EmitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
   return Sink;
 }
 
+MachineBasicBlock *MipsTargetLowering::EmitSel16(unsigned Opc, MachineInstr *MI,
+                             MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, dl, TII->get(Opc)).addReg(MI->getOperand(3).getReg())
+    .addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), dl,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *MipsTargetLowering::EmitSelT16
+  (unsigned Opc1, unsigned Opc2,
+   MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, dl, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
+    .addReg(MI->getOperand(4).getReg());
+  BuildMI(BB, dl, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), dl,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+
+MachineBasicBlock *MipsTargetLowering::EmitSeliT16
+  (unsigned Opc1, unsigned Opc2,
+   MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, dl, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
+    .addImm(MI->getOperand(4).getImm());
+  BuildMI(BB, dl, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), dl,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+
+MachineBasicBlock
+  *MipsTargetLowering::EmitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                           MachineInstr *MI,
+                           MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned regX = MI->getOperand(0).getReg();
+  unsigned regY = MI->getOperand(1).getReg();
+  MachineBasicBlock *target = MI->getOperand(2).getMBB();
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addReg(regY);
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+
+MachineBasicBlock *MipsTargetLowering::EmitFEXT_T8I8I16_ins(
+  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned regX = MI->getOperand(0).getReg();
+  int64_t imm = MI->getOperand(1).getImm();
+  MachineBasicBlock *target = MI->getOperand(2).getMBB();
+  unsigned CmpOpc;
+  if (isUInt<8>(imm))
+    CmpOpc = CmpiOpc;
+  else if (isUInt<16>(imm))
+    CmpOpc = CmpiXOpc;
+  else
+    llvm_unreachable("immediate field not usable");
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addImm(imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+
+static unsigned Mips16WhichOp8uOr16simm
+  (unsigned shortOp, unsigned longOp, int64_t Imm) {
+  if (isUInt<8>(Imm))
+    return shortOp;
+  else if (isInt<16>(Imm))
+    return longOp;
+  else
+    llvm_unreachable("immediate field not usable");
+}
+
+MachineBasicBlock *MipsTargetLowering::EmitFEXT_CCRX16_ins(
+  unsigned SltOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned CC = MI->getOperand(0).getReg();
+  unsigned regX = MI->getOperand(1).getReg();
+  unsigned regY = MI->getOperand(2).getReg();
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+		  TII->get(SltOpc)).addReg(regX).addReg(regY);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+MachineBasicBlock *MipsTargetLowering::EmitFEXT_CCRXI16_ins(
+  unsigned SltiOpc, unsigned SltiXOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB )const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned CC = MI->getOperand(0).getReg();
+  unsigned regX = MI->getOperand(1).getReg();
+  int64_t Imm = MI->getOperand(2).getImm();
+  unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(SltOpc)).addReg(regX).addImm(Imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
 MachineBasicBlock *
 MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unexpected instr type to insert");
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
   case Mips::ATOMIC_LOAD_ADD_I8:
   case Mips::ATOMIC_LOAD_ADD_I8_P8:
     return EmitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
@@ -1317,6 +1625,75 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return EmitAtomicCmpSwap(MI, BB, 8);
   case Mips::BPOSGE32_PSEUDO:
     return EmitBPOSGE32(MI, BB);
+  case Mips::SelBeqZ:
+    return EmitSel16(Mips::BeqzRxImm16, MI, BB);
+  case Mips::SelBneZ:
+    return EmitSel16(Mips::BnezRxImm16, MI, BB);
+  case Mips::SelTBteqZCmpi:
+    return EmitSeliT16(Mips::BteqzX16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSlti:
+    return EmitSeliT16(Mips::BteqzX16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSltiu:
+    return EmitSeliT16(Mips::BteqzX16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBtneZCmpi:
+    return EmitSeliT16(Mips::BtnezX16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSlti:
+    return EmitSeliT16(Mips::BtnezX16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSltiu:
+    return EmitSeliT16(Mips::BtnezX16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBteqZCmp:
+    return EmitSelT16(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBteqZSlt:
+    return EmitSelT16(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBteqZSltu:
+    return EmitSelT16(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::SelTBtneZCmp:
+    return EmitSelT16(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBtneZSlt:
+    return EmitSelT16(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBtneZSltu:
+    return EmitSelT16(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpX16:
+    return EmitFEXT_T8I816_ins(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BteqzT8SltX16:
+    return EmitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+  case Mips::BteqzT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return EmitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BtnezT8CmpX16:
+    return EmitFEXT_T8I816_ins(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BtnezT8SltX16:
+    return EmitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+  case Mips::BtnezT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return EmitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpiX16: return EmitFEXT_T8I8I16_ins(
+    Mips::BteqzX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::BteqzT8SltiX16: return EmitFEXT_T8I8I16_ins(
+    Mips::BteqzX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::BteqzT8SltiuX16: return EmitFEXT_T8I8I16_ins(
+    Mips::BteqzX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::BtnezT8CmpiX16: return EmitFEXT_T8I8I16_ins(
+    Mips::BtnezX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::BtnezT8SltiX16: return EmitFEXT_T8I8I16_ins(
+    Mips::BtnezX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::BtnezT8SltiuX16: return EmitFEXT_T8I8I16_ins(
+    Mips::BtnezX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+    break;
+  case Mips::SltCCRxRy16:
+    return EmitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
+    break;
+  case Mips::SltiCCRxImmX16:
+    return EmitFEXT_CCRXI16_ins
+      (Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SltiuCCRxImmX16:
+    return EmitFEXT_CCRXI16_ins
+      (Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SltuCCRxRy16:
+    return EmitFEXT_CCRX16_ins
+      (Mips::SltuRxRy16, MI, BB);
   }
 }
 
@@ -2209,6 +2586,34 @@ SDValue MipsTargetLowering::LowerRETURNADDR(SDValue Op,
   return DAG.getCopyFromReg(DAG.getEntryNode(), Op.getDebugLoc(), Reg, VT);
 }
 
+// An EH_RETURN is the result of lowering llvm.eh.return which in turn is
+// generated from __builtin_eh_return (offset, handler)
+// The effect of this is to adjust the stack pointer by "offset"
+// and then branch to "handler".
+SDValue MipsTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+                                                                     const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsFI->setCallsEhReturn();
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  DebugLoc DL       = Op.getDebugLoc();
+  EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
+
+  // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
+  // EH_RETURN nodes, so that instructions are emitted back-to-back.
+  unsigned OffsetReg = IsN64 ? Mips::V1_64 : Mips::V1;
+  unsigned AddrReg = IsN64 ? Mips::V0_64 : Mips::V0;
+  Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
+  Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
+  return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
+                     DAG.getRegister(OffsetReg, Ty),
+                     DAG.getRegister(AddrReg, getPointerTy()),
+                     Chain.getValue(1));
+}
+
 // TODO: set SType according to the desired memory barrier behavior.
 SDValue
 MipsTargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const {
@@ -2754,6 +3159,163 @@ MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
                       /*isVolatile=*/ true, false, 0);
 }
 
+//
+// The Mips16 hard float is a crazy quilt inherited from gcc. I have a much
+// cleaner way to do all of this but it will have to wait until the traditional
+// gcc mechanism is completed.
+//
+// For Pic, in order for Mips16 code to call Mips32 code which according the abi
+// have either arguments or returned values placed in floating point registers,
+// we use a set of helper functions. (This includes functions which return type
+//  complex which on Mips are returned in a pair of floating point registers).
+//
+// This is an encoding that we inherited from gcc.
+// In Mips traditional O32, N32 ABI, floating point numbers are passed in
+// floating point argument registers 1,2 only when the first and optionally
+// the second arguments are float (sf) or double (df).
+// For Mips16 we are only concerned with the situations where floating point
+// arguments are being passed in floating point registers by the ABI, because
+// Mips16 mode code cannot execute floating point instructions to load those
+// values and hence helper functions are needed.
+// The possibilities are (), (sf), (sf, sf), (sf, df), (df), (df, sf), (df, df)
+// the helper function suffixs for these are:
+//                        0,  1,    5,        9,         2,   6,        10
+// this suffix can then be calculated as follows:
+// for a given argument Arg:
+//     Arg1x, Arg2x = 1 :  Arg is sf
+//                    2 :  Arg is df
+//                    0:   Arg is neither sf or df
+// So this stub is the string for number Arg1x + Arg2x*4.
+// However not all numbers between 0 and 10 are possible, we check anyway and
+// assert if the impossible exists.
+//
+
+unsigned int MipsTargetLowering::getMips16HelperFunctionStubNumber
+  (ArgListTy &Args) const {
+  unsigned int resultNum = 0;
+  if (Args.size() >= 1) {
+    Type *t = Args[0].Ty;
+    if (t->isFloatTy()) {
+      resultNum = 1;
+    }
+    else if (t->isDoubleTy()) {
+      resultNum = 2;
+    }
+  }
+  if (resultNum) {
+    if (Args.size() >=2) {
+      Type *t = Args[1].Ty;
+      if (t->isFloatTy()) {
+        resultNum += 4;
+      }
+      else if (t->isDoubleTy()) {
+        resultNum += 8;
+      }
+    }
+  }
+  return resultNum;
+}
+
+//
+// prefixs are attached to stub numbers depending on the return type .
+// return type: float  sf_
+//              double df_
+//              single complex sc_
+//              double complext dc_
+//              others  NO PREFIX
+//
+//
+// The full name of a helper function is__mips16_call_stub +
+//    return type dependent prefix + stub number
+//
+//
+// This is something that probably should be in a different source file and
+// perhaps done differently but my main purpose is to not waste runtime
+// on something that we can enumerate in the source. Another possibility is
+// to have a python script to generate these mapping tables. This will do
+// for now. There are a whole series of helper function mapping arrays, one
+// for each return type class as outlined above. There there are 11 possible
+//  entries. Ones with 0 are ones which should never be selected
+//
+// All the arrays are similar except for ones which return neither
+// sf, df, sc, dc, in which only care about ones which have sf or df as a
+// first parameter.
+//
+#define P_ "__mips16_call_stub_"
+#define MAX_STUB_NUMBER 10
+#define T1 P "1", P "2", 0, 0, P "5", P "6", 0, 0, P "9", P "10"
+#define T P "0" , T1
+#define P P_
+static char const * vMips16Helper[MAX_STUB_NUMBER+1] =
+  {0, T1 };
+#undef P
+#define P P_ "sf_"
+static char const * sfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "df_"
+static char const * dfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "sc_"
+static char const * scMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "dc_"
+static char const * dcMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#undef P_
+
+
+const char* MipsTargetLowering::
+  getMips16HelperFunction
+    (Type* RetTy, ArgListTy &Args, bool &needHelper) const {
+  const unsigned int stubNum = getMips16HelperFunctionStubNumber(Args);
+#ifndef NDEBUG
+  const unsigned int maxStubNum = 10;
+  assert(stubNum <= maxStubNum);
+  const bool validStubNum[maxStubNum+1] =
+    {true, true, true, false, false, true, true, false, false, true, true};
+  assert(validStubNum[stubNum]);
+#endif
+  const char *result;
+  if (RetTy->isFloatTy()) {
+    result = sfMips16Helper[stubNum];
+  }
+  else if (RetTy ->isDoubleTy()) {
+    result = dfMips16Helper[stubNum];
+  }
+  else if (RetTy->isStructTy()) {
+    // check if it's complex
+    if (RetTy->getNumContainedTypes() == 2) {
+      if ((RetTy->getContainedType(0)->isFloatTy()) &&
+          (RetTy->getContainedType(1)->isFloatTy())) {
+        result = scMips16Helper[stubNum];
+      }
+      else if ((RetTy->getContainedType(0)->isDoubleTy()) &&
+               (RetTy->getContainedType(1)->isDoubleTy())) {
+        result = dcMips16Helper[stubNum];
+      }
+      else {
+        llvm_unreachable("Uncovered condition");
+      }
+    }
+    else {
+      llvm_unreachable("Uncovered condition");
+    }
+  }
+  else {
+    if (stubNum == 0) {
+      needHelper = false;
+      return "";
+    }
+    result = vMips16Helper[stubNum];
+  }
+  needHelper = true;
+  return result;
+}
+
 /// LowerCall - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 SDValue
@@ -2770,6 +3332,26 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
 
+  const char* mips16HelperFunction = 0;
+  bool needMips16Helper = false;
+
+  if (Subtarget->inMips16Mode() && getTargetMachine().Options.UseSoftFloat &&
+      Mips16HardFloat) {
+    //
+    // currently we don't have symbols tagged with the mips16 or mips32
+    // qualifier so we will assume that we don't know what kind it is.
+    // and generate the helper
+    //
+    bool lookupHelper = true;
+    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      if (noHelperNeeded.find(S->getSymbol()) != noHelperNeeded.end()) {
+        lookupHelper = false;
+      }
+    }
+    if (lookupHelper) mips16HelperFunction =
+      getMips16HelperFunction(CLI.RetTy, CLI.Args, needMips16Helper);
+
+  }
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
@@ -2779,9 +3361,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, isVarArg, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
 
-  MipsCCInfo.analyzeCallOperands(Outs);
+  MipsCCInfo.analyzeCallOperands(Outs, isVarArg);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
@@ -2810,7 +3392,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                         getPointerTy());
 
   // With EABI is it possible to have 16 args on registers.
-  SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
+  std::deque< std::pair<unsigned, SDValue> > RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
   MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
 
@@ -2920,31 +3502,31 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     else if (LargeGOT)
       Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
                                      MipsII::MO_CALL_LO16);
-    else if (HasMips64)
-      Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_DISP);
-    else // O32 & PIC
+    else // N64 || PIC
       Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
 
     GlobalOrExternal = true;
   }
 
-  SDValue InFlag;
-
-  // T9 register operand.
-  SDValue T9;
+  SDValue JumpTarget = Callee;
 
   // T9 should contain the address of the callee function if
   // -reloction-model=pic or it is an indirect call.
   if (IsPICCall || !GlobalOrExternal) {
-    // copy to T9
     unsigned T9Reg = IsN64 ? Mips::T9_64 : Mips::T9;
-    Chain = DAG.getCopyToReg(Chain, dl, T9Reg, Callee, SDValue(0, 0));
-    InFlag = Chain.getValue(1);
+    unsigned V0Reg = Mips::V0;
+    if (needMips16Helper) {
+      RegsToPass.push_front(std::make_pair(V0Reg, Callee));
+      JumpTarget = DAG.getExternalSymbol(
+        mips16HelperFunction, getPointerTy());
+      JumpTarget = getAddrGlobal(JumpTarget, DAG, MipsII::MO_GOT);
+    }
+    else {
+      RegsToPass.push_front(std::make_pair(T9Reg, Callee));
 
-    if (Subtarget->inMips16Mode())
-      T9 = DAG.getRegister(T9Reg, getPointerTy());
-    else
-      Callee = DAG.getRegister(T9Reg, getPointerTy());
+      if (!Subtarget->inMips16Mode())
+        JumpTarget = SDValue();
+    }
   }
 
   // Insert node "GP copy globalreg" before call to function.
@@ -2962,6 +3544,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // chain and flag operands which copy the outgoing args into registers.
   // The InFlag in necessary since all emitted instructions must be
   // stuck together.
+  SDValue InFlag;
+
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                              RegsToPass[i].second, InFlag);
@@ -2973,9 +3557,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   //
   // Returns a chain & a flag for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  SmallVector<SDValue, 8> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
+  SmallVector<SDValue, 8> Ops(1, Chain);
+
+  if (JumpTarget.getNode())
+    Ops.push_back(JumpTarget);
 
   // Add argument registers to the end of the list so that they are
   // known live into the call.
@@ -2983,10 +3568,6 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
-  // Add T9 register operand.
-  if (T9.getNode())
-    Ops.push_back(T9);
-
   // Add a register mask operand representing the call-preserved registers.
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
@@ -3065,7 +3646,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, isVarArg, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
 
   MipsCCInfo.analyzeFormalArguments(Ins);
   MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
@@ -3225,15 +3806,8 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
 
-  // If this is the first return lowered for this function, add
-  // the regs to the liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
   SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
 
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -3242,9 +3816,9 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
 
-    // guarantee that all emitted copies are
-    // stuck together, avoiding something bad
+    // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
   // The mips ABIs for returning structs by value requires that we copy
@@ -3263,15 +3837,17 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 
     Chain = DAG.getCopyToReg(Chain, dl, V0, Val, Flag);
     Flag = Chain.getValue(1);
-    MF.getRegInfo().addLiveOut(V0);
+    RetOps.push_back(DAG.getRegister(V0, getPointerTy()));
   }
 
-  // Return on Mips is always a "jr $ra"
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
   if (Flag.getNode())
-    return DAG.getNode(MipsISD::Ret, dl, MVT::Other, Chain, Flag);
+    RetOps.push_back(Flag);
 
-  // Return Void
-  return DAG.getNode(MipsISD::Ret, dl, MVT::Other, Chain);
+  // Return on Mips is always a "jr $ra"
+  return DAG.getNode(MipsISD::Ret, dl, MVT::Other, &RetOps[0], RetOps.size());
 }
 
 //===----------------------------------------------------------------------===//
@@ -3552,40 +4128,21 @@ unsigned MipsTargetLowering::getJumpTableEncoding() const {
   return TargetLowering::getJumpTableEncoding();
 }
 
-MipsTargetLowering::MipsCC::MipsCC(CallingConv::ID CallConv, bool IsVarArg,
-                                   bool IsO32, CCState &Info) : CCInfo(Info) {
-  UseRegsForByval = true;
-
-  if (IsO32) {
-    RegSize = 4;
-    NumIntArgRegs = array_lengthof(O32IntRegs);
-    ReservedArgArea = 16;
-    IntArgRegs = ShadowRegs = O32IntRegs;
-    FixedFn = VarFn = CC_MipsO32;
-  } else {
-    RegSize = 8;
-    NumIntArgRegs = array_lengthof(Mips64IntRegs);
-    ReservedArgArea = 0;
-    IntArgRegs = Mips64IntRegs;
-    ShadowRegs = Mips64DPRegs;
-    FixedFn = CC_MipsN;
-    VarFn = CC_MipsN_VarArg;
-  }
-
-  if (CallConv == CallingConv::Fast) {
-    assert(!IsVarArg);
-    UseRegsForByval = false;
-    ReservedArgArea = 0;
-    FixedFn = VarFn = CC_Mips_FastCC;
-  }
-
+MipsTargetLowering::MipsCC::MipsCC(CallingConv::ID CC, bool IsO32_,
+                                   CCState &Info)
+  : CCInfo(Info), CallConv(CC), IsO32(IsO32_) {
   // Pre-allocate reserved argument area.
-  CCInfo.AllocateStack(ReservedArgArea, 1);
+  CCInfo.AllocateStack(reservedArgArea(), 1);
 }
 
 void MipsTargetLowering::MipsCC::
-analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args) {
+analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
+                    bool IsVarArg) {
+  assert((CallConv != CallingConv::Fast || !IsVarArg) &&
+         "CallingConv::Fast shouldn't be used for vararg functions.");
+
   unsigned NumOpnds = Args.size();
+  llvm::CCAssignFn *FixedFn = fixedArgFn(), *VarFn = varArgFn();
 
   for (unsigned I = 0; I != NumOpnds; ++I) {
     MVT ArgVT = Args[I].VT;
@@ -3597,10 +4154,10 @@ analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args) {
       continue;
     }
 
-    if (Args[I].IsFixed)
-      R = FixedFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-    else
+    if (IsVarArg && !Args[I].IsFixed)
       R = VarFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+    else
+      R = FixedFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
 
     if (R) {
 #ifndef NDEBUG
@@ -3615,6 +4172,7 @@ analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args) {
 void MipsTargetLowering::MipsCC::
 analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args) {
   unsigned NumArgs = Args.size();
+  llvm::CCAssignFn *FixedFn = fixedArgFn();
 
   for (unsigned I = 0; I != NumArgs; ++I) {
     MVT ArgVT = Args[I].VT;
@@ -3644,11 +4202,12 @@ MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
   assert(ArgFlags.getByValSize() && "Byval argument's size shouldn't be 0.");
 
   struct ByValArgInfo ByVal;
+  unsigned RegSize = regSize();
   unsigned ByValSize = RoundUpToAlignment(ArgFlags.getByValSize(), RegSize);
   unsigned Align = std::min(std::max(ArgFlags.getByValAlign(), RegSize),
                             RegSize * 2);
 
-  if (UseRegsForByval)
+  if (useRegsForByval())
     allocateRegs(ByVal, ByValSize, Align);
 
   // Allocate space on caller's stack.
@@ -3659,9 +4218,38 @@ MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
   ByValArgs.push_back(ByVal);
 }
 
+unsigned MipsTargetLowering::MipsCC::numIntArgRegs() const {
+  return IsO32 ? array_lengthof(O32IntRegs) : array_lengthof(Mips64IntRegs);
+}
+
+unsigned MipsTargetLowering::MipsCC::reservedArgArea() const {
+  return (IsO32 && (CallConv != CallingConv::Fast)) ? 16 : 0;
+}
+
+const uint16_t *MipsTargetLowering::MipsCC::intArgRegs() const {
+  return IsO32 ? O32IntRegs : Mips64IntRegs;
+}
+
+llvm::CCAssignFn *MipsTargetLowering::MipsCC::fixedArgFn() const {
+  if (CallConv == CallingConv::Fast)
+    return CC_Mips_FastCC;
+
+  return IsO32 ? CC_MipsO32 : CC_MipsN;
+}
+
+llvm::CCAssignFn *MipsTargetLowering::MipsCC::varArgFn() const {
+  return IsO32 ? CC_MipsO32 : CC_MipsN_VarArg;
+}
+
+const uint16_t *MipsTargetLowering::MipsCC::shadowRegs() const {
+  return IsO32 ? O32IntRegs : Mips64DPRegs;
+}
+
 void MipsTargetLowering::MipsCC::allocateRegs(ByValArgInfo &ByVal,
                                               unsigned ByValSize,
                                               unsigned Align) {
+  unsigned RegSize = regSize(), NumIntArgRegs = numIntArgRegs();
+  const uint16_t *IntArgRegs = intArgRegs(), *ShadowRegs = shadowRegs();
   assert(!(ByValSize % RegSize) && !(Align % RegSize) &&
          "Byval argument's size and alignment should be a multiple of"
          "RegSize.");
@@ -3726,7 +4314,7 @@ copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
 // Copy byVal arg to registers and stack.
 void MipsTargetLowering::
 passByValArg(SDValue Chain, DebugLoc DL,
-             SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
+             std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
              SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
              MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
              const MipsCC &CC, const ByValArgInfo &ByVal,
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index c4b38c6..f0f3782 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -20,6 +20,8 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
+#include <deque>
+#include <string>
 
 namespace llvm {
   namespace MipsISD {
@@ -63,6 +65,8 @@ namespace llvm {
       // Return
       Ret,
 
+      EH_RETURN,
+
       // MAdd/Sub nodes
       MAdd,
       MAddu,
@@ -174,8 +178,16 @@ namespace llvm {
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   private:
 
+    void SetMips16LibcallName(RTLIB::Libcall, const char *Name);
+
     void setMips16HardFloatLibCalls();
 
+    unsigned int
+      getMips16HelperFunctionStubNumber(ArgListTy &Args) const;
+
+    const char *getMips16HelperFunction
+      (Type* RetTy, ArgListTy &Args, bool &needHelper) const;
+
     /// ByValArgInfo - Byval argument information.
     struct ByValArgInfo {
       unsigned FirstIdx; // Index of the first register used.
@@ -189,53 +201,57 @@ namespace llvm {
     /// arguments and inquire about calling convention information.
     class MipsCC {
     public:
-      MipsCC(CallingConv::ID CallConv, bool IsVarArg, bool IsO32,
-             CCState &Info);
+      MipsCC(CallingConv::ID CallConv, bool IsO32, CCState &Info);
 
-      void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs);
+      void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               bool IsVarArg);
       void analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins);
-      void handleByValArg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                          CCValAssign::LocInfo LocInfo,
-                          ISD::ArgFlagsTy ArgFlags);
-
       const CCState &getCCInfo() const { return CCInfo; }
 
       /// hasByValArg - Returns true if function has byval arguments.
       bool hasByValArg() const { return !ByValArgs.empty(); }
 
-      /// useRegsForByval - Returns true if the calling convention allows the
-      /// use of registers to pass byval arguments.
-      bool useRegsForByval() const { return UseRegsForByval; }
-
       /// regSize - Size (in number of bits) of integer registers.
-      unsigned regSize() const { return RegSize; }
+      unsigned regSize() const { return IsO32 ? 4 : 8; }
 
       /// numIntArgRegs - Number of integer registers available for calls.
-      unsigned numIntArgRegs() const { return NumIntArgRegs; }
+      unsigned numIntArgRegs() const;
 
       /// reservedArgArea - The size of the area the caller reserves for
       /// register arguments. This is 16-byte if ABI is O32.
-      unsigned reservedArgArea() const { return ReservedArgArea; }
+      unsigned reservedArgArea() const;
 
-      /// intArgRegs - Pointer to array of integer registers.
-      const uint16_t *intArgRegs() const { return IntArgRegs; }
+      /// Return pointer to array of integer argument registers.
+      const uint16_t *intArgRegs() const;
 
       typedef SmallVector<ByValArgInfo, 2>::const_iterator byval_iterator;
       byval_iterator byval_begin() const { return ByValArgs.begin(); }
       byval_iterator byval_end() const { return ByValArgs.end(); }
 
     private:
+      void handleByValArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags);
+
+      /// useRegsForByval - Returns true if the calling convention allows the
+      /// use of registers to pass byval arguments.
+      bool useRegsForByval() const { return CallConv != CallingConv::Fast; }
+
+      /// Return the function that analyzes fixed argument list functions.
+      llvm::CCAssignFn *fixedArgFn() const;
+
+      /// Return the function that analyzes variable argument list functions.
+      llvm::CCAssignFn *varArgFn() const;
+
+      const uint16_t *shadowRegs() const;
+
       void allocateRegs(ByValArgInfo &ByVal, unsigned ByValSize,
                         unsigned Align);
 
       CCState &CCInfo;
-      bool UseRegsForByval;
-      unsigned RegSize;
-      unsigned NumIntArgRegs;
-      unsigned ReservedArgArea;
-      const uint16_t *IntArgRegs, *ShadowRegs;
+      CallingConv::ID CallConv;
+      bool IsO32;
       SmallVector<ByValArgInfo, 2> ByValArgs;
-      llvm::CCAssignFn *FixedFn, *VarFn;
     };
 
     // Subtarget Info
@@ -265,6 +281,7 @@ namespace llvm {
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
@@ -294,7 +311,7 @@ namespace llvm {
 
     /// passByValArg - Pass a byval argument in registers or on stack.
     void passByValArg(SDValue Chain, DebugLoc DL,
-                      SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
+                      std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
                       SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
                       MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
                       const MipsCC &CC, const ByValArgInfo &ByVal,
@@ -387,6 +404,28 @@ namespace llvm {
                                   MachineBasicBlock *BB, unsigned Size) const;
     MachineBasicBlock *EmitAtomicCmpSwapPartword(MachineInstr *MI,
                                   MachineBasicBlock *BB, unsigned Size) const;
+    MachineBasicBlock *EmitSel16(unsigned Opc, MachineInstr *MI,
+                                 MachineBasicBlock *BB) const;
+    MachineBasicBlock *EmitSeliT16(unsigned Opc1, unsigned Opc2,
+                                  MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitSelT16(unsigned Opc1, unsigned Opc2,
+                                  MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
+    MachineBasicBlock *EmitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                               MachineInstr *MI,
+                               MachineBasicBlock *BB) const;
+    MachineBasicBlock *EmitFEXT_T8I8I16_ins(
+      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB) const;
+    MachineBasicBlock *EmitFEXT_CCRX16_ins(
+      unsigned SltOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB) const;
+    MachineBasicBlock *EmitFEXT_CCRXI16_ins(
+      unsigned SltiOpc, unsigned SltiXOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB )const;
+
   };
 }
 
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index ab6f8ab..891bdc1 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -107,7 +107,8 @@ multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
 class ABSS_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
               InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs DstRC:$fd), (ins SrcRC:$fs), !strconcat(opstr, "\t$fd, $fs"),
-         [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR>;
+         [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR>,
+  NeverHasSideEffects;
 
 multiclass ABSS_M<string opstr, InstrItinClass Itin,
                   SDPatternOperator OpNode= null_frag> {
@@ -138,17 +139,27 @@ class MTC1_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
   InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
          [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
 
+class MFC1_FT_CCR<string opstr, RegisterClass DstRC, RegisterOperand SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR>;
+
+class MTC1_FT_CCR<string opstr, RegisterOperand DstRC, RegisterClass SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
+
 class LW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
             Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs RC:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(set RC:$rt, (OpNode addr:$addr))], Itin, FrmFI> {
+         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
 }
 
 class SW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
             Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(OpNode RC:$rt, addr:$addr)], Itin, FrmFI> {
+         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
 }
 
@@ -169,13 +180,17 @@ class LWXC1_FT<string opstr, RegisterClass DRC, RegisterClass PRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
   InstSE<(outs DRC:$fd), (ins PRC:$base, PRC:$index),
          !strconcat(opstr, "\t$fd, ${index}(${base})"),
-         [(set DRC:$fd, (OpNode (add PRC:$base, PRC:$index)))], Itin, FrmFI>;
+         [(set DRC:$fd, (OpNode (add PRC:$base, PRC:$index)))], Itin, FrmFI> {
+  let AddedComplexity = 20;
+}
 
 class SWXC1_FT<string opstr, RegisterClass DRC, RegisterClass PRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
   InstSE<(outs), (ins DRC:$fs, PRC:$base, PRC:$index),
          !strconcat(opstr, "\t$fs, ${index}(${base})"),
-         [(OpNode DRC:$fs, (add PRC:$base, PRC:$index))], Itin, FrmFI>;
+         [(OpNode DRC:$fs, (add PRC:$base, PRC:$index))], Itin, FrmFI> {
+  let AddedComplexity = 20;
+}
 
 class BC1F_FT<string opstr, InstrItinClass Itin,
               SDPatternOperator Op = null_frag>  :
@@ -203,15 +218,13 @@ def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xc, 16>;
 def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xd, 16>;
 def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xe, 16>;
 def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xf, 16>;
-def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0x24, 16>,
-                 NeverHasSideEffects;
+def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0x24, 16>;
 
 defm ROUND_W : ROUND_M<"round.w.d", IIFcvt>, ABSS_FM<0xc, 17>;
 defm TRUNC_W : ROUND_M<"trunc.w.d", IIFcvt>, ABSS_FM<0xd, 17>;
 defm CEIL_W  : ROUND_M<"ceil.w.d", IIFcvt>, ABSS_FM<0xe, 17>;
 defm FLOOR_W : ROUND_M<"floor.w.d", IIFcvt>, ABSS_FM<0xf, 17>;
-defm CVT_W   : ROUND_M<"cvt.w.d", IIFcvt>, ABSS_FM<0x24, 17>,
-               NeverHasSideEffects;
+defm CVT_W   : ROUND_M<"cvt.w.d", IIFcvt>, ABSS_FM<0x24, 17>;
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
   def ROUND_L_S : ABSS_FT<"round.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x8, 16>;
@@ -228,19 +241,16 @@ let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
 }
 
 def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32, FGR32, IIFcvt>, ABSS_FM<0x20, 20>;
-def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x25, 16>,
-              NeverHasSideEffects;
-def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64, FGR64, IIFcvt>, ABSS_FM<0x25, 17>,
-               NeverHasSideEffects;
+def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x25, 16>;
+def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64, FGR64, IIFcvt>, ABSS_FM<0x25, 17>;
 
-let Predicates = [NotFP64bit, HasStdEnc], neverHasSideEffects = 1 in {
+let Predicates = [NotFP64bit, HasStdEnc] in {
   def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32, AFGR64, IIFcvt>, ABSS_FM<0x20, 17>;
   def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
   def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 16>;
 }
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64",
-    neverHasSideEffects = 1 in {
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 17>;
  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 21>;
  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
@@ -265,8 +275,8 @@ defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
 // regardless of register aliasing.
 
 /// Move Control Registers From/To CPU Registers
-def CFC1 : MFC1_FT<"cfc1", CPURegs, CCR, IIFmove>, MFC1_FM<2>;
-def CTC1 : MTC1_FT<"ctc1", CCR, CPURegs, IIFmove>, MFC1_FM<6>;
+def CFC1 : MFC1_FT_CCR<"cfc1", CPURegs, CCROpnd, IIFmove>, MFC1_FM<2>;
+def CTC1 : MTC1_FT_CCR<"ctc1", CCROpnd, CPURegs, IIFmove>, MFC1_FM<6>;
 def MFC1 : MFC1_FT<"mfc1", CPURegs, FGR32, IIFmove, bitconvert>, MFC1_FM<0>;
 def MTC1 : MTC1_FT<"mtc1", FGR32, CPURegs, IIFmove, bitconvert>, MFC1_FM<4>;
 def DMFC1 : MFC1_FT<"dmfc1", CPU64Regs, FGR64, IIFmove, bitconvert>, MFC1_FM<1>;
@@ -437,7 +447,7 @@ def FCMP_D64 : CEQS_FT<"d", FGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
 //===----------------------------------------------------------------------===//
 // Floating Point Pseudo-Instructions
 //===----------------------------------------------------------------------===//
-def MOVCCRToCCR : PseudoSE<(outs CCR:$dst), (ins CCR:$src), []>;
+def MOVCCRToCCR : PseudoSE<(outs CCR:$dst), (ins CCROpnd:$src), []>;
 
 // This pseudo instr gets expanded into 2 mtc1 instrs after register
 // allocation.
@@ -492,3 +502,33 @@ let Predicates = [IsFP64bit, HasStdEnc] in {
   def : MipsPat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>;
   def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>;
 }
+
+// Load/Store patterns.
+let AddedComplexity = 40 in {
+  let Predicates = [IsN64, HasStdEnc] in {
+    def : MipsPat<(f32 (load addrRegImm:$a)), (LWC1_P8 addrRegImm:$a)>;
+    def : MipsPat<(store FGR32:$v, addrRegImm:$a),
+                  (SWC1_P8 FGR32:$v, addrRegImm:$a)>;
+    def : MipsPat<(f64 (load addrRegImm:$a)), (LDC164_P8 addrRegImm:$a)>;
+    def : MipsPat<(store FGR64:$v, addrRegImm:$a),
+                  (SDC164_P8 FGR64:$v, addrRegImm:$a)>;
+  }
+
+  let Predicates = [NotN64, HasStdEnc] in {
+    def : MipsPat<(f32 (load addrRegImm:$a)), (LWC1 addrRegImm:$a)>;
+    def : MipsPat<(store FGR32:$v, addrRegImm:$a),
+                  (SWC1 FGR32:$v, addrRegImm:$a)>;
+  }
+
+  let Predicates = [NotN64, HasMips64, HasStdEnc] in {
+    def : MipsPat<(f64 (load addrRegImm:$a)), (LDC164 addrRegImm:$a)>;
+    def : MipsPat<(store FGR64:$v, addrRegImm:$a),
+                  (SDC164 FGR64:$v, addrRegImm:$a)>;
+  }
+
+  let Predicates = [NotN64, NotMips64, HasStdEnc] in {
+    def : MipsPat<(f64 (load addrRegImm:$a)), (LDC1 addrRegImm:$a)>;
+    def : MipsPat<(store AFGR64:$v, addrRegImm:$a),
+                  (SDC1 AFGR64:$v, addrRegImm:$a)>;
+  }
+}
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index c026b5d..ee432c8 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -366,13 +366,8 @@ class LUI_FM {
   let Inst{15-0}  = imm16;
 }
 
-class NOP_FM {
-  bits<32> Inst;
-
-  let Inst{31-0} = 0;
-}
-
 class JALR_FM {
+  bits<5> rd;
   bits<5> rs;
 
   bits<32> Inst;
@@ -380,7 +375,7 @@ class JALR_FM {
   let Inst{31-26} = 0;
   let Inst{25-21} = rs;
   let Inst{20-16} = 0;
-  let Inst{15-11} = 31;
+  let Inst{15-11} = rd;
   let Inst{10-6}  = 0;
   let Inst{5-0}   = 9;
 }
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 8f2ce6f..de09c9e 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -72,7 +72,8 @@ def MipsTprelLo    : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
 def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
 
 // Return
-def MipsRet : SDNode<"MipsISD::Ret", SDTNone, [SDNPHasChain, SDNPOptInGlue]>;
+def MipsRet : SDNode<"MipsISD::Ret", SDTNone,
+                     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
 // These are target-independent nodes, but have target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
@@ -232,6 +233,10 @@ def calltarget64: Operand<i64>;
 def simm16      : Operand<i32> {
   let DecoderMethod= "DecodeSimm16";
 }
+
+def simm20      : Operand<i32> {
+}
+
 def simm16_64   : Operand<i64>;
 def shamt       : Operand<i32>;
 
@@ -296,6 +301,10 @@ def HI16 : SDNodeXForm<imm, [{
 
 // Node immediate fits as 16-bit sign extended on target immediate.
 // e.g. addi, andi
+def immSExt8  : PatLeaf<(imm), [{ return isInt<8>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
 def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
 
 // Node immediate fits as 15-bit sign extended on target immediate.
@@ -325,19 +334,25 @@ def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
 // Mips Address Mode! SDNode frameindex could possibily be a match
 // since load and store instructions from stack used it.
 def addr :
-  ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], [SDNPWantParent]>;
+  ComplexPattern<iPTR, 2, "selectIntAddr", [frameindex]>;
+
+def addrRegImm :
+  ComplexPattern<iPTR, 2, "selectAddrRegImm", [frameindex]>;
+
+def addrDefault :
+  ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
 
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
 
 // Arithmetic and logical instructions with 3 register operands.
-class ArithLogicR<string opstr, RegisterClass RC, bit isComm = 0,
+class ArithLogicR<string opstr, RegisterOperand RO, bit isComm = 0,
                   InstrItinClass Itin = NoItinerary,
                   SDPatternOperator OpNode = null_frag>:
-  InstSE<(outs RC:$rd), (ins RC:$rs, RC:$rt),
+  InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
          !strconcat(opstr, "\t$rd, $rs, $rt"),
-         [(set RC:$rd, (OpNode RC:$rs, RC:$rt))], Itin, FrmR> {
+         [(set RO:$rd, (OpNode RO:$rs, RO:$rt))], Itin, FrmR> {
   let isCommutable = isComm;
   let isReMaterializable = 1;
   string BaseOpcode;
@@ -345,27 +360,27 @@ class ArithLogicR<string opstr, RegisterClass RC, bit isComm = 0,
 }
 
 // Arithmetic and logical instructions with 2 register operands.
-class ArithLogicI<string opstr, Operand Od, RegisterClass RC,
+class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
                   SDPatternOperator imm_type = null_frag,
                   SDPatternOperator OpNode = null_frag> :
-  InstSE<(outs RC:$rt), (ins RC:$rs, Od:$imm16),
+  InstSE<(outs RO:$rt), (ins RO:$rs, Od:$imm16),
          !strconcat(opstr, "\t$rt, $rs, $imm16"),
-         [(set RC:$rt, (OpNode RC:$rs, imm_type:$imm16))], IIAlu, FrmI> {
+         [(set RO:$rt, (OpNode RO:$rs, imm_type:$imm16))], IIAlu, FrmI> {
   let isReMaterializable = 1;
 }
 
 // Arithmetic Multiply ADD/SUB
 class MArithR<string opstr, SDPatternOperator op = null_frag, bit isComm = 0> :
-  InstSE<(outs), (ins CPURegs:$rs, CPURegs:$rt),
+  InstSE<(outs), (ins CPURegsOpnd:$rs, CPURegsOpnd:$rt),
          !strconcat(opstr, "\t$rs, $rt"),
-         [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul, FrmR> {
+         [(op CPURegsOpnd:$rs, CPURegsOpnd:$rt, LO, HI)], IIImul, FrmR> {
   let Defs = [HI, LO];
   let Uses = [HI, LO];
   let isCommutable = isComm;
 }
 
 //  Logical
-class LogicNOR<string opstr, RegisterClass RC>:
+class LogicNOR<string opstr, RegisterOperand RC>:
   InstSE<(outs RC:$rd), (ins RC:$rs, RC:$rt),
          !strconcat(opstr, "\t$rd, $rs, $rt"),
          [(set RC:$rd, (not (or RC:$rs, RC:$rt)))], IIAlu, FrmR> {
@@ -374,17 +389,17 @@ class LogicNOR<string opstr, RegisterClass RC>:
 
 // Shifts
 class shift_rotate_imm<string opstr, Operand ImmOpnd,
-                       RegisterClass RC, SDPatternOperator OpNode = null_frag,
+                       RegisterOperand RC, SDPatternOperator OpNode = null_frag,
                        SDPatternOperator PF = null_frag> :
   InstSE<(outs RC:$rd), (ins RC:$rt, ImmOpnd:$shamt),
          !strconcat(opstr, "\t$rd, $rt, $shamt"),
          [(set RC:$rd, (OpNode RC:$rt, PF:$shamt))], IIAlu, FrmR>;
 
-class shift_rotate_reg<string opstr, RegisterClass RC,
+class shift_rotate_reg<string opstr, RegisterOperand RC,
                        SDPatternOperator OpNode = null_frag>:
-  InstSE<(outs RC:$rd), (ins CPURegs:$rs, RC:$rt),
+  InstSE<(outs RC:$rd), (ins CPURegsOpnd:$rs, RC:$rt),
          !strconcat(opstr, "\t$rd, $rt, $rs"),
-         [(set RC:$rd, (OpNode RC:$rt, CPURegs:$rs))], IIAlu, FrmR>;
+         [(set RC:$rd, (OpNode RC:$rt, CPURegsOpnd:$rs))], IIAlu, FrmR>;
 
 // Load Upper Imediate
 class LoadUpper<string opstr, RegisterClass RC, Operand Imm>:
@@ -498,15 +513,16 @@ class CBranchZero<string opstr, PatFrag cond_op, RegisterClass RC> :
 
 // SetCC
 class SetCC_R<string opstr, PatFrag cond_op, RegisterClass RC> :
-  InstSE<(outs CPURegs:$rd), (ins RC:$rs, RC:$rt),
+  InstSE<(outs CPURegsOpnd:$rd), (ins RC:$rs, RC:$rt),
          !strconcat(opstr, "\t$rd, $rs, $rt"),
-         [(set CPURegs:$rd, (cond_op RC:$rs, RC:$rt))], IIAlu, FrmR>;
+         [(set CPURegsOpnd:$rd, (cond_op RC:$rs, RC:$rt))], IIAlu, FrmR>;
 
 class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
               RegisterClass RC>:
-  InstSE<(outs CPURegs:$rt), (ins RC:$rs, Od:$imm16),
+  InstSE<(outs CPURegsOpnd:$rt), (ins RC:$rs, Od:$imm16),
          !strconcat(opstr, "\t$rt, $rs, $imm16"),
-         [(set CPURegs:$rt, (cond_op RC:$rs, imm_type:$imm16))], IIAlu, FrmI>;
+         [(set CPURegsOpnd:$rt, (cond_op RC:$rs, imm_type:$imm16))],
+         IIAlu, FrmI>;
 
 // Jump
 class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
@@ -559,12 +575,17 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in {
     let DecoderMethod = "DecodeJumpTarget";
   }
 
+  class JumpLinkRegPseudo<RegisterClass RC, Instruction JALRInst,
+                          Register RetReg>:
+    PseudoSE<(outs), (ins RC:$rs), [(MipsJmpLink RC:$rs)], IIBranch>,
+    PseudoInstExpansion<(JALRInst RetReg, RC:$rs)>;
+
   class JumpLinkReg<string opstr, RegisterClass RC>:
-    InstSE<(outs), (ins RC:$rs), !strconcat(opstr, "\t$rs"),
-           [(MipsJmpLink RC:$rs)], IIBranch, FrmR>;
+    InstSE<(outs RC:$rd), (ins RC:$rs), !strconcat(opstr, "\t$rd, $rs"),
+           [], IIBranch, FrmR>;
 
-  class BGEZAL_FT<string opstr, RegisterClass RC> :
-    InstSE<(outs), (ins RC:$rs, brtarget:$offset),
+  class BGEZAL_FT<string opstr, RegisterOperand RO> :
+    InstSE<(outs), (ins RO:$rs, brtarget:$offset),
            !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI>;
 
 }
@@ -585,19 +606,19 @@ class SYNC_FT :
          NoItinerary, FrmOther>;
 
 // Mul, Div
-class Mult<string opstr, InstrItinClass itin, RegisterClass RC,
+class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
            list<Register> DefRegs> :
-  InstSE<(outs), (ins RC:$rs, RC:$rt), !strconcat(opstr, "\t$rs, $rt"), [],
+  InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$rs, $rt"), [],
          itin, FrmR> {
   let isCommutable = 1;
   let Defs = DefRegs;
   let neverHasSideEffects = 1;
 }
 
-class Div<SDNode op, string opstr, InstrItinClass itin, RegisterClass RC,
+class Div<SDNode op, string opstr, InstrItinClass itin, RegisterOperand RO,
           list<Register> DefRegs> :
-  InstSE<(outs), (ins RC:$rs, RC:$rt),
-         !strconcat(opstr, "\t$$zero, $rs, $rt"), [(op RC:$rs, RC:$rt)], itin,
+  InstSE<(outs), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$$zero, $rs, $rt"), [(op RO:$rs, RO:$rt)], itin,
          FrmR> {
   let Defs = DefRegs;
 }
@@ -623,14 +644,14 @@ class EffectiveAddress<string opstr, RegisterClass RC, Operand Mem> :
 }
 
 // Count Leading Ones/Zeros in Word
-class CountLeading0<string opstr, RegisterClass RC>:
-  InstSE<(outs RC:$rd), (ins RC:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RC:$rd, (ctlz RC:$rs))], IIAlu, FrmR>,
+class CountLeading0<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctlz RO:$rs))], IIAlu, FrmR>,
   Requires<[HasBitCount, HasStdEnc]>;
 
-class CountLeading1<string opstr, RegisterClass RC>:
-  InstSE<(outs RC:$rd), (ins RC:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RC:$rd, (ctlz (not RC:$rs)))], IIAlu, FrmR>,
+class CountLeading1<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctlz (not RO:$rs)))], IIAlu, FrmR>,
   Requires<[HasBitCount, HasStdEnc]>;
 
 
@@ -642,31 +663,31 @@ class SignExtInReg<string opstr, ValueType vt, RegisterClass RC> :
 }
 
 // Subword Swap
-class SubwordSwap<string opstr, RegisterClass RC>:
-  InstSE<(outs RC:$rd), (ins RC:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
+class SubwordSwap<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
          NoItinerary, FrmR> {
   let Predicates = [HasSwap, HasStdEnc];
   let neverHasSideEffects = 1;
 }
 
 // Read Hardware
-class ReadHardware<RegisterClass CPURegClass, RegisterClass HWRegClass> :
-  InstSE<(outs CPURegClass:$rt), (ins HWRegClass:$rd), "rdhwr\t$rt, $rd", [],
+class ReadHardware<RegisterClass CPURegClass, RegisterOperand RO> :
+  InstSE<(outs CPURegClass:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
          IIAlu, FrmR>;
 
 // Ext and Ins
-class ExtBase<string opstr, RegisterClass RC>:
-  InstSE<(outs RC:$rt), (ins RC:$rs, uimm16:$pos, size_ext:$size),
+class ExtBase<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ext:$size),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RC:$rt, (MipsExt RC:$rs, imm:$pos, imm:$size))], NoItinerary,
+         [(set RO:$rt, (MipsExt RO:$rs, imm:$pos, imm:$size))], NoItinerary,
          FrmR> {
   let Predicates = [HasMips32r2, HasStdEnc];
 }
 
-class InsBase<string opstr, RegisterClass RC>:
-  InstSE<(outs RC:$rt), (ins RC:$rs, uimm16:$pos, size_ins:$size, RC:$src),
+class InsBase<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ins:$size, RO:$src),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RC:$rt, (MipsIns RC:$rs, imm:$pos, imm:$size, RC:$src))],
+         [(set RO:$rt, (MipsIns RO:$rs, imm:$pos, imm:$size, RO:$src))],
          NoItinerary, FrmR> {
   let Predicates = [HasMips32r2, HasStdEnc];
   let Constraints = "$src = $rt";
@@ -699,15 +720,15 @@ multiclass AtomicCmpSwap32<PatFrag Op>  {
   }
 }
 
-class LLBase<string opstr, RegisterClass RC, Operand Mem> :
-  InstSE<(outs RC:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class LLBase<string opstr, RegisterOperand RO, Operand Mem> :
+  InstSE<(outs RO:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMem";
   let mayLoad = 1;
 }
 
-class SCBase<string opstr, RegisterClass RC, Operand Mem> :
-  InstSE<(outs RC:$dst), (ins RC:$rt, Mem:$addr),
+class SCBase<string opstr, RegisterOperand RO, Operand Mem> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, Mem:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMem";
   let mayStore = 1;
@@ -769,42 +790,48 @@ let usesCustomInserter = 1 in {
 //===----------------------------------------------------------------------===//
 
 /// Arithmetic Instructions (ALU Immediate)
-def ADDiu : ArithLogicI<"addiu", simm16, CPURegs, immSExt16, add>,
+def ADDiu : ArithLogicI<"addiu", simm16, CPURegsOpnd, immSExt16, add>,
             ADDI_FM<0x9>, IsAsCheapAsAMove;
-def ADDi  : ArithLogicI<"addi", simm16, CPURegs>, ADDI_FM<0x8>;
+def ADDi  : ArithLogicI<"addi", simm16, CPURegsOpnd>, ADDI_FM<0x8>;
 def SLTi  : SetCC_I<"slti", setlt, simm16, immSExt16, CPURegs>, SLTI_FM<0xa>;
 def SLTiu : SetCC_I<"sltiu", setult, simm16, immSExt16, CPURegs>, SLTI_FM<0xb>;
-def ANDi  : ArithLogicI<"andi", uimm16, CPURegs, immZExt16, and>, ADDI_FM<0xc>;
-def ORi   : ArithLogicI<"ori", uimm16, CPURegs, immZExt16, or>, ADDI_FM<0xd>;
-def XORi  : ArithLogicI<"xori", uimm16, CPURegs, immZExt16, xor>, ADDI_FM<0xe>;
+def ANDi  : ArithLogicI<"andi", uimm16, CPURegsOpnd, immZExt16, and>,
+            ADDI_FM<0xc>;
+def ORi   : ArithLogicI<"ori", uimm16, CPURegsOpnd, immZExt16, or>,
+            ADDI_FM<0xd>;
+def XORi  : ArithLogicI<"xori", uimm16, CPURegsOpnd, immZExt16, xor>,
+            ADDI_FM<0xe>;
 def LUi   : LoadUpper<"lui", CPURegs, uimm16>, LUI_FM;
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def ADDu : ArithLogicR<"addu", CPURegs, 1, IIAlu, add>, ADD_FM<0, 0x21>;
-def SUBu : ArithLogicR<"subu", CPURegs, 0, IIAlu, sub>, ADD_FM<0, 0x23>;
-def MUL  : ArithLogicR<"mul", CPURegs, 1, IIImul, mul>, ADD_FM<0x1c, 2>;
-def ADD  : ArithLogicR<"add", CPURegs>, ADD_FM<0, 0x20>;
-def SUB  : ArithLogicR<"sub", CPURegs>, ADD_FM<0, 0x22>;
+def ADDu : ArithLogicR<"addu", CPURegsOpnd, 1, IIAlu, add>, ADD_FM<0, 0x21>;
+def SUBu : ArithLogicR<"subu", CPURegsOpnd, 0, IIAlu, sub>, ADD_FM<0, 0x23>;
+def MUL  : ArithLogicR<"mul", CPURegsOpnd, 1, IIImul, mul>, ADD_FM<0x1c, 2>;
+def ADD  : ArithLogicR<"add", CPURegsOpnd>, ADD_FM<0, 0x20>;
+def SUB  : ArithLogicR<"sub", CPURegsOpnd>, ADD_FM<0, 0x22>;
 def SLT  : SetCC_R<"slt", setlt, CPURegs>, ADD_FM<0, 0x2a>;
 def SLTu : SetCC_R<"sltu", setult, CPURegs>, ADD_FM<0, 0x2b>;
-def AND  : ArithLogicR<"and", CPURegs, 1, IIAlu, and>, ADD_FM<0, 0x24>;
-def OR   : ArithLogicR<"or", CPURegs, 1, IIAlu, or>, ADD_FM<0, 0x25>;
-def XOR  : ArithLogicR<"xor", CPURegs, 1, IIAlu, xor>, ADD_FM<0, 0x26>;
-def NOR  : LogicNOR<"nor", CPURegs>, ADD_FM<0, 0x27>;
+def AND  : ArithLogicR<"and", CPURegsOpnd, 1, IIAlu, and>, ADD_FM<0, 0x24>;
+def OR   : ArithLogicR<"or", CPURegsOpnd, 1, IIAlu, or>, ADD_FM<0, 0x25>;
+def XOR  : ArithLogicR<"xor", CPURegsOpnd, 1, IIAlu, xor>, ADD_FM<0, 0x26>;
+def NOR  : LogicNOR<"nor", CPURegsOpnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
-def SLL  : shift_rotate_imm<"sll", shamt, CPURegs, shl, immZExt5>, SRA_FM<0, 0>;
-def SRL  : shift_rotate_imm<"srl", shamt, CPURegs, srl, immZExt5>, SRA_FM<2, 0>;
-def SRA  : shift_rotate_imm<"sra", shamt, CPURegs, sra, immZExt5>, SRA_FM<3, 0>;
-def SLLV : shift_rotate_reg<"sllv", CPURegs, shl>, SRLV_FM<4, 0>;
-def SRLV : shift_rotate_reg<"srlv", CPURegs, srl>, SRLV_FM<6, 0>;
-def SRAV : shift_rotate_reg<"srav", CPURegs, sra>, SRLV_FM<7, 0>;
+def SLL  : shift_rotate_imm<"sll", shamt, CPURegsOpnd, shl, immZExt5>,
+           SRA_FM<0, 0>;
+def SRL  : shift_rotate_imm<"srl", shamt, CPURegsOpnd, srl, immZExt5>,
+           SRA_FM<2, 0>;
+def SRA  : shift_rotate_imm<"sra", shamt, CPURegsOpnd, sra, immZExt5>,
+           SRA_FM<3, 0>;
+def SLLV : shift_rotate_reg<"sllv", CPURegsOpnd, shl>, SRLV_FM<4, 0>;
+def SRLV : shift_rotate_reg<"srlv", CPURegsOpnd, srl>, SRLV_FM<6, 0>;
+def SRAV : shift_rotate_reg<"srav", CPURegsOpnd, sra>, SRLV_FM<7, 0>;
 
 // Rotate Instructions
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def ROTR  : shift_rotate_imm<"rotr", shamt, CPURegs, rotr, immZExt5>,
+  def ROTR  : shift_rotate_imm<"rotr", shamt, CPURegsOpnd, rotr, immZExt5>,
               SRA_FM<2, 1>;
-  def ROTRV : shift_rotate_reg<"rotrv", CPURegs, rotr>, SRLV_FM<6, 1>;
+  def ROTRV : shift_rotate_reg<"rotrv", CPURegsOpnd, rotr>, SRLV_FM<6, 1>;
 }
 
 /// Load and Store Instructions
@@ -828,13 +855,13 @@ def SYNC : SYNC_FT, SYNC_FM;
 
 /// Load-linked, Store-conditional
 let Predicates = [NotN64, HasStdEnc] in {
-  def LL : LLBase<"ll", CPURegs, mem>, LW_FM<0x30>;
-  def SC : SCBase<"sc", CPURegs, mem>, LW_FM<0x38>;
+  def LL : LLBase<"ll", CPURegsOpnd, mem>, LW_FM<0x30>;
+  def SC : SCBase<"sc", CPURegsOpnd, mem>, LW_FM<0x38>;
 }
 
 let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LL_P8 : LLBase<"ll", CPURegs, mem64>, LW_FM<0x30>;
-  def SC_P8 : SCBase<"sc", CPURegs, mem64>, LW_FM<0x38>;
+  def LL_P8 : LLBase<"ll", CPURegsOpnd, mem64>, LW_FM<0x30>;
+  def SC_P8 : SCBase<"sc", CPURegsOpnd, mem64>, LW_FM<0x38>;
 }
 
 /// Jump and Branch Instructions
@@ -853,18 +880,41 @@ def BAL_BR: BAL_FT, BAL_FM;
 
 def JAL  : JumpLink<"jal">, FJ<3>;
 def JALR : JumpLinkReg<"jalr", CPURegs>, JALR_FM;
-def BGEZAL : BGEZAL_FT<"bgezal", CPURegs>, BGEZAL_FM<0x11>;
-def BLTZAL : BGEZAL_FT<"bltzal", CPURegs>, BGEZAL_FM<0x10>;
+def JALRPseudo : JumpLinkRegPseudo<CPURegs, JALR, RA>;
+def BGEZAL : BGEZAL_FT<"bgezal", CPURegsOpnd>, BGEZAL_FM<0x11>;
+def BLTZAL : BGEZAL_FT<"bltzal", CPURegsOpnd>, BGEZAL_FM<0x10>;
 def TAILCALL : JumpFJ<calltarget, "j", MipsTailCall, imm>, FJ<2>, IsTailCall;
 def TAILCALL_R : JumpFR<CPURegs, MipsTailCall>, MTLO_FM<8>, IsTailCall;
 
 def RET : RetBase<CPURegs>, MTLO_FM<8>;
 
+// Exception handling related node and instructions.
+// The conversion sequence is:
+// ISD::EH_RETURN -> MipsISD::EH_RETURN ->
+// MIPSeh_return -> (stack change + indirect branch)
+//
+// MIPSeh_return takes the place of regular return instruction
+// but takes two arguments (V1, V0) which are used for storing
+// the offset and return address respectively.
+def SDT_MipsEHRET : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+
+def MIPSehret : SDNode<"MipsISD::EH_RETURN", SDT_MipsEHRET,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  def MIPSeh_return32 : MipsPseudo<(outs), (ins CPURegs:$spoff, CPURegs:$dst),
+                                [(MIPSehret CPURegs:$spoff, CPURegs:$dst)]>;
+  def MIPSeh_return64 : MipsPseudo<(outs), (ins CPU64Regs:$spoff,
+                                                CPU64Regs:$dst),
+                                [(MIPSehret CPU64Regs:$spoff, CPU64Regs:$dst)]>;
+}
+
 /// Multiply and Divide Instructions.
-def MULT  : Mult<"mult", IIImul, CPURegs, [HI, LO]>, MULT_FM<0, 0x18>;
-def MULTu : Mult<"multu", IIImul, CPURegs, [HI, LO]>, MULT_FM<0, 0x19>;
-def SDIV  : Div<MipsDivRem, "div", IIIdiv, CPURegs, [HI, LO]>, MULT_FM<0, 0x1a>;
-def UDIV  : Div<MipsDivRemU, "divu", IIIdiv, CPURegs, [HI, LO]>,
+def MULT  : Mult<"mult", IIImul, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x18>;
+def MULTu : Mult<"multu", IIImul, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x19>;
+def SDIV  : Div<MipsDivRem, "div", IIIdiv, CPURegsOpnd, [HI, LO]>,
+            MULT_FM<0, 0x1a>;
+def UDIV  : Div<MipsDivRemU, "divu", IIIdiv, CPURegsOpnd, [HI, LO]>,
             MULT_FM<0, 0x1b>;
 
 def MTHI : MoveToLOHI<"mthi", CPURegs, [HI]>, MTLO_FM<0x11>;
@@ -877,15 +927,14 @@ def SEB : SignExtInReg<"seb", i8, CPURegs>, SEB_FM<0x10, 0x20>;
 def SEH : SignExtInReg<"seh", i16, CPURegs>, SEB_FM<0x18, 0x20>;
 
 /// Count Leading
-def CLZ : CountLeading0<"clz", CPURegs>, CLO_FM<0x20>;
-def CLO : CountLeading1<"clo", CPURegs>, CLO_FM<0x21>;
+def CLZ : CountLeading0<"clz", CPURegsOpnd>, CLO_FM<0x20>;
+def CLO : CountLeading1<"clo", CPURegsOpnd>, CLO_FM<0x21>;
 
 /// Word Swap Bytes Within Halfwords
-def WSBH : SubwordSwap<"wsbh", CPURegs>, SEB_FM<2, 0x20>;
+def WSBH : SubwordSwap<"wsbh", CPURegsOpnd>, SEB_FM<2, 0x20>;
 
 /// No operation.
-/// FIXME: NOP should be an alias of "sll $0, $0, 0".
-def NOP : InstSE<(outs), (ins), "nop", [], IIAlu, FrmJ>, NOP_FM;
+def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
 
 // FrameIndexes are legalized when they are operands from load/store
 // instructions. The same not happens for stack address copies, so an
@@ -899,66 +948,86 @@ def MADDU : MArithR<"maddu", MipsMAddu, 1>, MULT_FM<0x1c, 1>;
 def MSUB  : MArithR<"msub", MipsMSub>, MULT_FM<0x1c, 4>;
 def MSUBU : MArithR<"msubu", MipsMSubu>, MULT_FM<0x1c, 5>;
 
-def RDHWR : ReadHardware<CPURegs, HWRegs>, RDHWR_FM;
+def RDHWR : ReadHardware<CPURegs, HWRegsOpnd>, RDHWR_FM;
 
-def EXT : ExtBase<"ext", CPURegs>, EXT_FM<0>;
-def INS : InsBase<"ins", CPURegs>, EXT_FM<4>;
+def EXT : ExtBase<"ext", CPURegsOpnd>, EXT_FM<0>;
+def INS : InsBase<"ins", CPURegsOpnd>, EXT_FM<4>;
 
 /// Move Control Registers From/To CPU Registers
-def MFC0_3OP : MFC3OP<(outs CPURegs:$rt), (ins CPURegs:$rd, uimm16:$sel),
+def MFC0_3OP : MFC3OP<(outs CPURegsOpnd:$rt),
+                      (ins CPURegsOpnd:$rd, uimm16:$sel),
                       "mfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 0>;
 
-def MTC0_3OP : MFC3OP<(outs CPURegs:$rd, uimm16:$sel), (ins CPURegs:$rt),
+def MTC0_3OP : MFC3OP<(outs CPURegsOpnd:$rd, uimm16:$sel),
+                      (ins CPURegsOpnd:$rt),
                       "mtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 4>;
 
-def MFC2_3OP : MFC3OP<(outs CPURegs:$rt), (ins CPURegs:$rd, uimm16:$sel),
+def MFC2_3OP : MFC3OP<(outs CPURegsOpnd:$rt),
+                      (ins CPURegsOpnd:$rd, uimm16:$sel),
                       "mfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 0>;
 
-def MTC2_3OP : MFC3OP<(outs CPURegs:$rd, uimm16:$sel), (ins CPURegs:$rt),
+def MTC2_3OP : MFC3OP<(outs CPURegsOpnd:$rd, uimm16:$sel),
+                      (ins CPURegsOpnd:$rt),
                       "mtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 4>;
 
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
-def : InstAlias<"move $dst,$src", (ADD CPURegs:$dst,CPURegs:$src,ZERO)>;
-def : InstAlias<"bal $offset", (BGEZAL RA,brtarget:$offset)>;
-def : InstAlias<"addu $rs,$rt,$imm",
-                (ADDiu CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
-def : InstAlias<"add $rs,$rt,$imm",
-                (ADDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
-def : InstAlias<"and $rs,$rt,$imm",
-                (ANDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
-def : InstAlias<"j $rs", (JR CPURegs:$rs)>;
-def : InstAlias<"not $rt,$rs", (NOR CPURegs:$rt,CPURegs:$rs,ZERO)>;
-def : InstAlias<"neg $rt,$rs", (SUB CPURegs:$rt,ZERO,CPURegs:$rs)>;
-def : InstAlias<"negu $rt,$rs", (SUBu CPURegs:$rt,ZERO,CPURegs:$rs)>;
-def : InstAlias<"slt $rs,$rt,$imm",
-                (SLTi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
-def : InstAlias<"xor $rs,$rt,$imm",
-                (XORi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
-def : InstAlias<"mfc0 $rt, $rd", (MFC0_3OP CPURegs:$rt, CPURegs:$rd, 0)>;
-def : InstAlias<"mtc0 $rt, $rd", (MTC0_3OP CPURegs:$rd, 0, CPURegs:$rt)>;
-def : InstAlias<"mfc2 $rt, $rd", (MFC2_3OP CPURegs:$rt, CPURegs:$rd, 0)>;
-def : InstAlias<"mtc2 $rt, $rd", (MTC2_3OP CPURegs:$rd, 0, CPURegs:$rt)>;
+def : InstAlias<"move $dst, $src",
+                (ADDu CPURegsOpnd:$dst, CPURegsOpnd:$src,ZERO), 1>,
+      Requires<[NotMips64]>;
+def : InstAlias<"move $dst, $src",
+                (OR CPURegsOpnd:$dst, CPURegsOpnd:$src,ZERO), 0>,
+      Requires<[NotMips64]>;
+def : InstAlias<"bal $offset", (BGEZAL RA, brtarget:$offset), 1>;
+def : InstAlias<"addu $rs, $rt, $imm",
+                (ADDiu CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>;
+def : InstAlias<"add $rs, $rt, $imm",
+                (ADDi CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>;
+def : InstAlias<"and $rs, $rt, $imm",
+                (ANDi CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>;
+def : InstAlias<"j $rs", (JR CPURegs:$rs), 0>,
+      Requires<[NotMips64]>;
+def : InstAlias<"jalr $rs", (JALR RA, CPURegs:$rs)>, Requires<[NotMips64]>;
+def : InstAlias<"not $rt, $rs",
+                (NOR CPURegsOpnd:$rt, CPURegsOpnd:$rs, ZERO), 1>;
+def : InstAlias<"neg $rt, $rs",
+                (SUB CPURegsOpnd:$rt, ZERO, CPURegsOpnd:$rs), 1>;
+def : InstAlias<"negu $rt, $rs",
+                (SUBu CPURegsOpnd:$rt, ZERO, CPURegsOpnd:$rs), 1>;
+def : InstAlias<"slt $rs, $rt, $imm",
+                (SLTi CPURegsOpnd:$rs, CPURegs:$rt, simm16:$imm), 0>;
+def : InstAlias<"xor $rs, $rt, $imm",
+                (XORi CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>,
+      Requires<[NotMips64]>;
+def : InstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+def : InstAlias<"mfc0 $rt, $rd",
+                (MFC0_3OP CPURegsOpnd:$rt, CPURegsOpnd:$rd, 0), 0>;
+def : InstAlias<"mtc0 $rt, $rd",
+                (MTC0_3OP CPURegsOpnd:$rd, 0, CPURegsOpnd:$rt), 0>;
+def : InstAlias<"mfc2 $rt, $rd",
+                (MFC2_3OP CPURegsOpnd:$rt, CPURegsOpnd:$rd, 0), 0>;
+def : InstAlias<"mtc2 $rt, $rd",
+                (MTC2_3OP CPURegsOpnd:$rd, 0, CPURegsOpnd:$rt), 0>;
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
-class LoadImm32< string instr_asm, Operand Od, RegisterClass RC> :
-  MipsAsmPseudoInst<(outs RC:$rt), (ins Od:$imm32),
+class LoadImm32< string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadImm32Reg : LoadImm32<"li", shamt,CPURegs>;
+def LoadImm32Reg : LoadImm32<"li", shamt,CPURegsOpnd>;
 
-class LoadAddress<string instr_asm, Operand MemOpnd, RegisterClass RC> :
-  MipsAsmPseudoInst<(outs RC:$rt), (ins MemOpnd:$addr),
+class LoadAddress<string instr_asm, Operand MemOpnd, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins MemOpnd:$addr),
                      !strconcat(instr_asm, "\t$rt, $addr")> ;
-def LoadAddr32Reg : LoadAddress<"la", mem, CPURegs>;
+def LoadAddr32Reg : LoadAddress<"la", mem, CPURegsOpnd>;
 
-class LoadAddressImm<string instr_asm, Operand Od, RegisterClass RC> :
-  MipsAsmPseudoInst<(outs RC:$rt), (ins Od:$imm32),
+class LoadAddressImm<string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadAddr32Imm : LoadAddressImm<"la", shamt,CPURegs>;
+def LoadAddr32Imm : LoadAddressImm<"la", shamt,CPURegsOpnd>;
 
 
 
@@ -1045,7 +1114,7 @@ def : WrapperPat<tglobaltlsaddr, ADDiu, CPURegs>;
 
 // Mips does not have "not", so we expand our way
 def : MipsPat<(not CPURegs:$in),
-              (NOR CPURegs:$in, ZERO)>;
+              (NOR CPURegsOpnd:$in, ZERO)>;
 
 // extended loads
 let Predicates = [NotN64, HasStdEnc] in {
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 30f68b1..2efe534 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -10,10 +10,10 @@
 // This pass expands a branch or jump instruction into a long branch if its
 // offset is too large to fit into its immediate field.
 //
-// FIXME: 
-// 1. Fix pc-region jump instructions which cross 256MB segment boundaries. 
+// FIXME:
+// 1. Fix pc-region jump instructions which cross 256MB segment boundaries.
 // 2. If program has inline assembly statements whose size cannot be
-//    determined accurately, load branch target addresses from the GOT. 
+//    determined accurately, load branch target addresses from the GOT.
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-long-branch"
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index 0c71596..59b23f7 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -56,4 +56,20 @@ unsigned MipsFunctionInfo::getMips16SPAliasReg() {
   return Mips16SPAliasReg = MF.getRegInfo().createVirtualRegister(RC);
 }
 
+void MipsFunctionInfo::createEhDataRegsFI() {
+  for (int I = 0; I < 4; ++I) {
+    const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
+    const TargetRegisterClass *RC = ST.isABI_N64() ?
+        &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+
+    EhDataRegFI[I] = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
+        RC->getAlignment(), false);
+  }
+}
+
+bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
+  return CallsEhReturn && (FI == EhDataRegFI[0] || FI == EhDataRegFI[1]
+                        || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
+}
+
 void MipsFunctionInfo::anchor() { }
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index eb6e1cf..b05b348 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -53,10 +53,16 @@ class MipsFunctionInfo : public MachineFunctionInfo {
   /// Size of incoming argument area.
   unsigned IncomingArgSize;
 
+  /// CallsEhReturn - Whether the function calls llvm.eh.return.
+  bool CallsEhReturn;
+
+  /// Frame objects for spilling eh data registers.
+  int EhDataRegFI[4];
+
 public:
   MipsFunctionInfo(MachineFunction& MF)
    : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
-     VarArgsFrameIndex(0)
+     VarArgsFrameIndex(0), CallsEhReturn(false)
   {}
 
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
@@ -78,6 +84,14 @@ public:
   }
 
   unsigned getIncomingArgSize() const { return IncomingArgSize; }
+
+  bool callsEhReturn() const { return CallsEhReturn; }
+  void setCallsEhReturn() { CallsEhReturn = true; }
+
+  void createEhDataRegsFI();
+  int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
+  bool isEhDataRegFI(int FI) const;
+
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 70eb6f3..3250733 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -47,6 +47,28 @@ MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
+
+unsigned
+MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                      MachineFunction &MF) const {
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case Mips::CPURegsRegClassID:
+  case Mips::CPU64RegsRegClassID:
+  case Mips::DSPRegsRegClassID: {
+    const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+    return 28 - TFI->hasFP(MF);
+  }
+  case Mips::FGR32RegClassID:
+    return 32;
+  case Mips::AFGR64RegClassID:
+    return 16;
+  case Mips::FGR64RegClassID:
+    return 32;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Callee Saved Registers methods
 //===----------------------------------------------------------------------===//
@@ -155,21 +177,14 @@ MipsRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
 // direct reference.
 void MipsRegisterInfo::
 eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                    RegScavenger *RS) const {
+                    unsigned FIOperandNum, RegScavenger *RS) const {
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
 
-  unsigned i = 0;
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
-  }
-
   DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
         errs() << "<--------->\n" << MI);
 
-  int FrameIndex = MI.getOperand(i).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   uint64_t stackSize = MF.getFrameInfo()->getStackSize();
   int64_t spOffset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
 
@@ -177,7 +192,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                << "spOffset   : " << spOffset << "\n"
                << "stackSize  : " << stackSize << "\n");
 
-  eliminateFI(MI, i, FrameIndex, stackSize, spOffset);
+  eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
 }
 
 unsigned MipsRegisterInfo::
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 78adf7f..13b2a6a 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -42,6 +42,8 @@ public:
   void adjustMipsStackFrame(MachineFunction &MF) const;
 
   /// Code Generation virtual methods...
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const;
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
 
@@ -53,7 +55,8 @@ public:
 
   /// Stack Frame Processing Methods
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index f07a10c..f93dd86 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -331,3 +331,48 @@ def HWRegs64 : RegisterClass<"Mips", [i64], 32, (add HWR29_64)>;
 
 // Accumulator Registers
 def ACRegs : RegisterClass<"Mips", [i64], 64, (sequence "AC%u", 0, 3)>;
+
+def CPURegsAsmOperand : AsmOperandClass {
+  let Name = "CPURegsAsm";
+  let ParserMethod = "parseCPURegs";
+}
+
+def CPU64RegsAsmOperand : AsmOperandClass {
+  let Name = "CPU64RegsAsm";
+  let ParserMethod = "parseCPU64Regs";
+}
+
+def CCRAsmOperand : AsmOperandClass {
+  let Name = "CCRAsm";
+  let ParserMethod = "parseCCRRegs";
+}
+
+def CPURegsOpnd : RegisterOperand<CPURegs, "printCPURegs"> {
+  let ParserMatchClass = CPURegsAsmOperand;
+}
+
+def CPU64RegsOpnd : RegisterOperand<CPU64Regs, "printCPURegs"> {
+  let ParserMatchClass = CPU64RegsAsmOperand;
+}
+
+def CCROpnd : RegisterOperand<CCR, "printCPURegs"> {
+  let ParserMatchClass = CCRAsmOperand;
+}
+
+def HWRegsAsmOperand : AsmOperandClass {
+  let Name = "HWRegsAsm";
+  let ParserMethod = "parseHWRegs";
+}
+
+def HW64RegsAsmOperand : AsmOperandClass {
+  let Name = "HW64RegsAsm";
+  let ParserMethod = "parseHW64Regs";
+}
+
+def HWRegsOpnd : RegisterOperand<HWRegs, "printCPURegs"> {
+  let ParserMatchClass = HWRegsAsmOperand;
+}
+
+def HW64RegsOpnd : RegisterOperand<HWRegs64, "printCPURegs"> {
+  let ParserMatchClass = HW64RegsAsmOperand;
+}
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 60b1233..0dd6713 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -29,9 +29,21 @@
 
 using namespace llvm;
 
+unsigned MipsSEFrameLowering::ehDataReg(unsigned I) const {
+  static const unsigned EhDataReg[] = {
+    Mips::A0, Mips::A1, Mips::A2, Mips::A3
+  };
+  static const unsigned EhDataReg64[] = {
+    Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64
+  };
+
+  return STI.isABI_N64() ? EhDataReg64[I] : EhDataReg[I];
+}
+
 void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB   = MF.front();
   MachineFrameInfo *MFI    = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   const MipsRegisterInfo *RegInfo =
     static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
   const MipsSEInstrInfo &TII =
@@ -105,6 +117,30 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
     }
   }
 
+  if (MipsFI->callsEhReturn()) {
+    const TargetRegisterClass *RC = STI.isABI_N64() ?
+        &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+
+    // Insert instructions that spill eh data registers.
+    for (int I = 0; I < 4; ++I) {
+      if (!MBB.isLiveIn(ehDataReg(I)))
+        MBB.addLiveIn(ehDataReg(I));
+      TII.storeRegToStackSlot(MBB, MBBI, ehDataReg(I), false,
+                              MipsFI->getEhDataRegFI(I), RC, RegInfo);
+    }
+
+    // Emit .cfi_offset directives for eh data registers.
+    MCSymbol *CSLabel2 = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel2);
+    for (int I = 0; I < 4; ++I) {
+      int64_t Offset = MFI->getObjectOffset(MipsFI->getEhDataRegFI(I));
+      DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
+      SrcML = MachineLocation(ehDataReg(I));
+      Moves.push_back(MachineMove(CSLabel2, DstML, SrcML));
+    }
+  }
+
   // if framepointer enabled, set it to point to the stack pointer.
   if (hasFP(MF)) {
     // Insert instruction "move $fp, $sp" at this location.
@@ -124,6 +160,9 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
                                        MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  const MipsRegisterInfo *RegInfo =
+    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
   const MipsSEInstrInfo &TII =
     *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
@@ -144,6 +183,22 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
     BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO);
   }
 
+  if (MipsFI->callsEhReturn()) {
+    const TargetRegisterClass *RC = STI.isABI_N64() ?
+        &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+
+    // Find first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+    for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instructions that restore eh data registers.
+    for (int J = 0; J < 4; ++J) {
+      TII.loadRegFromStackSlot(MBB, I, ehDataReg(J), MipsFI->getEhDataRegFI(J),
+                               RC, RegInfo);
+    }
+  }
+
   // Get the number of bytes from FrameInfo
   uint64_t StackSize = MFI->getStackSize();
 
@@ -194,16 +249,41 @@ MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return isInt<16>(MFI->getMaxCallFrameSize()) && !MFI->hasVarSizedObjects();
 }
 
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
+void MipsSEFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  if (!hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+
+    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+      Amount = -Amount;
+
+    unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+    TII.adjustStackPtr(SP, Amount, MBB, I);
+  }
+
+  MBB.erase(I);
+}
+
 void MipsSEFrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
 
   // Mark $fp as used if function has dedicated frame pointer.
   if (hasFP(MF))
     MRI.setPhysRegUsed(FP);
 
+  // Create spill slots for eh data registers if function calls eh_return.
+  if (MipsFI->callsEhReturn())
+    MipsFI->createEhDataRegsFI();
+
   // Set scavenging frame index if necessary.
   uint64_t MaxSPOffset = MF.getInfo<MipsFunctionInfo>()->getIncomingArgSize() +
     estimateStackSize(MF);
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 6481a0a..7becd25 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -28,6 +28,10 @@ public:
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
@@ -37,6 +41,7 @@ public:
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS) const;
+  unsigned ehDataReg(unsigned I) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index cd8f9f4..a9809ef 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -220,6 +220,10 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case Mips::ExtractElementF64:
     ExpandExtractElementF64(MBB, MI);
     break;
+  case Mips::MIPSeh_return32:
+  case Mips::MIPSeh_return64:
+    ExpandEhReturn(MBB, MI);
+    break;
   }
 
   MBB.erase(MI);
@@ -356,6 +360,31 @@ void MipsSEInstrInfo::ExpandBuildPairF64(MachineBasicBlock &MBB,
     .addReg(HiReg);
 }
 
+void MipsSEInstrInfo::ExpandEhReturn(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  // This pseudo instruction is generated as part of the lowering of
+  // ISD::EH_RETURN. We convert it to a stack increment by OffsetReg, and
+  // indirect jump to TargetReg
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  unsigned ADDU = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+  unsigned OR = STI.isABI_N64() ? Mips::OR64 : Mips::OR;
+  unsigned JR = STI.isABI_N64() ? Mips::JR64 : Mips::JR;
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned RA = STI.isABI_N64() ? Mips::RA_64 : Mips::RA;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned OffsetReg = I->getOperand(0).getReg();
+  unsigned TargetReg = I->getOperand(1).getReg();
+
+  // or   $ra, $v0, $zero
+  // addu $sp, $sp, $v1
+  // jr   $ra
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(OR), RA)
+      .addReg(TargetReg).addReg(ZERO);
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), SP)
+      .addReg(SP).addReg(OffsetReg);
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(JR)).addReg(RA);
+}
+
 const MipsInstrInfo *llvm::createMipsSEInstrInfo(MipsTargetMachine &TM) {
   return new MipsSEInstrInfo(TM);
 }
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 55b78b2..3e22b33 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -85,6 +85,8 @@ private:
                                MachineBasicBlock::iterator I) const;
   void ExpandBuildPairF64(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator I) const;
+  void ExpandEhReturn(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
 };
 
 }
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index abeab7b..a39b393 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -54,28 +54,6 @@ requiresFrameIndexScavenging(const MachineFunction &MF) const {
   return true;
 }
 
-// This function eliminate ADJCALLSTACKDOWN,
-// ADJCALLSTACKUP pseudo instructions
-void MipsSERegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  if (!TFI->hasReservedCallFrame(MF)) {
-    int64_t Amount = I->getOperand(0).getImm();
-
-    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
-      Amount = -Amount;
-
-    const MipsSEInstrInfo *II = static_cast<const MipsSEInstrInfo*>(&TII);
-    unsigned SP = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
-
-    II->adjustStackPtr(SP, Amount, MBB, I);
-  }
-
-  MBB.erase(I);
-}
-
 void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
                                      unsigned OpNo, int FrameIndex,
                                      uint64_t StackSize,
@@ -83,6 +61,7 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   int MinCSFI = 0;
@@ -93,15 +72,18 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
     MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
   }
 
+  bool EhDataRegFI = MipsFI->isEhDataRegFI(FrameIndex);
+
   // The following stack frame objects are always referenced relative to $sp:
   //  1. Outgoing arguments.
   //  2. Pointer to dynamically allocated stack space.
   //  3. Locations for callee-saved registers.
+  //  4. Locations for eh data registers.
   // Everything else is referenced relative to whatever register
   // getFrameRegister() returns.
   unsigned FrameReg;
 
-  if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
+  if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI)
     FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
   else
     FrameReg = getFrameRegister(MF);
diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index 7437bd3..f6827e9 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h
@@ -31,10 +31,6 @@ public:
 
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
 private:
   virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
                            int FrameIndex, uint64_t StackSize,
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 30d377a..75b4c98 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -26,13 +26,14 @@ void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                              const std::string &FS, bool little,
-                             Reloc::Model RM) :
+                             Reloc::Model _RM) :
   MipsGenSubtargetInfo(TT, CPU, FS),
   MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
   IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
   IsLinux(true), HasSEInReg(false), HasCondMov(false), HasSwap(false),
   HasBitCount(false), HasFPIdx(false),
-  InMips16Mode(false), HasDSP(false), HasDSPR2(false), IsAndroid(false)
+  InMips16Mode(false), InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
+  IsAndroid(false), RM(_RM)
 {
   std::string CPUName = CPU;
   if (CPUName.empty())
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 6a20815..32baa3d 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -14,6 +14,7 @@
 #ifndef MIPSSUBTARGET_H
 #define MIPSSUBTARGET_H
 
+#include "MCTargetDesc/MipsReginfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -88,6 +89,9 @@ protected:
   // InMips16 -- can process Mips16 instructions
   bool InMips16Mode;
 
+  // InMicroMips -- can process MicroMips instructions
+  bool InMicroMipsMode;
+
   // HasDSP, HasDSPR2 -- supports DSP ASE.
   bool HasDSP, HasDSPR2;
 
@@ -96,6 +100,12 @@ protected:
 
   InstrItineraryData InstrItins;
 
+  // The instance to the register info section object
+  MipsReginfo MRI;
+
+  // Relocation Model
+  Reloc::Model RM;
+
 public:
   virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                                      AntiDepBreakMode& Mode,
@@ -131,6 +141,7 @@ public:
   bool isNotSingleFloat() const { return !IsSingleFloat; }
   bool hasVFPU() const { return HasVFPU; }
   bool inMips16Mode() const { return InMips16Mode; }
+  bool inMicroMipsMode() const { return InMicroMipsMode; }
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
   bool isAndroid() const { return IsAndroid; }
@@ -145,6 +156,12 @@ public:
   bool hasSwap()      const { return HasSwap; }
   bool hasBitCount()  const { return HasBitCount; }
   bool hasFPIdx()     const { return HasFPIdx; }
+
+  // Grab MipsRegInfo object
+  const MipsReginfo &getMReginfo() const { return MRI; }
+
+  // Grab relocation model
+  Reloc::Model getRelocationModel() const {return RM;}
 };
 } // End llvm namespace
 
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 9aea764..4c748c5 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -38,6 +38,20 @@ void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
                                ELF::SHF_WRITE |ELF::SHF_ALLOC,
                                SectionKind::getBSS());
 
+  // Register info information
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+  if (Subtarget.isABI_N64() || Subtarget.isABI_N32())
+    ReginfoSection =
+      getContext().getELFSection(".MIPS.options",
+                                 ELF::SHT_MIPS_OPTIONS,
+                                 ELF::SHF_ALLOC |ELF::SHF_MIPS_NOSTRIP,
+                                 SectionKind::getMetadata());
+  else
+    ReginfoSection =
+      getContext().getELFSection(".reginfo",
+                                 ELF::SHT_MIPS_REGINFO,
+                                 ELF::SHF_ALLOC,
+                                 SectionKind::getMetadata());
 }
 
 // A address must be loaded from a small section if its size is less than the
diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h
index c394a9d..c0e9140 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.h
+++ b/lib/Target/Mips/MipsTargetObjectFile.h
@@ -17,6 +17,7 @@ namespace llvm {
   class MipsTargetObjectFile : public TargetLoweringObjectFileELF {
     const MCSection *SmallDataSection;
     const MCSection *SmallBSSSection;
+    const MCSection *ReginfoSection;
   public:
 
     void Initialize(MCContext &Ctx, const TargetMachine &TM);
@@ -35,6 +36,7 @@ namespace llvm {
                                             const TargetMachine &TM) const;
 
     // TODO: Classify globals as mips wishes.
+    const MCSection *getReginfoSection() const { return ReginfoSection; }
   };
 } // end namespace llvm
 
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 7cb16b4..47baef6 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -22,7 +22,6 @@ set(NVPTXCodeGen_sources
   NVPTXAllocaHoisting.cpp
   NVPTXAsmPrinter.cpp
   NVPTXUtilities.cpp
-  VectorElementize.cpp
   )
 
 add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 1d41665..6191819 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -30,8 +30,9 @@ void NVPTXMCAsmInfo::anchor() { }
 
 NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) {
   Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::nvptx64)
-    PointerSize = 8;
+  if (TheTriple.getArch() == Triple::nvptx64) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
 
   CommentString = "//";
 
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 097b50a..b46ea88 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -53,7 +53,6 @@ inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
 
 FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
                                  llvm::CodeGenOpt::Level OptLevel);
-FunctionPass *createVectorElementizePass(NVPTXTargetMachine &);
 FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &);
 FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &);
 FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 22da8f3..0115e1f 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -503,21 +503,7 @@ NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
     O << getNVPTXRegClassStr(RC) << mapped_vr;
     return;
   }
-  // Vector virtual register
-  if (getNVPTXVectorSize(RC) == 4)
-    O << "{"
-    << getNVPTXRegClassStr(RC) << mapped_vr << "_0, "
-    << getNVPTXRegClassStr(RC) << mapped_vr << "_1, "
-    << getNVPTXRegClassStr(RC) << mapped_vr << "_2, "
-    << getNVPTXRegClassStr(RC) << mapped_vr << "_3"
-    << "}";
-  else if (getNVPTXVectorSize(RC) == 2)
-    O << "{"
-    << getNVPTXRegClassStr(RC) << mapped_vr << "_0, "
-    << getNVPTXRegClassStr(RC) << mapped_vr << "_1"
-    << "}";
-  else
-    llvm_unreachable("Unsupported vector size");
+  report_fatal_error("Bad register!");
 }
 
 void
@@ -1314,7 +1300,8 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
     O << "shared" ;
     break;
   default:
-    llvm_unreachable("unexpected address space");
+    report_fatal_error("Bad address space found while emitting PTX");
+    break;
   }
 }
 
@@ -2023,29 +2010,9 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI)
   case NVPTX::StoreParamI64:  case NVPTX::StoreParamI8:
   case NVPTX::StoreParamS32I8:  case NVPTX::StoreParamU32I8:
   case NVPTX::StoreParamS32I16:  case NVPTX::StoreParamU32I16:
-  case NVPTX::StoreParamScalar2F32:  case NVPTX::StoreParamScalar2F64:
-  case NVPTX::StoreParamScalar2I16:  case NVPTX::StoreParamScalar2I32:
-  case NVPTX::StoreParamScalar2I64:  case NVPTX::StoreParamScalar2I8:
-  case NVPTX::StoreParamScalar4F32:  case NVPTX::StoreParamScalar4I16:
-  case NVPTX::StoreParamScalar4I32:  case NVPTX::StoreParamScalar4I8:
-  case NVPTX::StoreParamV2F32:  case NVPTX::StoreParamV2F64:
-  case NVPTX::StoreParamV2I16:  case NVPTX::StoreParamV2I32:
-  case NVPTX::StoreParamV2I64:  case NVPTX::StoreParamV2I8:
-  case NVPTX::StoreParamV4F32:  case NVPTX::StoreParamV4I16:
-  case NVPTX::StoreParamV4I32:  case NVPTX::StoreParamV4I8:
   case NVPTX::StoreRetvalF32:  case NVPTX::StoreRetvalF64:
   case NVPTX::StoreRetvalI16:  case NVPTX::StoreRetvalI32:
   case NVPTX::StoreRetvalI64:  case NVPTX::StoreRetvalI8:
-  case NVPTX::StoreRetvalScalar2F32:  case NVPTX::StoreRetvalScalar2F64:
-  case NVPTX::StoreRetvalScalar2I16:  case NVPTX::StoreRetvalScalar2I32:
-  case NVPTX::StoreRetvalScalar2I64:  case NVPTX::StoreRetvalScalar2I8:
-  case NVPTX::StoreRetvalScalar4F32:  case NVPTX::StoreRetvalScalar4I16:
-  case NVPTX::StoreRetvalScalar4I32:  case NVPTX::StoreRetvalScalar4I8:
-  case NVPTX::StoreRetvalV2F32:  case NVPTX::StoreRetvalV2F64:
-  case NVPTX::StoreRetvalV2I16:  case NVPTX::StoreRetvalV2I32:
-  case NVPTX::StoreRetvalV2I64:  case NVPTX::StoreRetvalV2I8:
-  case NVPTX::StoreRetvalV4F32:  case NVPTX::StoreRetvalV4I16:
-  case NVPTX::StoreRetvalV4I32:  case NVPTX::StoreRetvalV4I8:
   case NVPTX::LastCallArgF32:  case NVPTX::LastCallArgF64:
   case NVPTX::LastCallArgI16:  case NVPTX::LastCallArgI32:
   case NVPTX::LastCallArgI32imm:  case NVPTX::LastCallArgI64:
@@ -2056,16 +2023,6 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI)
   case NVPTX::LoadParamRegF32:  case NVPTX::LoadParamRegF64:
   case NVPTX::LoadParamRegI16:  case NVPTX::LoadParamRegI32:
   case NVPTX::LoadParamRegI64:  case NVPTX::LoadParamRegI8:
-  case NVPTX::LoadParamScalar2F32:  case NVPTX::LoadParamScalar2F64:
-  case NVPTX::LoadParamScalar2I16:  case NVPTX::LoadParamScalar2I32:
-  case NVPTX::LoadParamScalar2I64:  case NVPTX::LoadParamScalar2I8:
-  case NVPTX::LoadParamScalar4F32:  case NVPTX::LoadParamScalar4I16:
-  case NVPTX::LoadParamScalar4I32:  case NVPTX::LoadParamScalar4I8:
-  case NVPTX::LoadParamV2F32:  case NVPTX::LoadParamV2F64:
-  case NVPTX::LoadParamV2I16:  case NVPTX::LoadParamV2I32:
-  case NVPTX::LoadParamV2I64:  case NVPTX::LoadParamV2I8:
-  case NVPTX::LoadParamV4F32:  case NVPTX::LoadParamV4I16:
-  case NVPTX::LoadParamV4I32:  case NVPTX::LoadParamV4I8:
   case NVPTX::PrototypeInst:   case NVPTX::DBG_VALUE:
     return true;
   }
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 50072c5..bb2c55c 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -74,3 +74,14 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
 void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
                                       MachineBasicBlock &MBB) const {
 }
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void NVPTXFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN,
+  // ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index ee87b39..d34e7be 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -33,6 +33,10 @@ public:
   virtual void emitPrologue(MachineFunction &MF) const;
   virtual void emitEpilogue(MachineFunction &MF,
                             MachineBasicBlock &MBB) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 36ab7f5..481f13a 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -105,6 +105,21 @@ SDNode* NVPTXDAGToDAGISel::Select(SDNode *N) {
   case ISD::STORE:
     ResNode = SelectStore(N);
     break;
+  case NVPTXISD::LoadV2:
+  case NVPTXISD::LoadV4:
+    ResNode = SelectLoadVector(N);
+    break;
+  case NVPTXISD::LDGV2:
+  case NVPTXISD::LDGV4:
+  case NVPTXISD::LDUV2:
+  case NVPTXISD::LDUV4:
+    ResNode = SelectLDGLDUVector(N);
+    break;
+  case NVPTXISD::StoreV2:
+  case NVPTXISD::StoreV4:
+    ResNode = SelectStoreVector(N);
+    break;
+  default: break;
   }
   if (ResNode)
     return ResNode;
@@ -214,16 +229,6 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
     case MVT::i64:   Opcode = NVPTX::LD_i64_avar; break;
     case MVT::f32:   Opcode = NVPTX::LD_f32_avar; break;
     case MVT::f64:   Opcode = NVPTX::LD_f64_avar; break;
-    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_avar; break;
-    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_avar; break;
-    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_avar; break;
-    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_avar; break;
-    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_avar; break;
-    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_avar; break;
-    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_avar; break;
-    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_avar; break;
-    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_avar; break;
-    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_avar; break;
     default: return NULL;
     }
     SDValue Ops[] = { getI32Imm(isVolatile),
@@ -244,16 +249,6 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
     case MVT::i64:   Opcode = NVPTX::LD_i64_asi; break;
     case MVT::f32:   Opcode = NVPTX::LD_f32_asi; break;
     case MVT::f64:   Opcode = NVPTX::LD_f64_asi; break;
-    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_asi; break;
-    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_asi; break;
-    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_asi; break;
-    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_asi; break;
-    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_asi; break;
-    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_asi; break;
-    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_asi; break;
-    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_asi; break;
-    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_asi; break;
-    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_asi; break;
     default: return NULL;
     }
     SDValue Ops[] = { getI32Imm(isVolatile),
@@ -267,24 +262,26 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   } else if (Subtarget.is64Bit()?
       SelectADDRri64(N1.getNode(), N1, Base, Offset):
       SelectADDRri(N1.getNode(), N1, Base, Offset)) {
-    switch (TargetVT) {
-    case MVT::i8:    Opcode = NVPTX::LD_i8_ari; break;
-    case MVT::i16:   Opcode = NVPTX::LD_i16_ari; break;
-    case MVT::i32:   Opcode = NVPTX::LD_i32_ari; break;
-    case MVT::i64:   Opcode = NVPTX::LD_i64_ari; break;
-    case MVT::f32:   Opcode = NVPTX::LD_f32_ari; break;
-    case MVT::f64:   Opcode = NVPTX::LD_f64_ari; break;
-    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_ari; break;
-    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_ari; break;
-    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_ari; break;
-    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_ari; break;
-    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_ari; break;
-    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_ari; break;
-    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_ari; break;
-    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_ari; break;
-    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_ari; break;
-    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_ari; break;
-    default: return NULL;
+    if (Subtarget.is64Bit()) {
+      switch (TargetVT) {
+      case MVT::i8:    Opcode = NVPTX::LD_i8_ari_64; break;
+      case MVT::i16:   Opcode = NVPTX::LD_i16_ari_64; break;
+      case MVT::i32:   Opcode = NVPTX::LD_i32_ari_64; break;
+      case MVT::i64:   Opcode = NVPTX::LD_i64_ari_64; break;
+      case MVT::f32:   Opcode = NVPTX::LD_f32_ari_64; break;
+      case MVT::f64:   Opcode = NVPTX::LD_f64_ari_64; break;
+      default: return NULL;
+      }
+    } else {
+      switch (TargetVT) {
+      case MVT::i8:    Opcode = NVPTX::LD_i8_ari; break;
+      case MVT::i16:   Opcode = NVPTX::LD_i16_ari; break;
+      case MVT::i32:   Opcode = NVPTX::LD_i32_ari; break;
+      case MVT::i64:   Opcode = NVPTX::LD_i64_ari; break;
+      case MVT::f32:   Opcode = NVPTX::LD_f32_ari; break;
+      case MVT::f64:   Opcode = NVPTX::LD_f64_ari; break;
+      default: return NULL;
+      }
     }
     SDValue Ops[] = { getI32Imm(isVolatile),
                       getI32Imm(codeAddrSpace),
@@ -296,24 +293,26 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
                                      MVT::Other, Ops, 8);
   }
   else {
-    switch (TargetVT) {
-    case MVT::i8:    Opcode = NVPTX::LD_i8_areg; break;
-    case MVT::i16:   Opcode = NVPTX::LD_i16_areg; break;
-    case MVT::i32:   Opcode = NVPTX::LD_i32_areg; break;
-    case MVT::i64:   Opcode = NVPTX::LD_i64_areg; break;
-    case MVT::f32:   Opcode = NVPTX::LD_f32_areg; break;
-    case MVT::f64:   Opcode = NVPTX::LD_f64_areg; break;
-    case MVT::v2i8:  Opcode = NVPTX::LD_v2i8_areg; break;
-    case MVT::v2i16: Opcode = NVPTX::LD_v2i16_areg; break;
-    case MVT::v2i32: Opcode = NVPTX::LD_v2i32_areg; break;
-    case MVT::v2i64: Opcode = NVPTX::LD_v2i64_areg; break;
-    case MVT::v2f32: Opcode = NVPTX::LD_v2f32_areg; break;
-    case MVT::v2f64: Opcode = NVPTX::LD_v2f64_areg; break;
-    case MVT::v4i8:  Opcode = NVPTX::LD_v4i8_areg; break;
-    case MVT::v4i16: Opcode = NVPTX::LD_v4i16_areg; break;
-    case MVT::v4i32: Opcode = NVPTX::LD_v4i32_areg; break;
-    case MVT::v4f32: Opcode = NVPTX::LD_v4f32_areg; break;
-    default: return NULL;
+    if (Subtarget.is64Bit()) {
+      switch (TargetVT) {
+      case MVT::i8:    Opcode = NVPTX::LD_i8_areg_64; break;
+      case MVT::i16:   Opcode = NVPTX::LD_i16_areg_64; break;
+      case MVT::i32:   Opcode = NVPTX::LD_i32_areg_64; break;
+      case MVT::i64:   Opcode = NVPTX::LD_i64_areg_64; break;
+      case MVT::f32:   Opcode = NVPTX::LD_f32_areg_64; break;
+      case MVT::f64:   Opcode = NVPTX::LD_f64_areg_64; break;
+      default: return NULL;
+      }
+    } else {
+      switch (TargetVT) {
+      case MVT::i8:    Opcode = NVPTX::LD_i8_areg; break;
+      case MVT::i16:   Opcode = NVPTX::LD_i16_areg; break;
+      case MVT::i32:   Opcode = NVPTX::LD_i32_areg; break;
+      case MVT::i64:   Opcode = NVPTX::LD_i64_areg; break;
+      case MVT::f32:   Opcode = NVPTX::LD_f32_areg; break;
+      case MVT::f64:   Opcode = NVPTX::LD_f64_areg; break;
+      default: return NULL;
+      }
     }
     SDValue Ops[] = { getI32Imm(isVolatile),
                       getI32Imm(codeAddrSpace),
@@ -334,6 +333,370 @@ SDNode* NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   return NVPTXLD;
 }
 
+SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Addr, Offset, Base;
+  unsigned Opcode;
+  DebugLoc DL = N->getDebugLoc();
+  SDNode *LD;
+  MemSDNode *MemSD = cast<MemSDNode>(N);
+  EVT LoadedVT = MemSD->getMemoryVT();
+
+
+  if (!LoadedVT.isSimple())
+     return NULL;
+
+  // Address Space Setting
+  unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned FromTypeWidth =  ScalarVT.getSizeInBits();
+  unsigned int FromType;
+  // The last operand holds the original LoadSDNode::getExtensionType() value
+  unsigned ExtensionType =
+    cast<ConstantSDNode>(N->getOperand(N->getNumOperands()-1))->getZExtValue();
+  if (ExtensionType == ISD::SEXTLOAD)
+    FromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    FromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    FromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  unsigned VecType;
+
+  switch (N->getOpcode()) {
+  case NVPTXISD::LoadV2:  VecType = NVPTX::PTXLdStInstCode::V2; break;
+  case NVPTXISD::LoadV4:  VecType = NVPTX::PTXLdStInstCode::V4; break;
+  default: return NULL;
+  }
+
+  EVT EltVT = N->getValueType(0);
+
+  if (SelectDirectAddr(Op1, Addr)) {
+    switch (N->getOpcode()) {
+    default: return NULL;
+    case NVPTXISD::LoadV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::LDV_i8_v2_avar; break;
+      case MVT::i16:  Opcode = NVPTX::LDV_i16_v2_avar; break;
+      case MVT::i32:  Opcode = NVPTX::LDV_i32_v2_avar; break;
+      case MVT::i64:  Opcode = NVPTX::LDV_i64_v2_avar; break;
+      case MVT::f32:  Opcode = NVPTX::LDV_f32_v2_avar; break;
+      case MVT::f64:  Opcode = NVPTX::LDV_f64_v2_avar; break;
+      }
+      break;
+    case NVPTXISD::LoadV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::LDV_i8_v4_avar; break;
+      case MVT::i16:  Opcode = NVPTX::LDV_i16_v4_avar; break;
+      case MVT::i32:  Opcode = NVPTX::LDV_i32_v4_avar; break;
+      case MVT::f32:  Opcode = NVPTX::LDV_f32_v4_avar; break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile),
+                      getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType),
+                      getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth),
+                      Addr, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 7);
+  } else if (Subtarget.is64Bit()?
+             SelectADDRsi64(Op1.getNode(), Op1, Base, Offset):
+             SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
+    switch (N->getOpcode()) {
+    default: return NULL;
+    case NVPTXISD::LoadV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::LDV_i8_v2_asi; break;
+      case MVT::i16:  Opcode = NVPTX::LDV_i16_v2_asi; break;
+      case MVT::i32:  Opcode = NVPTX::LDV_i32_v2_asi; break;
+      case MVT::i64:  Opcode = NVPTX::LDV_i64_v2_asi; break;
+      case MVT::f32:  Opcode = NVPTX::LDV_f32_v2_asi; break;
+      case MVT::f64:  Opcode = NVPTX::LDV_f64_v2_asi; break;
+      }
+      break;
+    case NVPTXISD::LoadV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::LDV_i8_v4_asi; break;
+      case MVT::i16:  Opcode = NVPTX::LDV_i16_v4_asi; break;
+      case MVT::i32:  Opcode = NVPTX::LDV_i32_v4_asi; break;
+      case MVT::f32:  Opcode = NVPTX::LDV_f32_v4_asi; break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile),
+                      getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType),
+                      getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth),
+                      Base, Offset, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 8);
+  } else if (Subtarget.is64Bit()?
+             SelectADDRri64(Op1.getNode(), Op1, Base, Offset):
+             SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default: return NULL;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::LDV_i8_v2_ari_64; break;
+        case MVT::i16:  Opcode = NVPTX::LDV_i16_v2_ari_64; break;
+        case MVT::i32:  Opcode = NVPTX::LDV_i32_v2_ari_64; break;
+        case MVT::i64:  Opcode = NVPTX::LDV_i64_v2_ari_64; break;
+        case MVT::f32:  Opcode = NVPTX::LDV_f32_v2_ari_64; break;
+        case MVT::f64:  Opcode = NVPTX::LDV_f64_v2_ari_64; break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::LDV_i8_v4_ari_64; break;
+        case MVT::i16:  Opcode = NVPTX::LDV_i16_v4_ari_64; break;
+        case MVT::i32:  Opcode = NVPTX::LDV_i32_v4_ari_64; break;
+        case MVT::f32:  Opcode = NVPTX::LDV_f32_v4_ari_64; break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default: return NULL;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::LDV_i8_v2_ari; break;
+        case MVT::i16:  Opcode = NVPTX::LDV_i16_v2_ari; break;
+        case MVT::i32:  Opcode = NVPTX::LDV_i32_v2_ari; break;
+        case MVT::i64:  Opcode = NVPTX::LDV_i64_v2_ari; break;
+        case MVT::f32:  Opcode = NVPTX::LDV_f32_v2_ari; break;
+        case MVT::f64:  Opcode = NVPTX::LDV_f64_v2_ari; break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::LDV_i8_v4_ari; break;
+        case MVT::i16:  Opcode = NVPTX::LDV_i16_v4_ari; break;
+        case MVT::i32:  Opcode = NVPTX::LDV_i32_v4_ari; break;
+        case MVT::f32:  Opcode = NVPTX::LDV_f32_v4_ari; break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile),
+                      getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType),
+                      getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth),
+                      Base, Offset, Chain };
+
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 8);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default: return NULL;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::LDV_i8_v2_areg_64; break;
+        case MVT::i16:  Opcode = NVPTX::LDV_i16_v2_areg_64; break;
+        case MVT::i32:  Opcode = NVPTX::LDV_i32_v2_areg_64; break;
+        case MVT::i64:  Opcode = NVPTX::LDV_i64_v2_areg_64; break;
+        case MVT::f32:  Opcode = NVPTX::LDV_f32_v2_areg_64; break;
+        case MVT::f64:  Opcode = NVPTX::LDV_f64_v2_areg_64; break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::LDV_i8_v4_areg_64; break;
+        case MVT::i16:  Opcode = NVPTX::LDV_i16_v4_areg_64; break;
+        case MVT::i32:  Opcode = NVPTX::LDV_i32_v4_areg_64; break;
+        case MVT::f32:  Opcode = NVPTX::LDV_f32_v4_areg_64; break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default: return NULL;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::LDV_i8_v2_areg; break;
+        case MVT::i16:  Opcode = NVPTX::LDV_i16_v2_areg; break;
+        case MVT::i32:  Opcode = NVPTX::LDV_i32_v2_areg; break;
+        case MVT::i64:  Opcode = NVPTX::LDV_i64_v2_areg; break;
+        case MVT::f32:  Opcode = NVPTX::LDV_f32_v2_areg; break;
+        case MVT::f64:  Opcode = NVPTX::LDV_f64_v2_areg; break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::LDV_i8_v4_areg; break;
+        case MVT::i16:  Opcode = NVPTX::LDV_i16_v4_areg; break;
+        case MVT::i32:  Opcode = NVPTX::LDV_i32_v4_areg; break;
+        case MVT::f32:  Opcode = NVPTX::LDV_f32_v4_areg; break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile),
+                      getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType),
+                      getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth),
+                      Op1, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 7);
+  }
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return LD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  unsigned Opcode;
+  DebugLoc DL = N->getDebugLoc();
+  SDNode *LD;
+
+  EVT RetVT = N->getValueType(0);
+
+  // Select opcode
+  if (Subtarget.is64Bit()) {
+    switch (N->getOpcode()) {
+    default: return NULL;
+    case NVPTXISD::LDGV2:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_64; break;
+      case MVT::i16:  Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_64; break;
+      case MVT::i32:  Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_64; break;
+      case MVT::i64:  Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_64; break;
+      case MVT::f32:  Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_64; break;
+      case MVT::f64:  Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_64; break;
+      }
+      break;
+    case NVPTXISD::LDGV4:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_64; break;
+      case MVT::i16:  Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_64; break;
+      case MVT::i32:  Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_64; break;
+      case MVT::f32:  Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_64; break;
+      }
+      break;
+    case NVPTXISD::LDUV2:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_64; break;
+      case MVT::i16:  Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_64; break;
+      case MVT::i32:  Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_64; break;
+      case MVT::i64:  Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_64; break;
+      case MVT::f32:  Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_64; break;
+      case MVT::f64:  Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_64; break;
+      }
+      break;
+    case NVPTXISD::LDUV4:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_64; break;
+      case MVT::i16:  Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_64; break;
+      case MVT::i32:  Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_64; break;
+      case MVT::f32:  Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_64; break;
+      }
+      break;
+    }
+  } else {
+    switch (N->getOpcode()) {
+    default: return NULL;
+    case NVPTXISD::LDGV2:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_32; break;
+      case MVT::i16:  Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_32; break;
+      case MVT::i32:  Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_32; break;
+      case MVT::i64:  Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_32; break;
+      case MVT::f32:  Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_32; break;
+      case MVT::f64:  Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_32; break;
+      }
+      break;
+    case NVPTXISD::LDGV4:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_32; break;
+      case MVT::i16:  Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_32; break;
+      case MVT::i32:  Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_32; break;
+      case MVT::f32:  Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_32; break;
+      }
+      break;
+    case NVPTXISD::LDUV2:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_32; break;
+      case MVT::i16:  Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_32; break;
+      case MVT::i32:  Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_32; break;
+      case MVT::i64:  Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_32; break;
+      case MVT::f32:  Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_32; break;
+      case MVT::f64:  Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_32; break;
+      }
+      break;
+    case NVPTXISD::LDUV4:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_32; break;
+      case MVT::i16:  Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_32; break;
+      case MVT::i32:  Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_32; break;
+      case MVT::f32:  Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_32; break;
+      }
+      break;
+    }
+  }
+
+  SDValue Ops[] = { Op1, Chain };
+  LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), &Ops[0], 2);
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return LD;
+}
+
+
 SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
   StoreSDNode *ST = cast<StoreSDNode>(N);
@@ -400,16 +763,6 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     case MVT::i64:   Opcode = NVPTX::ST_i64_avar; break;
     case MVT::f32:   Opcode = NVPTX::ST_f32_avar; break;
     case MVT::f64:   Opcode = NVPTX::ST_f64_avar; break;
-    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_avar; break;
-    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_avar; break;
-    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_avar; break;
-    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_avar; break;
-    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_avar; break;
-    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_avar; break;
-    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_avar; break;
-    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_avar; break;
-    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_avar; break;
-    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_avar; break;
     default: return NULL;
     }
     SDValue Ops[] = { N1,
@@ -431,16 +784,6 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     case MVT::i64:   Opcode = NVPTX::ST_i64_asi; break;
     case MVT::f32:   Opcode = NVPTX::ST_f32_asi; break;
     case MVT::f64:   Opcode = NVPTX::ST_f64_asi; break;
-    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_asi; break;
-    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_asi; break;
-    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_asi; break;
-    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_asi; break;
-    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_asi; break;
-    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_asi; break;
-    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_asi; break;
-    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_asi; break;
-    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_asi; break;
-    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_asi; break;
     default: return NULL;
     }
     SDValue Ops[] = { N1,
@@ -455,24 +798,26 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
   } else if (Subtarget.is64Bit()?
       SelectADDRri64(N2.getNode(), N2, Base, Offset):
       SelectADDRri(N2.getNode(), N2, Base, Offset)) {
-    switch (SourceVT) {
-    case MVT::i8:    Opcode = NVPTX::ST_i8_ari; break;
-    case MVT::i16:   Opcode = NVPTX::ST_i16_ari; break;
-    case MVT::i32:   Opcode = NVPTX::ST_i32_ari; break;
-    case MVT::i64:   Opcode = NVPTX::ST_i64_ari; break;
-    case MVT::f32:   Opcode = NVPTX::ST_f32_ari; break;
-    case MVT::f64:   Opcode = NVPTX::ST_f64_ari; break;
-    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_ari; break;
-    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_ari; break;
-    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_ari; break;
-    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_ari; break;
-    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_ari; break;
-    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_ari; break;
-    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_ari; break;
-    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_ari; break;
-    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_ari; break;
-    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_ari; break;
-    default: return NULL;
+    if (Subtarget.is64Bit()) {
+      switch (SourceVT) {
+      case MVT::i8:    Opcode = NVPTX::ST_i8_ari_64; break;
+      case MVT::i16:   Opcode = NVPTX::ST_i16_ari_64; break;
+      case MVT::i32:   Opcode = NVPTX::ST_i32_ari_64; break;
+      case MVT::i64:   Opcode = NVPTX::ST_i64_ari_64; break;
+      case MVT::f32:   Opcode = NVPTX::ST_f32_ari_64; break;
+      case MVT::f64:   Opcode = NVPTX::ST_f64_ari_64; break;
+      default: return NULL;
+      }
+    } else {
+      switch (SourceVT) {
+      case MVT::i8:    Opcode = NVPTX::ST_i8_ari; break;
+      case MVT::i16:   Opcode = NVPTX::ST_i16_ari; break;
+      case MVT::i32:   Opcode = NVPTX::ST_i32_ari; break;
+      case MVT::i64:   Opcode = NVPTX::ST_i64_ari; break;
+      case MVT::f32:   Opcode = NVPTX::ST_f32_ari; break;
+      case MVT::f64:   Opcode = NVPTX::ST_f64_ari; break;
+      default: return NULL;
+      }
     }
     SDValue Ops[] = { N1,
                       getI32Imm(isVolatile),
@@ -484,24 +829,26 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     NVPTXST = CurDAG->getMachineNode(Opcode, dl,
                                      MVT::Other, Ops, 9);
   } else {
-    switch (SourceVT) {
-    case MVT::i8:    Opcode = NVPTX::ST_i8_areg; break;
-    case MVT::i16:   Opcode = NVPTX::ST_i16_areg; break;
-    case MVT::i32:   Opcode = NVPTX::ST_i32_areg; break;
-    case MVT::i64:   Opcode = NVPTX::ST_i64_areg; break;
-    case MVT::f32:   Opcode = NVPTX::ST_f32_areg; break;
-    case MVT::f64:   Opcode = NVPTX::ST_f64_areg; break;
-    case MVT::v2i8:  Opcode = NVPTX::ST_v2i8_areg; break;
-    case MVT::v2i16: Opcode = NVPTX::ST_v2i16_areg; break;
-    case MVT::v2i32: Opcode = NVPTX::ST_v2i32_areg; break;
-    case MVT::v2i64: Opcode = NVPTX::ST_v2i64_areg; break;
-    case MVT::v2f32: Opcode = NVPTX::ST_v2f32_areg; break;
-    case MVT::v2f64: Opcode = NVPTX::ST_v2f64_areg; break;
-    case MVT::v4i8:  Opcode = NVPTX::ST_v4i8_areg; break;
-    case MVT::v4i16: Opcode = NVPTX::ST_v4i16_areg; break;
-    case MVT::v4i32: Opcode = NVPTX::ST_v4i32_areg; break;
-    case MVT::v4f32: Opcode = NVPTX::ST_v4f32_areg; break;
-    default: return NULL;
+    if (Subtarget.is64Bit()) {
+      switch (SourceVT) {
+      case MVT::i8:    Opcode = NVPTX::ST_i8_areg_64; break;
+      case MVT::i16:   Opcode = NVPTX::ST_i16_areg_64; break;
+      case MVT::i32:   Opcode = NVPTX::ST_i32_areg_64; break;
+      case MVT::i64:   Opcode = NVPTX::ST_i64_areg_64; break;
+      case MVT::f32:   Opcode = NVPTX::ST_f32_areg_64; break;
+      case MVT::f64:   Opcode = NVPTX::ST_f64_areg_64; break;
+      default: return NULL;
+      }
+    } else {
+      switch (SourceVT) {
+      case MVT::i8:    Opcode = NVPTX::ST_i8_areg; break;
+      case MVT::i16:   Opcode = NVPTX::ST_i16_areg; break;
+      case MVT::i32:   Opcode = NVPTX::ST_i32_areg; break;
+      case MVT::i64:   Opcode = NVPTX::ST_i64_areg; break;
+      case MVT::f32:   Opcode = NVPTX::ST_f32_areg; break;
+      case MVT::f64:   Opcode = NVPTX::ST_f64_areg; break;
+      default: return NULL;
+      }
     }
     SDValue Ops[] = { N1,
                       getI32Imm(isVolatile),
@@ -523,6 +870,244 @@ SDNode* NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
   return NVPTXST;
 }
 
+SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Addr, Offset, Base;
+  unsigned Opcode;
+  DebugLoc DL = N->getDebugLoc();
+  SDNode *ST;
+  EVT EltVT = Op1.getValueType();
+  MemSDNode *MemSD = cast<MemSDNode>(N);
+  EVT StoreVT = MemSD->getMemoryVT();
+
+  // Address Space Setting
+  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+
+  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
+    report_fatal_error("Cannot store to pointer that points to constant "
+                       "memory space");
+  }
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  assert(StoreVT.isSimple() && "Store value is not simple");
+  MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
+  unsigned ToTypeWidth =  ScalarVT.getSizeInBits();
+  unsigned ToType;
+  if (ScalarVT.isFloatingPoint())
+    ToType = NVPTX::PTXLdStInstCode::Float;
+  else
+    ToType = NVPTX::PTXLdStInstCode::Unsigned;
+
+
+  SmallVector<SDValue, 12> StOps;
+  SDValue N2;
+  unsigned VecType;
+
+  switch (N->getOpcode()) {
+  case NVPTXISD::StoreV2:
+    VecType = NVPTX::PTXLdStInstCode::V2;
+    StOps.push_back(N->getOperand(1));
+    StOps.push_back(N->getOperand(2));
+    N2 = N->getOperand(3);
+    break;
+  case NVPTXISD::StoreV4:
+    VecType = NVPTX::PTXLdStInstCode::V4;
+    StOps.push_back(N->getOperand(1));
+    StOps.push_back(N->getOperand(2));
+    StOps.push_back(N->getOperand(3));
+    StOps.push_back(N->getOperand(4));
+    N2 = N->getOperand(5);
+    break;
+  default: return NULL;
+  }
+
+  StOps.push_back(getI32Imm(IsVolatile));
+  StOps.push_back(getI32Imm(CodeAddrSpace));
+  StOps.push_back(getI32Imm(VecType));
+  StOps.push_back(getI32Imm(ToType));
+  StOps.push_back(getI32Imm(ToTypeWidth));
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (N->getOpcode()) {
+    default: return NULL;
+    case NVPTXISD::StoreV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::STV_i8_v2_avar; break;
+      case MVT::i16:  Opcode = NVPTX::STV_i16_v2_avar; break;
+      case MVT::i32:  Opcode = NVPTX::STV_i32_v2_avar; break;
+      case MVT::i64:  Opcode = NVPTX::STV_i64_v2_avar; break;
+      case MVT::f32:  Opcode = NVPTX::STV_f32_v2_avar; break;
+      case MVT::f64:  Opcode = NVPTX::STV_f64_v2_avar; break;
+      }
+      break;
+    case NVPTXISD::StoreV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::STV_i8_v4_avar; break;
+      case MVT::i16:  Opcode = NVPTX::STV_i16_v4_avar; break;
+      case MVT::i32:  Opcode = NVPTX::STV_i32_v4_avar; break;
+      case MVT::f32:  Opcode = NVPTX::STV_f32_v4_avar; break;
+      }
+      break;
+    }
+    StOps.push_back(Addr);
+  } else if (Subtarget.is64Bit()?
+             SelectADDRsi64(N2.getNode(), N2, Base, Offset):
+             SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (N->getOpcode()) {
+    default: return NULL;
+    case NVPTXISD::StoreV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::STV_i8_v2_asi; break;
+      case MVT::i16:  Opcode = NVPTX::STV_i16_v2_asi; break;
+      case MVT::i32:  Opcode = NVPTX::STV_i32_v2_asi; break;
+      case MVT::i64:  Opcode = NVPTX::STV_i64_v2_asi; break;
+      case MVT::f32:  Opcode = NVPTX::STV_f32_v2_asi; break;
+      case MVT::f64:  Opcode = NVPTX::STV_f64_v2_asi; break;
+      }
+      break;
+    case NVPTXISD::StoreV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default: return NULL;
+      case MVT::i8:   Opcode = NVPTX::STV_i8_v4_asi; break;
+      case MVT::i16:  Opcode = NVPTX::STV_i16_v4_asi; break;
+      case MVT::i32:  Opcode = NVPTX::STV_i32_v4_asi; break;
+      case MVT::f32:  Opcode = NVPTX::STV_f32_v4_asi; break;
+      }
+      break;
+    }
+    StOps.push_back(Base);
+    StOps.push_back(Offset);
+  } else if (Subtarget.is64Bit()?
+             SelectADDRri64(N2.getNode(), N2, Base, Offset):
+             SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default: return NULL;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::STV_i8_v2_ari_64; break;
+        case MVT::i16:  Opcode = NVPTX::STV_i16_v2_ari_64; break;
+        case MVT::i32:  Opcode = NVPTX::STV_i32_v2_ari_64; break;
+        case MVT::i64:  Opcode = NVPTX::STV_i64_v2_ari_64; break;
+        case MVT::f32:  Opcode = NVPTX::STV_f32_v2_ari_64; break;
+        case MVT::f64:  Opcode = NVPTX::STV_f64_v2_ari_64; break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::STV_i8_v4_ari_64; break;
+        case MVT::i16:  Opcode = NVPTX::STV_i16_v4_ari_64; break;
+        case MVT::i32:  Opcode = NVPTX::STV_i32_v4_ari_64; break;
+        case MVT::f32:  Opcode = NVPTX::STV_f32_v4_ari_64; break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default: return NULL;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::STV_i8_v2_ari; break;
+        case MVT::i16:  Opcode = NVPTX::STV_i16_v2_ari; break;
+        case MVT::i32:  Opcode = NVPTX::STV_i32_v2_ari; break;
+        case MVT::i64:  Opcode = NVPTX::STV_i64_v2_ari; break;
+        case MVT::f32:  Opcode = NVPTX::STV_f32_v2_ari; break;
+        case MVT::f64:  Opcode = NVPTX::STV_f64_v2_ari; break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::STV_i8_v4_ari; break;
+        case MVT::i16:  Opcode = NVPTX::STV_i16_v4_ari; break;
+        case MVT::i32:  Opcode = NVPTX::STV_i32_v4_ari; break;
+        case MVT::f32:  Opcode = NVPTX::STV_f32_v4_ari; break;
+        }
+        break;
+      }
+    }
+    StOps.push_back(Base);
+    StOps.push_back(Offset);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default: return NULL;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::STV_i8_v2_areg_64; break;
+        case MVT::i16:  Opcode = NVPTX::STV_i16_v2_areg_64; break;
+        case MVT::i32:  Opcode = NVPTX::STV_i32_v2_areg_64; break;
+        case MVT::i64:  Opcode = NVPTX::STV_i64_v2_areg_64; break;
+        case MVT::f32:  Opcode = NVPTX::STV_f32_v2_areg_64; break;
+        case MVT::f64:  Opcode = NVPTX::STV_f64_v2_areg_64; break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::STV_i8_v4_areg_64; break;
+        case MVT::i16:  Opcode = NVPTX::STV_i16_v4_areg_64; break;
+        case MVT::i32:  Opcode = NVPTX::STV_i32_v4_areg_64; break;
+        case MVT::f32:  Opcode = NVPTX::STV_f32_v4_areg_64; break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default: return NULL;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::STV_i8_v2_areg; break;
+        case MVT::i16:  Opcode = NVPTX::STV_i16_v2_areg; break;
+        case MVT::i32:  Opcode = NVPTX::STV_i32_v2_areg; break;
+        case MVT::i64:  Opcode = NVPTX::STV_i64_v2_areg; break;
+        case MVT::f32:  Opcode = NVPTX::STV_f32_v2_areg; break;
+        case MVT::f64:  Opcode = NVPTX::STV_f64_v2_areg; break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default: return NULL;
+        case MVT::i8:   Opcode = NVPTX::STV_i8_v4_areg; break;
+        case MVT::i16:  Opcode = NVPTX::STV_i16_v4_areg; break;
+        case MVT::i32:  Opcode = NVPTX::STV_i32_v4_areg; break;
+        case MVT::f32:  Opcode = NVPTX::STV_f32_v4_areg; break;
+        }
+        break;
+      }
+    }
+    StOps.push_back(N2);
+  }
+
+  StOps.push_back(Chain);
+
+  ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, &StOps[0], StOps.size());
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return ST;
+}
+
 // SelectDirectAddr - Match a direct address for DAG.
 // A direct address could be a globaladdress or externalsymbol.
 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 14f2091..4ec9241 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -72,8 +72,11 @@ private:
 #include "NVPTXGenDAGISel.inc"
 
   SDNode *Select(SDNode *N);
-  SDNode* SelectLoad(SDNode *N);
-  SDNode* SelectStore(SDNode *N);
+  SDNode *SelectLoad(SDNode *N);
+  SDNode *SelectLoadVector(SDNode *N);
+  SDNode *SelectLDGLDUVector(SDNode *N);
+  SDNode *SelectStore(SDNode *N);
+  SDNode *SelectStoreVector(SDNode *N);
 
   inline SDValue getI32Imm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i32);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b3ab9fc..5ee747a 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -45,15 +45,27 @@ using namespace llvm;
 static unsigned int uniqueCallSite = 0;
 
 static cl::opt<bool>
-RetainVectorOperands("nvptx-codegen-vectors",
-     cl::desc("NVPTX Specific: Retain LLVM's vectors and generate PTX vectors"),
-                     cl::init(true));
-
-static cl::opt<bool>
 sched4reg("nvptx-sched4reg",
           cl::desc("NVPTX Specific: schedule for register pressue"),
           cl::init(false));
 
+static bool IsPTXVectorType(MVT VT) {
+  switch (VT.SimpleTy) {
+  default: return false;
+  case MVT::v2i8:
+  case MVT::v4i8:
+  case MVT::v2i16:
+  case MVT::v4i16:
+  case MVT::v2i32:
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v2f32:
+  case MVT::v4f32:
+  case MVT::v2f64:
+  return true;
+  }
+}
+
 // NVPTXTargetLowering Constructor.
 NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
 : TargetLowering(TM, new NVPTXTargetObjectFile()),
@@ -63,9 +75,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   // always lower memset, memcpy, and memmove intrinsics to load/store
   // instructions, rather
   // then generating calls to memset, mempcy or memmove.
-  maxStoresPerMemset = (unsigned)0xFFFFFFFF;
-  maxStoresPerMemcpy = (unsigned)0xFFFFFFFF;
-  maxStoresPerMemmove = (unsigned)0xFFFFFFFF;
+  MaxStoresPerMemset = (unsigned)0xFFFFFFFF;
+  MaxStoresPerMemcpy = (unsigned)0xFFFFFFFF;
+  MaxStoresPerMemmove = (unsigned)0xFFFFFFFF;
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 
@@ -87,41 +99,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
 
-  if (RetainVectorOperands) {
-    addRegisterClass(MVT::v2f32, &NVPTX::V2F32RegsRegClass);
-    addRegisterClass(MVT::v4f32, &NVPTX::V4F32RegsRegClass);
-    addRegisterClass(MVT::v2i32, &NVPTX::V2I32RegsRegClass);
-    addRegisterClass(MVT::v4i32, &NVPTX::V4I32RegsRegClass);
-    addRegisterClass(MVT::v2f64, &NVPTX::V2F64RegsRegClass);
-    addRegisterClass(MVT::v2i64, &NVPTX::V2I64RegsRegClass);
-    addRegisterClass(MVT::v2i16, &NVPTX::V2I16RegsRegClass);
-    addRegisterClass(MVT::v4i16, &NVPTX::V4I16RegsRegClass);
-    addRegisterClass(MVT::v2i8, &NVPTX::V2I8RegsRegClass);
-    addRegisterClass(MVT::v4i8, &NVPTX::V4I8RegsRegClass);
-
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32  , Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32  , Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16  , Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8   , Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64  , Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64  , Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32  , Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32  , Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16  , Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i8   , Custom);
-
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32  , Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32  , Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16  , Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i8   , Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64  , Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64  , Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32  , Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32  , Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16  , Custom);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i8   , Custom);
-  }
-
   // Operations not directly supported by NVPTX.
   setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
   setOperationAction(ISD::BR_CC,             MVT::Other, Expand);
@@ -191,42 +168,16 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   // TRAP can be lowered to PTX trap
   setOperationAction(ISD::TRAP,               MVT::Other, Legal);
 
-  // By default, CONCAT_VECTORS is implemented via store/load
-  // through stack. It is slow and uses local memory. We need
-  // to custom-lowering them.
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32  , Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32  , Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16  , Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i8   , Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64  , Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64  , Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32  , Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f32  , Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i16  , Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i8   , Custom);
-
-  // Expand vector int to float and float to int conversions
-  // - For SINT_TO_FP and UINT_TO_FP, the src type
-  //   (Node->getOperand(0).getValueType())
-  //   is used to determine the action, while for FP_TO_UINT and FP_TO_SINT,
-  //   the dest type (Node->getValueType(0)) is used.
-  //
-  //   See VectorLegalizer::LegalizeOp() (LegalizeVectorOps.cpp) for the vector
-  //   case, and
-  //   SelectionDAGLegalize::LegalizeOp() (LegalizeDAG.cpp) for the scalar case.
-  //
-  //   That is why v4i32 or v2i32 are used here.
-  //
-  //   The expansion for vectors happens in VectorLegalizer::LegalizeOp()
-  //   (LegalizeVectorOps.cpp).
-  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Expand);
-  setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Expand);
-  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
-  setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Expand);
-  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+  // Register custom handling for vector loads/stores
+  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+       i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType)i;
+    if (IsPTXVectorType(VT)) {
+      setOperationAction(ISD::LOAD, VT, Custom);
+      setOperationAction(ISD::STORE, VT, Custom);
+      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+    }
+  }
 
   // Now deduce the information based on the above mentioned
   // actions
@@ -268,6 +219,14 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case NVPTXISD::RETURN:          return "NVPTXISD::RETURN";
   case NVPTXISD::CallSeqBegin:    return "NVPTXISD::CallSeqBegin";
   case NVPTXISD::CallSeqEnd:      return "NVPTXISD::CallSeqEnd";
+  case NVPTXISD::LoadV2:          return "NVPTXISD::LoadV2";
+  case NVPTXISD::LoadV4:          return "NVPTXISD::LoadV4";
+  case NVPTXISD::LDGV2:           return "NVPTXISD::LDGV2";
+  case NVPTXISD::LDGV4:           return "NVPTXISD::LDGV4";
+  case NVPTXISD::LDUV2:           return "NVPTXISD::LDUV2";
+  case NVPTXISD::LDUV4:           return "NVPTXISD::LDUV4";
+  case NVPTXISD::StoreV2:         return "NVPTXISD::StoreV2";
+  case NVPTXISD::StoreV4:         return "NVPTXISD::StoreV4";
   }
 }
 
@@ -868,12 +827,19 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 }
 
 
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::i1)
+    return LowerLOADi1(Op, DAG);
+  else
+    return SDValue();
+}
+
 // v = ld i1* addr
 //   =>
 // v1 = ld i8* addr
 // v = trunc v1 to i1
 SDValue NVPTXTargetLowering::
-LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   LoadSDNode *LD = cast<LoadSDNode>(Node);
   DebugLoc dl = Node->getDebugLoc();
@@ -893,12 +859,109 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues(Ops, 2, dl);
 }
 
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  EVT ValVT = Op.getOperand(1).getValueType();
+  if (ValVT == MVT::i1)
+    return LowerSTOREi1(Op, DAG);
+  else if (ValVT.isVector())
+    return LowerSTOREVector(Op, DAG);
+  else
+    return SDValue();
+}
+
+SDValue
+NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *N = Op.getNode();
+  SDValue Val = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+  EVT ValVT = Val.getValueType();
+
+  if (ValVT.isVector()) {
+    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+    // legal.  We can (and should) split that into 2 stores of <2 x double> here
+    // but I'm leaving that as a TODO for now.
+    if (!ValVT.isSimple())
+      return SDValue();
+    switch (ValVT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v2i8:
+    case MVT::v2i16:
+    case MVT::v2i32:
+    case MVT::v2i64:
+    case MVT::v2f32:
+    case MVT::v2f64:
+    case MVT::v4i8:
+    case MVT::v4i16:
+    case MVT::v4i32:
+    case MVT::v4f32:
+      // This is a "native" vector type
+      break;
+    }
+
+    unsigned Opcode = 0;
+    EVT EltVT = ValVT.getVectorElementType();
+    unsigned NumElts = ValVT.getVectorNumElements();
+
+    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
+    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+    // stored type to i16 and propogate the "real" type as the memory type.
+    bool NeedExt = false;
+    if (EltVT.getSizeInBits() < 16)
+      NeedExt = true;
+
+    switch (NumElts) {
+    default:  return SDValue();
+    case 2:
+      Opcode = NVPTXISD::StoreV2;
+      break;
+    case 4: {
+      Opcode = NVPTXISD::StoreV4;
+      break;
+    }
+    }
+
+    SmallVector<SDValue, 8> Ops;
+
+    // First is the chain
+    Ops.push_back(N->getOperand(0));
+
+    // Then the split values
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+                                   DAG.getIntPtrConstant(i));
+      if (NeedExt)
+        // ANY_EXTEND is correct here since the store will only look at the
+        // lower-order bits anyway.
+        ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+      Ops.push_back(ExtVal);
+    }
+
+    // Then any remaining arguments
+    for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) {
+      Ops.push_back(N->getOperand(i));
+    }
+
+    MemSDNode *MemSD = cast<MemSDNode>(N);
+
+    SDValue NewSt = DAG.getMemIntrinsicNode(Opcode, DL,
+                                            DAG.getVTList(MVT::Other), &Ops[0],
+                                            Ops.size(), MemSD->getMemoryVT(),
+                                            MemSD->getMemOperand());
+
+
+    //return DCI.CombineTo(N, NewSt, true);
+    return NewSt;
+  }
+
+  return SDValue();
+}
+
 // st i1 v, addr
 //    =>
 // v1 = zxt v to i8
 // st i8, addr
 SDValue NVPTXTargetLowering::
-LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   DebugLoc dl = Node->getDebugLoc();
   StoreSDNode *ST = cast<StoreSDNode>(Node);
@@ -1027,9 +1090,11 @@ NVPTXTargetLowering::LowerFormalArguments(SDValue Chain,
       if (isABI || isKernel) {
         // If ABI, load from the param symbol
         SDValue Arg = getParamSymbol(DAG, idx);
-        Value *srcValue = new Argument(PointerType::get(ObjectVT.getTypeForEVT(
-            F->getContext()),
-            llvm::ADDRESS_SPACE_PARAM));
+        // Conjure up a value that we can get the address space from.
+        // FIXME: Using a constant here is a hack.
+        Value *srcValue = Constant::getNullValue(PointerType::get(
+                              ObjectVT.getTypeForEVT(F->getContext()),
+                              llvm::ADDRESS_SPACE_PARAM));
         SDValue p = DAG.getLoad(ObjectVT, dl, Root, Arg,
                                 MachinePointerInfo(srcValue), false, false,
                                 false,
@@ -1346,3 +1411,242 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
   return 4;
 }
+
+/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
+static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue>& Results) {
+  EVT ResVT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  assert(ResVT.isVector() && "Vector load must have vector type");
+
+  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+  // legal.  We can (and should) split that into 2 loads of <2 x double> here
+  // but I'm leaving that as a TODO for now.
+  assert(ResVT.isSimple() && "Can only handle simple types");
+  switch (ResVT.getSimpleVT().SimpleTy) {
+  default: return;
+  case MVT::v2i8:
+  case MVT::v2i16:
+  case MVT::v2i32:
+  case MVT::v2i64:
+  case MVT::v2f32:
+  case MVT::v2f64:
+  case MVT::v4i8:
+  case MVT::v4i16:
+  case MVT::v4i32:
+  case MVT::v4f32:
+    // This is a "native" vector type
+    break;
+  }
+
+  EVT EltVT = ResVT.getVectorElementType();
+  unsigned NumElts = ResVT.getVectorNumElements();
+
+  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+  // loaded type to i16 and propogate the "real" type as the memory type.
+  bool NeedTrunc = false;
+  if (EltVT.getSizeInBits() < 16) {
+    EltVT = MVT::i16;
+    NeedTrunc = true;
+  }
+
+  unsigned Opcode = 0;
+  SDVTList LdResVTs;
+
+  switch (NumElts) {
+  default:  return;
+  case 2:
+    Opcode = NVPTXISD::LoadV2;
+    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+    break;
+  case 4: {
+    Opcode = NVPTXISD::LoadV4;
+    EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+    LdResVTs = DAG.getVTList(ListVTs, 5);
+    break;
+  }
+  }
+
+  SmallVector<SDValue, 8> OtherOps;
+
+  // Copy regular operands
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    OtherOps.push_back(N->getOperand(i));
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+
+  // The select routine does not have access to the LoadSDNode instance, so
+  // pass along the extension information
+  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
+
+  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, &OtherOps[0],
+                                          OtherOps.size(), LD->getMemoryVT(),
+                                          LD->getMemOperand());
+
+  SmallVector<SDValue, 4> ScalarRes;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Res = NewLD.getValue(i);
+    if (NeedTrunc)
+      Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+    ScalarRes.push_back(Res);
+  }
+
+  SDValue LoadChain = NewLD.getValue(NumElts);
+
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts);
+
+  Results.push_back(BuildVec);
+  Results.push_back(LoadChain);
+}
+
+static void ReplaceINTRINSIC_W_CHAIN(SDNode *N,
+                                     SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &Results) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Intrin = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  // Get the intrinsic ID
+  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
+  switch(IntrinNo) {
+  default: return;
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_p:
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p: {
+    EVT ResVT = N->getValueType(0);
+
+    if (ResVT.isVector()) {
+      // Vector LDG/LDU
+
+      unsigned NumElts = ResVT.getVectorNumElements();
+      EVT EltVT = ResVT.getVectorElementType();
+
+      // Since LDU/LDG are target nodes, we cannot rely on DAG type legalization.
+      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+      // loaded type to i16 and propogate the "real" type as the memory type.
+      bool NeedTrunc = false;
+      if (EltVT.getSizeInBits() < 16) {
+        EltVT = MVT::i16;
+        NeedTrunc = true;
+      }
+
+      unsigned Opcode = 0;
+      SDVTList LdResVTs;
+
+      switch (NumElts) {
+      default:  return;
+      case 2:
+        switch(IntrinNo) {
+        default: return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV2;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV2;
+          break;
+        }
+        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+        break;
+      case 4: {
+        switch(IntrinNo) {
+        default: return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV4;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV4;
+          break;
+        }
+        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+        LdResVTs = DAG.getVTList(ListVTs, 5);
+        break;
+      }
+      }
+
+      SmallVector<SDValue, 8> OtherOps;
+
+      // Copy regular operands
+
+      OtherOps.push_back(Chain); // Chain
+      // Skip operand 1 (intrinsic ID)
+      // Others
+      for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i)
+        OtherOps.push_back(N->getOperand(i));
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, &OtherOps[0],
+                                              OtherOps.size(), MemSD->getMemoryVT(),
+                                              MemSD->getMemOperand());
+
+      SmallVector<SDValue, 4> ScalarRes;
+
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue Res = NewLD.getValue(i);
+        if (NeedTrunc)
+          Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+        ScalarRes.push_back(Res);
+      }
+
+      SDValue LoadChain = NewLD.getValue(NumElts);
+
+      SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts);
+
+      Results.push_back(BuildVec);
+      Results.push_back(LoadChain);
+    } else {
+      // i8 LDG/LDU
+      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
+             "Custom handling of non-i8 ldu/ldg?");
+
+      // Just copy all operands as-is
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        Ops.push_back(N->getOperand(i));
+
+      // Force output to i16
+      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      // We make sure the memory type is i8, which will be used during isel
+      // to select the proper instruction.
+      SDValue NewLD = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL,
+                                              LdResVTs, &Ops[0],
+                                              Ops.size(), MVT::i8,
+                                              MemSD->getMemOperand());
+
+      Results.push_back(NewLD.getValue(0));
+      Results.push_back(NewLD.getValue(1));
+    }
+  }
+  }
+}
+
+void NVPTXTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue> &Results,
+                                             SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default: report_fatal_error("Unhandled custom legalization");
+  case ISD::LOAD:
+    ReplaceLoadVector(N, DAG, Results);
+    return;
+  case ISD::INTRINSIC_W_CHAIN:
+    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
+    return;
+  }
+}
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 0a1833a..95e7b55 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -58,7 +58,16 @@ enum NodeType {
   RETURN,
   CallSeqBegin,
   CallSeqEnd,
-  Dummy
+  Dummy,
+
+  LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LoadV4,
+  LDGV2, // LDG.v2
+  LDGV4, // LDG.v4
+  LDUV2, // LDU.v2
+  LDUV4, // LDU.v4
+  StoreV2,
+  StoreV4
 };
 }
 
@@ -143,8 +152,16 @@ private:
 
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
+
+  virtual void ReplaceNodeResults(SDNode *N,
+                                  SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) const;
 };
 } // namespace llvm
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 6fe654cb..9e73d80 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -65,46 +65,6 @@ void NVPTXInstrInfo::copyPhysReg (MachineBasicBlock &MBB,
       NVPTX::Float64RegsRegClass.contains(SrcReg))
     BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
     .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V4F32RegsRegClass.contains(DestReg) &&
-      NVPTX::V4F32RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V4f32Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V4I32RegsRegClass.contains(DestReg) &&
-      NVPTX::V4I32RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V4i32Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V2F32RegsRegClass.contains(DestReg) &&
-      NVPTX::V2F32RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V2f32Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V2I32RegsRegClass.contains(DestReg) &&
-      NVPTX::V2I32RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V2i32Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V4I8RegsRegClass.contains(DestReg) &&
-      NVPTX::V4I8RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V4i8Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V2I8RegsRegClass.contains(DestReg) &&
-      NVPTX::V2I8RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V2i8Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V4I16RegsRegClass.contains(DestReg) &&
-      NVPTX::V4I16RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V4i16Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V2I16RegsRegClass.contains(DestReg) &&
-      NVPTX::V2I16RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V2i16Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V2I64RegsRegClass.contains(DestReg) &&
-      NVPTX::V2I64RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V2i64Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::V2F64RegsRegClass.contains(DestReg) &&
-      NVPTX::V2F64RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::V2f64Mov), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc));
   else {
     llvm_unreachable("Don't know how to copy a register");
   }
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8a410b8..f43abe2 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -52,6 +52,7 @@ def hasAtomAddF32 : Predicate<"Subtarget.hasAtomAddF32()">;
 def hasVote : Predicate<"Subtarget.hasVote()">;
 def hasDouble : Predicate<"Subtarget.hasDouble()">;
 def reqPTX20 : Predicate<"Subtarget.reqPTX20()">;
+def hasLDG : Predicate<"Subtarget.hasLDG()">;
 def hasLDU : Predicate<"Subtarget.hasLDU()">;
 def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
 
@@ -2153,11 +2154,21 @@ multiclass LD<NVPTXRegClass regclass> {
       i32imm:$fromWidth, Int32Regs:$addr),
 !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
            "$fromWidth \t$dst, [$addr];"), []>;
+  def _areg_64 : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+     !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
+                " \t$dst, [$addr];"), []>;
   def _ari : NVPTXInst<(outs regclass:$dst),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
       i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
 !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
            "$fromWidth \t$dst, [$addr+$offset];"), []>;
+  def _ari_64 : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
+               " \t$dst, [$addr+$offset];"), []>;
   def _asi : NVPTXInst<(outs regclass:$dst),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
       i32imm:$fromWidth, imem:$addr, i32imm:$offset),
@@ -2174,19 +2185,6 @@ defm LD_f32 : LD<Float32Regs>;
 defm LD_f64 : LD<Float64Regs>;
 }
 
-let VecInstType=isVecLD.Value, mayLoad=1, neverHasSideEffects=1 in {
-defm LD_v2i8 : LD<V2I8Regs>;
-defm LD_v4i8 : LD<V4I8Regs>;
-defm LD_v2i16 : LD<V2I16Regs>;
-defm LD_v4i16 : LD<V4I16Regs>;
-defm LD_v2i32 : LD<V2I32Regs>;
-defm LD_v4i32 : LD<V4I32Regs>;
-defm LD_v2f32 : LD<V2F32Regs>;
-defm LD_v4f32 : LD<V4F32Regs>;
-defm LD_v2i64 : LD<V2I64Regs>;
-defm LD_v2f64 : LD<V2F64Regs>;
-}
-
 multiclass ST<NVPTXRegClass regclass> {
   def _avar : NVPTXInst<(outs),
     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
@@ -2198,11 +2196,21 @@ multiclass ST<NVPTXRegClass regclass> {
       LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
 !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
            " \t[$addr], $src;"), []>;
+  def _areg_64 : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+     LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+  !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
+               "\t[$addr], $src;"), []>;
   def _ari : NVPTXInst<(outs),
     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
       LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
 !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
            " \t[$addr+$offset], $src;"), []>;
+  def _ari_64 : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+     LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+  !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
+               "\t[$addr+$offset], $src;"), []>;
   def _asi : NVPTXInst<(outs),
     (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
       LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
@@ -2219,19 +2227,6 @@ defm ST_f32 : ST<Float32Regs>;
 defm ST_f64 : ST<Float64Regs>;
 }
 
-let VecInstType=isVecST.Value, mayStore=1, neverHasSideEffects=1 in {
-defm ST_v2i8 : ST<V2I8Regs>;
-defm ST_v4i8 : ST<V4I8Regs>;
-defm ST_v2i16 : ST<V2I16Regs>;
-defm ST_v4i16 : ST<V4I16Regs>;
-defm ST_v2i32 : ST<V2I32Regs>;
-defm ST_v4i32 : ST<V4I32Regs>;
-defm ST_v2f32 : ST<V2F32Regs>;
-defm ST_v4f32 : ST<V4F32Regs>;
-defm ST_v2i64 : ST<V2I64Regs>;
-defm ST_v2f64 : ST<V2F64Regs>;
-}
-
 // The following is used only in and after vector elementizations.
 // Vector elementization happens at the machine instruction level, so the
 // following instruction
@@ -2247,11 +2242,21 @@ multiclass LD_VEC<NVPTXRegClass regclass> {
       i32imm:$fromWidth, Int32Regs:$addr),
     !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
                "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
   def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
       i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
     !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
                "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v2_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
   def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
       i32imm:$fromWidth, imem:$addr, i32imm:$offset),
@@ -2269,6 +2274,12 @@ multiclass LD_VEC<NVPTXRegClass regclass> {
       i32imm:$fromWidth, Int32Regs:$addr),
     !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
                "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                               regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
   def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
       regclass:$dst4),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
@@ -2276,6 +2287,13 @@ multiclass LD_VEC<NVPTXRegClass regclass> {
     !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
                "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
                 []>;
+  def _v4_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                              regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+    []>;
   def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
       regclass:$dst4),
     (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
@@ -2304,12 +2322,23 @@ multiclass ST_VEC<NVPTXRegClass regclass> {
       LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
     !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
                "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_areg_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+     LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
   def _v2_ari : NVPTXInst<(outs),
     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
       LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
       i32imm:$offset),
     !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
                "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v2_ari_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+     LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+     i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
   def _v2_asi : NVPTXInst<(outs),
     (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
       LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
@@ -2328,6 +2357,12 @@ multiclass ST_VEC<NVPTXRegClass regclass> {
       i32imm:$fromWidth, Int32Regs:$addr),
     !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
                "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_areg_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+     LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
   def _v4_ari : NVPTXInst<(outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
       LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
@@ -2335,6 +2370,13 @@ multiclass ST_VEC<NVPTXRegClass regclass> {
     !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
                "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
     []>;
+  def _v4_ari_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+     LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+     []>;
   def _v4_asi : NVPTXInst<(outs),
     (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
       LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
@@ -2822,8 +2864,6 @@ def trapinst : NVPTXInst<(outs), (ins),
                          "trap;",
                          [(trap)]>;
 
-include "NVPTXVector.td"
-
 include "NVPTXIntrinsics.td"
 
 
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 028a94b..49e2568 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1343,52 +1343,113 @@ defm INT_PTX_LDU_G_v4f32_ELE
   : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
     Float32Regs>;
 
-// Vector ldu
-multiclass VLDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp,
-  NVPTXInst eleInst, NVPTXInst eleInst64> {
- def _32:    NVPTXVecInst<(outs regclass:$result), (ins Int32Regs:$src),
-               !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp Int32Regs:$src))], eleInst>,
- Requires<[hasLDU]>;
- def _64:    NVPTXVecInst<(outs regclass:$result), (ins Int64Regs:$src),
-               !strconcat("ldu.global.", TyStr),
-         [(set regclass:$result, (IntOp Int64Regs:$src))], eleInst64>,
- Requires<[hasLDU]>;
+
+//-----------------------------------
+// Support for ldg on sm_35 or later 
+//-----------------------------------
+
+def ldg_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldg_global_i node:$ptr), [{
+  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+  return M->getMemoryVT() == MVT::i8;
+}]>;
+
+multiclass LDG_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
+         Requires<[hasLDG]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>;
+}
+
+multiclass LDG_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
+        Requires<[hasLDG]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>;
+}
+
+defm INT_PTX_LDG_GLOBAL_i8
+  : LDG_G_NOINTRIN<"u8 \t$result, [$src];",  Int16Regs, ldg_i8>;
+defm INT_PTX_LDG_GLOBAL_i16
+  : LDG_G<"u16 \t$result, [$src];", Int16Regs,   int_nvvm_ldg_global_i>;
+defm INT_PTX_LDG_GLOBAL_i32
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs,   int_nvvm_ldg_global_i>;
+defm INT_PTX_LDG_GLOBAL_i64
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs,   int_nvvm_ldg_global_i>;
+defm INT_PTX_LDG_GLOBAL_f32
+  : LDG_G<"f32 \t$result, [$src];", Float32Regs, int_nvvm_ldg_global_f>;
+defm INT_PTX_LDG_GLOBAL_f64
+  : LDG_G<"f64 \t$result, [$src];", Float64Regs, int_nvvm_ldg_global_f>;
+defm INT_PTX_LDG_GLOBAL_p32
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs,   int_nvvm_ldg_global_p>;
+defm INT_PTX_LDG_GLOBAL_p64
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs,   int_nvvm_ldg_global_p>;
+
+// vector
+
+// Elementized vector ldg 
+multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _32:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int32Regs:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _64:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int64Regs:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
 }
 
-let VecInstType=isVecLD.Value in {
-defm INT_PTX_LDU_G_v2i8  : VLDU_G<"v2.u8 \t${result:vecfull}, [$src];",
-  V2I8Regs,  int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i8_ELE_32,
-  INT_PTX_LDU_G_v2i8_ELE_64>;
-defm INT_PTX_LDU_G_v4i8  : VLDU_G<"v4.u8 \t${result:vecfull}, [$src];",
-  V4I8Regs,  int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i8_ELE_32,
-  INT_PTX_LDU_G_v4i8_ELE_64>;
-defm INT_PTX_LDU_G_v2i16 : VLDU_G<"v2.u16 \t${result:vecfull}, [$src];",
-  V2I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i16_ELE_32,
-  INT_PTX_LDU_G_v2i16_ELE_64>;
-defm INT_PTX_LDU_G_v4i16 : VLDU_G<"v4.u16 \t${result:vecfull}, [$src];",
-  V4I16Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i16_ELE_32,
-  INT_PTX_LDU_G_v4i16_ELE_64>;
-defm INT_PTX_LDU_G_v2i32 : VLDU_G<"v2.u32 \t${result:vecfull}, [$src];",
-  V2I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i32_ELE_32,
-  INT_PTX_LDU_G_v2i32_ELE_64>;
-defm INT_PTX_LDU_G_v4i32 : VLDU_G<"v4.u32 \t${result:vecfull}, [$src];",
-  V4I32Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v4i32_ELE_32,
-  INT_PTX_LDU_G_v4i32_ELE_64>;
-defm INT_PTX_LDU_G_v2f32 : VLDU_G<"v2.f32 \t${result:vecfull}, [$src];",
-  V2F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f32_ELE_32,
-  INT_PTX_LDU_G_v2f32_ELE_64>;
-defm INT_PTX_LDU_G_v4f32 : VLDU_G<"v4.f32 \t${result:vecfull}, [$src];",
-  V4F32Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v4f32_ELE_32,
-  INT_PTX_LDU_G_v4f32_ELE_64>;
-defm INT_PTX_LDU_G_v2i64 : VLDU_G<"v2.u64 \t${result:vecfull}, [$src];",
-  V2I64Regs, int_nvvm_ldu_global_i, INT_PTX_LDU_G_v2i64_ELE_32,
-  INT_PTX_LDU_G_v2i64_ELE_64>;
-defm INT_PTX_LDU_G_v2f64 : VLDU_G<"v2.f64 \t${result:vecfull}, [$src];",
-  V2F64Regs, int_nvvm_ldu_global_f, INT_PTX_LDU_G_v2f64_ELE_32,
-  INT_PTX_LDU_G_v2f64_ELE_64>;
+multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { 
+ def _32:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                        regclass:$dst3, regclass:$dst4), (ins Int32Regs:$src),
+               !strconcat("ld.global.nc.", TyStr), []>;
+ def _64:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                        regclass:$dst3, regclass:$dst4), (ins Int64Regs:$src),
+               !strconcat("ld.global.nc.", TyStr), []>;
 }
 
+// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
+defm INT_PTX_LDG_G_v2i8_ELE
+  : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE
+  : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE
+  : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE
+  : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE
+  : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE
+  : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDG_G_v4i8_ELE
+  : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE
+  : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE
+  : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE
+  : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
 
 
 multiclass NG_TO_G<string Str, Intrinsic Intrin> {
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 08be917..350a2c5 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -54,36 +54,6 @@ std::string getNVPTXRegClassName (TargetRegisterClass const *RC) {
   else if (RC == &NVPTX::SpecialRegsRegClass) {
     return "!Special!";
   }
-  else if (RC == &NVPTX::V2F32RegsRegClass) {
-    return ".v2.f32";
-  }
-  else if (RC == &NVPTX::V4F32RegsRegClass) {
-    return ".v4.f32";
-  }
-  else if (RC == &NVPTX::V2I32RegsRegClass) {
-    return ".v2.s32";
-  }
-  else if (RC == &NVPTX::V4I32RegsRegClass) {
-    return ".v4.s32";
-  }
-  else if (RC == &NVPTX::V2F64RegsRegClass) {
-    return ".v2.f64";
-  }
-  else if (RC == &NVPTX::V2I64RegsRegClass) {
-    return ".v2.s64";
-  }
-  else if (RC == &NVPTX::V2I16RegsRegClass) {
-    return ".v2.s16";
-  }
-  else if (RC == &NVPTX::V4I16RegsRegClass) {
-    return ".v4.s16";
-  }
-  else if (RC == &NVPTX::V2I8RegsRegClass) {
-    return ".v2.s16";
-  }
-  else if (RC == &NVPTX::V4I8RegsRegClass) {
-    return ".v4.s16";
-  }
   else {
     return "INTERNAL";
   }
@@ -115,137 +85,11 @@ std::string getNVPTXRegClassStr (TargetRegisterClass const *RC) {
   else if (RC == &NVPTX::SpecialRegsRegClass) {
     return "!Special!";
   }
-  else if (RC == &NVPTX::V2F32RegsRegClass) {
-    return "%v2f";
-  }
-  else if (RC == &NVPTX::V4F32RegsRegClass) {
-    return "%v4f";
-  }
-  else if (RC == &NVPTX::V2I32RegsRegClass) {
-    return "%v2r";
-  }
-  else if (RC == &NVPTX::V4I32RegsRegClass) {
-    return "%v4r";
-  }
-  else if (RC == &NVPTX::V2F64RegsRegClass) {
-    return "%v2fd";
-  }
-  else if (RC == &NVPTX::V2I64RegsRegClass) {
-    return "%v2rd";
-  }
-  else if (RC == &NVPTX::V2I16RegsRegClass) {
-    return "%v2s";
-  }
-  else if (RC == &NVPTX::V4I16RegsRegClass) {
-    return "%v4rs";
-  }
-  else if (RC == &NVPTX::V2I8RegsRegClass) {
-    return "%v2rc";
-  }
-  else if (RC == &NVPTX::V4I8RegsRegClass) {
-    return "%v4rc";
-  }
   else {
     return "INTERNAL";
   }
   return "";
 }
-
-bool isNVPTXVectorRegClass(TargetRegisterClass const *RC) {
-  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
-    return true;
-  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
-    return true;
-  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
-    return true;
-  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
-    return true;
-  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
-    return true;
-  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
-    return true;
-  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
-    return true;
-  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
-    return true;
-  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
-    return true;
-  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
-    return true;
-  return false;
-}
-
-std::string getNVPTXElemClassName(TargetRegisterClass const *RC) {
-  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass);
-  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Float64RegsRegClass);
-  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass);
-  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass);
-  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Int64RegsRegClass);
-  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass);
-  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Float32RegsRegClass);
-  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Int16RegsRegClass);
-  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Int32RegsRegClass);
-  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
-    return getNVPTXRegClassName(&NVPTX::Int8RegsRegClass);
-  llvm_unreachable("Not a vector register class");
-}
-
-const TargetRegisterClass *getNVPTXElemClass(TargetRegisterClass const *RC) {
-  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
-    return (&NVPTX::Float32RegsRegClass);
-  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
-    return (&NVPTX::Float64RegsRegClass);
-  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
-    return (&NVPTX::Int16RegsRegClass);
-  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
-    return (&NVPTX::Int32RegsRegClass);
-  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
-    return (&NVPTX::Int64RegsRegClass);
-  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
-    return (&NVPTX::Int8RegsRegClass);
-  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
-    return (&NVPTX::Float32RegsRegClass);
-  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
-    return (&NVPTX::Int16RegsRegClass);
-  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
-    return (&NVPTX::Int32RegsRegClass);
-  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
-    return (&NVPTX::Int8RegsRegClass);
-  llvm_unreachable("Not a vector register class");
-}
-
-int getNVPTXVectorSize(TargetRegisterClass const *RC) {
-  if (RC->getID() == NVPTX::V2F32RegsRegClassID)
-    return 2;
-  if (RC->getID() == NVPTX::V2F64RegsRegClassID)
-    return 2;
-  if (RC->getID() == NVPTX::V2I16RegsRegClassID)
-    return 2;
-  if (RC->getID() == NVPTX::V2I32RegsRegClassID)
-    return 2;
-  if (RC->getID() == NVPTX::V2I64RegsRegClassID)
-    return 2;
-  if (RC->getID() == NVPTX::V2I8RegsRegClassID)
-    return 2;
-  if (RC->getID() == NVPTX::V4F32RegsRegClassID)
-    return 4;
-  if (RC->getID() == NVPTX::V4I16RegsRegClassID)
-    return 4;
-  if (RC->getID() == NVPTX::V4I32RegsRegClassID)
-    return 4;
-  if (RC->getID() == NVPTX::V4I8RegsRegClassID)
-    return 4;
-  llvm_unreachable("Not a vector register class");
-}
 }
 
 NVPTXRegisterInfo::NVPTXRegisterInfo(const TargetInstrInfo &tii,
@@ -277,30 +121,22 @@ BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
 void NVPTXRegisterInfo::
 eliminateFrameIndex(MachineBasicBlock::iterator II,
-                    int SPAdj,
+                    int SPAdj, unsigned FIOperandNum,
                     RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
-  unsigned i = 0;
   MachineInstr &MI = *II;
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
-  }
-
-  int FrameIndex = MI.getOperand(i).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
   MachineFunction &MF = *MI.getParent()->getParent();
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
-      MI.getOperand(i+1).getImm();
+      MI.getOperand(FIOperandNum+1).getImm();
 
   // Using I0 as the frame pointer
-  MI.getOperand(i).ChangeToRegister(NVPTX::VRFrame, false);
-  MI.getOperand(i+1).ChangeToImmediate(Offset);
+  MI.getOperand(FIOperandNum).ChangeToRegister(NVPTX::VRFrame, false);
+  MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
 }
 
-
 int NVPTXRegisterInfo::
 getDwarfRegNum(unsigned RegNum, bool isEH) const {
   return 0;
@@ -314,12 +150,3 @@ unsigned NVPTXRegisterInfo::getRARegister() const {
   return 0;
 }
 
-// This function eliminates ADJCALLSTACKDOWN,
-// ADJCALLSTACKUP pseudo instructions
-void NVPTXRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  // Simply discard ADJCALLSTACKDOWN,
-  // ADJCALLSTACKUP instructions.
-  MBB.erase(I);
-}
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h
index 5951783..69f73f2 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -55,13 +55,9 @@ public:
   virtual BitVector getReservedRegs(const MachineFunction &MF) const;
 
   virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                                   int SPAdj,
+                                   int SPAdj, unsigned FIOperandNum,
                                    RegScavenger *RS=NULL) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
   virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const;
   virtual unsigned getFrameRegister(const MachineFunction &MF) const;
   virtual unsigned getRARegister() const;
@@ -81,10 +77,6 @@ public:
 
 std::string getNVPTXRegClassName (const TargetRegisterClass *RC);
 std::string getNVPTXRegClassStr (const TargetRegisterClass *RC);
-bool isNVPTXVectorRegClass (const TargetRegisterClass *RC);
-std::string getNVPTXElemClassName (const TargetRegisterClass *RC);
-int getNVPTXVectorSize (const TargetRegisterClass *RC);
-const TargetRegisterClass *getNVPTXElemClass(const TargetRegisterClass *RC);
 
 } // end namespace llvm
 
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
index ba15825..8d100d6 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -37,9 +37,6 @@ foreach i = 0-395 in {
   def RL#i : NVPTXReg<"%rl"#i>; // 64-bit
   def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
   def FL#i : NVPTXReg<"%fl"#i>; // 64-bit float
-  // Vectors
-  foreach s = [ "2b8", "2b16", "2b32", "2b64", "4b8", "4b16", "4b32" ] in
-    def v#s#_#i : NVPTXReg<"%v"#s#"_"#i>;
 
   // Arguments
   def ia#i : NVPTXReg<"%ia"#i>;
@@ -65,44 +62,3 @@ def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 395))>;
 
 // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
 def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
-
-class NVPTXVecRegClass<list<ValueType> regTypes, int alignment, dag regList,
-                       NVPTXRegClass sClass,
-                       int e,
-                       string n>
-  : NVPTXRegClass<regTypes, alignment, regList>
-{
-  NVPTXRegClass scalarClass=sClass;
-  int elems=e;
-  string name=n;
-}
-def V2F32Regs
-  : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%u", 0, 395)),
-    Float32Regs, 2, ".v2.f32">;
-def V4F32Regs
-  : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%u", 0, 395)),
-    Float32Regs, 4, ".v4.f32">;
-def V2I32Regs
-  : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%u", 0, 395)),
-    Int32Regs, 2, ".v2.u32">;
-def V4I32Regs
-  : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%u", 0, 395)),
-    Int32Regs, 4, ".v4.u32">;
-def V2F64Regs
-  : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%u", 0, 395)),
-    Float64Regs, 2, ".v2.f64">;
-def V2I64Regs
-  : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%u", 0, 395)),
-    Int64Regs, 2, ".v2.u64">;
-def V2I16Regs
-  : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%u", 0, 395)),
-    Int16Regs, 2, ".v2.u16">;
-def V4I16Regs
-  : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%u", 0, 395)),
-    Int16Regs, 4, ".v4.u16">;
-def V2I8Regs
-  : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%u", 0, 395)),
-    Int8Regs, 2, ".v2.u8">;
-def V4I8Regs
-  : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%u", 0, 395)),
-    Int8Regs, 4, ".v4.u8">;
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index e6cb7c2..beea77e 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -57,6 +57,7 @@ public:
   bool hasF32FTZ() const { return SmVersion >= 20; }
   bool hasFMAF32() const { return SmVersion >= 20; }
   bool hasFMAF64() const { return SmVersion >= 13; }
+  bool hasLDG() const { return SmVersion >= 32; }
   bool hasLDU() const { return SmVersion >= 20; }
   bool hasGenericLdSt() const { return SmVersion >= 20; }
   inline bool hasHWROT32() const { return false; }
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index b4e049e..cd765fa 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -123,7 +123,6 @@ bool NVPTXPassConfig::addInstSelector() {
   addPass(createSplitBBatBarPass());
   addPass(createAllocaHoisting());
   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
-  addPass(createVectorElementizePass(getNVPTXTargetMachine()));
   return false;
 }
 
diff --git a/lib/Target/NVPTX/VectorElementize.cpp b/lib/Target/NVPTX/VectorElementize.cpp
deleted file mode 100644
index f1b285d..0000000
--- a/lib/Target/NVPTX/VectorElementize.cpp
+++ /dev/null
@@ -1,1239 +0,0 @@
-//===-- VectorElementize.cpp - Remove unreachable blocks for codegen --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass converts operations on vector types to operations on their
-// element types.
-//
-// For generic binary and unary vector instructions, the conversion is simple.
-// Suppose we have
-//        av = bv Vop cv
-// where av, bv, and cv are vector virtual registers, and Vop is a vector op.
-// This gets converted to the following :
-//       a1 = b1 Sop c1
-//       a2 = b2 Sop c2
-//
-// VectorToScalarMap maintains the vector vreg to scalar vreg mapping.
-// For the above example, the map will look as follows:
-// av => [a1, a2]
-// bv => [b1, b2]
-//
-// In addition, initVectorInfo creates the following opcode->opcode map.
-// Vop => Sop
-// OtherVop => OtherSop
-// ...
-//
-// For vector specific instructions like vecbuild, vecshuffle etc, the
-// conversion is different. Look at comments near the functions with
-// prefix createVec<...>.
-//
-//===----------------------------------------------------------------------===//
-
-#include "NVPTX.h"
-#include "NVPTXTargetMachine.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-using namespace llvm;
-
-namespace {
-
-class LLVM_LIBRARY_VISIBILITY VectorElementize : public MachineFunctionPass {
-  virtual bool runOnMachineFunction(MachineFunction &F);
-
-  NVPTXTargetMachine &TM;
-  MachineRegisterInfo *MRI;
-  const NVPTXRegisterInfo *RegInfo;
-  const NVPTXInstrInfo *InstrInfo;
-
-  llvm::DenseMap<const TargetRegisterClass *, const TargetRegisterClass *>
-  RegClassMap;
-  llvm::DenseMap<unsigned, bool> SimpleMoveMap;
-
-  llvm::DenseMap<unsigned, SmallVector<unsigned, 4> > VectorToScalarMap;
-
-  bool isVectorInstr(MachineInstr *);
-
-  SmallVector<unsigned, 4> getScalarRegisters(unsigned);
-  unsigned getScalarVersion(unsigned);
-  unsigned getScalarVersion(MachineInstr *);
-
-  bool isVectorRegister(unsigned);
-  const TargetRegisterClass *getScalarRegClass(const TargetRegisterClass *RC);
-  unsigned numCopiesNeeded(MachineInstr *);
-
-  void createLoadCopy(MachineFunction&, MachineInstr *,
-                      std::vector<MachineInstr *>&);
-  void createStoreCopy(MachineFunction&, MachineInstr *,
-                       std::vector<MachineInstr *>&);
-
-  void createVecDest(MachineFunction&, MachineInstr *,
-                     std::vector<MachineInstr *>&);
-
-  void createCopies(MachineFunction&, MachineInstr *,
-                    std::vector<MachineInstr *>&);
-
-  unsigned copyProp(MachineFunction&);
-  unsigned removeDeadMoves(MachineFunction&);
-
-  void elementize(MachineFunction&);
-
-  bool isSimpleMove(MachineInstr *);
-
-  void createVecShuffle(MachineFunction& F, MachineInstr *Instr,
-                        std::vector<MachineInstr *>& copies);
-
-  void createVecExtract(MachineFunction& F, MachineInstr *Instr,
-                        std::vector<MachineInstr *>& copies);
-
-  void createVecInsert(MachineFunction& F, MachineInstr *Instr,
-                       std::vector<MachineInstr *>& copies);
-
-  void createVecBuild(MachineFunction& F, MachineInstr *Instr,
-                      std::vector<MachineInstr *>& copies);
-
-public:
-
-  static char ID; // Pass identification, replacement for typeid
-  VectorElementize(NVPTXTargetMachine &tm)
-  : MachineFunctionPass(ID), TM(tm) {}
-
-  virtual const char *getPassName() const {
-    return "Convert LLVM vector types to their element types";
-  }
-};
-
-char VectorElementize::ID = 1;
-}
-
-static cl::opt<bool>
-RemoveRedundantMoves("nvptx-remove-redundant-moves",
-       cl::desc("NVPTX: Remove redundant moves introduced by vector lowering"),
-                     cl::init(true));
-
-#define VECINST(x) ((((x)->getDesc().TSFlags) & NVPTX::VecInstTypeMask) \
-    >> NVPTX::VecInstTypeShift)
-#define ISVECINST(x) (VECINST(x) != NVPTX::VecNOP)
-#define ISVECLOAD(x)    (VECINST(x) == NVPTX::VecLoad)
-#define ISVECSTORE(x)   (VECINST(x) == NVPTX::VecStore)
-#define ISVECBUILD(x)   (VECINST(x) == NVPTX::VecBuild)
-#define ISVECSHUFFLE(x) (VECINST(x) == NVPTX::VecShuffle)
-#define ISVECEXTRACT(x) (VECINST(x) == NVPTX::VecExtract)
-#define ISVECINSERT(x)  (VECINST(x) == NVPTX::VecInsert)
-#define ISVECDEST(x)     (VECINST(x) == NVPTX::VecDest)
-
-bool VectorElementize::isSimpleMove(MachineInstr *mi) {
-  if (mi->isCopy())
-    return true;
-  unsigned TSFlags = (mi->getDesc().TSFlags & NVPTX::SimpleMoveMask)
-        >> NVPTX::SimpleMoveShift;
-  return (TSFlags == 1);
-}
-
-bool VectorElementize::isVectorInstr(MachineInstr *mi) {
-  if ((mi->getOpcode() == NVPTX::PHI) ||
-      (mi->getOpcode() == NVPTX::IMPLICIT_DEF) || mi->isCopy()) {
-    MachineOperand dest = mi->getOperand(0);
-    return isVectorRegister(dest.getReg());
-  }
-  return ISVECINST(mi);
-}
-
-unsigned VectorElementize::getScalarVersion(MachineInstr *mi) {
-  return getScalarVersion(mi->getOpcode());
-}
-
-///=============================================================================
-///Instr is assumed to be a vector instruction. For most vector instructions,
-///the size of the destination vector register gives the number of scalar copies
-///needed. For VecStore, size of getOperand(1) gives the number of scalar copies
-///needed. For VecExtract, the dest is a scalar. So getOperand(1) gives the
-///number of scalar copies needed.
-///=============================================================================
-unsigned VectorElementize::numCopiesNeeded(MachineInstr *Instr) {
-  unsigned numDefs=0;
-  unsigned def;
-  for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) {
-    MachineOperand oper = Instr->getOperand(i);
-
-    if (!oper.isReg()) continue;
-    if (!oper.isDef()) continue;
-    def = i;
-    numDefs++;
-  }
-  assert((numDefs <= 1) && "Only 0 or 1 defs supported");
-
-  if (numDefs == 1) {
-    unsigned regnum = Instr->getOperand(def).getReg();
-    if (ISVECEXTRACT(Instr))
-      regnum = Instr->getOperand(1).getReg();
-    return getNVPTXVectorSize(MRI->getRegClass(regnum));
-  }
-  else if (numDefs == 0) {
-    assert(ISVECSTORE(Instr)
-           && "Only 0 def instruction supported is vector store");
-
-    unsigned regnum = Instr->getOperand(0).getReg();
-    return getNVPTXVectorSize(MRI->getRegClass(regnum));
-  }
-  return 1;
-}
-
-const TargetRegisterClass *VectorElementize::
-getScalarRegClass(const TargetRegisterClass *RC) {
-  assert(isNVPTXVectorRegClass(RC) &&
-         "Not a vector register class");
-  return getNVPTXElemClass(RC);
-}
-
-bool VectorElementize::isVectorRegister(unsigned reg) {
-  const TargetRegisterClass *RC=MRI->getRegClass(reg);
-  return isNVPTXVectorRegClass(RC);
-}
-
-///=============================================================================
-///For every vector register 'v' that is not already in the VectorToScalarMap,
-///create n scalar registers of the corresponding element type, where n
-///is 2 or 4 (getNVPTXVectorSize) and add it VectorToScalarMap.
-///=============================================================================
-SmallVector<unsigned, 4> VectorElementize::getScalarRegisters(unsigned regnum) {
-  assert(isVectorRegister(regnum) && "Expecting a vector register here");
-  // Create the scalar registers and put them in the map, if not already there.
-  if (VectorToScalarMap.find(regnum) == VectorToScalarMap.end()) {
-    const TargetRegisterClass *vecClass = MRI->getRegClass(regnum);
-    const TargetRegisterClass *scalarClass = getScalarRegClass(vecClass);
-
-    SmallVector<unsigned, 4> temp;
-
-    for (unsigned i=0, e=getNVPTXVectorSize(vecClass); i!=e; ++i)
-      temp.push_back(MRI->createVirtualRegister(scalarClass));
-
-    VectorToScalarMap[regnum] = temp;
-  }
-  return VectorToScalarMap[regnum];
-}
-
-///=============================================================================
-///For a vector load of the form
-///va <= ldv2 [addr]
-///the following multi output instruction is created :
-///[v1, v2] <= LD [addr]
-///Look at NVPTXVector.td for the definitions of multi output loads.
-///=============================================================================
-void VectorElementize::createLoadCopy(MachineFunction& F, MachineInstr *Instr,
-                                      std::vector<MachineInstr *>& copies) {
-  copies.push_back(F.CloneMachineInstr(Instr));
-
-  MachineInstrBuilder copy(F, copies[0]);
-  copy->setDesc(InstrInfo->get(getScalarVersion(copy)));
-
-  // Remove the dest, that should be a vector operand.
-  MachineOperand dest = copy->getOperand(0);
-  unsigned regnum = dest.getReg();
-
-  SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
-  copy->RemoveOperand(0);
-
-  std::vector<MachineOperand> otherOperands;
-  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
-    otherOperands.push_back(copy->getOperand(i));
-
-  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
-    copy->RemoveOperand(0);
-
-  for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i)
-    copy.addReg(scalarRegs[i], RegState::Define);
-
-  for (unsigned i=0, e=otherOperands.size(); i!=e; ++i)
-    copy.addOperand(otherOperands[i]);
-
-}
-
-///=============================================================================
-///For a vector store of the form
-///stv2 va, [addr]
-///the following multi input instruction is created :
-///ST v1, v2, [addr]
-///Look at NVPTXVector.td for the definitions of multi input stores.
-///=============================================================================
-void VectorElementize::createStoreCopy(MachineFunction& F, MachineInstr *Instr,
-                                       std::vector<MachineInstr *>& copies) {
-  copies.push_back(F.CloneMachineInstr(Instr));
-
-  MachineInstrBuilder copy(F, copies[0]);
-  copy->setDesc(InstrInfo->get(getScalarVersion(copy)));
-
-  MachineOperand src = copy->getOperand(0);
-  unsigned regnum = src.getReg();
-
-  SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
-  copy->RemoveOperand(0);
-
-  std::vector<MachineOperand> otherOperands;
-  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
-    otherOperands.push_back(copy->getOperand(i));
-
-  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
-    copy->RemoveOperand(0);
-
-  for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i)
-    copy.addReg(scalarRegs[i]);
-
-  for (unsigned i=0, e=otherOperands.size(); i!=e; ++i)
-    copy.addOperand(otherOperands[i]);
-}
-
-///=============================================================================
-///va <= shufflev2 vb, vc, <i1>, <i2>
-///gets converted to 2 moves into a1 and a2. The source of the moves depend on
-///i1 and i2. i1, i2 can belong to the set {0, 1, 2, 3} for shufflev2. For
-///shufflev4 the set is {0,..7}. For example, if i1=3, i2=0, the move
-///instructions will be
-///a1 <= c2
-///a2 <= b1
-///=============================================================================
-void VectorElementize::createVecShuffle(MachineFunction& F, MachineInstr *Instr,
-                                        std::vector<MachineInstr *>& copies) {
-  unsigned numcopies=numCopiesNeeded(Instr);
-
-  unsigned destregnum = Instr->getOperand(0).getReg();
-  unsigned src1regnum = Instr->getOperand(1).getReg();
-  unsigned src2regnum = Instr->getOperand(2).getReg();
-
-  SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum);
-  SmallVector<unsigned, 4> src1 = getScalarRegisters(src1regnum);
-  SmallVector<unsigned, 4> src2 = getScalarRegisters(src2regnum);
-
-  DebugLoc DL = Instr->getDebugLoc();
-
-  for (unsigned i=0; i<numcopies; i++) {
-    MachineInstrBuilder copy =
-      BuildMI(F, DL, InstrInfo->get(getScalarVersion(Instr)), dest[i]);
-    MachineOperand which=Instr->getOperand(3+i);
-    assert(which.isImm() && "Shuffle operand not a constant");
-
-    int src=which.getImm();
-    int elem=src%numcopies;
-
-    if (which.getImm() < numcopies)
-      copy.addReg(src1[elem]);
-    else
-      copy.addReg(src2[elem]);
-    copies.push_back(copy);
-  }
-}
-
-///=============================================================================
-///a <= extractv2 va, <i1>
-///gets turned into a simple move to the scalar register a. The source depends
-///on i1.
-///=============================================================================
-void VectorElementize::createVecExtract(MachineFunction& F, MachineInstr *Instr,
-                                        std::vector<MachineInstr *>& copies) {
-  unsigned srcregnum = Instr->getOperand(1).getReg();
-
-  SmallVector<unsigned, 4> src = getScalarRegisters(srcregnum);
-
-  MachineOperand which = Instr->getOperand(2);
-  assert(which.isImm() && "Extract operand not a constant");
-
-  DebugLoc DL = Instr->getDebugLoc();
-  copies.push_back(BuildMI(F, DL, InstrInfo->get(getScalarVersion(Instr)),
-                           Instr->getOperand(0).getReg())
-                   .addReg(src[which.getImm()]));
-}
-
-///=============================================================================
-///va <= vecinsertv2 vb, c, <i1>
-///This instruction copies all elements of vb to va, except the 'i1'th element.
-///The scalar value c becomes the 'i1'th element of va.
-///This gets translated to 2 (4 for vecinsertv4) moves.
-///=============================================================================
-void VectorElementize::createVecInsert(MachineFunction& F, MachineInstr *Instr,
-                                       std::vector<MachineInstr *>& copies) {
-  unsigned numcopies=numCopiesNeeded(Instr);
-
-  unsigned destregnum = Instr->getOperand(0).getReg();
-  unsigned srcregnum = Instr->getOperand(1).getReg();
-
-  SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum);
-  SmallVector<unsigned, 4> src = getScalarRegisters(srcregnum);
-
-  MachineOperand which=Instr->getOperand(3);
-  assert(which.isImm() && "Insert operand not a constant");
-  unsigned int elem=which.getImm();
-
-  DebugLoc DL = Instr->getDebugLoc();
-
-  for (unsigned i=0; i<numcopies; i++) {
-    MachineInstrBuilder copy =
-      BuildMI(F, DL, InstrInfo->get(getScalarVersion(Instr)), dest[i]);
-
-    if (i != elem)
-      copy.addReg(src[i]);
-    else
-      copy.addOperand(Instr->getOperand(2));
-
-    copies.push_back(copy);
-  }
-
-}
-
-///=============================================================================
-///va <= buildv2 b1, b2
-///gets translated to
-///a1 <= b1
-///a2 <= b2
-///=============================================================================
-void VectorElementize::createVecBuild(MachineFunction& F, MachineInstr *Instr,
-                                      std::vector<MachineInstr *>& copies) {
-  unsigned numcopies=numCopiesNeeded(Instr);
-
-  unsigned destregnum = Instr->getOperand(0).getReg();
-
-  SmallVector<unsigned, 4> dest = getScalarRegisters(destregnum);
-
-  DebugLoc DL = Instr->getDebugLoc();
-
-  for (unsigned i=0; i<numcopies; i++)
-    copies.push_back(BuildMI(F, DL, InstrInfo->get(getScalarVersion(Instr)),
-                             dest[i])
-                     .addOperand(Instr->getOperand(1+i)));
-}
-
-///=============================================================================
-///For a tex inst of the form
-///va <= op [scalar operands]
-///the following multi output instruction is created :
-///[v1, v2] <= op' [scalar operands]
-///=============================================================================
-void VectorElementize::createVecDest(MachineFunction& F, MachineInstr *Instr,
-                                     std::vector<MachineInstr *>& copies) {
-  copies.push_back(F.CloneMachineInstr(Instr));
-
-  MachineInstrBuilder copy(F, copies[0]);
-  copy->setDesc(InstrInfo->get(getScalarVersion(copy)));
-
-  // Remove the dest, that should be a vector operand.
-  MachineOperand dest = copy->getOperand(0);
-  unsigned regnum = dest.getReg();
-
-  SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
-  copy->RemoveOperand(0);
-
-  std::vector<MachineOperand> otherOperands;
-  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
-    otherOperands.push_back(copy->getOperand(i));
-
-  for (unsigned i=0, e=copy->getNumOperands(); i!=e; ++i)
-    copy->RemoveOperand(0);
-
-  for (unsigned i=0, e=scalarRegs.size(); i!=e; ++i)
-    copy.addReg(scalarRegs[i], RegState::Define);
-
-  for (unsigned i=0, e=otherOperands.size(); i!=e; ++i)
-    copy.addOperand(otherOperands[i]);
-}
-
-///=============================================================================
-///Look at the vector instruction type and dispatch to the createVec<...>
-///function that creates the scalar copies.
-///=============================================================================
-void VectorElementize::createCopies(MachineFunction& F, MachineInstr *Instr,
-                                    std::vector<MachineInstr *>& copies) {
-  if (ISVECLOAD(Instr)) {
-    createLoadCopy(F, Instr, copies);
-    return;
-  }
-  if (ISVECSTORE(Instr)) {
-    createStoreCopy(F, Instr, copies);
-    return;
-  }
-  if (ISVECSHUFFLE(Instr)) {
-    createVecShuffle(F, Instr, copies);
-    return;
-  }
-  if (ISVECEXTRACT(Instr)) {
-    createVecExtract(F, Instr, copies);
-    return;
-  }
-  if (ISVECINSERT(Instr)) {
-    createVecInsert(F, Instr, copies);
-    return;
-  }
-  if (ISVECDEST(Instr)) {
-    createVecDest(F, Instr, copies);
-    return;
-  }
-  if (ISVECBUILD(Instr)) {
-    createVecBuild(F, Instr, copies);
-    return;
-  }
-
-  unsigned numcopies=numCopiesNeeded(Instr);
-
-  for (unsigned i=0; i<numcopies; ++i)
-    copies.push_back(F.CloneMachineInstr(Instr));
-
-  for (unsigned i=0; i<numcopies; ++i) {
-    MachineInstrBuilder copy(F, copies[i]);
-
-    std::vector<MachineOperand> allOperands;
-    std::vector<bool> isDef;
-
-    for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j) {
-      MachineOperand oper = copy->getOperand(j);
-      allOperands.push_back(oper);
-      if (oper.isReg())
-        isDef.push_back(oper.isDef());
-      else
-        isDef.push_back(false);
-    }
-
-    for (unsigned j=0, e=copy->getNumOperands(); j!=e; ++j)
-      copy->RemoveOperand(0);
-
-    copy->setDesc(InstrInfo->get(getScalarVersion(Instr)));
-
-    for (unsigned j=0, e=allOperands.size(); j!=e; ++j) {
-      MachineOperand oper=allOperands[j];
-      if (oper.isReg()) {
-        unsigned regnum = oper.getReg();
-        if (isVectorRegister(regnum)) {
-
-          SmallVector<unsigned, 4> scalarRegs = getScalarRegisters(regnum);
-          copy.addReg(scalarRegs[i], getDefRegState(isDef[j]));
-        }
-        else
-          copy.addOperand(oper);
-      }
-      else
-        copy.addOperand(oper);
-    }
-  }
-}
-
-///=============================================================================
-///Scan through all basic blocks, looking for vector instructions.
-///For each vector instruction I, insert the scalar copies before I, and
-///add I into toRemove vector. Finally remove all instructions in toRemove.
-///=============================================================================
-void VectorElementize::elementize(MachineFunction &F) {
-  for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend();
-      BI!=BE; ++BI) {
-    MachineBasicBlock *BB = &*BI;
-
-    std::vector<MachineInstr *> copies;
-    std::vector<MachineInstr *> toRemove;
-
-    for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end();
-        II!=IE; ++II) {
-      MachineInstr *Instr = &*II;
-
-      if (!isVectorInstr(Instr))
-        continue;
-
-      copies.clear();
-      createCopies(F, Instr, copies);
-      for (unsigned i=0, e=copies.size(); i!=e; ++i)
-        BB->insert(II, copies[i]);
-
-      assert((copies.size() > 0) && "Problem in createCopies");
-      toRemove.push_back(Instr);
-    }
-    for (unsigned i=0, e=toRemove.size(); i!=e; ++i)
-      F.DeleteMachineInstr(toRemove[i]->getParent()->remove(toRemove[i]));
-  }
-}
-
-///=============================================================================
-///a <= b
-///...
-///...
-///x <= op(a, ...)
-///gets converted to
-///
-///x <= op(b, ...)
-///The original move is still present. This works on SSA form machine code.
-///Note that a <= b should be a simple vreg-to-vreg move instruction.
-///TBD : I didn't find a function that can do replaceOperand, so I remove
-///all operands and add all of them again, replacing the one while adding.
-///=============================================================================
-unsigned VectorElementize::copyProp(MachineFunction &F) {
-  unsigned numReplacements = 0;
-
-  for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE;
-      ++BI) {
-    MachineBasicBlock *BB = &*BI;
-
-    for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE;
-        ++II) {
-      MachineInstr *Instr = &*II;
-
-      // Don't do copy propagation on PHI as it will cause unnecessary
-      // live range overlap.
-      if ((Instr->getOpcode() == TargetOpcode::PHI) ||
-          (Instr->getOpcode() == TargetOpcode::DBG_VALUE))
-        continue;
-
-      bool needsReplacement = false;
-
-      for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) {
-        MachineOperand oper = Instr->getOperand(i);
-        if (!oper.isReg()) continue;
-        if (oper.isDef()) continue;
-        if (!RegInfo->isVirtualRegister(oper.getReg())) continue;
-
-        MachineInstr *defInstr = MRI->getVRegDef(oper.getReg());
-
-        if (!defInstr) continue;
-
-        if (!isSimpleMove(defInstr)) continue;
-
-        MachineOperand defSrc = defInstr->getOperand(1);
-        if (!defSrc.isReg()) continue;
-        if (!RegInfo->isVirtualRegister(defSrc.getReg())) continue;
-
-        needsReplacement = true;
-
-      }
-      if (!needsReplacement) continue;
-
-      numReplacements++;
-
-      std::vector<MachineOperand> operands;
-
-      for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i) {
-        MachineOperand oper = Instr->getOperand(i);
-        bool flag = false;
-        do {
-          if (!(oper.isReg()))
-            break;
-          if (oper.isDef())
-            break;
-          if (!(RegInfo->isVirtualRegister(oper.getReg())))
-            break;
-          MachineInstr *defInstr = MRI->getVRegDef(oper.getReg());
-          if (!(isSimpleMove(defInstr)))
-            break;
-          MachineOperand defSrc = defInstr->getOperand(1);
-          if (!(defSrc.isReg()))
-            break;
-          if (!(RegInfo->isVirtualRegister(defSrc.getReg())))
-            break;
-          operands.push_back(defSrc);
-          flag = true;
-        } while (0);
-        if (flag == false)
-          operands.push_back(oper);
-      }
-
-      for (unsigned i=0, e=Instr->getNumOperands(); i!=e; ++i)
-        Instr->RemoveOperand(0);
-      for (unsigned i=0, e=operands.size(); i!=e; ++i)
-        Instr->addOperand(F, operands[i]);
-
-    }
-  }
-  return numReplacements;
-}
-
-///=============================================================================
-///Look for simple vreg-to-vreg instructions whose use_empty() is true, add
-///them to deadMoves vector. Then remove all instructions in deadMoves.
-///=============================================================================
-unsigned VectorElementize::removeDeadMoves(MachineFunction &F) {
-  std::vector<MachineInstr *> deadMoves;
-  for (MachineFunction::reverse_iterator BI=F.rbegin(), BE=F.rend(); BI!=BE;
-      ++BI) {
-    MachineBasicBlock *BB = &*BI;
-
-    for (MachineBasicBlock::iterator II=BB->begin(), IE=BB->end(); II!=IE;
-        ++II) {
-      MachineInstr *Instr = &*II;
-
-      if (!isSimpleMove(Instr)) continue;
-
-      MachineOperand dest = Instr->getOperand(0);
-      assert(dest.isReg() && "dest of move not a register");
-      assert(RegInfo->isVirtualRegister(dest.getReg()) &&
-             "dest of move not a virtual register");
-
-      if (MRI->use_empty(dest.getReg())) {
-        deadMoves.push_back(Instr);
-      }
-    }
-  }
-
-  for (unsigned i=0, e=deadMoves.size(); i!=e; ++i)
-    F.DeleteMachineInstr(deadMoves[i]->getParent()->remove(deadMoves[i]));
-
-  return deadMoves.size();
-}
-
-///=============================================================================
-///Main function for this pass.
-///=============================================================================
-bool VectorElementize::runOnMachineFunction(MachineFunction &F) {
-  MRI = &F.getRegInfo();
-
-  RegInfo = TM.getRegisterInfo();
-  InstrInfo = TM.getInstrInfo();
-
-  VectorToScalarMap.clear();
-
-  elementize(F);
-
-  if (RemoveRedundantMoves)
-    while (1) {
-      if (copyProp(F) == 0) break;
-      removeDeadMoves(F);
-    }
-
-  return true;
-}
-
-FunctionPass *llvm::createVectorElementizePass(NVPTXTargetMachine &tm) {
-  return new VectorElementize(tm);
-}
-
-unsigned VectorElementize::getScalarVersion(unsigned opcode) {
-  if (opcode == NVPTX::PHI)
-    return opcode;
-  if (opcode == NVPTX::IMPLICIT_DEF)
-    return opcode;
-  switch(opcode) {
-  default: llvm_unreachable("Scalar version not set, fix NVPTXVector.td");
-  case TargetOpcode::COPY: return TargetOpcode::COPY;
-  case NVPTX::AddCCCV2I32: return NVPTX::ADDCCCi32rr;
-  case NVPTX::AddCCCV4I32: return NVPTX::ADDCCCi32rr;
-  case NVPTX::AddCCV2I32: return NVPTX::ADDCCi32rr;
-  case NVPTX::AddCCV4I32: return NVPTX::ADDCCi32rr;
-  case NVPTX::Build_Vector2_f32: return NVPTX::FMOV32rr;
-  case NVPTX::Build_Vector2_f64: return NVPTX::FMOV64rr;
-  case NVPTX::Build_Vector2_i16: return NVPTX::IMOV16rr;
-  case NVPTX::Build_Vector2_i32: return NVPTX::IMOV32rr;
-  case NVPTX::Build_Vector2_i64: return NVPTX::IMOV64rr;
-  case NVPTX::Build_Vector2_i8: return NVPTX::IMOV8rr;
-  case NVPTX::Build_Vector4_f32: return NVPTX::FMOV32rr;
-  case NVPTX::Build_Vector4_i16: return NVPTX::IMOV16rr;
-  case NVPTX::Build_Vector4_i32: return NVPTX::IMOV32rr;
-  case NVPTX::Build_Vector4_i8: return NVPTX::IMOV8rr;
-  case NVPTX::CVTv2i16tov2i32: return NVPTX::Zint_extendext16to32;
-  case NVPTX::CVTv2i64tov2i32: return NVPTX::TRUNC_64to32;
-  case NVPTX::CVTv2i8tov2i32: return NVPTX::Zint_extendext8to32;
-  case NVPTX::CVTv4i16tov4i32: return NVPTX::Zint_extendext16to32;
-  case NVPTX::CVTv4i8tov4i32: return NVPTX::Zint_extendext8to32;
-  case NVPTX::F32MAD_ftzV2: return NVPTX::FMAD32_ftzrrr;
-  case NVPTX::F32MADV2: return NVPTX::FMAD32rrr;
-  case NVPTX::F32MAD_ftzV4: return NVPTX::FMAD32_ftzrrr;
-  case NVPTX::F32MADV4: return NVPTX::FMAD32rrr;
-  case NVPTX::F32FMA_ftzV2: return NVPTX::FMA32_ftzrrr;
-  case NVPTX::F32FMAV2: return NVPTX::FMA32rrr;
-  case NVPTX::F32FMA_ftzV4: return NVPTX::FMA32_ftzrrr;
-  case NVPTX::F32FMAV4: return NVPTX::FMA32rrr;
-  case NVPTX::F64FMAV2: return NVPTX::FMA64rrr;
-  case NVPTX::FVecEQV2F32: return NVPTX::FSetEQf32rr_toi32;
-  case NVPTX::FVecEQV2F64: return NVPTX::FSetEQf64rr_toi64;
-  case NVPTX::FVecEQV4F32: return NVPTX::FSetEQf32rr_toi32;
-  case NVPTX::FVecGEV2F32: return NVPTX::FSetGEf32rr_toi32;
-  case NVPTX::FVecGEV2F64: return NVPTX::FSetGEf64rr_toi64;
-  case NVPTX::FVecGEV4F32: return NVPTX::FSetGEf32rr_toi32;
-  case NVPTX::FVecGTV2F32: return NVPTX::FSetGTf32rr_toi32;
-  case NVPTX::FVecGTV2F64: return NVPTX::FSetGTf64rr_toi64;
-  case NVPTX::FVecGTV4F32: return NVPTX::FSetGTf32rr_toi32;
-  case NVPTX::FVecLEV2F32: return NVPTX::FSetLEf32rr_toi32;
-  case NVPTX::FVecLEV2F64: return NVPTX::FSetLEf64rr_toi64;
-  case NVPTX::FVecLEV4F32: return NVPTX::FSetLEf32rr_toi32;
-  case NVPTX::FVecLTV2F32: return NVPTX::FSetLTf32rr_toi32;
-  case NVPTX::FVecLTV2F64: return NVPTX::FSetLTf64rr_toi64;
-  case NVPTX::FVecLTV4F32: return NVPTX::FSetLTf32rr_toi32;
-  case NVPTX::FVecNANV2F32: return NVPTX::FSetNANf32rr_toi32;
-  case NVPTX::FVecNANV2F64: return NVPTX::FSetNANf64rr_toi64;
-  case NVPTX::FVecNANV4F32: return NVPTX::FSetNANf32rr_toi32;
-  case NVPTX::FVecNEV2F32: return NVPTX::FSetNEf32rr_toi32;
-  case NVPTX::FVecNEV2F64: return NVPTX::FSetNEf64rr_toi64;
-  case NVPTX::FVecNEV4F32: return NVPTX::FSetNEf32rr_toi32;
-  case NVPTX::FVecNUMV2F32: return NVPTX::FSetNUMf32rr_toi32;
-  case NVPTX::FVecNUMV2F64: return NVPTX::FSetNUMf64rr_toi64;
-  case NVPTX::FVecNUMV4F32: return NVPTX::FSetNUMf32rr_toi32;
-  case NVPTX::FVecUEQV2F32: return NVPTX::FSetUEQf32rr_toi32;
-  case NVPTX::FVecUEQV2F64: return NVPTX::FSetUEQf64rr_toi64;
-  case NVPTX::FVecUEQV4F32: return NVPTX::FSetUEQf32rr_toi32;
-  case NVPTX::FVecUGEV2F32: return NVPTX::FSetUGEf32rr_toi32;
-  case NVPTX::FVecUGEV2F64: return NVPTX::FSetUGEf64rr_toi64;
-  case NVPTX::FVecUGEV4F32: return NVPTX::FSetUGEf32rr_toi32;
-  case NVPTX::FVecUGTV2F32: return NVPTX::FSetUGTf32rr_toi32;
-  case NVPTX::FVecUGTV2F64: return NVPTX::FSetUGTf64rr_toi64;
-  case NVPTX::FVecUGTV4F32: return NVPTX::FSetUGTf32rr_toi32;
-  case NVPTX::FVecULEV2F32: return NVPTX::FSetULEf32rr_toi32;
-  case NVPTX::FVecULEV2F64: return NVPTX::FSetULEf64rr_toi64;
-  case NVPTX::FVecULEV4F32: return NVPTX::FSetULEf32rr_toi32;
-  case NVPTX::FVecULTV2F32: return NVPTX::FSetULTf32rr_toi32;
-  case NVPTX::FVecULTV2F64: return NVPTX::FSetULTf64rr_toi64;
-  case NVPTX::FVecULTV4F32: return NVPTX::FSetULTf32rr_toi32;
-  case NVPTX::FVecUNEV2F32: return NVPTX::FSetUNEf32rr_toi32;
-  case NVPTX::FVecUNEV2F64: return NVPTX::FSetUNEf64rr_toi64;
-  case NVPTX::FVecUNEV4F32: return NVPTX::FSetUNEf32rr_toi32;
-  case NVPTX::I16MADV2: return NVPTX::MAD16rrr;
-  case NVPTX::I16MADV4: return NVPTX::MAD16rrr;
-  case NVPTX::I32MADV2: return NVPTX::MAD32rrr;
-  case NVPTX::I32MADV4: return NVPTX::MAD32rrr;
-  case NVPTX::I64MADV2: return NVPTX::MAD64rrr;
-  case NVPTX::I8MADV2: return NVPTX::MAD8rrr;
-  case NVPTX::I8MADV4: return NVPTX::MAD8rrr;
-  case NVPTX::ShiftLV2I16: return NVPTX::SHLi16rr;
-  case NVPTX::ShiftLV2I32: return NVPTX::SHLi32rr;
-  case NVPTX::ShiftLV2I64: return NVPTX::SHLi64rr;
-  case NVPTX::ShiftLV2I8: return NVPTX::SHLi8rr;
-  case NVPTX::ShiftLV4I16: return NVPTX::SHLi16rr;
-  case NVPTX::ShiftLV4I32: return NVPTX::SHLi32rr;
-  case NVPTX::ShiftLV4I8: return NVPTX::SHLi8rr;
-  case NVPTX::ShiftRAV2I16: return NVPTX::SRAi16rr;
-  case NVPTX::ShiftRAV2I32: return NVPTX::SRAi32rr;
-  case NVPTX::ShiftRAV2I64: return NVPTX::SRAi64rr;
-  case NVPTX::ShiftRAV2I8: return NVPTX::SRAi8rr;
-  case NVPTX::ShiftRAV4I16: return NVPTX::SRAi16rr;
-  case NVPTX::ShiftRAV4I32: return NVPTX::SRAi32rr;
-  case NVPTX::ShiftRAV4I8: return NVPTX::SRAi8rr;
-  case NVPTX::ShiftRLV2I16: return NVPTX::SRLi16rr;
-  case NVPTX::ShiftRLV2I32: return NVPTX::SRLi32rr;
-  case NVPTX::ShiftRLV2I64: return NVPTX::SRLi64rr;
-  case NVPTX::ShiftRLV2I8: return NVPTX::SRLi8rr;
-  case NVPTX::ShiftRLV4I16: return NVPTX::SRLi16rr;
-  case NVPTX::ShiftRLV4I32: return NVPTX::SRLi32rr;
-  case NVPTX::ShiftRLV4I8: return NVPTX::SRLi8rr;
-  case NVPTX::SubCCCV2I32: return NVPTX::SUBCCCi32rr;
-  case NVPTX::SubCCCV4I32: return NVPTX::SUBCCCi32rr;
-  case NVPTX::SubCCV2I32: return NVPTX::SUBCCi32rr;
-  case NVPTX::SubCCV4I32: return NVPTX::SUBCCi32rr;
-  case NVPTX::V2F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz;
-  case NVPTX::V2F32Div_prec: return NVPTX::FDIV32rr_prec;
-  case NVPTX::V2F32Div_ftz: return NVPTX::FDIV32rr_ftz;
-  case NVPTX::V2F32Div: return NVPTX::FDIV32rr;
-  case NVPTX::V2F32_Select: return NVPTX::SELECTf32rr;
-  case NVPTX::V2F64Div: return NVPTX::FDIV64rr;
-  case NVPTX::V2F64_Select: return NVPTX::SELECTf64rr;
-  case NVPTX::V2I16_Select: return NVPTX::SELECTi16rr;
-  case NVPTX::V2I32_Select: return NVPTX::SELECTi32rr;
-  case NVPTX::V2I64_Select: return NVPTX::SELECTi64rr;
-  case NVPTX::V2I8_Select: return NVPTX::SELECTi8rr;
-  case NVPTX::V2f32Extract: return NVPTX::FMOV32rr;
-  case NVPTX::V2f32Insert: return NVPTX::FMOV32rr;
-  case NVPTX::V2f32Mov: return NVPTX::FMOV32rr;
-  case NVPTX::V2f64Extract: return NVPTX::FMOV64rr;
-  case NVPTX::V2f64Insert: return NVPTX::FMOV64rr;
-  case NVPTX::V2f64Mov: return NVPTX::FMOV64rr;
-  case NVPTX::V2i16Extract: return NVPTX::IMOV16rr;
-  case NVPTX::V2i16Insert: return NVPTX::IMOV16rr;
-  case NVPTX::V2i16Mov: return NVPTX::IMOV16rr;
-  case NVPTX::V2i32Extract: return NVPTX::IMOV32rr;
-  case NVPTX::V2i32Insert: return NVPTX::IMOV32rr;
-  case NVPTX::V2i32Mov: return NVPTX::IMOV32rr;
-  case NVPTX::V2i64Extract: return NVPTX::IMOV64rr;
-  case NVPTX::V2i64Insert: return NVPTX::IMOV64rr;
-  case NVPTX::V2i64Mov: return NVPTX::IMOV64rr;
-  case NVPTX::V2i8Extract: return NVPTX::IMOV8rr;
-  case NVPTX::V2i8Insert: return NVPTX::IMOV8rr;
-  case NVPTX::V2i8Mov: return NVPTX::IMOV8rr;
-  case NVPTX::V4F32Div_prec_ftz: return NVPTX::FDIV32rr_prec_ftz;
-  case NVPTX::V4F32Div_prec: return NVPTX::FDIV32rr_prec;
-  case NVPTX::V4F32Div_ftz: return NVPTX::FDIV32rr_ftz;
-  case NVPTX::V4F32Div: return NVPTX::FDIV32rr;
-  case NVPTX::V4F32_Select: return NVPTX::SELECTf32rr;
-  case NVPTX::V4I16_Select: return NVPTX::SELECTi16rr;
-  case NVPTX::V4I32_Select: return NVPTX::SELECTi32rr;
-  case NVPTX::V4I8_Select: return NVPTX::SELECTi8rr;
-  case NVPTX::V4f32Extract: return NVPTX::FMOV32rr;
-  case NVPTX::V4f32Insert: return NVPTX::FMOV32rr;
-  case NVPTX::V4f32Mov: return NVPTX::FMOV32rr;
-  case NVPTX::V4i16Extract: return NVPTX::IMOV16rr;
-  case NVPTX::V4i16Insert: return NVPTX::IMOV16rr;
-  case NVPTX::V4i16Mov: return NVPTX::IMOV16rr;
-  case NVPTX::V4i32Extract: return NVPTX::IMOV32rr;
-  case NVPTX::V4i32Insert: return NVPTX::IMOV32rr;
-  case NVPTX::V4i32Mov: return NVPTX::IMOV32rr;
-  case NVPTX::V4i8Extract: return NVPTX::IMOV8rr;
-  case NVPTX::V4i8Insert: return NVPTX::IMOV8rr;
-  case NVPTX::V4i8Mov: return NVPTX::IMOV8rr;
-  case NVPTX::VAddV2I16: return NVPTX::ADDi16rr;
-  case NVPTX::VAddV2I32: return NVPTX::ADDi32rr;
-  case NVPTX::VAddV2I64: return NVPTX::ADDi64rr;
-  case NVPTX::VAddV2I8: return NVPTX::ADDi8rr;
-  case NVPTX::VAddV4I16: return NVPTX::ADDi16rr;
-  case NVPTX::VAddV4I32: return NVPTX::ADDi32rr;
-  case NVPTX::VAddV4I8: return NVPTX::ADDi8rr;
-  case NVPTX::VAddfV2F32: return NVPTX::FADDf32rr;
-  case NVPTX::VAddfV2F32_ftz: return NVPTX::FADDf32rr_ftz;
-  case NVPTX::VAddfV2F64: return NVPTX::FADDf64rr;
-  case NVPTX::VAddfV4F32: return NVPTX::FADDf32rr;
-  case NVPTX::VAddfV4F32_ftz: return NVPTX::FADDf32rr_ftz;
-  case NVPTX::VAndV2I16: return NVPTX::ANDb16rr;
-  case NVPTX::VAndV2I32: return NVPTX::ANDb32rr;
-  case NVPTX::VAndV2I64: return NVPTX::ANDb64rr;
-  case NVPTX::VAndV2I8: return NVPTX::ANDb8rr;
-  case NVPTX::VAndV4I16: return NVPTX::ANDb16rr;
-  case NVPTX::VAndV4I32: return NVPTX::ANDb32rr;
-  case NVPTX::VAndV4I8: return NVPTX::ANDb8rr;
-  case NVPTX::VMulfV2F32_ftz: return NVPTX::FMULf32rr_ftz;
-  case NVPTX::VMulfV2F32: return NVPTX::FMULf32rr;
-  case NVPTX::VMulfV2F64: return NVPTX::FMULf64rr;
-  case NVPTX::VMulfV4F32_ftz: return NVPTX::FMULf32rr_ftz;
-  case NVPTX::VMulfV4F32: return NVPTX::FMULf32rr;
-  case NVPTX::VMultHSV2I16: return NVPTX::MULTHSi16rr;
-  case NVPTX::VMultHSV2I32: return NVPTX::MULTHSi32rr;
-  case NVPTX::VMultHSV2I64: return NVPTX::MULTHSi64rr;
-  case NVPTX::VMultHSV2I8: return NVPTX::MULTHSi8rr;
-  case NVPTX::VMultHSV4I16: return NVPTX::MULTHSi16rr;
-  case NVPTX::VMultHSV4I32: return NVPTX::MULTHSi32rr;
-  case NVPTX::VMultHSV4I8: return NVPTX::MULTHSi8rr;
-  case NVPTX::VMultHUV2I16: return NVPTX::MULTHUi16rr;
-  case NVPTX::VMultHUV2I32: return NVPTX::MULTHUi32rr;
-  case NVPTX::VMultHUV2I64: return NVPTX::MULTHUi64rr;
-  case NVPTX::VMultHUV2I8: return NVPTX::MULTHUi8rr;
-  case NVPTX::VMultHUV4I16: return NVPTX::MULTHUi16rr;
-  case NVPTX::VMultHUV4I32: return NVPTX::MULTHUi32rr;
-  case NVPTX::VMultHUV4I8: return NVPTX::MULTHUi8rr;
-  case NVPTX::VMultV2I16: return NVPTX::MULTi16rr;
-  case NVPTX::VMultV2I32: return NVPTX::MULTi32rr;
-  case NVPTX::VMultV2I64: return NVPTX::MULTi64rr;
-  case NVPTX::VMultV2I8: return NVPTX::MULTi8rr;
-  case NVPTX::VMultV4I16: return NVPTX::MULTi16rr;
-  case NVPTX::VMultV4I32: return NVPTX::MULTi32rr;
-  case NVPTX::VMultV4I8: return NVPTX::MULTi8rr;
-  case NVPTX::VNegV2I16: return NVPTX::INEG16;
-  case NVPTX::VNegV2I32: return NVPTX::INEG32;
-  case NVPTX::VNegV2I64: return NVPTX::INEG64;
-  case NVPTX::VNegV2I8: return NVPTX::INEG8;
-  case NVPTX::VNegV4I16: return NVPTX::INEG16;
-  case NVPTX::VNegV4I32: return NVPTX::INEG32;
-  case NVPTX::VNegV4I8: return NVPTX::INEG8;
-  case NVPTX::VNegv2f32: return NVPTX::FNEGf32;
-  case NVPTX::VNegv2f32_ftz: return NVPTX::FNEGf32_ftz;
-  case NVPTX::VNegv2f64: return NVPTX::FNEGf64;
-  case NVPTX::VNegv4f32: return NVPTX::FNEGf32;
-  case NVPTX::VNegv4f32_ftz: return NVPTX::FNEGf32_ftz;
-  case NVPTX::VNotV2I16: return NVPTX::NOT16;
-  case NVPTX::VNotV2I32: return NVPTX::NOT32;
-  case NVPTX::VNotV2I64: return NVPTX::NOT64;
-  case NVPTX::VNotV2I8: return NVPTX::NOT8;
-  case NVPTX::VNotV4I16: return NVPTX::NOT16;
-  case NVPTX::VNotV4I32: return NVPTX::NOT32;
-  case NVPTX::VNotV4I8: return NVPTX::NOT8;
-  case NVPTX::VOrV2I16: return NVPTX::ORb16rr;
-  case NVPTX::VOrV2I32: return NVPTX::ORb32rr;
-  case NVPTX::VOrV2I64: return NVPTX::ORb64rr;
-  case NVPTX::VOrV2I8: return NVPTX::ORb8rr;
-  case NVPTX::VOrV4I16: return NVPTX::ORb16rr;
-  case NVPTX::VOrV4I32: return NVPTX::ORb32rr;
-  case NVPTX::VOrV4I8: return NVPTX::ORb8rr;
-  case NVPTX::VSDivV2I16: return NVPTX::SDIVi16rr;
-  case NVPTX::VSDivV2I32: return NVPTX::SDIVi32rr;
-  case NVPTX::VSDivV2I64: return NVPTX::SDIVi64rr;
-  case NVPTX::VSDivV2I8: return NVPTX::SDIVi8rr;
-  case NVPTX::VSDivV4I16: return NVPTX::SDIVi16rr;
-  case NVPTX::VSDivV4I32: return NVPTX::SDIVi32rr;
-  case NVPTX::VSDivV4I8: return NVPTX::SDIVi8rr;
-  case NVPTX::VSRemV2I16: return NVPTX::SREMi16rr;
-  case NVPTX::VSRemV2I32: return NVPTX::SREMi32rr;
-  case NVPTX::VSRemV2I64: return NVPTX::SREMi64rr;
-  case NVPTX::VSRemV2I8: return NVPTX::SREMi8rr;
-  case NVPTX::VSRemV4I16: return NVPTX::SREMi16rr;
-  case NVPTX::VSRemV4I32: return NVPTX::SREMi32rr;
-  case NVPTX::VSRemV4I8: return NVPTX::SREMi8rr;
-  case NVPTX::VSubV2I16: return NVPTX::SUBi16rr;
-  case NVPTX::VSubV2I32: return NVPTX::SUBi32rr;
-  case NVPTX::VSubV2I64: return NVPTX::SUBi64rr;
-  case NVPTX::VSubV2I8: return NVPTX::SUBi8rr;
-  case NVPTX::VSubV4I16: return NVPTX::SUBi16rr;
-  case NVPTX::VSubV4I32: return NVPTX::SUBi32rr;
-  case NVPTX::VSubV4I8: return NVPTX::SUBi8rr;
-  case NVPTX::VSubfV2F32_ftz: return NVPTX::FSUBf32rr_ftz;
-  case NVPTX::VSubfV2F32: return NVPTX::FSUBf32rr;
-  case NVPTX::VSubfV2F64: return NVPTX::FSUBf64rr;
-  case NVPTX::VSubfV4F32_ftz: return NVPTX::FSUBf32rr_ftz;
-  case NVPTX::VSubfV4F32: return NVPTX::FSUBf32rr;
-  case NVPTX::VUDivV2I16: return NVPTX::UDIVi16rr;
-  case NVPTX::VUDivV2I32: return NVPTX::UDIVi32rr;
-  case NVPTX::VUDivV2I64: return NVPTX::UDIVi64rr;
-  case NVPTX::VUDivV2I8: return NVPTX::UDIVi8rr;
-  case NVPTX::VUDivV4I16: return NVPTX::UDIVi16rr;
-  case NVPTX::VUDivV4I32: return NVPTX::UDIVi32rr;
-  case NVPTX::VUDivV4I8: return NVPTX::UDIVi8rr;
-  case NVPTX::VURemV2I16: return NVPTX::UREMi16rr;
-  case NVPTX::VURemV2I32: return NVPTX::UREMi32rr;
-  case NVPTX::VURemV2I64: return NVPTX::UREMi64rr;
-  case NVPTX::VURemV2I8: return NVPTX::UREMi8rr;
-  case NVPTX::VURemV4I16: return NVPTX::UREMi16rr;
-  case NVPTX::VURemV4I32: return NVPTX::UREMi32rr;
-  case NVPTX::VURemV4I8: return NVPTX::UREMi8rr;
-  case NVPTX::VXorV2I16: return NVPTX::XORb16rr;
-  case NVPTX::VXorV2I32: return NVPTX::XORb32rr;
-  case NVPTX::VXorV2I64: return NVPTX::XORb64rr;
-  case NVPTX::VXorV2I8: return NVPTX::XORb8rr;
-  case NVPTX::VXorV4I16: return NVPTX::XORb16rr;
-  case NVPTX::VXorV4I32: return NVPTX::XORb32rr;
-  case NVPTX::VXorV4I8: return NVPTX::XORb8rr;
-  case NVPTX::VecSEQV2I16: return NVPTX::ISetSEQi16rr_toi16;
-  case NVPTX::VecSEQV2I32: return NVPTX::ISetSEQi32rr_toi32;
-  case NVPTX::VecSEQV2I64: return NVPTX::ISetSEQi64rr_toi64;
-  case NVPTX::VecSEQV2I8: return NVPTX::ISetSEQi8rr_toi8;
-  case NVPTX::VecSEQV4I16: return NVPTX::ISetSEQi16rr_toi16;
-  case NVPTX::VecSEQV4I32: return NVPTX::ISetSEQi32rr_toi32;
-  case NVPTX::VecSEQV4I8: return NVPTX::ISetSEQi8rr_toi8;
-  case NVPTX::VecSGEV2I16: return NVPTX::ISetSGEi16rr_toi16;
-  case NVPTX::VecSGEV2I32: return NVPTX::ISetSGEi32rr_toi32;
-  case NVPTX::VecSGEV2I64: return NVPTX::ISetSGEi64rr_toi64;
-  case NVPTX::VecSGEV2I8: return NVPTX::ISetSGEi8rr_toi8;
-  case NVPTX::VecSGEV4I16: return NVPTX::ISetSGEi16rr_toi16;
-  case NVPTX::VecSGEV4I32: return NVPTX::ISetSGEi32rr_toi32;
-  case NVPTX::VecSGEV4I8: return NVPTX::ISetSGEi8rr_toi8;
-  case NVPTX::VecSGTV2I16: return NVPTX::ISetSGTi16rr_toi16;
-  case NVPTX::VecSGTV2I32: return NVPTX::ISetSGTi32rr_toi32;
-  case NVPTX::VecSGTV2I64: return NVPTX::ISetSGTi64rr_toi64;
-  case NVPTX::VecSGTV2I8: return NVPTX::ISetSGTi8rr_toi8;
-  case NVPTX::VecSGTV4I16: return NVPTX::ISetSGTi16rr_toi16;
-  case NVPTX::VecSGTV4I32: return NVPTX::ISetSGTi32rr_toi32;
-  case NVPTX::VecSGTV4I8: return NVPTX::ISetSGTi8rr_toi8;
-  case NVPTX::VecSLEV2I16: return NVPTX::ISetSLEi16rr_toi16;
-  case NVPTX::VecSLEV2I32: return NVPTX::ISetSLEi32rr_toi32;
-  case NVPTX::VecSLEV2I64: return NVPTX::ISetSLEi64rr_toi64;
-  case NVPTX::VecSLEV2I8: return NVPTX::ISetSLEi8rr_toi8;
-  case NVPTX::VecSLEV4I16: return NVPTX::ISetSLEi16rr_toi16;
-  case NVPTX::VecSLEV4I32: return NVPTX::ISetSLEi32rr_toi32;
-  case NVPTX::VecSLEV4I8: return NVPTX::ISetSLEi8rr_toi8;
-  case NVPTX::VecSLTV2I16: return NVPTX::ISetSLTi16rr_toi16;
-  case NVPTX::VecSLTV2I32: return NVPTX::ISetSLTi32rr_toi32;
-  case NVPTX::VecSLTV2I64: return NVPTX::ISetSLTi64rr_toi64;
-  case NVPTX::VecSLTV2I8: return NVPTX::ISetSLTi8rr_toi8;
-  case NVPTX::VecSLTV4I16: return NVPTX::ISetSLTi16rr_toi16;
-  case NVPTX::VecSLTV4I32: return NVPTX::ISetSLTi32rr_toi32;
-  case NVPTX::VecSLTV4I8: return NVPTX::ISetSLTi8rr_toi8;
-  case NVPTX::VecSNEV2I16: return NVPTX::ISetSNEi16rr_toi16;
-  case NVPTX::VecSNEV2I32: return NVPTX::ISetSNEi32rr_toi32;
-  case NVPTX::VecSNEV2I64: return NVPTX::ISetSNEi64rr_toi64;
-  case NVPTX::VecSNEV2I8: return NVPTX::ISetSNEi8rr_toi8;
-  case NVPTX::VecSNEV4I16: return NVPTX::ISetSNEi16rr_toi16;
-  case NVPTX::VecSNEV4I32: return NVPTX::ISetSNEi32rr_toi32;
-  case NVPTX::VecSNEV4I8: return NVPTX::ISetSNEi8rr_toi8;
-  case NVPTX::VecShuffle_v2f32: return NVPTX::FMOV32rr;
-  case NVPTX::VecShuffle_v2f64: return NVPTX::FMOV64rr;
-  case NVPTX::VecShuffle_v2i16: return NVPTX::IMOV16rr;
-  case NVPTX::VecShuffle_v2i32: return NVPTX::IMOV32rr;
-  case NVPTX::VecShuffle_v2i64: return NVPTX::IMOV64rr;
-  case NVPTX::VecShuffle_v2i8: return NVPTX::IMOV8rr;
-  case NVPTX::VecShuffle_v4f32: return NVPTX::FMOV32rr;
-  case NVPTX::VecShuffle_v4i16: return NVPTX::IMOV16rr;
-  case NVPTX::VecShuffle_v4i32: return NVPTX::IMOV32rr;
-  case NVPTX::VecShuffle_v4i8: return NVPTX::IMOV8rr;
-  case NVPTX::VecUEQV2I16: return NVPTX::ISetUEQi16rr_toi16;
-  case NVPTX::VecUEQV2I32: return NVPTX::ISetUEQi32rr_toi32;
-  case NVPTX::VecUEQV2I64: return NVPTX::ISetUEQi64rr_toi64;
-  case NVPTX::VecUEQV2I8: return NVPTX::ISetUEQi8rr_toi8;
-  case NVPTX::VecUEQV4I16: return NVPTX::ISetUEQi16rr_toi16;
-  case NVPTX::VecUEQV4I32: return NVPTX::ISetUEQi32rr_toi32;
-  case NVPTX::VecUEQV4I8: return NVPTX::ISetUEQi8rr_toi8;
-  case NVPTX::VecUGEV2I16: return NVPTX::ISetUGEi16rr_toi16;
-  case NVPTX::VecUGEV2I32: return NVPTX::ISetUGEi32rr_toi32;
-  case NVPTX::VecUGEV2I64: return NVPTX::ISetUGEi64rr_toi64;
-  case NVPTX::VecUGEV2I8: return NVPTX::ISetUGEi8rr_toi8;
-  case NVPTX::VecUGEV4I16: return NVPTX::ISetUGEi16rr_toi16;
-  case NVPTX::VecUGEV4I32: return NVPTX::ISetUGEi32rr_toi32;
-  case NVPTX::VecUGEV4I8: return NVPTX::ISetUGEi8rr_toi8;
-  case NVPTX::VecUGTV2I16: return NVPTX::ISetUGTi16rr_toi16;
-  case NVPTX::VecUGTV2I32: return NVPTX::ISetUGTi32rr_toi32;
-  case NVPTX::VecUGTV2I64: return NVPTX::ISetUGTi64rr_toi64;
-  case NVPTX::VecUGTV2I8: return NVPTX::ISetUGTi8rr_toi8;
-  case NVPTX::VecUGTV4I16: return NVPTX::ISetUGTi16rr_toi16;
-  case NVPTX::VecUGTV4I32: return NVPTX::ISetUGTi32rr_toi32;
-  case NVPTX::VecUGTV4I8: return NVPTX::ISetUGTi8rr_toi8;
-  case NVPTX::VecULEV2I16: return NVPTX::ISetULEi16rr_toi16;
-  case NVPTX::VecULEV2I32: return NVPTX::ISetULEi32rr_toi32;
-  case NVPTX::VecULEV2I64: return NVPTX::ISetULEi64rr_toi64;
-  case NVPTX::VecULEV2I8: return NVPTX::ISetULEi8rr_toi8;
-  case NVPTX::VecULEV4I16: return NVPTX::ISetULEi16rr_toi16;
-  case NVPTX::VecULEV4I32: return NVPTX::ISetULEi32rr_toi32;
-  case NVPTX::VecULEV4I8: return NVPTX::ISetULEi8rr_toi8;
-  case NVPTX::VecULTV2I16: return NVPTX::ISetULTi16rr_toi16;
-  case NVPTX::VecULTV2I32: return NVPTX::ISetULTi32rr_toi32;
-  case NVPTX::VecULTV2I64: return NVPTX::ISetULTi64rr_toi64;
-  case NVPTX::VecULTV2I8: return NVPTX::ISetULTi8rr_toi8;
-  case NVPTX::VecULTV4I16: return NVPTX::ISetULTi16rr_toi16;
-  case NVPTX::VecULTV4I32: return NVPTX::ISetULTi32rr_toi32;
-  case NVPTX::VecULTV4I8: return NVPTX::ISetULTi8rr_toi8;
-  case NVPTX::VecUNEV2I16: return NVPTX::ISetUNEi16rr_toi16;
-  case NVPTX::VecUNEV2I32: return NVPTX::ISetUNEi32rr_toi32;
-  case NVPTX::VecUNEV2I64: return NVPTX::ISetUNEi64rr_toi64;
-  case NVPTX::VecUNEV2I8: return NVPTX::ISetUNEi8rr_toi8;
-  case NVPTX::VecUNEV4I16: return NVPTX::ISetUNEi16rr_toi16;
-  case NVPTX::VecUNEV4I32: return NVPTX::ISetUNEi32rr_toi32;
-  case NVPTX::VecUNEV4I8: return NVPTX::ISetUNEi8rr_toi8;
-  case NVPTX::INT_PTX_LDU_G_v2i8_32: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v4i8_32: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v2i16_32: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v4i16_32: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v2i32_32: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v4i32_32: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v2f32_32: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v4f32_32: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v2i64_32: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v2f64_32: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_32;
-  case NVPTX::INT_PTX_LDU_G_v2i8_64: return NVPTX::INT_PTX_LDU_G_v2i8_ELE_64;
-  case NVPTX::INT_PTX_LDU_G_v4i8_64: return NVPTX::INT_PTX_LDU_G_v4i8_ELE_64;
-  case NVPTX::INT_PTX_LDU_G_v2i16_64: return NVPTX::INT_PTX_LDU_G_v2i16_ELE_64;
-  case NVPTX::INT_PTX_LDU_G_v4i16_64: return NVPTX::INT_PTX_LDU_G_v4i16_ELE_64;
-  case NVPTX::INT_PTX_LDU_G_v2i32_64: return NVPTX::INT_PTX_LDU_G_v2i32_ELE_64;
-  case NVPTX::INT_PTX_LDU_G_v4i32_64: return NVPTX::INT_PTX_LDU_G_v4i32_ELE_64;
-  case NVPTX::INT_PTX_LDU_G_v2f32_64: return NVPTX::INT_PTX_LDU_G_v2f32_ELE_64;
-  case NVPTX::INT_PTX_LDU_G_v4f32_64: return NVPTX::INT_PTX_LDU_G_v4f32_ELE_64;
-  case NVPTX::INT_PTX_LDU_G_v2i64_64: return NVPTX::INT_PTX_LDU_G_v2i64_ELE_64;
-  case NVPTX::INT_PTX_LDU_G_v2f64_64: return NVPTX::INT_PTX_LDU_G_v2f64_ELE_64;
-
-  case NVPTX::LoadParamV4I32: return NVPTX::LoadParamScalar4I32;
-  case NVPTX::LoadParamV4I16: return NVPTX::LoadParamScalar4I16;
-  case NVPTX::LoadParamV4I8: return NVPTX::LoadParamScalar4I8;
-  case NVPTX::LoadParamV2I64: return NVPTX::LoadParamScalar2I64;
-  case NVPTX::LoadParamV2I32: return NVPTX::LoadParamScalar2I32;
-  case NVPTX::LoadParamV2I16: return NVPTX::LoadParamScalar2I16;
-  case NVPTX::LoadParamV2I8: return NVPTX::LoadParamScalar2I8;
-  case NVPTX::LoadParamV4F32: return NVPTX::LoadParamScalar4F32;
-  case NVPTX::LoadParamV2F32: return NVPTX::LoadParamScalar2F32;
-  case NVPTX::LoadParamV2F64: return NVPTX::LoadParamScalar2F64;
-  case NVPTX::StoreParamV4I32: return NVPTX::StoreParamScalar4I32;
-  case NVPTX::StoreParamV4I16: return NVPTX::StoreParamScalar4I16;
-  case NVPTX::StoreParamV4I8: return NVPTX::StoreParamScalar4I8;
-  case NVPTX::StoreParamV2I64: return NVPTX::StoreParamScalar2I64;
-  case NVPTX::StoreParamV2I32: return NVPTX::StoreParamScalar2I32;
-  case NVPTX::StoreParamV2I16: return NVPTX::StoreParamScalar2I16;
-  case NVPTX::StoreParamV2I8: return NVPTX::StoreParamScalar2I8;
-  case NVPTX::StoreParamV4F32: return NVPTX::StoreParamScalar4F32;
-  case NVPTX::StoreParamV2F32: return NVPTX::StoreParamScalar2F32;
-  case NVPTX::StoreParamV2F64: return NVPTX::StoreParamScalar2F64;
-  case NVPTX::StoreRetvalV4I32: return NVPTX::StoreRetvalScalar4I32;
-  case NVPTX::StoreRetvalV4I16: return NVPTX::StoreRetvalScalar4I16;
-  case NVPTX::StoreRetvalV4I8: return NVPTX::StoreRetvalScalar4I8;
-  case NVPTX::StoreRetvalV2I64: return NVPTX::StoreRetvalScalar2I64;
-  case NVPTX::StoreRetvalV2I32: return NVPTX::StoreRetvalScalar2I32;
-  case NVPTX::StoreRetvalV2I16: return NVPTX::StoreRetvalScalar2I16;
-  case NVPTX::StoreRetvalV2I8: return NVPTX::StoreRetvalScalar2I8;
-  case NVPTX::StoreRetvalV4F32: return NVPTX::StoreRetvalScalar4F32;
-  case NVPTX::StoreRetvalV2F32: return NVPTX::StoreRetvalScalar2F32;
-  case NVPTX::StoreRetvalV2F64: return NVPTX::StoreRetvalScalar2F64;
-  case NVPTX::VecI32toV4I8: return NVPTX::I32toV4I8;
-  case NVPTX::VecI64toV4I16: return NVPTX::I64toV4I16;
-  case NVPTX::VecI16toV2I8: return NVPTX::I16toV2I8;
-  case NVPTX::VecI32toV2I16: return NVPTX::I32toV2I16;
-  case NVPTX::VecI64toV2I32: return NVPTX::I64toV2I32;
-  case NVPTX::VecF64toV2F32: return NVPTX::F64toV2F32;
-
-  case NVPTX::LD_v2i8_avar: return NVPTX::LDV_i8_v2_avar;
-  case NVPTX::LD_v2i8_areg: return NVPTX::LDV_i8_v2_areg;
-  case NVPTX::LD_v2i8_ari:  return NVPTX::LDV_i8_v2_ari;
-  case NVPTX::LD_v2i8_asi:  return NVPTX::LDV_i8_v2_asi;
-  case NVPTX::LD_v4i8_avar: return NVPTX::LDV_i8_v4_avar;
-  case NVPTX::LD_v4i8_areg: return NVPTX::LDV_i8_v4_areg;
-  case NVPTX::LD_v4i8_ari:  return NVPTX::LDV_i8_v4_ari;
-  case NVPTX::LD_v4i8_asi:  return NVPTX::LDV_i8_v4_asi;
-
-  case NVPTX::LD_v2i16_avar: return NVPTX::LDV_i16_v2_avar;
-  case NVPTX::LD_v2i16_areg: return NVPTX::LDV_i16_v2_areg;
-  case NVPTX::LD_v2i16_ari:  return NVPTX::LDV_i16_v2_ari;
-  case NVPTX::LD_v2i16_asi:  return NVPTX::LDV_i16_v2_asi;
-  case NVPTX::LD_v4i16_avar: return NVPTX::LDV_i16_v4_avar;
-  case NVPTX::LD_v4i16_areg: return NVPTX::LDV_i16_v4_areg;
-  case NVPTX::LD_v4i16_ari:  return NVPTX::LDV_i16_v4_ari;
-  case NVPTX::LD_v4i16_asi:  return NVPTX::LDV_i16_v4_asi;
-
-  case NVPTX::LD_v2i32_avar: return NVPTX::LDV_i32_v2_avar;
-  case NVPTX::LD_v2i32_areg: return NVPTX::LDV_i32_v2_areg;
-  case NVPTX::LD_v2i32_ari:  return NVPTX::LDV_i32_v2_ari;
-  case NVPTX::LD_v2i32_asi:  return NVPTX::LDV_i32_v2_asi;
-  case NVPTX::LD_v4i32_avar: return NVPTX::LDV_i32_v4_avar;
-  case NVPTX::LD_v4i32_areg: return NVPTX::LDV_i32_v4_areg;
-  case NVPTX::LD_v4i32_ari:  return NVPTX::LDV_i32_v4_ari;
-  case NVPTX::LD_v4i32_asi:  return NVPTX::LDV_i32_v4_asi;
-
-  case NVPTX::LD_v2f32_avar: return NVPTX::LDV_f32_v2_avar;
-  case NVPTX::LD_v2f32_areg: return NVPTX::LDV_f32_v2_areg;
-  case NVPTX::LD_v2f32_ari:  return NVPTX::LDV_f32_v2_ari;
-  case NVPTX::LD_v2f32_asi:  return NVPTX::LDV_f32_v2_asi;
-  case NVPTX::LD_v4f32_avar: return NVPTX::LDV_f32_v4_avar;
-  case NVPTX::LD_v4f32_areg: return NVPTX::LDV_f32_v4_areg;
-  case NVPTX::LD_v4f32_ari:  return NVPTX::LDV_f32_v4_ari;
-  case NVPTX::LD_v4f32_asi:  return NVPTX::LDV_f32_v4_asi;
-
-  case NVPTX::LD_v2i64_avar: return NVPTX::LDV_i64_v2_avar;
-  case NVPTX::LD_v2i64_areg: return NVPTX::LDV_i64_v2_areg;
-  case NVPTX::LD_v2i64_ari:  return NVPTX::LDV_i64_v2_ari;
-  case NVPTX::LD_v2i64_asi:  return NVPTX::LDV_i64_v2_asi;
-  case NVPTX::LD_v2f64_avar: return NVPTX::LDV_f64_v2_avar;
-  case NVPTX::LD_v2f64_areg: return NVPTX::LDV_f64_v2_areg;
-  case NVPTX::LD_v2f64_ari:  return NVPTX::LDV_f64_v2_ari;
-  case NVPTX::LD_v2f64_asi:  return NVPTX::LDV_f64_v2_asi;
-
-  case NVPTX::ST_v2i8_avar: return NVPTX::STV_i8_v2_avar;
-  case NVPTX::ST_v2i8_areg: return NVPTX::STV_i8_v2_areg;
-  case NVPTX::ST_v2i8_ari:  return NVPTX::STV_i8_v2_ari;
-  case NVPTX::ST_v2i8_asi:  return NVPTX::STV_i8_v2_asi;
-  case NVPTX::ST_v4i8_avar: return NVPTX::STV_i8_v4_avar;
-  case NVPTX::ST_v4i8_areg: return NVPTX::STV_i8_v4_areg;
-  case NVPTX::ST_v4i8_ari:  return NVPTX::STV_i8_v4_ari;
-  case NVPTX::ST_v4i8_asi:  return NVPTX::STV_i8_v4_asi;
-
-  case NVPTX::ST_v2i16_avar: return NVPTX::STV_i16_v2_avar;
-  case NVPTX::ST_v2i16_areg: return NVPTX::STV_i16_v2_areg;
-  case NVPTX::ST_v2i16_ari:  return NVPTX::STV_i16_v2_ari;
-  case NVPTX::ST_v2i16_asi:  return NVPTX::STV_i16_v2_asi;
-  case NVPTX::ST_v4i16_avar: return NVPTX::STV_i16_v4_avar;
-  case NVPTX::ST_v4i16_areg: return NVPTX::STV_i16_v4_areg;
-  case NVPTX::ST_v4i16_ari:  return NVPTX::STV_i16_v4_ari;
-  case NVPTX::ST_v4i16_asi:  return NVPTX::STV_i16_v4_asi;
-
-  case NVPTX::ST_v2i32_avar: return NVPTX::STV_i32_v2_avar;
-  case NVPTX::ST_v2i32_areg: return NVPTX::STV_i32_v2_areg;
-  case NVPTX::ST_v2i32_ari:  return NVPTX::STV_i32_v2_ari;
-  case NVPTX::ST_v2i32_asi:  return NVPTX::STV_i32_v2_asi;
-  case NVPTX::ST_v4i32_avar: return NVPTX::STV_i32_v4_avar;
-  case NVPTX::ST_v4i32_areg: return NVPTX::STV_i32_v4_areg;
-  case NVPTX::ST_v4i32_ari:  return NVPTX::STV_i32_v4_ari;
-  case NVPTX::ST_v4i32_asi:  return NVPTX::STV_i32_v4_asi;
-
-  case NVPTX::ST_v2f32_avar: return NVPTX::STV_f32_v2_avar;
-  case NVPTX::ST_v2f32_areg: return NVPTX::STV_f32_v2_areg;
-  case NVPTX::ST_v2f32_ari:  return NVPTX::STV_f32_v2_ari;
-  case NVPTX::ST_v2f32_asi:  return NVPTX::STV_f32_v2_asi;
-  case NVPTX::ST_v4f32_avar: return NVPTX::STV_f32_v4_avar;
-  case NVPTX::ST_v4f32_areg: return NVPTX::STV_f32_v4_areg;
-  case NVPTX::ST_v4f32_ari:  return NVPTX::STV_f32_v4_ari;
-  case NVPTX::ST_v4f32_asi:  return NVPTX::STV_f32_v4_asi;
-
-  case NVPTX::ST_v2i64_avar: return NVPTX::STV_i64_v2_avar;
-  case NVPTX::ST_v2i64_areg: return NVPTX::STV_i64_v2_areg;
-  case NVPTX::ST_v2i64_ari:  return NVPTX::STV_i64_v2_ari;
-  case NVPTX::ST_v2i64_asi:  return NVPTX::STV_i64_v2_asi;
-  case NVPTX::ST_v2f64_avar: return NVPTX::STV_f64_v2_avar;
-  case NVPTX::ST_v2f64_areg: return NVPTX::STV_f64_v2_areg;
-  case NVPTX::ST_v2f64_ari:  return NVPTX::STV_f64_v2_ari;
-  case NVPTX::ST_v2f64_asi:  return NVPTX::STV_f64_v2_asi;
-  }
-  return 0;
-}
diff --git a/lib/Target/NVPTX/gen-register-defs.py b/lib/Target/NVPTX/gen-register-defs.py
deleted file mode 100644
index ed06668..0000000
--- a/lib/Target/NVPTX/gen-register-defs.py
+++ /dev/null
@@ -1,202 +0,0 @@
-#!/usr/bin/env python
-
-num_regs = 396
-
-outFile = open('NVPTXRegisterInfo.td', 'w')
-
-outFile.write('''
-//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Declarations that describe the PTX register file
-//===----------------------------------------------------------------------===//
-
-class NVPTXReg<string n> : Register<n> {
-  let Namespace = "NVPTX";
-}
-
-class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
-     : RegisterClass <"NVPTX", regTypes, alignment, regList>;
-
-//===----------------------------------------------------------------------===//
-//  Registers
-//===----------------------------------------------------------------------===//
-
-// Special Registers used as stack pointer
-def VRFrame         : NVPTXReg<"%SP">;
-def VRFrameLocal    : NVPTXReg<"%SPL">;
-
-// Special Registers used as the stack
-def VRDepot  : NVPTXReg<"%Depot">;
-''')
-
-# Predicates
-outFile.write('''
-//===--- Predicate --------------------------------------------------------===//
-''')
-for i in range(0, num_regs):
-  outFile.write('def P%d : NVPTXReg<"%%p%d">;\n' % (i, i))
-
-# Int8
-outFile.write('''
-//===--- 8-bit ------------------------------------------------------------===//
-''')
-for i in range(0, num_regs):
-  outFile.write('def RC%d : NVPTXReg<"%%rc%d">;\n' % (i, i))
-
-# Int16
-outFile.write('''
-//===--- 16-bit -----------------------------------------------------------===//
-''')
-for i in range(0, num_regs):
-  outFile.write('def RS%d : NVPTXReg<"%%rs%d">;\n' % (i, i))
-
-# Int32
-outFile.write('''
-//===--- 32-bit -----------------------------------------------------------===//
-''')
-for i in range(0, num_regs):
-  outFile.write('def R%d : NVPTXReg<"%%r%d">;\n' % (i, i))
-
-# Int64
-outFile.write('''
-//===--- 64-bit -----------------------------------------------------------===//
-''')
-for i in range(0, num_regs):
-  outFile.write('def RL%d : NVPTXReg<"%%rl%d">;\n' % (i, i))
-
-# F32
-outFile.write('''
-//===--- 32-bit float -----------------------------------------------------===//
-''')
-for i in range(0, num_regs):
-  outFile.write('def F%d : NVPTXReg<"%%f%d">;\n' % (i, i))
-
-# F64
-outFile.write('''
-//===--- 64-bit float -----------------------------------------------------===//
-''')
-for i in range(0, num_regs):
-  outFile.write('def FL%d : NVPTXReg<"%%fl%d">;\n' % (i, i))
-
-# Vector registers
-outFile.write('''
-//===--- Vector -----------------------------------------------------------===//
-''')
-for i in range(0, num_regs):
-  outFile.write('def v2b8_%d : NVPTXReg<"%%v2b8_%d">;\n' % (i, i))
-for i in range(0, num_regs):
-  outFile.write('def v2b16_%d : NVPTXReg<"%%v2b16_%d">;\n' % (i, i))
-for i in range(0, num_regs):
-  outFile.write('def v2b32_%d : NVPTXReg<"%%v2b32_%d">;\n' % (i, i))
-for i in range(0, num_regs):
-  outFile.write('def v2b64_%d : NVPTXReg<"%%v2b64_%d">;\n' % (i, i))
-
-for i in range(0, num_regs):
-  outFile.write('def v4b8_%d : NVPTXReg<"%%v4b8_%d">;\n' % (i, i))
-for i in range(0, num_regs):
-  outFile.write('def v4b16_%d : NVPTXReg<"%%v4b16_%d">;\n' % (i, i))
-for i in range(0, num_regs):
-  outFile.write('def v4b32_%d : NVPTXReg<"%%v4b32_%d">;\n' % (i, i))
-
-# Argument registers
-outFile.write('''
-//===--- Arguments --------------------------------------------------------===//
-''')
-for i in range(0, num_regs):
-  outFile.write('def ia%d : NVPTXReg<"%%ia%d">;\n' % (i, i))
-for i in range(0, num_regs):
-  outFile.write('def la%d : NVPTXReg<"%%la%d">;\n' % (i, i))
-for i in range(0, num_regs):
-  outFile.write('def fa%d : NVPTXReg<"%%fa%d">;\n' % (i, i))
-for i in range(0, num_regs):
-  outFile.write('def da%d : NVPTXReg<"%%da%d">;\n' % (i, i))
-
-outFile.write('''
-//===----------------------------------------------------------------------===//
-//  Register classes
-//===----------------------------------------------------------------------===//
-''')
-
-outFile.write('def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%%u", 0, %d))>;\n' % (num_regs-1))
-outFile.write('def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%%u", 0, %d))>;\n' % (num_regs-1))
-outFile.write('def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%%u", 0, %d))>;\n' % (num_regs-1))
-outFile.write('def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%%u", 0, %d))>;\n' % (num_regs-1))
-outFile.write('def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%%u", 0, %d))>;\n' % (num_regs-1))
-
-outFile.write('def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%%u", 0, %d))>;\n' % (num_regs-1))
-outFile.write('def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%%u", 0, %d))>;\n' % (num_regs-1))
-
-outFile.write('def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%%u", 0, %d))>;\n' % (num_regs-1))
-outFile.write('def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%%u", 0, %d))>;\n' % (num_regs-1))
-outFile.write('def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%%u", 0, %d))>;\n' % (num_regs-1))
-outFile.write('def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%%u", 0, %d))>;\n' % (num_regs-1))
-
-outFile.write('''
-// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
-def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
-''')
-
-outFile.write('''
-class NVPTXVecRegClass<list<ValueType> regTypes, int alignment, dag regList,
-                       NVPTXRegClass sClass,
-                       int e,
-                       string n>
-  : NVPTXRegClass<regTypes, alignment, regList>
-{
-  NVPTXRegClass scalarClass=sClass;
-  int elems=e;
-  string name=n;
-}
-''')
-
-
-outFile.write('def V2F32Regs\n  : NVPTXVecRegClass<[v2f32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n    Float32Regs, 2, ".v2.f32">;\n' % (num_regs-1))
-outFile.write('def V4F32Regs\n  : NVPTXVecRegClass<[v4f32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n    Float32Regs, 4, ".v4.f32">;\n' % (num_regs-1))
-
-outFile.write('def V2I32Regs\n  : NVPTXVecRegClass<[v2i32], 64, (add (sequence "v2b32_%%u", 0, %d)),\n    Int32Regs, 2, ".v2.u32">;\n' % (num_regs-1))
-outFile.write('def V4I32Regs\n  : NVPTXVecRegClass<[v4i32], 128, (add (sequence "v4b32_%%u", 0, %d)),\n    Int32Regs, 4, ".v4.u32">;\n' % (num_regs-1))
-
-outFile.write('def V2F64Regs\n  : NVPTXVecRegClass<[v2f64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n    Float64Regs, 2, ".v2.f64">;\n' % (num_regs-1))
-outFile.write('def V2I64Regs\n  : NVPTXVecRegClass<[v2i64], 128, (add (sequence "v2b64_%%u", 0, %d)),\n    Int64Regs, 2, ".v2.u64">;\n' % (num_regs-1))
-
-outFile.write('def V2I16Regs\n  : NVPTXVecRegClass<[v2i16], 32, (add (sequence "v2b16_%%u", 0, %d)),\n    Int16Regs, 2, ".v2.u16">;\n' % (num_regs-1))
-outFile.write('def V4I16Regs\n  : NVPTXVecRegClass<[v4i16], 64, (add (sequence "v4b16_%%u", 0, %d)),\n    Int16Regs, 4, ".v4.u16">;\n' % (num_regs-1))
-
-outFile.write('def V2I8Regs\n  : NVPTXVecRegClass<[v2i8], 16, (add (sequence "v2b8_%%u", 0, %d)),\n    Int8Regs, 2, ".v2.u8">;\n' % (num_regs-1))
-outFile.write('def V4I8Regs\n  : NVPTXVecRegClass<[v4i8], 32, (add (sequence "v4b8_%%u", 0, %d)),\n    Int8Regs, 4, ".v4.u8">;\n' % (num_regs-1))
-
-outFile.close()
-
-
-outFile = open('NVPTXNumRegisters.h', 'w')
-outFile.write('''
-//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef NVPTX_NUM_REGISTERS_H
-#define NVPTX_NUM_REGISTERS_H
-
-namespace llvm {
-
-const unsigned NVPTXNumRegisters = %d;
-
-}
-
-#endif
-''' % num_regs)
-
-outFile.close()
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 192d18d..6036428 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_target(PowerPCCodeGen
   PPCRegisterInfo.cpp
   PPCSubtarget.cpp
   PPCTargetMachine.cpp
+  PPCTargetTransformInfo.cpp
   PPCSelectionDAGInfo.cpp
   )
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index d61e741..61868d4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -151,7 +151,24 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
       Type = ELF::R_PPC64_TOC;
       break;
     case PPC::fixup_ppc_toc16:
-      Type = ELF::R_PPC64_TOC16;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TPREL16_LO:
+        Type = ELF::R_PPC64_TPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL16_LO:
+        Type = ELF::R_PPC64_DTPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC64_TOC16;
+	break;
+      case MCSymbolRefExpr::VK_PPC_TOC16_LO:
+        Type = ELF::R_PPC64_TOC16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO:
+        Type = ELF::R_PPC64_GOT_TLSLD16_LO;
+        break;
+      }
       break;
     case PPC::fixup_ppc_toc16_ds:
       switch (Modifier) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 215aa40..a25d7fe 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -17,8 +17,9 @@ using namespace llvm;
 void PPCMCAsmInfoDarwin::anchor() { }
 
 PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
-  if (is64Bit)
-    PointerSize = 8;
+  if (is64Bit) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
   IsLittleEndian = false;
 
   PCSymbol = ".";
@@ -35,8 +36,9 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
 void PPCLinuxMCAsmInfo::anchor() { }
 
 PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
-  if (is64Bit)
-    PointerSize = 8;
+  if (is64Bit) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
   IsLittleEndian = false;
 
   // ".comm align is in bytes but .align is pow-2."
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index e6d38eb..f71979f 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -25,6 +25,7 @@
 namespace llvm {
   class PPCTargetMachine;
   class FunctionPass;
+  class ImmutablePass;
   class JITCodeEmitter;
   class MachineInstr;
   class AsmPrinter;
@@ -37,6 +38,9 @@ namespace llvm {
                                             JITCodeEmitter &MCE);
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
+
+  /// \brief Creates an PPC-specific Target Transformation Info pass.
+  ImmutablePass *createPPCTargetTransformInfoPass(const PPCTargetMachine *TM);
   
   namespace PPCII {
     
@@ -53,26 +57,32 @@ namespace llvm {
     
     /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
     /// the function's picbase, e.g. lo16(symbol-picbase).
-    MO_PIC_FLAG = 4,
+    MO_PIC_FLAG = 2,
 
     /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
     /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
-    MO_NLP_FLAG = 8,
+    MO_NLP_FLAG = 4,
     
     /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
     /// symbol with hidden visibility.  This causes a different kind of
     /// non-lazy-pointer to be generated.
-    MO_NLP_HIDDEN_FLAG = 16,
+    MO_NLP_HIDDEN_FLAG = 8,
 
     /// The next are not flags but distinct values.
-    MO_ACCESS_MASK = 0xe0,
+    MO_ACCESS_MASK = 0xf0,
 
     /// MO_LO16, MO_HA16 - lo16(symbol) and ha16(symbol)
-    MO_LO16 = 1 << 5,
-    MO_HA16 = 2 << 5,
+    MO_LO16 = 1 << 4,
+    MO_HA16 = 2 << 4,
+
+    MO_TPREL16_HA = 3 << 4,
+    MO_TPREL16_LO = 4 << 4,
 
-    MO_TPREL16_HA = 3 << 5,
-    MO_TPREL16_LO = 4 << 5
+    /// These values identify relocations on immediates folded
+    /// into memory operations.
+    MO_DTPREL16_LO = 5 << 4,
+    MO_TLSLD16_LO  = 6 << 4,
+    MO_TOC16_LO    = 7 << 4
   };
   } // end namespace PPCII
   
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index cb15dad..9929136 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -39,7 +39,12 @@ def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
                                        "PPC::DIR_E500mc", "">;
 def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective", 
                                        "PPC::DIR_E5500", "">;
+def DirectivePwr3: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR3", "">;
+def DirectivePwr4: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR4", "">;
+def DirectivePwr5: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5", "">;
+def DirectivePwr5x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">;
 def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
+def DirectivePwr6x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">;
 def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
 
 def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
@@ -58,6 +63,25 @@ def FeatureISEL      : SubtargetFeature<"isel","HasISEL", "true",
                                         "Enable the isel instruction">;
 def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
                                         "Enable Book E instructions">;
+def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
+                                        "Enable QPX instructions">;
+
+// Note: Future features to add when support is extended to more
+// recent ISA levels:
+//
+// CMPB         p6, p6x, p7        cmpb
+// DFP          p6, p6x, p7        decimal floating-point instructions
+// FLT_CVT      p7                 fcfids, fcfidu, fcfidus, fcfiduz, fctiwuz
+// FPRND        p5x, p6, p6x, p7   frim, frin, frip, friz
+// FRE          p5 through p7      fre (vs. fres, available since p3)
+// FRSQRTES     p5 through p7      frsqrtes (vs. frsqrte, available since p3)
+// LDBRX        p7                 load with byte reversal
+// LFIWAX       p6, p6x, p7        lfiwax
+// LFIWZX       p7                 lfiwzx
+// POPCNTB      p5 through p7      popcntb and related instructions
+// POPCNTD      p7                 popcntd and related instructions
+// RECIP_PREC   p6, p6x, p7        higher precision reciprocal estimates
+// VSX          p7                 vector-scalar instruction set
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -109,10 +133,30 @@ def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE,
                                          FeatureSTFIWX, FeatureISEL,
                                          Feature64Bit
                                      /*, Feature64BitRegs */]>;
+def : Processor<"a2q", PPCA2Itineraries, [DirectiveA2, FeatureBookE,
+                                          FeatureMFOCRF, FeatureFSqrt,
+                                          FeatureSTFIWX, FeatureISEL,
+                                          Feature64Bit /*, Feature64BitRegs */,
+                                          FeatureQPX]>;
+def : Processor<"pwr3", G5Itineraries,
+                  [DirectivePwr3, FeatureAltivec, FeatureMFOCRF,
+                   FeatureSTFIWX, Feature64Bit]>;
+def : Processor<"pwr4", G5Itineraries,
+                  [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
+def : Processor<"pwr5", G5Itineraries,
+                  [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
+def : Processor<"pwr5x", G5Itineraries,
+                  [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
 def : Processor<"pwr6", G5Itineraries,
                   [DirectivePwr6, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"pwr6x", G5Itineraries,
+                  [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
 def : Processor<"pwr7", G5Itineraries,
                   [DirectivePwr7, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index adb673b..eae9b7b 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -464,12 +464,15 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // associated TOC entry.  Otherwise reference the symbol directly.
     TmpInst.setOpcode(PPC::LDrs);
     const MachineOperand &MO = MI->getOperand(1);
-    assert((MO.isGlobal() || MO.isJTI()) && "Invalid operand for LDtocL!");
+    assert((MO.isGlobal() || MO.isJTI() || MO.isCPI()) &&
+           "Invalid operand for LDtocL!");
     MCSymbol *MOSymbol = 0;
 
     if (MO.isJTI())
       MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
-    else {
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
@@ -732,14 +735,14 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
   // entry point.
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext),
-                        8/*size*/, 0/*addrspace*/);
+			8 /*size*/);
   MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
   // Generates a R_PPC64_TOC relocation for TOC base insertion.
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2,
                         MCSymbolRefExpr::VK_PPC_TOC, OutContext),
-                        8/*size*/, 0/*addrspace*/);
+                        8/*size*/);
   // Emit a null environment pointer.
-  OutStreamer.EmitIntValue(0, 8 /* size */, 0 /* addrspace */);
+  OutStreamer.EmitIntValue(0, 8 /* size */);
   OutStreamer.SwitchSection(Current);
 
   MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol(
@@ -768,6 +771,25 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
     }
   }
 
+  MachineModuleInfoELF &MMIELF =
+    MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+  MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+  if (!Stubs.empty()) {
+    OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      // L_foo$stub:
+      OutStreamer.EmitLabel(Stubs[i].first);
+      //   .long _foo
+      OutStreamer.EmitValue(MCSymbolRefExpr::Create(Stubs[i].second.getPointer(),
+                                                    OutContext),
+                            isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
+    }
+
+    Stubs.clear();
+    OutStreamer.AddBlankLine();
+  }
+
   return AsmPrinter::doFinalization(M);
 }
 
@@ -802,7 +824,12 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "ppcA2",
     "ppce500mc",
     "ppce5500",
+    "power3",
+    "power4",
+    "power5",
+    "power5x",
     "power6",
+    "power6x",
     "power7",
     "ppc64"
   };
@@ -817,8 +844,11 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
   assert(Directive <= PPC::DIR_64 && "Directive out of range.");
   
   // FIXME: This is a total hack, finish mc'izing the PPC backend.
-  if (OutStreamer.hasRawTextSupport())
+  if (OutStreamer.hasRawTextSupport()) {
+    assert(Directive < sizeof(CPUDirectives) / sizeof(*CPUDirectives) &&
+           "CPUDirectives[] might not be up-to-date!");
     OutStreamer.EmitRawText("\t.machine " + Twine(CPUDirectives[Directive]));
+  }
 
   // Prime text sections so they are adjacent.  This reduces the likelihood a
   // large data or debug section causes a branch to exceed 16M limit.
@@ -1031,7 +1061,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
 
       if (MCSym.getInt())
         // External to current translation unit.
-        OutStreamer.EmitIntValue(0, isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
+        OutStreamer.EmitIntValue(0, isPPC64 ? 8 : 4/*size*/);
       else
         // Internal to current translation unit.
         //
@@ -1041,7 +1071,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
         // fill in the value for the NLP in those cases.
         OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
                                                       OutContext),
-                              isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
+                              isPPC64 ? 8 : 4/*size*/);
     }
 
     Stubs.clear();
@@ -1060,7 +1090,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
       OutStreamer.EmitValue(MCSymbolRefExpr::
                             Create(Stubs[i].second.getPointer(),
                                    OutContext),
-                            isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
+                            isPPC64 ? 8 : 4/*size*/);
     }
 
     Stubs.clear();
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 9911575..bd1c378 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -28,10 +28,16 @@ using namespace llvm;
 
 STATISTIC(NumExpanded, "Number of branches expanded to long format");
 
+namespace llvm {
+  void initializePPCBSelPass(PassRegistry&);
+}
+
 namespace {
   struct PPCBSel : public MachineFunctionPass {
     static char ID;
-    PPCBSel() : MachineFunctionPass(ID) {}
+    PPCBSel() : MachineFunctionPass(ID) {
+      initializePPCBSelPass(*PassRegistry::getPassRegistry());
+    }
 
     /// BlockSizes - The sizes of the basic blocks in the function.
     std::vector<unsigned> BlockSizes;
@@ -45,6 +51,9 @@ namespace {
   char PPCBSel::ID = 0;
 }
 
+INITIALIZE_PASS(PPCBSel, "ppc-branch-select", "PowerPC Branch Selector",
+                false, false)
+
 /// createPPCBranchSelectionPass - returns an instance of the Branch Selection
 /// Pass
 ///
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index a74932c..b98cc48 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -54,6 +54,10 @@ using namespace llvm;
 
 STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
 
+namespace llvm {
+  void initializePPCCTRLoopsPass(PassRegistry&);
+}
+
 namespace {
   class CountValue;
   struct PPCCTRLoops : public MachineFunctionPass {
@@ -64,7 +68,9 @@ namespace {
   public:
     static char ID;   // Pass identification, replacement for typeid
 
-    PPCCTRLoops() : MachineFunctionPass(ID) {}
+    PPCCTRLoops() : MachineFunctionPass(ID) {
+      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+    }
 
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -174,6 +180,12 @@ namespace {
   };
 } // end anonymous namespace
 
+INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+                    false, false)
 
 /// isCompareEquals - Returns true if the instruction is a compare equals
 /// instruction with an immediate operand.
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index 3f87e88..caeb179 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -27,9 +27,10 @@ def RetCC_PPC : CallingConv<[
 
   CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
   CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
   
-  CCIfType<[f32], CCAssignToReg<[F1]>>,
-  CCIfType<[f64], CCAssignToReg<[F1, F2]>>,
+  CCIfType<[f32], CCAssignToReg<[F1, F2]>>,
+  CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>,
   
   // Vector types are always returned in V2.
   CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
@@ -37,49 +38,20 @@ def RetCC_PPC : CallingConv<[
 
 
 //===----------------------------------------------------------------------===//
-// PowerPC Argument Calling Conventions
-//===----------------------------------------------------------------------===//
-/*
-def CC_PPC : CallingConv<[
-  // The first 8 integer arguments are passed in integer registers.
-  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
-  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
-  
-  // Common sub-targets passes FP values in F1 - F13
-  CCIfType<[f32, f64], 
-           CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8,F9,F10,F11,F12,F13]>>,
-           
-  // The first 12 Vector arguments are passed in altivec registers.
-  CCIfType<[v16i8, v8i16, v4i32, v4f32],
-              CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10,V11,V12,V13]>>
-
-/*
-  // Integer/FP values get stored in stack slots that are 8 bytes in size and
-  // 8-byte aligned if there are no more registers to hold them.
-  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
-  
-  // Vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-              CCAssignToStack<16, 16>>*/
-]>;
-
-*/
-
-//===----------------------------------------------------------------------===//
-// PowerPC System V Release 4 ABI
+// PowerPC System V Release 4 32-bit ABI
 //===----------------------------------------------------------------------===//
 
-def CC_PPC_SVR4_Common : CallingConv<[
+def CC_PPC32_SVR4_Common : CallingConv<[
   // The ABI requires i64 to be passed in two adjacent registers with the first
   // register having an odd register number.
-  CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC_SVR4_Custom_AlignArgRegs">>>,
+  CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>,
 
   // The first 8 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
 
   // Make sure the i64 words from a long double are either both passed in
   // registers or both passed on the stack.
-  CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC_SVR4_Custom_AlignFPArgRegs">>>,
+  CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>,
   
   // FP values are passed in F1 - F8.
   CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
@@ -100,18 +72,18 @@ def CC_PPC_SVR4_Common : CallingConv<[
 // This calling convention puts vector arguments always on the stack. It is used
 // to assign vector arguments which belong to the variable portion of the
 // parameter list of a variable argument function.
-def CC_PPC_SVR4_VarArg : CallingConv<[
-  CCDelegateTo<CC_PPC_SVR4_Common>
+def CC_PPC32_SVR4_VarArg : CallingConv<[
+  CCDelegateTo<CC_PPC32_SVR4_Common>
 ]>;
 
-// In contrast to CC_PPC_SVR4_VarArg, this calling convention first tries to put
-// vector arguments in vector registers before putting them on the stack.
-def CC_PPC_SVR4 : CallingConv<[
+// In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to
+// put vector arguments in vector registers before putting them on the stack.
+def CC_PPC32_SVR4 : CallingConv<[
   // The first 12 Vector arguments are passed in AltiVec registers.
   CCIfType<[v16i8, v8i16, v4i32, v4f32],
            CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13]>>,
            
-  CCDelegateTo<CC_PPC_SVR4_Common>
+  CCDelegateTo<CC_PPC32_SVR4_Common>
 ]>;  
 
 // Helper "calling convention" to handle aggregate by value arguments.
@@ -122,15 +94,15 @@ def CC_PPC_SVR4 : CallingConv<[
 // Still, the address of the aggregate copy in the callers stack frame is passed
 // in a GPR (or in the parameter list area if all GPRs are allocated) from the
 // caller to the callee. The location for the address argument is assigned by
-// the CC_PPC_SVR4 calling convention.
+// the CC_PPC32_SVR4 calling convention.
 //
-// The only purpose of CC_PPC_SVR4_Custom_Dummy is to skip arguments which are
+// The only purpose of CC_PPC32_SVR4_Custom_Dummy is to skip arguments which are
 // not passed by value.
  
-def CC_PPC_SVR4_ByVal : CallingConv<[
+def CC_PPC32_SVR4_ByVal : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
   
-  CCCustom<"CC_PPC_SVR4_Custom_Dummy">
+  CCCustom<"CC_PPC32_SVR4_Custom_Dummy">
 ]>;
 
 def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 5901f36..0a396e6 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -119,12 +119,21 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
     if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
       UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
   }
-  for (MachineRegisterInfo::liveout_iterator
-       I = MF->getRegInfo().liveout_begin(),
-       E = MF->getRegInfo().liveout_end(); I != E; ++I) {
-    unsigned RegNo = getPPCRegisterNumbering(*I);
-    if (VRRegNo[RegNo] == *I)              // If this really is a vector reg.
-      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+
+  // Live out registers appear as use operands on return instructions.
+  for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end();
+       UsedRegMask != 0 && BI != BE; ++BI) {
+    const MachineBasicBlock &MBB = *BI;
+    if (MBB.empty() || !MBB.back().isReturn())
+      continue;
+    const MachineInstr &Ret = MBB.back();
+    for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) {
+      const MachineOperand &MO = Ret.getOperand(I);
+      if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg()))
+        continue;
+      unsigned RegNo = getPPCRegisterNumbering(MO.getReg());
+      UsedRegMask &= ~(1 << (31-RegNo));
+    }
   }
 
   // If no registers are used, turn this into a copy.
@@ -198,13 +207,14 @@ void PPCFrameLowering::determineFrameLayout(MachineFunction &MF) const {
   // to adjust the stack pointer (we fit in the Red Zone).  For 64-bit
   // SVR4, we also require a stack frame if we need to spill the CR,
   // since this spill area is addressed relative to the stack pointer.
+  // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate
+  // stackless code if all local vars are reg-allocated.
   bool DisableRedZone = MF.getFunction()->getAttributes().
     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoRedZone);
-  // FIXME SVR4 The 32-bit SVR4 ABI has no red zone.  However, it can
-  // still generate stackless code if all local vars are reg-allocated.
-  // Try: (FrameSize <= 224
-  //       || (FrameSize == 0 && Subtarget.isPPC32 && Subtarget.isSVR4ABI()))
   if (!DisableRedZone &&
+      (Subtarget.isPPC64() ||                      // 32-bit SVR4, no stack-
+       !Subtarget.isSVR4ABI() ||                   //   allocated locals.
+	FrameSize == 0) &&
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
@@ -777,7 +787,8 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   unsigned LR = RegInfo->getRARegister();
   FI->setMustSaveLR(MustSaveLR(MF, LR));
-  MF.getRegInfo().setPhysRegUnused(LR);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.setPhysRegUnused(LR);
 
   //  Save R31 if necessary
   int FPSI = FI->getFramePointerSaveIndex();
@@ -802,6 +813,16 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     MFI->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
   }
 
+  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the 
+  // function uses CR 2, 3, or 4.
+  if (!isPPC64 && !isDarwinABI && 
+      (MRI.isPhysRegUsed(PPC::CR2) ||
+       MRI.isPhysRegUsed(PPC::CR3) ||
+       MRI.isPhysRegUsed(PPC::CR4))) {
+    int FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
+    FI->setCRSpillFrameIndex(FrameIdx);
+  }
+
   // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
   // a large stack, which will require scavenging a register to materialize a
   // large offset.
@@ -1115,6 +1136,47 @@ restoreCRs(bool isPPC64, bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
 	       .addReg(MoveReg));
 }
 
+void PPCFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      I->getOpcode() == PPC::ADJCALLSTACKUP) {
+    // Add (actually subtract) back the amount the callee popped on return.
+    if (int CalleeAmt =  I->getOperand(1).getImm()) {
+      bool is64Bit = Subtarget.isPPC64();
+      CalleeAmt *= -1;
+      unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1;
+      unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0;
+      unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI;
+      unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4;
+      unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS;
+      unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI;
+      MachineInstr *MI = I;
+      DebugLoc dl = MI->getDebugLoc();
+
+      if (isInt<16>(CalleeAmt)) {
+        BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg)
+          .addReg(StackReg, RegState::Kill)
+          .addImm(CalleeAmt);
+      } else {
+        MachineBasicBlock::iterator MBBI = I;
+        BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+          .addImm(CalleeAmt >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(CalleeAmt & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(ADDInstr), StackReg)
+          .addReg(StackReg, RegState::Kill)
+          .addReg(TmpReg);
+      }
+    }
+  }
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
 bool 
 PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 					MachineBasicBlock::iterator MI,
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 3517d8c..d09e47f 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -27,7 +27,8 @@ class PPCFrameLowering: public TargetFrameLowering {
 
 public:
   PPCFrameLowering(const PPCSubtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 16, 0),
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+        (sti.hasQPX() || sti.isBGQ()) ? 32 : 16, 0),
       Subtarget(sti) {
   }
 
@@ -50,6 +51,10 @@ public:
                                  const std::vector<CalleeSavedInfo> &CSI,
                                  const TargetRegisterInfo *TRI) const;
 
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MI,
                                    const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 762b346..17bea8a 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -23,9 +23,9 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -34,6 +34,10 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+namespace llvm {
+  void initializePPCDAGToDAGISelPass(PassRegistry&);
+}
+
 namespace {
   //===--------------------------------------------------------------------===//
   /// PPCDAGToDAGISel - PPC specific code to select PPC machine
@@ -48,7 +52,9 @@ namespace {
     explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
       : SelectionDAGISel(tm), TM(tm),
         PPCLowering(*TM.getTargetLowering()),
-        PPCSubTarget(*TM.getSubtargetImpl()) {}
+        PPCSubTarget(*TM.getSubtargetImpl()) {
+      initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
+    }
 
     virtual bool runOnMachineFunction(MachineFunction &MF) {
       // Make sure we re-emit a set of the global base reg if necessary
@@ -61,6 +67,8 @@ namespace {
       return true;
     }
 
+    virtual void PostprocessISelDAG();
+
     /// getI32Imm - Return a target constant with the specified value, of type
     /// i32.
     inline SDValue getI32Imm(unsigned Imm) {
@@ -1273,16 +1281,17 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   case PPCISD::TOC_ENTRY: {
     assert (PPCSubTarget.isPPC64() && "Only supported for 64-bit ABI");
 
-    // For medium code model, we generate two instructions as described
-    // below.  Otherwise we allow SelectCodeCommon to handle this, selecting
-    // one of LDtoc, LDtocJTI, and LDtocCPT.
-    if (TM.getCodeModel() != CodeModel::Medium)
+    // For medium and large code model, we generate two instructions as
+    // described below.  Otherwise we allow SelectCodeCommon to handle this,
+    // selecting one of LDtoc, LDtocJTI, and LDtocCPT.
+    CodeModel::Model CModel = TM.getCodeModel();
+    if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
       break;
 
     // The first source operand is a TargetGlobalAddress or a
     // TargetJumpTable.  If it is an externally defined symbol, a symbol
     // with common linkage, a function address, or a jump table address,
-    // we generate:
+    // or if we are generating code for large code model, we generate:
     //   LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
     // Otherwise we generate:
     //   ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
@@ -1291,7 +1300,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
                                         TOCbase, GA);
 
-    if (isa<JumpTableSDNode>(GA))
+    if (isa<JumpTableSDNode>(GA) || CModel == CodeModel::Large)
       return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
                                     SDValue(Tmp, 0));
 
@@ -1316,11 +1325,231 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
                                   SDValue(Tmp, 0), GA);
   }
+  case PPCISD::VADD_SPLAT: {
+    // This expands into one of three sequences, depending on whether
+    // the first operand is odd or even, positive or negative.
+    assert(isa<ConstantSDNode>(N->getOperand(0)) &&
+           isa<ConstantSDNode>(N->getOperand(1)) &&
+           "Invalid operand on VADD_SPLAT!");
+
+    int Elt     = N->getConstantOperandVal(0);
+    int EltSize = N->getConstantOperandVal(1);
+    unsigned Opc1, Opc2, Opc3;
+    EVT VT;
+
+    if (EltSize == 1) {
+      Opc1 = PPC::VSPLTISB;
+      Opc2 = PPC::VADDUBM;
+      Opc3 = PPC::VSUBUBM;
+      VT = MVT::v16i8;
+    } else if (EltSize == 2) {
+      Opc1 = PPC::VSPLTISH;
+      Opc2 = PPC::VADDUHM;
+      Opc3 = PPC::VSUBUHM;
+      VT = MVT::v8i16;
+    } else {
+      assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!");
+      Opc1 = PPC::VSPLTISW;
+      Opc2 = PPC::VADDUWM;
+      Opc3 = PPC::VSUBUWM;
+      VT = MVT::v4i32;
+    }
+
+    if ((Elt & 1) == 0) {
+      // Elt is even, in the range [-32,-18] + [16,30].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp = VSPLTIS[BHW] elt
+      //          VADDU[BHW]M tmp, tmp
+      // Where:   [BHW] = B for size = 1, H for size = 2, W for size = 4
+      SDValue EltVal = getI32Imm(Elt >> 1);
+      SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      SDValue TmpVal = SDValue(Tmp, 0);
+      return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal);
+
+    } else if (Elt > 0) {
+      // Elt is odd and positive, in the range [17,31].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp1 = VSPLTIS[BHW] elt-16
+      //          tmp2 = VSPLTIS[BHW] -16
+      //          VSUBU[BHW]M tmp1, tmp2
+      SDValue EltVal = getI32Imm(Elt - 16);
+      SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      EltVal = getI32Imm(-16);
+      SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      return CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
+                                    SDValue(Tmp2, 0));
+
+    } else {
+      // Elt is odd and negative, in the range [-31,-17].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp1 = VSPLTIS[BHW] elt+16
+      //          tmp2 = VSPLTIS[BHW] -16
+      //          VADDU[BHW]M tmp1, tmp2
+      SDValue EltVal = getI32Imm(Elt + 16);
+      SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      EltVal = getI32Imm(-16);
+      SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      return CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0),
+                                    SDValue(Tmp2, 0));
+    }
+  }
   }
 
   return SelectCode(N);
 }
 
+/// PostProcessISelDAG - Perform some late peephole optimizations
+/// on the DAG representation.
+void PPCDAGToDAGISel::PostprocessISelDAG() {
+
+  // Skip peepholes at -O0.
+  if (TM.getOptLevel() == CodeGenOpt::None)
+    return;
+
+  // These optimizations are currently supported only for 64-bit SVR4.
+  if (PPCSubTarget.isDarwin() || !PPCSubTarget.isPPC64())
+    return;
+
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = --Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    unsigned FirstOp;
+    unsigned StorageOpcode = N->getMachineOpcode();
+
+    switch (StorageOpcode) {
+    default: continue;
+
+    case PPC::LBZ:
+    case PPC::LBZ8:
+    case PPC::LD:
+    case PPC::LFD:
+    case PPC::LFS:
+    case PPC::LHA:
+    case PPC::LHA8:
+    case PPC::LHZ:
+    case PPC::LHZ8:
+    case PPC::LWA:
+    case PPC::LWZ:
+    case PPC::LWZ8:
+      FirstOp = 0;
+      break;
+
+    case PPC::STB:
+    case PPC::STB8:
+    case PPC::STD:
+    case PPC::STFD:
+    case PPC::STFS:
+    case PPC::STH:
+    case PPC::STH8:
+    case PPC::STW:
+    case PPC::STW8:
+      FirstOp = 1;
+      break;
+    }
+
+    // If this is a load or store with a zero offset, we may be able to
+    // fold an add-immediate into the memory operation.
+    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
+        N->getConstantOperandVal(FirstOp) != 0)
+      continue;
+
+    SDValue Base = N->getOperand(FirstOp + 1);
+    if (!Base.isMachineOpcode())
+      continue;
+
+    unsigned Flags = 0;
+    bool ReplaceFlags = true;
+
+    // When the feeding operation is an add-immediate of some sort,
+    // determine whether we need to add relocation information to the
+    // target flags on the immediate operand when we fold it into the
+    // load instruction.
+    //
+    // For something like ADDItocL, the relocation information is
+    // inferred from the opcode; when we process it in the AsmPrinter,
+    // we add the necessary relocation there.  A load, though, can receive
+    // relocation from various flavors of ADDIxxx, so we need to carry
+    // the relocation information in the target flags.
+    switch (Base.getMachineOpcode()) {
+    default: continue;
+
+    case PPC::ADDI8:
+    case PPC::ADDI8L:
+    case PPC::ADDIL:
+      // In some cases (such as TLS) the relocation information
+      // is already in place on the operand, so copying the operand
+      // is sufficient.
+      ReplaceFlags = false;
+      // For these cases, the immediate may not be divisible by 4, in
+      // which case the fold is illegal for DS-form instructions.  (The
+      // other cases provide aligned addresses and are always safe.)
+      if ((StorageOpcode == PPC::LWA ||
+           StorageOpcode == PPC::LD  ||
+           StorageOpcode == PPC::STD) &&
+          (!isa<ConstantSDNode>(Base.getOperand(1)) ||
+           Base.getConstantOperandVal(1) % 4 != 0))
+        continue;
+      break;
+    case PPC::ADDIdtprelL:
+      Flags = PPCII::MO_DTPREL16_LO;
+      break;
+    case PPC::ADDItlsldL:
+      Flags = PPCII::MO_TLSLD16_LO;
+      break;
+    case PPC::ADDItocL:
+      Flags = PPCII::MO_TOC16_LO;
+      break;
+    }
+
+    // We found an opportunity.  Reverse the operands from the add
+    // immediate and substitute them into the load or store.  If
+    // needed, update the target flags for the immediate operand to
+    // reflect the necessary relocation information.
+    DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+    DEBUG(Base->dump(CurDAG));
+    DEBUG(dbgs() << "\nN: ");
+    DEBUG(N->dump(CurDAG));
+    DEBUG(dbgs() << "\n");
+
+    SDValue ImmOpnd = Base.getOperand(1);
+
+    // If the relocation information isn't already present on the
+    // immediate operand, add it now.
+    if (ReplaceFlags) {
+      if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+        DebugLoc dl = GA->getDebugLoc();
+        const GlobalValue *GV = GA->getGlobal();
+        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
+      } else if (ConstantPoolSDNode *CP =
+                 dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
+        const Constant *C = CP->getConstVal();
+        ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
+                                                CP->getAlignment(),
+                                                0, Flags);
+      }
+    }
+
+    if (FirstOp == 1) // Store
+      (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
+                                       Base.getOperand(0), N->getOperand(3));
+    else // Load
+      (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
+                                       N->getOperand(2));
+
+    // The add-immediate may now be dead, in which case remove it.
+    if (Base.getNode()->use_empty())
+      CurDAG->RemoveDeadNode(Base.getNode());
+  }
+}
 
 
 /// createPPCISelDag - This pass converts a legalized DAG into a
@@ -1330,3 +1559,14 @@ FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
   return new PPCDAGToDAGISel(TM);
 }
 
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "PowerPC DAG->DAG Pattern Instruction Selection";
+  PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID, 0,
+                              false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializePPCDAGToDAGISelPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce);
+}
+
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 9966b2c..cf1f459 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -36,20 +36,20 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                     CCValAssign::LocInfo &LocInfo,
-                                     ISD::ArgFlagsTy &ArgFlags,
-                                     CCState &State);
-static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
-                                            MVT &LocVT,
-                                            CCValAssign::LocInfo &LocInfo,
-                                            ISD::ArgFlagsTy &ArgFlags,
-                                            CCState &State);
-static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State);
+static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
                                               MVT &LocVT,
                                               CCValAssign::LocInfo &LocInfo,
                                               ISD::ArgFlagsTy &ArgFlags,
                                               CCState &State);
+static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                                MVT &LocVT,
+                                                CCValAssign::LocInfo &LocInfo,
+                                                ISD::ArgFlagsTy &ArgFlags,
+                                                CCState &State);
 
 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
@@ -132,11 +132,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // We don't support sin/cos/sqrt/fmod/pow
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
   setOperationAction(ISD::FREM , MVT::f64, Expand);
   setOperationAction(ISD::FPOW , MVT::f64, Expand);
   setOperationAction(ISD::FMA  , MVT::f64, Legal);
   setOperationAction(ISD::FSIN , MVT::f32, Expand);
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
   setOperationAction(ISD::FPOW , MVT::f32, Expand);
   setOperationAction(ISD::FMA  , MVT::f32, Legal);
@@ -498,15 +500,15 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
   if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc ||
       Subtarget->getDarwinDirective() == PPC::DIR_E5500) {
-    maxStoresPerMemset = 32;
-    maxStoresPerMemsetOptSize = 16;
-    maxStoresPerMemcpy = 32;
-    maxStoresPerMemcpyOptSize = 8;
-    maxStoresPerMemmove = 32;
-    maxStoresPerMemmoveOptSize = 8;
+    MaxStoresPerMemset = 32;
+    MaxStoresPerMemsetOptSize = 16;
+    MaxStoresPerMemcpy = 32;
+    MaxStoresPerMemcpyOptSize = 8;
+    MaxStoresPerMemmove = 32;
+    MaxStoresPerMemmoveOptSize = 8;
 
     setPrefFunctionAlignment(4);
-    benefitFromCodePlacementOpt = true;
+    BenefitFromCodePlacementOpt = true;
   }
 }
 
@@ -592,6 +594,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
   case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
   case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
+  case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
   }
 }
 
@@ -1746,18 +1749,18 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 
 #include "PPCGenCallingConv.inc"
 
-static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                     CCValAssign::LocInfo &LocInfo,
-                                     ISD::ArgFlagsTy &ArgFlags,
-                                     CCState &State) {
+static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State) {
   return true;
 }
 
-static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
-                                            MVT &LocVT,
-                                            CCValAssign::LocInfo &LocInfo,
-                                            ISD::ArgFlagsTy &ArgFlags,
-                                            CCState &State) {
+static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                              MVT &LocVT,
+                                              CCValAssign::LocInfo &LocInfo,
+                                              ISD::ArgFlagsTy &ArgFlags,
+                                              CCState &State) {
   static const uint16_t ArgRegs[] = {
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
@@ -1780,11 +1783,11 @@ static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
   return false;
 }
 
-static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
-                                              MVT &LocVT,
-                                              CCValAssign::LocInfo &LocInfo,
-                                              ISD::ArgFlagsTy &ArgFlags,
-                                              CCState &State) {
+static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                                MVT &LocVT,
+                                                CCValAssign::LocInfo &LocInfo,
+                                                ISD::ArgFlagsTy &ArgFlags,
+                                                CCState &State) {
   static const uint16_t ArgRegs[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8
@@ -1907,7 +1910,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
 
-  CCInfo.AnalyzeFormalArguments(Ins, CC_PPC_SVR4);
+  CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -1968,7 +1971,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
 
-  CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC_SVR4_ByVal);
+  CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
 
   // Area that is at least reserved in the caller of this function.
   unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
@@ -2160,13 +2163,16 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   SmallVector<SDValue, 8> MemOps;
   unsigned nAltivecParamsAtEnd = 0;
   Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
-  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
+  unsigned CurArgIdx = 0;
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     SDValue ArgVal;
     bool needsLoad = false;
     EVT ObjectVT = Ins[ArgNo].VT;
     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
     unsigned ArgSize = ObjSize;
     ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[ArgNo].OrigArgIndex;
 
     unsigned CurArgOffset = ArgOffset;
 
@@ -2501,6 +2507,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
   SmallVector<SDValue, 8> MemOps;
   unsigned nAltivecParamsAtEnd = 0;
+  // FIXME: FuncArg and Ins[ArgNo] must reference the same argument.
+  // When passing anonymous aggregates, this is currently not true.
+  // See LowerFormalArguments_64SVR4 for a fix.
   Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
     SDValue ArgVal;
@@ -3323,7 +3332,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
 
   // When performing tail call optimization the callee pops its arguments off
   // the stack. Account for this here so these bytes can be pushed back on in
-  // PPCRegisterInfo::eliminateCallFramePseudoInstr.
+  // PPCFrameLowering::eliminateCallFramePseudoInstr.
   int BytesCalleePops =
     (CallConv == CallingConv::Fast &&
      getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
@@ -3339,17 +3348,6 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
 
   // Emit tail call.
   if (isTailCall) {
-    // If this is the first return lowered for this function, add the regs
-    // to the liveout set for the function.
-    if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-      SmallVector<CCValAssign, 16> RVLocs;
-      CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                     getTargetMachine(), RVLocs, *DAG.getContext());
-      CCInfo.AnalyzeCallResult(Ins, RetCC_PPC);
-      for (unsigned i = 0; i != RVLocs.size(); ++i)
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-    }
-
     assert(((Callee.getOpcode() == ISD::Register &&
              cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
             Callee.getOpcode() == ISD::TargetExternalSymbol ||
@@ -3493,11 +3491,11 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
       bool Result;
 
       if (Outs[i].IsFixed) {
-        Result = CC_PPC_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
-                             CCInfo);
+        Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
+                               CCInfo);
       } else {
-        Result = CC_PPC_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
-                                    ArgFlags, CCInfo);
+        Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
+                                      ArgFlags, CCInfo);
       }
 
       if (Result) {
@@ -3510,7 +3508,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
     }
   } else {
     // All arguments are treated the same.
-    CCInfo.AnalyzeCallOperands(Outs, CC_PPC_SVR4);
+    CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
   }
 
   // Assign locations to all of the outgoing aggregate by value arguments.
@@ -3521,7 +3519,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
   // Reserve stack space for the allocations in CCInfo.
   CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
 
-  CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC_SVR4_ByVal);
+  CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
 
   // Size of the linkage area, parameter list area and the part of the local
   // space variable where copies of aggregates which are passed by value are
@@ -4415,14 +4413,8 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
                  getTargetMachine(), RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
 
-  // If this is the first return lowered for this function, add the regs to the
-  // liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
   SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
 
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -4447,12 +4439,17 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
 
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
     Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
   if (Flag.getNode())
-    return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
-  else
-    return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, Chain);
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
 }
 
 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
@@ -5028,11 +5025,21 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // Two instruction sequences.
 
   // If this value is in the range [-32,30] and is even, use:
-  //    tmp = VSPLTI[bhw], result = add tmp, tmp
-  if (SextVal >= -32 && SextVal <= 30 && (SextVal & 1) == 0) {
-    SDValue Res = BuildSplatI(SextVal >> 1, SplatSize, MVT::Other, DAG, dl);
-    Res = DAG.getNode(ISD::ADD, dl, Res.getValueType(), Res, Res);
-    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+  //     VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
+  // If this value is in the range [17,31] and is odd, use:
+  //     VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
+  // If this value is in the range [-31,-17] and is odd, use:
+  //     VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
+  // Note the last two are three-instruction sequences.
+  if (SextVal >= -32 && SextVal <= 31) {
+    // To avoid having these optimizations undone by constant folding,
+    // we convert to a pseudo that will be expanded later into one of
+    // the above forms.
+    SDValue Elt = DAG.getConstant(SextVal, MVT::i32);
+    EVT VT = Op.getValueType();
+    int Size = VT == MVT::v16i8 ? 1 : (VT == MVT::v8i16 ? 2 : 4);
+    SDValue EltSize = DAG.getConstant(Size, MVT::i32);
+    return DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
   }
 
   // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
@@ -5128,23 +5135,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     }
   }
 
-  // Three instruction sequences.
-
-  // Odd, in range [17,31]:  (vsplti C)-(vsplti -16).
-  if (SextVal >= 0 && SextVal <= 31) {
-    SDValue LHS = BuildSplatI(SextVal-16, SplatSize, MVT::Other, DAG, dl);
-    SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl);
-    LHS = DAG.getNode(ISD::SUB, dl, LHS.getValueType(), LHS, RHS);
-    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), LHS);
-  }
-  // Odd, in range [-31,-17]:  (vsplti C)+(vsplti -16).
-  if (SextVal >= -31 && SextVal <= 0) {
-    SDValue LHS = BuildSplatI(SextVal+16, SplatSize, MVT::Other, DAG, dl);
-    SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl);
-    LHS = DAG.getNode(ISD::ADD, dl, LHS.getValueType(), LHS, RHS);
-    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), LHS);
-  }
-
   return SDValue();
 }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 12b3df7..f5d418c 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -237,6 +237,12 @@ namespace llvm {
       /// sym@got@dtprel@l.
       ADDI_DTPREL_L,
 
+      /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
+      /// during instruction selection to optimize a BUILD_VECTOR into
+      /// operations on splats.  This is necessary to avoid losing these
+      /// optimizations due to constant folding.
+      VADD_SPLAT,
+
       /// STD_32 - This is the STD instruction for use with "32-bit" registers.
       STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE,
 
@@ -252,13 +258,14 @@ namespace llvm {
       /// or i32.
       LBRX,
 
-      /// G8RC = ADDIS_TOC_HA %X2, Symbol - For medium code model, produces
-      /// an ADDIS8 instruction that adds the TOC base register to sym@toc@ha.
+      /// G8RC = ADDIS_TOC_HA %X2, Symbol - For medium and large code model,
+      /// produces an ADDIS8 instruction that adds the TOC base register to
+      /// sym@toc@ha.
       ADDIS_TOC_HA,
 
-      /// G8RC = LD_TOC_L Symbol, G8RReg - For medium code model, produces a
-      /// LD instruction with base register G8RReg and offset sym@toc@l.
-      /// Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
+      /// G8RC = LD_TOC_L Symbol, G8RReg - For medium and large code model,
+      /// produces a LD instruction with base register G8RReg and offset
+      /// sym@toc@l.  Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
       LD_TOC_L,
 
       /// G8RC = ADDI_TOC_L G8RReg, Symbol - For medium code model, produces
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 1dd5415..0120130 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -701,7 +701,7 @@ def : Pat<(PPCload ixaddr:$src),
 def : Pat<(PPCload xaddr:$src),
           (LDX xaddr:$src)>;
 
-// Support for medium code model.
+// Support for medium and large code model.
 def ADDIStocHA: Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, tocentry:$disp),
                        "#ADDIStocHA",
                        [(set G8RC:$rD,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 8c077b7..460e943 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -181,7 +181,7 @@ def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
 def PPCstcx      : SDNode<"PPCISD::STCX", SDT_PPCstcx,
                           [SDNPHasChain, SDNPMayStore]>;
 
-// Instructions to support medium code model
+// Instructions to support medium and large code model
 def PPCaddisTocHA : SDNode<"PPCISD::ADDIS_TOC_HA", SDTIntBinOp, []>;
 def PPCldTocL     : SDNode<"PPCISD::LD_TOC_L", SDTIntBinOp, [SDNPMayLoad]>;
 def PPCaddiTocL   : SDNode<"PPCISD::ADDI_TOC_L", SDTIntBinOp, []>;
@@ -346,7 +346,7 @@ def crbitm: Operand<i8> {
 // Address operands
 def memri : Operand<iPTR> {
   let PrintMethod = "printMemRegImm";
-  let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
+  let MIOperandInfo = (ops symbolLo:$imm, ptr_rc:$reg);
   let EncoderMethod = "getMemRIEncoding";
 }
 def memrr : Operand<iPTR> {
@@ -355,7 +355,7 @@ def memrr : Operand<iPTR> {
 }
 def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
   let PrintMethod = "printMemRegImmShifted";
-  let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
+  let MIOperandInfo = (ops symbolLo:$imm, ptr_rc:$reg);
   let EncoderMethod = "getMemRIXEncoding";
 }
 
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index 851de17..cfcd749 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -115,7 +115,7 @@ asm(
     "lwz  r2, 208(r1)\n" // stub's frame
     "lwz  r4, 8(r2)\n" // stub's lr
     "li   r5, 0\n"       // 0 == 32 bit
-    "bl _PPCCompilationCallbackC\n"
+    "bl _LLVMPPCCompilationCallback\n"
     "mtctr r3\n"
     // Restore all int arg registers
     "lwz r10, 204(r1)\n"    "lwz r9,  200(r1)\n"
@@ -178,7 +178,7 @@ asm(
     "lwz  5, 104(1)\n" // stub's frame
     "lwz  4, 4(5)\n" // stub's lr
     "li   5, 0\n"       // 0 == 32 bit
-    "bl PPCCompilationCallbackC\n"
+    "bl LLVMPPCCompilationCallback\n"
     "mtctr 3\n"
     // Restore all int arg registers
     "lwz 10, 100(1)\n"   "lwz 9,  96(1)\n"
@@ -259,10 +259,10 @@ asm(
     "ld   4, 16(5)\n"  // stub's lr
     "li   5, 1\n"      // 1 == 64 bit
 #ifdef __ELF__
-    "bl PPCCompilationCallbackC\n"
+    "bl LLVMPPCCompilationCallback\n"
     "nop\n"
 #else
-    "bl _PPCCompilationCallbackC\n"
+    "bl _LLVMPPCCompilationCallback\n"
 #endif
     "mtctr 3\n"
     // Restore all int arg registers
@@ -292,9 +292,10 @@ void PPC64CompilationCallback() {
 #endif
 
 extern "C" {
-static void* LLVM_ATTRIBUTE_USED PPCCompilationCallbackC(unsigned *StubCallAddrPlus4,
-                                                         unsigned *OrigCallAddrPlus4,
-                                                         bool is64Bit) {
+LLVM_LIBRARY_VISIBILITY void *
+LLVMPPCCompilationCallback(unsigned *StubCallAddrPlus4,
+                           unsigned *OrigCallAddrPlus4,
+                           bool is64Bit) {
   // Adjust the pointer to the address of the call instruction in the stub
   // emitted by emitFunctionStub, rather than the instruction after it.
   unsigned *StubCallAddr = StubCallAddrPlus4 - 1;
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 73f7a2c..9b0df3e 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -114,6 +115,12 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
                                break;
     case PPCII::MO_TPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_LO;
                                break;
+    case PPCII::MO_DTPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_DTPREL16_LO;
+                                break;
+    case PPCII::MO_TLSLD16_LO: RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO;
+                               break;
+    case PPCII::MO_TOC16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TOC16_LO;
+                             break;
    }
 
   // FIXME: This isn't right, but we don't have a good way to express this in
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 24caffa..045b375 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -71,6 +71,9 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// register for parameter passing.
   unsigned VarArgsNumFPR;
 
+  /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
+  int CRSpillFrameIndex;
+
 public:
   explicit PPCFunctionInfo(MachineFunction &MF) 
     : FramePointerSaveIndex(0),
@@ -83,7 +86,8 @@ public:
       VarArgsFrameIndex(0),
       VarArgsStackOffset(0),
       VarArgsNumGPR(0),
-      VarArgsNumFPR(0) {}
+      VarArgsNumFPR(0),
+      CRSpillFrameIndex(0) {}
 
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
@@ -125,6 +129,9 @@ public:
 
   unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
   void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
+
+  int getCRSpillFrameIndex() const { return CRSpillFrameIndex; }
+  void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; }
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 378c147..df245cc 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -71,7 +71,7 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
   : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
                        ST.isPPC64() ? 0 : 1,
                        ST.isPPC64() ? 0 : 1),
-    Subtarget(ST), TII(tii), CRSpillFrameIdx(0) {
+    Subtarget(ST), TII(tii) {
   ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
   ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
   ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
@@ -111,11 +111,6 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return Subtarget.isPPC64() ? CSR_Darwin64_SaveList :
                                  CSR_Darwin32_SaveList;
 
-  // For 32-bit SVR4, also initialize the frame index associated with
-  // the CR spill slot.
-  if (!Subtarget.isPPC64())
-    CRSpillFrameIdx = 0;
-
   return Subtarget.isPPC64() ? CSR_SVR464_SaveList : CSR_SVR432_SaveList;
 }
 
@@ -222,45 +217,6 @@ PPCRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
 
-void PPCRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
-      I->getOpcode() == PPC::ADJCALLSTACKUP) {
-    // Add (actually subtract) back the amount the callee popped on return.
-    if (int CalleeAmt =  I->getOperand(1).getImm()) {
-      bool is64Bit = Subtarget.isPPC64();
-      CalleeAmt *= -1;
-      unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1;
-      unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0;
-      unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI;
-      unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4;
-      unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS;
-      unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI;
-      MachineInstr *MI = I;
-      DebugLoc dl = MI->getDebugLoc();
-
-      if (isInt<16>(CalleeAmt)) {
-        BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg)
-          .addReg(StackReg, RegState::Kill)
-          .addImm(CalleeAmt);
-      } else {
-        MachineBasicBlock::iterator MBBI = I;
-        BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
-          .addImm(CalleeAmt >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
-          .addReg(TmpReg, RegState::Kill)
-          .addImm(CalleeAmt & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(ADDInstr), StackReg)
-          .addReg(StackReg, RegState::Kill)
-          .addReg(TmpReg);
-      }
-    }
-  }
-  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
-  MBB.erase(I);
-}
-
 /// findScratchRegister - Find a 'free' PPC register. Try for a call-clobbered
 /// register first and then a spilled callee-saved register if that fails.
 static
@@ -489,19 +445,14 @@ PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
   // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
   // ABI, return true to prevent allocating an additional frame slot.
   // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
-  // is arbitrary and will be subsequently ignored.  For 32-bit, we must
-  // create exactly one stack slot and return its FrameIdx for all
-  // nonvolatiles.
+  // is arbitrary and will be subsequently ignored.  For 32-bit, we have
+  // previously created the stack slot if needed, so return its FrameIdx.
   if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
-    if (Subtarget.isPPC64()) {
+    if (Subtarget.isPPC64())
       FrameIdx = 0;
-    } else if (CRSpillFrameIdx) {
-      FrameIdx = CRSpillFrameIdx;
-    } else {
-      MachineFrameInfo *MFI = 
-        (const_cast<MachineFunction &>(MF)).getFrameInfo();
-      FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
-      CRSpillFrameIdx = FrameIdx;
+    else {
+      const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+      FrameIdx = FI->getCRSpillFrameIndex();
     }
     return true;
   }
@@ -510,7 +461,8 @@ PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
 
 void
 PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                     int SPAdj, RegScavenger *RS) const {
+                                     int SPAdj, unsigned FIOperandNum,
+                                     RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   // Get the instruction.
@@ -524,20 +476,13 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   DebugLoc dl = MI.getDebugLoc();
 
-  // Find out which operand is the frame index.
-  unsigned FIOperandNo = 0;
-  while (!MI.getOperand(FIOperandNo).isFI()) {
-    ++FIOperandNo;
-    assert(FIOperandNo != MI.getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
-  }
   // Take into account whether it's an add or mem instruction
-  unsigned OffsetOperandNo = (FIOperandNo == 2) ? 1 : 2;
+  unsigned OffsetOperandNo = (FIOperandNum == 2) ? 1 : 2;
   if (MI.isInlineAsm())
-    OffsetOperandNo = FIOperandNo-1;
+    OffsetOperandNo = FIOperandNum-1;
 
   // Get the frame index.
-  int FrameIndex = MI.getOperand(FIOperandNo).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
   // Get the frame pointer save index.  Users of this index are primarily
   // DYNALLOC instructions.
@@ -567,7 +512,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
 
   bool is64Bit = Subtarget.isPPC64();
-  MI.getOperand(FIOperandNo).ChangeToRegister(TFI->hasFP(MF) ?
+  MI.getOperand(FIOperandNum).ChangeToRegister(TFI->hasFP(MF) ?
                                               (is64Bit ? PPC::X31 : PPC::R31) :
                                                 (is64Bit ? PPC::X1 : PPC::R1),
                                               false);
@@ -649,7 +594,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     OperandBase = OffsetOperandNo;
   }
 
-  unsigned StackReg = MI.getOperand(FIOperandNo).getReg();
+  unsigned StackReg = MI.getOperand(FIOperandNum).getReg();
   MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
   MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
 }
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index a8fd796..9840666 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -30,7 +30,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
   std::map<unsigned, unsigned> ImmToIdxMap;
   const PPCSubtarget &Subtarget;
   const TargetInstrInfo &TII;
-  mutable int CRSpillFrameIdx;
 public:
   PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
   
@@ -56,10 +55,6 @@ public:
 
   bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
   void lowerDynamicAlloc(MachineBasicBlock::iterator II,
                          int SPAdj, RegScavenger *RS) const;
   void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex,
@@ -69,7 +64,8 @@ public:
   bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
 			    int &FrameIdx) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 5ca3876..8ee9b1e 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -63,142 +63,28 @@ class CRBIT<bits<5> num, string n> : PPCReg<n> {
   field bits<5> Num = num;
 }
 
-
 // General-purpose registers
-def R0  : GPR< 0,  "r0">, DwarfRegNum<[-2, 0]>;
-def R1  : GPR< 1,  "r1">, DwarfRegNum<[-2, 1]>;
-def R2  : GPR< 2,  "r2">, DwarfRegNum<[-2, 2]>;
-def R3  : GPR< 3,  "r3">, DwarfRegNum<[-2, 3]>;
-def R4  : GPR< 4,  "r4">, DwarfRegNum<[-2, 4]>;
-def R5  : GPR< 5,  "r5">, DwarfRegNum<[-2, 5]>;
-def R6  : GPR< 6,  "r6">, DwarfRegNum<[-2, 6]>;
-def R7  : GPR< 7,  "r7">, DwarfRegNum<[-2, 7]>;
-def R8  : GPR< 8,  "r8">, DwarfRegNum<[-2, 8]>;
-def R9  : GPR< 9,  "r9">, DwarfRegNum<[-2, 9]>;
-def R10 : GPR<10, "r10">, DwarfRegNum<[-2, 10]>;
-def R11 : GPR<11, "r11">, DwarfRegNum<[-2, 11]>;
-def R12 : GPR<12, "r12">, DwarfRegNum<[-2, 12]>;
-def R13 : GPR<13, "r13">, DwarfRegNum<[-2, 13]>;
-def R14 : GPR<14, "r14">, DwarfRegNum<[-2, 14]>;
-def R15 : GPR<15, "r15">, DwarfRegNum<[-2, 15]>;
-def R16 : GPR<16, "r16">, DwarfRegNum<[-2, 16]>;
-def R17 : GPR<17, "r17">, DwarfRegNum<[-2, 17]>;
-def R18 : GPR<18, "r18">, DwarfRegNum<[-2, 18]>;
-def R19 : GPR<19, "r19">, DwarfRegNum<[-2, 19]>;
-def R20 : GPR<20, "r20">, DwarfRegNum<[-2, 20]>;
-def R21 : GPR<21, "r21">, DwarfRegNum<[-2, 21]>;
-def R22 : GPR<22, "r22">, DwarfRegNum<[-2, 22]>;
-def R23 : GPR<23, "r23">, DwarfRegNum<[-2, 23]>;
-def R24 : GPR<24, "r24">, DwarfRegNum<[-2, 24]>;
-def R25 : GPR<25, "r25">, DwarfRegNum<[-2, 25]>;
-def R26 : GPR<26, "r26">, DwarfRegNum<[-2, 26]>;
-def R27 : GPR<27, "r27">, DwarfRegNum<[-2, 27]>;
-def R28 : GPR<28, "r28">, DwarfRegNum<[-2, 28]>;
-def R29 : GPR<29, "r29">, DwarfRegNum<[-2, 29]>;
-def R30 : GPR<30, "r30">, DwarfRegNum<[-2, 30]>;
-def R31 : GPR<31, "r31">, DwarfRegNum<[-2, 31]>;
+foreach Index = 0-31 in {
+  def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>;
+}
 
 // 64-bit General-purpose registers
-def X0  : GP8< R0,  "r0">, DwarfRegNum<[0, -2]>;
-def X1  : GP8< R1,  "r1">, DwarfRegNum<[1, -2]>;
-def X2  : GP8< R2,  "r2">, DwarfRegNum<[2, -2]>;
-def X3  : GP8< R3,  "r3">, DwarfRegNum<[3, -2]>;
-def X4  : GP8< R4,  "r4">, DwarfRegNum<[4, -2]>;
-def X5  : GP8< R5,  "r5">, DwarfRegNum<[5, -2]>;
-def X6  : GP8< R6,  "r6">, DwarfRegNum<[6, -2]>;
-def X7  : GP8< R7,  "r7">, DwarfRegNum<[7, -2]>;
-def X8  : GP8< R8,  "r8">, DwarfRegNum<[8, -2]>;
-def X9  : GP8< R9,  "r9">, DwarfRegNum<[9, -2]>;
-def X10 : GP8<R10, "r10">, DwarfRegNum<[10, -2]>;
-def X11 : GP8<R11, "r11">, DwarfRegNum<[11, -2]>;
-def X12 : GP8<R12, "r12">, DwarfRegNum<[12, -2]>;
-def X13 : GP8<R13, "r13">, DwarfRegNum<[13, -2]>;
-def X14 : GP8<R14, "r14">, DwarfRegNum<[14, -2]>;
-def X15 : GP8<R15, "r15">, DwarfRegNum<[15, -2]>;
-def X16 : GP8<R16, "r16">, DwarfRegNum<[16, -2]>;
-def X17 : GP8<R17, "r17">, DwarfRegNum<[17, -2]>;
-def X18 : GP8<R18, "r18">, DwarfRegNum<[18, -2]>;
-def X19 : GP8<R19, "r19">, DwarfRegNum<[19, -2]>;
-def X20 : GP8<R20, "r20">, DwarfRegNum<[20, -2]>;
-def X21 : GP8<R21, "r21">, DwarfRegNum<[21, -2]>;
-def X22 : GP8<R22, "r22">, DwarfRegNum<[22, -2]>;
-def X23 : GP8<R23, "r23">, DwarfRegNum<[23, -2]>;
-def X24 : GP8<R24, "r24">, DwarfRegNum<[24, -2]>;
-def X25 : GP8<R25, "r25">, DwarfRegNum<[25, -2]>;
-def X26 : GP8<R26, "r26">, DwarfRegNum<[26, -2]>;
-def X27 : GP8<R27, "r27">, DwarfRegNum<[27, -2]>;
-def X28 : GP8<R28, "r28">, DwarfRegNum<[28, -2]>;
-def X29 : GP8<R29, "r29">, DwarfRegNum<[29, -2]>;
-def X30 : GP8<R30, "r30">, DwarfRegNum<[30, -2]>;
-def X31 : GP8<R31, "r31">, DwarfRegNum<[31, -2]>;
+foreach Index = 0-31 in {
+  def X#Index : GP8<!cast<GPR>("R"#Index), "r"#Index>,
+                    DwarfRegNum<[Index, -2]>;
+}
 
 // Floating-point registers
-def F0  : FPR< 0,  "f0">, DwarfRegNum<[32, 32]>;
-def F1  : FPR< 1,  "f1">, DwarfRegNum<[33, 33]>;
-def F2  : FPR< 2,  "f2">, DwarfRegNum<[34, 34]>;
-def F3  : FPR< 3,  "f3">, DwarfRegNum<[35, 35]>;
-def F4  : FPR< 4,  "f4">, DwarfRegNum<[36, 36]>;
-def F5  : FPR< 5,  "f5">, DwarfRegNum<[37, 37]>;
-def F6  : FPR< 6,  "f6">, DwarfRegNum<[38, 38]>;
-def F7  : FPR< 7,  "f7">, DwarfRegNum<[39, 39]>;
-def F8  : FPR< 8,  "f8">, DwarfRegNum<[40, 40]>;
-def F9  : FPR< 9,  "f9">, DwarfRegNum<[41, 41]>;
-def F10 : FPR<10, "f10">, DwarfRegNum<[42, 42]>;
-def F11 : FPR<11, "f11">, DwarfRegNum<[43, 43]>;
-def F12 : FPR<12, "f12">, DwarfRegNum<[44, 44]>;
-def F13 : FPR<13, "f13">, DwarfRegNum<[45, 45]>;
-def F14 : FPR<14, "f14">, DwarfRegNum<[46, 46]>;
-def F15 : FPR<15, "f15">, DwarfRegNum<[47, 47]>;
-def F16 : FPR<16, "f16">, DwarfRegNum<[48, 48]>;
-def F17 : FPR<17, "f17">, DwarfRegNum<[49, 49]>;
-def F18 : FPR<18, "f18">, DwarfRegNum<[50, 50]>;
-def F19 : FPR<19, "f19">, DwarfRegNum<[51, 51]>;
-def F20 : FPR<20, "f20">, DwarfRegNum<[52, 52]>;
-def F21 : FPR<21, "f21">, DwarfRegNum<[53, 53]>;
-def F22 : FPR<22, "f22">, DwarfRegNum<[54, 54]>;
-def F23 : FPR<23, "f23">, DwarfRegNum<[55, 55]>;
-def F24 : FPR<24, "f24">, DwarfRegNum<[56, 56]>;
-def F25 : FPR<25, "f25">, DwarfRegNum<[57, 57]>;
-def F26 : FPR<26, "f26">, DwarfRegNum<[58, 58]>;
-def F27 : FPR<27, "f27">, DwarfRegNum<[59, 59]>;
-def F28 : FPR<28, "f28">, DwarfRegNum<[60, 60]>;
-def F29 : FPR<29, "f29">, DwarfRegNum<[61, 61]>;
-def F30 : FPR<30, "f30">, DwarfRegNum<[62, 62]>;
-def F31 : FPR<31, "f31">, DwarfRegNum<[63, 63]>;
+foreach Index = 0-31 in {
+  def F#Index : FPR<Index, "f"#Index>,
+                DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
+}
 
 // Vector registers
-def V0  : VR< 0,  "v0">, DwarfRegNum<[77, 77]>;
-def V1  : VR< 1,  "v1">, DwarfRegNum<[78, 78]>;
-def V2  : VR< 2,  "v2">, DwarfRegNum<[79, 79]>;
-def V3  : VR< 3,  "v3">, DwarfRegNum<[80, 80]>;
-def V4  : VR< 4,  "v4">, DwarfRegNum<[81, 81]>;
-def V5  : VR< 5,  "v5">, DwarfRegNum<[82, 82]>;
-def V6  : VR< 6,  "v6">, DwarfRegNum<[83, 83]>;
-def V7  : VR< 7,  "v7">, DwarfRegNum<[84, 84]>;
-def V8  : VR< 8,  "v8">, DwarfRegNum<[85, 85]>;
-def V9  : VR< 9,  "v9">, DwarfRegNum<[86, 86]>;
-def V10 : VR<10, "v10">, DwarfRegNum<[87, 87]>;
-def V11 : VR<11, "v11">, DwarfRegNum<[88, 88]>;
-def V12 : VR<12, "v12">, DwarfRegNum<[89, 89]>;
-def V13 : VR<13, "v13">, DwarfRegNum<[90, 90]>;
-def V14 : VR<14, "v14">, DwarfRegNum<[91, 91]>;
-def V15 : VR<15, "v15">, DwarfRegNum<[92, 92]>;
-def V16 : VR<16, "v16">, DwarfRegNum<[93, 93]>;
-def V17 : VR<17, "v17">, DwarfRegNum<[94, 94]>;
-def V18 : VR<18, "v18">, DwarfRegNum<[95, 95]>;
-def V19 : VR<19, "v19">, DwarfRegNum<[96, 96]>;
-def V20 : VR<20, "v20">, DwarfRegNum<[97, 97]>;
-def V21 : VR<21, "v21">, DwarfRegNum<[98, 98]>;
-def V22 : VR<22, "v22">, DwarfRegNum<[99, 99]>;
-def V23 : VR<23, "v23">, DwarfRegNum<[100, 100]>;
-def V24 : VR<24, "v24">, DwarfRegNum<[101, 101]>;
-def V25 : VR<25, "v25">, DwarfRegNum<[102, 102]>;
-def V26 : VR<26, "v26">, DwarfRegNum<[103, 103]>;
-def V27 : VR<27, "v27">, DwarfRegNum<[104, 104]>;
-def V28 : VR<28, "v28">, DwarfRegNum<[105, 105]>;
-def V29 : VR<29, "v29">, DwarfRegNum<[106, 106]>;
-def V30 : VR<30, "v30">, DwarfRegNum<[107, 107]>;
-def V31 : VR<31, "v31">, DwarfRegNum<[108, 108]>;
+foreach Index = 0-31 in {
+  def V#Index : VR<Index, "v"#Index>,
+                DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
+}
 
 // Condition register bits
 def CR0LT : CRBIT< 0, "0">;
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index d9b4e30..18e4c07 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -36,6 +36,7 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
   , Use64BitRegs(false)
   , IsPPC64(is64Bit)
   , HasAltivec(false)
+  , HasQPX(false)
   , HasFSQRT(false)
   , HasSTFIWX(false)
   , HasISEL(false)
@@ -82,6 +83,12 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
   // Set up darwin-specific properties.
   if (isDarwin())
     HasLazyResolverStubs = true;
+
+  // QPX requires a 32-byte aligned stack. Note that we need to do this if
+  // we're compiling for a BG/Q system regardless of whether or not QPX
+  // is enabled because external functions will assume this alignment.
+  if (hasQPX() || isBGQ())
+    StackAlignment = 32;
 }
 
 /// SetJITMode - This is called to inform the subtarget info that we are
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 416c0f3..15885bd 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -43,7 +43,12 @@ namespace PPC {
     DIR_A2,
     DIR_E500mc,
     DIR_E5500,
+    DIR_PWR3,
+    DIR_PWR4,
+    DIR_PWR5,
+    DIR_PWR5X,
     DIR_PWR6,
+    DIR_PWR6X,
     DIR_PWR7,
     DIR_64
   };
@@ -70,6 +75,7 @@ protected:
   bool Use64BitRegs;
   bool IsPPC64;
   bool HasAltivec;
+  bool HasQPX;
   bool HasFSQRT;
   bool HasSTFIWX;
   bool HasISEL;
@@ -150,6 +156,7 @@ public:
   bool hasFSQRT() const { return HasFSQRT; }
   bool hasSTFIWX() const { return HasSTFIWX; }
   bool hasAltivec() const { return HasAltivec; }
+  bool hasQPX() const { return HasQPX; }
   bool hasMFOCRF() const { return HasMFOCRF; }
   bool hasISEL() const { return HasISEL; }
   bool isBookE() const { return IsBookE; }
@@ -160,6 +167,8 @@ public:
   bool isDarwin() const { return TargetTriple.isMacOSX(); }
   /// isBGP - True if this is a BG/P platform.
   bool isBGP() const { return TargetTriple.getVendor() == Triple::BGP; }
+  /// isBGQ - True if this is a BG/Q platform.
+  bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
 
   bool isDarwinABI() const { return isDarwin(); }
   bool isSVR4ABI() const { return !isDarwin(); }
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index b8b7882..fe851c1 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -126,3 +126,12 @@ bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
 
   return false;
 }
+
+void PPCTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our PPC pass. This
+  // allows the PPC pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+  PM.add(createPPCTargetTransformInfoPass(this));
+}
+
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index d917d99..606ccb3 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -68,6 +68,9 @@ public:
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
   virtual bool addCodeEmitter(PassManagerBase &PM,
                               JITCodeEmitter &JCE);
+
+  /// \brief Register PPC analysis passes with a pass manager.
+  virtual void addAnalysisPasses(PassManagerBase &PM);
 };
 
 /// PPC32TargetMachine - PowerPC 32-bit target machine.
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
new file mode 100644
index 0000000..5e9ad34
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -0,0 +1,236 @@
+//===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// PPC target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppctti"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializePPCTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class PPCTTI : public ImmutablePass, public TargetTransformInfo {
+  const PPCTargetMachine *TM;
+  const PPCSubtarget *ST;
+  const PPCTargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  PPCTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  PPCTTI(const PPCTargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializePPCTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() {
+    pushTTIStack(this);
+  }
+
+  virtual void finalizePass() {
+    popTTIStack();
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  virtual unsigned getNumberOfRegisters(bool Vector) const;
+  virtual unsigned getRegisterBitWidth(bool Vector) const;
+  virtual unsigned getMaximumUnrollFactor() const;
+  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
+  virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                                  int Index, Type *SubTp) const;
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const;
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const;
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const;
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const;
+
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti",
+                   "PPC Target Transform Info", true, true, false)
+char PPCTTI::ID = 0;
+
+ImmutablePass *
+llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) {
+  return new PPCTTI(TM);
+}
+
+
+//===----------------------------------------------------------------------===//
+//
+// PPC cost model.
+//
+//===----------------------------------------------------------------------===//
+
+PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  // FIXME: PPC currently does not have custom popcnt lowering even though
+  // there is hardware support. Once this is fixed, update this function
+  // to reflect the real capabilities of the hardware.
+  return PSK_Software;
+}
+
+unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
+  if (Vector && !ST->hasAltivec())
+    return 0;
+  return 32;
+}
+
+unsigned PPCTTI::getRegisterBitWidth(bool Vector) const {
+  if (Vector) {
+    if (ST->hasAltivec()) return 128;
+    return 0;
+  }
+
+  if (ST->isPPC64())
+    return 64;
+  return 32;
+
+}
+
+unsigned PPCTTI::getMaximumUnrollFactor() const {
+  unsigned Directive = ST->getDarwinDirective();
+  // The 440 has no SIMD support, but floating-point instructions
+  // have a 5-cycle latency, so unroll by 5x for latency hiding.
+  if (Directive == PPC::DIR_440)
+    return 5;
+
+  // The A2 has no SIMD support, but floating-point instructions
+  // have a 6-cycle latency, so unroll by 6x for latency hiding.
+  if (Directive == PPC::DIR_A2)
+    return 6;
+
+  // FIXME: For lack of any better information, do no harm...
+  if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
+    return 1;
+
+  // For most things, modern systems have two execution units (and
+  // out-of-order execution).
+  return 2;
+}
+
+unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
+  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+  // Fallback to the default implementation.
+  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty);
+}
+
+unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                Type *SubTp) const {
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
+  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                    Type *CondTy) const {
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                    unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Estimated cost of a load-hit-store delay.  This was obtained
+  // experimentally as a minimum needed to prevent unprofitable
+  // vectorization for the paq8p benchmark.  It may need to be
+  // raised further if other unprofitable cases remain.
+  unsigned LHSPenalty = 12;
+
+  // Vector element insert/extract with Altivec is very expensive,
+  // because they require store and reload with the attendant
+  // processor stall for load-hit-store.  Until VSX is available,
+  // these need to be estimated as very costly.
+  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
+      ISD == ISD::INSERT_VECTOR_ELT)
+    return LHSPenalty +
+      TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+
+  return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  // Each load/store unit costs 1.
+  unsigned Cost = LT.first * 1;
+
+  // PPC in general does not support unaligned loads and stores. They'll need
+  // to be decomposed based on the alignment factor.
+  unsigned SrcBytes = LT.second.getStoreSize();
+  if (SrcBytes && Alignment && Alignment < SrcBytes)
+    Cost *= (SrcBytes/Alignment);
+
+  return Cost;
+}
+
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 0f5125d..ba87918 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -23,17 +23,19 @@ class AMDGPUTargetMachine;
 // R600 Passes
 FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600LowerConstCopy(TargetMachine &tm);
 
 // SI Passes
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
-FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
+FunctionPass *createSIInsertWaits(TargetMachine &tm);
 
 // Passes common to R600 and SI
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
+FunctionPass* createAMDGPUIndirectAddressingPass(TargetMachine &tm);
 
 } // End namespace llvm
 
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 754506c..c30dbe4 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -47,6 +47,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 #endif
   }
   SetupMachineFunction(MF);
+  if (OutStreamer.hasRawTextSupport()) {
+    OutStreamer.EmitRawText("@" + MF.getName() + ":");
+  }
   OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
   if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
     EmitProgramInfo(MF);
@@ -88,8 +91,6 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
         switch (reg) {
         default: break;
         case AMDGPU::EXEC:
-        case AMDGPU::SI_LITERAL_CONSTANT:
-        case AMDGPU::SREG_LIT_0:
         case AMDGPU::M0:
           continue;
         }
@@ -115,10 +116,16 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
         } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
           isSGPR = true;
           width = 8;
+        } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 8;
+        } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 16;
         } else {
           assert(!"Unknown register class");
         }
-        hwReg = RI->getEncodingValue(reg);
+        hwReg = RI->getEncodingValue(reg) & 0xff;
         maxUsed = hwReg + width - 1;
         if (isSGPR) {
           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
diff --git a/lib/Target/R600/AMDGPUCodeEmitter.h b/lib/Target/R600/AMDGPUCodeEmitter.h
deleted file mode 100644
index 84f3588..0000000
--- a/lib/Target/R600/AMDGPUCodeEmitter.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief CodeEmitter interface for R600 and SI codegen.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef AMDGPUCODEEMITTER_H
-#define AMDGPUCODEEMITTER_H
-
-namespace llvm {
-
-class AMDGPUCodeEmitter {
-public:
-  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
-  virtual uint64_t getMachineOpValue(const MachineInstr &MI,
-                                   const MachineOperand &MO) const { return 0; }
-  virtual unsigned GPR4AlignEncode(const MachineInstr  &MI,
-                                     unsigned OpNo) const {
-    return 0;
-  }
-  virtual unsigned GPR2AlignEncode(const MachineInstr &MI,
-                                   unsigned OpNo) const {
-    return 0;
-  }
-  virtual uint64_t VOPPostEncode(const MachineInstr &MI,
-                                 uint64_t Value) const {
-    return Value;
-  }
-  virtual uint64_t i32LiteralEncode(const MachineInstr &MI,
-                                    unsigned OpNo) const {
-    return 0;
-  }
-  virtual uint32_t SMRDmemriEncode(const MachineInstr &MI, unsigned OpNo)
-                                                                   const {
-    return 0;
-  }
-};
-
-} // End namespace llvm
-
-#endif // AMDGPUCODEEMITTER_H
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
new file mode 100644
index 0000000..815d6f7
--- /dev/null
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -0,0 +1,122 @@
+//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDIL target machine
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
+    int LAO, unsigned TransAl)
+  : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+
+AMDGPUFrameLowering::~AMDGPUFrameLowering() { }
+
+unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
+
+  // XXX: Hardcoding to 1 for now.
+  //
+  // I think the StackWidth should stored as metadata associated with the
+  // MachineFunction.  This metadata can either be added by a frontend, or
+  // calculated by a R600 specific LLVM IR pass.
+  //
+  // The StackWidth determines how stack objects are laid out in memory.
+  // For a vector stack variable, like: int4 stack[2], the data will be stored
+  // in the following ways depending on the StackWidth.
+  //
+  // StackWidth = 1:
+  //
+  // T0.X = stack[0].x
+  // T1.X = stack[0].y
+  // T2.X = stack[0].z
+  // T3.X = stack[0].w
+  // T4.X = stack[1].x
+  // T5.X = stack[1].y
+  // T6.X = stack[1].z
+  // T7.X = stack[1].w
+  //
+  // StackWidth = 2:
+  //
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T1.X = stack[0].z
+  // T1.Y = stack[0].w
+  // T2.X = stack[1].x
+  // T2.Y = stack[1].y
+  // T3.X = stack[1].z
+  // T3.Y = stack[1].w
+  // 
+  // StackWidth = 4:
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T0.Z = stack[0].z
+  // T0.W = stack[0].w
+  // T1.X = stack[1].x
+  // T1.Y = stack[1].y
+  // T1.Z = stack[1].z
+  // T1.W = stack[1].w
+  return 1;
+}
+
+/// \returns The number of registers allocated for \p FI.
+int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Offset = 0;
+  int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
+
+  for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
+    const AllocaInst *Alloca = MFI->getObjectAllocation(i);
+    unsigned ArrayElements;
+    const Type *AllocaType = Alloca->getAllocatedType();
+    const Type *ElementType;
+
+    if (AllocaType->isArrayTy()) {
+      ArrayElements = AllocaType->getArrayNumElements();
+      ElementType = AllocaType->getArrayElementType();
+    } else {
+      ArrayElements = 1;
+      ElementType = AllocaType;
+    }
+
+    unsigned VectorElements;
+    if (ElementType->isVectorTy()) {
+      VectorElements = ElementType->getVectorNumElements();
+    } else {
+      VectorElements = 1;
+    }
+
+    Offset += (VectorElements / getStackWidth(MF)) * ArrayElements;
+  }
+  return Offset;
+}
+
+const TargetFrameLowering::SpillSlot *
+AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  NumEntries = 0;
+  return 0;
+}
+void
+AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
+}
+void
+AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
+                                  MachineBasicBlock &MBB) const {
+}
+
+bool
+AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
+  return false;
+}
diff --git a/lib/Target/R600/AMDILFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h
index 51337c3..cf5742e 100644
--- a/lib/Target/R600/AMDILFrameLowering.h
+++ b/lib/Target/R600/AMDGPUFrameLowering.h
@@ -1,4 +1,4 @@
-//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===//
+//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -30,6 +30,10 @@ public:
   AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
                       unsigned TransAl = 1);
   virtual ~AMDGPUFrameLowering();
+
+  /// \returns The number of 32-bit sub-registers that are used when storing
+  /// values to the stack.
+  virtual unsigned getStackWidth(const MachineFunction &MF) const;
   virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
   virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
   virtual void emitPrologue(MachineFunction &MF) const;
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 473dac4..0a33264 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -127,9 +127,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return LowerIntrinsicLRP(Op, DAG);
     case AMDGPUIntrinsic::AMDIL_fraction:
       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
-    case AMDGPUIntrinsic::AMDIL_mad:
-      return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
-                              Op.getOperand(2), Op.getOperand(3));
     case AMDGPUIntrinsic::AMDIL_max:
       return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
@@ -176,9 +173,9 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
                                 Op.getOperand(1));
   SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
                                                     Op.getOperand(3));
-  return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
-                                               Op.getOperand(2),
-                                               OneSubAC);
+  return DAG.getNode(ISD::FADD, DL, VT,
+      DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
+      OneSubAC);
 }
 
 /// \brief Generate Min/Max node
@@ -393,7 +390,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default: return 0;
   // AMDIL DAG nodes
-  NODE_NAME_CASE(MAD);
   NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
   NODE_NAME_CASE(DIV_INF);
@@ -410,8 +406,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SMIN)
   NODE_NAME_CASE(UMIN)
   NODE_NAME_CASE(URECIP)
-  NODE_NAME_CASE(INTERP)
-  NODE_NAME_CASE(INTERP_P0)
   NODE_NAME_CASE(EXPORT)
+  NODE_NAME_CASE(CONST_ADDRESS)
+  NODE_NAME_CASE(REGISTER_LOAD)
+  NODE_NAME_CASE(REGISTER_STORE)
   }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index c7abaf6..9e7d997 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -53,6 +53,11 @@ public:
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
                               DebugLoc DL, SelectionDAG &DAG) const;
+  virtual SDValue LowerCall(CallLoweringInfo &CLI,
+                            SmallVectorImpl<SDValue> &InVals) const {
+    CLI.Callee.dump();
+    llvm_unreachable("Undefined function");
+  }
 
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
@@ -60,6 +65,10 @@ public:
   SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
   virtual const char* getTargetNodeName(unsigned Opcode) const;
 
+  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const {
+    return N;
+  }
+
 // Functions defined in AMDILISelLowering.cpp
 public:
 
@@ -103,7 +112,6 @@ namespace AMDGPUISD {
 enum {
   // AMDIL ISD Opcodes
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  MAD,         // 32bit Fused Multiply Add instruction
   CALL,        // Function call based on a single integer
   UMUL,        // 32bit unsigned multiplication
   DIV_INF,      // Divide with infinity returned on zero divisor
@@ -120,25 +128,16 @@ enum {
   SMIN,
   UMIN,
   URECIP,
-  INTERP,
-  INTERP_P0,
   EXPORT,
+  CONST_ADDRESS,
+  REGISTER_LOAD,
+  REGISTER_STORE,
   LAST_AMDGPU_ISD_NUMBER
 };
 
 
 } // End namespace AMDGPUISD
 
-namespace SIISD {
-
-enum {
-  SI_FIRST = AMDGPUISD::LAST_AMDGPU_ISD_NUMBER,
-  VCC_AND,
-  VCC_BITCAST
-};
-
-} // End namespace SIISD
-
 } // End namespace llvm
 
 #endif // AMDGPUISELLOWERING_H
diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
new file mode 100644
index 0000000..15840b3
--- /dev/null
+++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
@@ -0,0 +1,344 @@
+//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Instructions can use indirect addressing to index the register file as if it
+/// were memory.  This pass lowers RegisterLoad and RegisterStore instructions
+/// to either a COPY or a MOV that uses indirect addressing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUIndirectAddressingPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const AMDGPUInstrInfo *TII;
+
+  bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const;
+
+public:
+  AMDGPUIndirectAddressingPass(TargetMachine &tm) :
+    MachineFunctionPass(ID),
+    TII(static_cast<const AMDGPUInstrInfo*>(tm.getInstrInfo()))
+    { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const { return "R600 Handle indirect addressing"; }
+
+};
+
+} // End anonymous namespace
+
+char AMDGPUIndirectAddressingPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) {
+  return new AMDGPUIndirectAddressingPass(tm);
+}
+
+bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  int IndirectBegin = TII->getIndirectIndexBegin(MF);
+  int IndirectEnd = TII->getIndirectIndexEnd(MF);
+
+  if (IndirectBegin == -1) {
+    // No indirect addressing, we can skip this pass
+    assert(IndirectEnd == -1);
+    return false;
+  }
+
+  // The map keeps track of the indirect address that is represented by
+  // each virtual register. The key is the register and the value is the
+  // indirect address it uses.
+  std::map<unsigned, unsigned> RegisterAddressMap;
+
+  // First pass - Lower all of the RegisterStore instructions and track which
+  // registers are live.
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                      BB != BB_E; ++BB) {
+    // This map keeps track of the current live indirect registers.
+    // The key is the address and the value is the register
+    std::map<unsigned, unsigned> LiveAddressRegisterMap;
+    MachineBasicBlock &MBB = *BB;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+                               I != MBB.end(); I = Next) {
+      Next = llvm::next(I);
+      MachineInstr &MI = *I;
+
+      if (!TII->isRegisterStore(MI)) {
+        continue;
+      }
+
+      // Lower RegisterStore
+
+      unsigned RegIndex = MI.getOperand(2).getImm();
+      unsigned Channel = MI.getOperand(3).getImm();
+      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
+      const TargetRegisterClass *IndirectStoreRegClass =
+                   TII->getIndirectAddrStoreRegClass(MI.getOperand(0).getReg());
+
+      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
+        // Direct register access.
+        unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
+
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg)
+                .addOperand(MI.getOperand(0));
+
+        RegisterAddressMap[DstReg] = Address;
+        LiveAddressRegisterMap[Address] = DstReg;
+      } else {
+        // Indirect register access.
+        MachineInstrBuilder MOV = TII->buildIndirectWrite(BB, I,
+                                           MI.getOperand(0).getReg(), // Value
+                                           Address,
+                                           MI.getOperand(1).getReg()); // Offset
+        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
+          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
+          unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
+          MOV.addReg(DstReg, RegState::Define | RegState::Implicit);
+          RegisterAddressMap[DstReg] = Addr;
+          LiveAddressRegisterMap[Addr] = DstReg;
+        }
+      }
+      MI.eraseFromParent();
+    }
+
+    // Update the live-ins of the succesor blocks
+    for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(),
+                                          SuccEnd = MBB.succ_end();
+                                          SuccEnd != Succ; ++Succ) {
+      std::map<unsigned, unsigned>::const_iterator Key, KeyEnd;
+      for (Key = LiveAddressRegisterMap.begin(),
+           KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) {
+        (*Succ)->addLiveIn(Key->second);
+      }
+    }
+  }
+
+  // Second pass - Lower the RegisterLoad instructions
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                      BB != BB_E; ++BB) {
+    // Key is the address and the value is the register
+    std::map<unsigned, unsigned> LiveAddressRegisterMap;
+    MachineBasicBlock &MBB = *BB;
+
+    MachineBasicBlock::livein_iterator LI = MBB.livein_begin();
+    while (LI != MBB.livein_end()) {
+      std::vector<unsigned> PhiRegisters;
+
+      // Make sure this live in is used for indirect addressing
+      if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) {
+        ++LI;
+        continue;
+      }
+
+      unsigned Address = RegisterAddressMap[*LI];
+      LiveAddressRegisterMap[Address] = *LI;
+      PhiRegisters.push_back(*LI);
+
+      // Check if there are other live in registers which map to the same
+      // indirect address.
+      for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI),
+                                              LE = MBB.livein_end();
+                                              LJ != LE; ++LJ) {
+        unsigned Reg = *LJ;
+        if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) {
+          continue;
+        }
+
+        if (RegisterAddressMap[Reg] == Address) {
+          PhiRegisters.push_back(Reg);
+        }
+      }
+
+      if (PhiRegisters.size() == 1) {
+        // We don't need to insert a Phi instruction, so we can just add the
+        // registers to the live list for the block.
+        LiveAddressRegisterMap[Address] = *LI;
+        MBB.removeLiveIn(*LI);
+      } else {
+        // We need to insert a PHI, because we have the same address being
+        // written in multiple predecessor blocks.
+        const TargetRegisterClass *PhiDstClass =
+                   TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin()));
+        unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
+        MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
+                                          MBB.findDebugLoc(MBB.begin()),
+                                          TII->get(AMDGPU::PHI), PhiDstReg);
+
+        for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(),
+                                                   RE = PhiRegisters.end();
+                                                   RI != RE; ++RI) {
+          unsigned Reg = *RI;
+          MachineInstr *DefInst = MRI.getVRegDef(Reg);
+          assert(DefInst);
+          MachineBasicBlock *RegBlock = DefInst->getParent();
+          Phi.addReg(Reg);
+          Phi.addMBB(RegBlock);
+          MBB.removeLiveIn(Reg);
+        }
+        RegisterAddressMap[PhiDstReg] = Address;
+        LiveAddressRegisterMap[Address] = PhiDstReg;
+      }
+      LI = MBB.livein_begin();
+    }
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+                               I != MBB.end(); I = Next) {
+      Next = llvm::next(I);
+      MachineInstr &MI = *I;
+
+      if (!TII->isRegisterLoad(MI)) {
+        if (MI.getOpcode() == AMDGPU::PHI) {
+          continue;
+        }
+        // Check for indirect register defs
+        for (unsigned OpIdx = 0, NumOperands = MI.getNumOperands();
+                                 OpIdx < NumOperands; ++OpIdx) {
+          MachineOperand &MO = MI.getOperand(OpIdx);
+          if (MO.isReg() && MO.isDef() &&
+              RegisterAddressMap.find(MO.getReg()) != RegisterAddressMap.end()) {
+            unsigned Reg = MO.getReg();
+            unsigned LiveAddress = RegisterAddressMap[Reg];
+            // Chain the live-ins
+            if (LiveAddressRegisterMap.find(LiveAddress) !=
+                                                     RegisterAddressMap.end()) {
+              MI.addOperand(MachineOperand::CreateReg(
+                                  LiveAddressRegisterMap[LiveAddress],
+                                  false, // isDef
+                                  true,  // isImp
+                                  true));  // isKill
+            }
+            LiveAddressRegisterMap[LiveAddress] = Reg;
+          }
+        }
+        continue;
+      }
+
+      const TargetRegisterClass *SuperIndirectRegClass =
+                                                TII->getSuperIndirectRegClass();
+      const TargetRegisterClass *IndirectLoadRegClass =
+                                             TII->getIndirectAddrLoadRegClass();
+      unsigned IndirectReg = MRI.createVirtualRegister(SuperIndirectRegClass);
+
+      unsigned RegIndex = MI.getOperand(2).getImm();
+      unsigned Channel = MI.getOperand(3).getImm();
+      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
+
+      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
+        // Direct register access
+        unsigned Reg = LiveAddressRegisterMap[Address];
+        unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
+
+        if (regHasExplicitDef(MRI, Reg)) {
+          // If the register we are reading from has an explicit def, then that
+          // means it was written via a direct register access (i.e. COPY
+          // or other instruction that doesn't use indirect addressing).  In
+          // this case we know where the value has been stored, so we can just
+          // issue a copy.
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
+                  MI.getOperand(0).getReg())
+                  .addReg(Reg);
+        } else {
+          // If the register we are reading has an implicit def, then that
+          // means it was written by an indirect register access (i.e. An
+          // instruction that uses indirect addressing. 
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
+                   MI.getOperand(0).getReg())
+                   .addReg(AddrReg)
+                   .addReg(Reg, RegState::Implicit);
+        }
+      } else {
+        // Indirect register access
+
+        // Note on REQ_SEQUENCE instructons: You can't actually use the register
+        // it defines unless  you have an instruction that takes the defined
+        // register class as an operand.
+
+        MachineInstrBuilder Sequence = BuildMI(MBB, I, MBB.findDebugLoc(I),
+                                               TII->get(AMDGPU::REG_SEQUENCE),
+                                               IndirectReg);
+        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
+          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
+          if (LiveAddressRegisterMap.find(Addr) == LiveAddressRegisterMap.end()) {
+            continue;
+          }
+          unsigned Reg = LiveAddressRegisterMap[Addr];
+
+          // We only need to use REG_SEQUENCE for explicit defs, since the
+          // register coalescer won't do anything with the implicit defs.
+          MachineInstr *DefInstr = MRI.getVRegDef(Reg);
+          if (!regHasExplicitDef(MRI, Reg)) {
+            continue;
+          }
+
+          // Insert a REQ_SEQUENCE instruction to force the register allocator
+          // to allocate the virtual register to the correct physical register.
+          Sequence.addReg(LiveAddressRegisterMap[Addr]);
+          Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(Addr));
+        }
+        MachineInstrBuilder Mov = TII->buildIndirectRead(BB, I,
+                                           MI.getOperand(0).getReg(), // Value
+                                           Address,
+                                           MI.getOperand(1).getReg()); // Offset
+
+
+
+        Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill);
+        Mov.addReg(LiveAddressRegisterMap[Address], RegState::Implicit);
+
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
+
+bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI,
+                                                  unsigned Reg) const {
+  MachineInstr *DefInstr = MRI.getVRegDef(Reg);
+
+  if (!DefInstr) {
+    return false;
+  }
+
+  if (DefInstr->getOpcode() == AMDGPU::PHI) {
+    bool Explicit = false;
+    for (MachineInstr::const_mop_iterator I = DefInstr->operands_begin(),
+                                          E = DefInstr->operands_end();
+                                          I != E; ++I) {
+      const MachineOperand &MO = *I;
+      if (!MO.isReg() || MO.isDef()) {
+        continue;
+      }
+
+      Explicit = Explicit || regHasExplicitDef(MRI, MO.getReg());
+    }
+    return Explicit;
+  }
+
+  return DefInstr->getOperand(0).isReg() &&
+         DefInstr->getOperand(0).getReg() == Reg;
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index e42a46d..30f736c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 #define GET_INSTRINFO_CTOR
+#define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
 
 using namespace llvm;
@@ -234,7 +235,16 @@ AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   // TODO: Implement this function
   return true;
 }
- 
+
+bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
+}
+
+bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
+}
+
+
 void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
     DebugLoc DL) const {
   MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index cb97af9..3909e4e 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -40,9 +40,10 @@ class MachineInstrBuilder;
 class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
 private:
   const AMDGPURegisterInfo RI;
-  TargetMachine &TM;
   bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
                           MachineBasicBlock &MBB) const;
+protected:
+  TargetMachine &TM;
 public:
   explicit AMDGPUInstrInfo(TargetMachine &tm);
 
@@ -130,12 +131,66 @@ public:
   bool isAExtLoadInst(llvm::MachineInstr *MI) const;
   bool isStoreInst(llvm::MachineInstr *MI) const;
   bool isTruncStoreInst(llvm::MachineInstr *MI) const;
+  bool isRegisterStore(const MachineInstr &MI) const;
+  bool isRegisterLoad(const MachineInstr &MI) const;
+
+//===---------------------------------------------------------------------===//
+// Pure virtual funtions to be implemented by sub-classes.
+//===---------------------------------------------------------------------===//
 
   virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg,
                                        int64_t Imm) const = 0;
   virtual unsigned getIEQOpcode() const = 0;
   virtual bool isMov(unsigned opcode) const = 0;
 
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexBegin(const MachineFunction &MF) const = 0;
+
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexEnd(const MachineFunction &MF) const = 0;
+
+  /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
+  ///        \p Channel
+  ///
+  /// We model indirect addressing using a virtual address space that can be
+  /// accesed with loads and stores.  The "Indirect Address" is the memory
+  /// address in this virtual address space that maps to the given \p RegIndex
+  /// and \p Channel.
+  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
+                                            unsigned Channel) const = 0;
+
+  /// \returns The register class to be used for storing values to an
+  /// "Indirect Address" .
+  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
+                                                  unsigned SourceReg) const = 0;
+
+  /// \returns The register class to be used for loading values from
+  /// an "Indirect Address" .
+  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const = 0;
+
+  /// \brief Build instruction(s) for an indirect register write.
+  ///
+  /// \returns The instruction that performs the indirect register write
+  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned ValueReg, unsigned Address,
+                                    unsigned OffsetReg) const = 0;
+
+  /// \brief Build instruction(s) for an indirect register read.
+  ///
+  /// \returns The instruction that performs the indirect register read
+  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned ValueReg, unsigned Address,
+                                    unsigned OffsetReg) const = 0;
+
+  /// \returns the register class whose sub registers are the set of all
+  /// possible registers that can be used for indirect addressing.
+  virtual const TargetRegisterClass *getSuperIndirectRegClass() const = 0;
+
+
   /// \brief Convert the AMDIL MachineInstr to a supported ISA
   /// MachineInstr
   virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
@@ -145,4 +200,7 @@ public:
 
 } // End llvm namespace
 
+#define AMDGPU_FLAG_REGISTER_LOAD  (UINT64_C(1) << 63)
+#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
+
 #endif // AMDGPUINSTRINFO_H
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 96368e8..b66ae87 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -72,3 +72,11 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
 def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
 
 def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>;
+
+def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
+                          SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                          [SDNPHasChain, SDNPMayLoad]>;
+
+def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
+                           SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                           [SDNPHasChain, SDNPMayStore]>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index e634d20..960f108 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
-  field bits<16> AMDILOp = 0;
-  field bits<3> Gen = 0;
+  field bit isRegisterLoad = 0;
+  field bit isRegisterStore = 0;
 
   let Namespace = "AMDGPU";
   let OutOperandList = outs;
@@ -22,8 +22,9 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio
   let AsmString = asm;
   let Pattern = pattern;
   let Itinerary = NullALU;
-  let TSFlags{42-40} = Gen;
-  let TSFlags{63-48} = AMDILOp;
+
+  let TSFlags{63} = isRegisterLoad;
+  let TSFlags{62} = isRegisterStore;
 }
 
 class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
@@ -76,6 +77,11 @@ def COND_LE : PatLeaf <
                      case ISD::SETLE: return true;}}}]
 >;
 
+def COND_NULL : PatLeaf <
+  (cond),
+  [{return false;}]
+>;
+
 //===----------------------------------------------------------------------===//
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
@@ -101,7 +107,9 @@ def FP_ONE : PatLeaf <
   [{return N->isExactlyValue(1.0);}]
 >;
 
-let isCodeGenOnly = 1, isPseudo = 1, usesCustomInserter = 1  in {
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let usesCustomInserter = 1  in {
 
 class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
   (outs rc:$dst),
@@ -131,7 +139,31 @@ def SHADER_TYPE : AMDGPUShaderInst <
   [(int_AMDGPU_shader_type imm:$type)]
 >;
 
-} // End isCodeGenOnly = 1, isPseudo = 1, hasCustomInserter = 1
+} // usesCustomInserter = 1
+
+multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
+                    ComplexPattern addrPat> {
+  def RegisterLoad : AMDGPUShaderInst <
+    (outs dstClass:$dst),
+    (ins addrClass:$addr, i32imm:$chan),
+    "RegisterLoad $dst, $addr",
+    [(set (i32 dstClass:$dst), (AMDGPUregister_load addrPat:$addr,
+                                                    (i32 timm:$chan)))]
+  > {
+    let isRegisterLoad = 1;
+  }
+
+  def RegisterStore : AMDGPUShaderInst <
+    (outs),
+    (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
+    "RegisterStore $val, $addr",
+    [(AMDGPUregister_store (i32 dstClass:$val), addrPat:$addr, (i32 timm:$chan))]
+  > {
+    let isRegisterStore = 1;
+  }
+}
+
+} // End isCodeGenOnly = 1, isPseudo = 1
 
 /* Generic helper patterns for intrinsics */
 /* -------------------------------------- */
@@ -164,13 +196,64 @@ class Insert_Element <ValueType elem_type, ValueType vec_type,
 >;
 
 // Vector Build pattern
+class Vector1_Build <ValueType vecType, RegisterClass vectorClass,
+                     ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$src))),
+  (vecType elemClass:$src)
+>;
+
+class Vector2_Build <ValueType vecType, RegisterClass vectorClass,
+                     ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1))),
+  (INSERT_SUBREG (INSERT_SUBREG
+  (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1)
+>;
+
 class Vector_Build <ValueType vecType, RegisterClass vectorClass,
                     ValueType elemType, RegisterClass elemClass> : Pat <
   (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
                          (elemType elemClass:$z), (elemType elemClass:$w))),
   (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-  (vecType (IMPLICIT_DEF)), elemClass:$x, sel_x), elemClass:$y, sel_y),
-                            elemClass:$z, sel_z), elemClass:$w, sel_w)
+  (vecType (IMPLICIT_DEF)), elemClass:$x, sub0), elemClass:$y, sub1),
+                            elemClass:$z, sub2), elemClass:$w, sub3)
+>;
+
+class Vector8_Build <ValueType vecType, RegisterClass vectorClass,
+                     ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1),
+                         (elemType elemClass:$sub2), (elemType elemClass:$sub3),
+                         (elemType elemClass:$sub4), (elemType elemClass:$sub5),
+                         (elemType elemClass:$sub6), (elemType elemClass:$sub7))),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1),
+                            elemClass:$sub2, sub2), elemClass:$sub3, sub3),
+                            elemClass:$sub4, sub4), elemClass:$sub5, sub5),
+                            elemClass:$sub6, sub6), elemClass:$sub7, sub7)
+>;
+
+class Vector16_Build <ValueType vecType, RegisterClass vectorClass,
+                      ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1),
+                         (elemType elemClass:$sub2), (elemType elemClass:$sub3),
+                         (elemType elemClass:$sub4), (elemType elemClass:$sub5),
+                         (elemType elemClass:$sub6), (elemType elemClass:$sub7),
+                         (elemType elemClass:$sub8), (elemType elemClass:$sub9),
+                         (elemType elemClass:$sub10), (elemType elemClass:$sub11),
+                         (elemType elemClass:$sub12), (elemType elemClass:$sub13),
+                         (elemType elemClass:$sub14), (elemType elemClass:$sub15))),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1),
+                            elemClass:$sub2, sub2), elemClass:$sub3, sub3),
+                            elemClass:$sub4, sub4), elemClass:$sub5, sub5),
+                            elemClass:$sub6, sub6), elemClass:$sub7, sub7),
+                            elemClass:$sub8, sub8), elemClass:$sub9, sub9),
+                            elemClass:$sub10, sub10), elemClass:$sub11, sub11),
+                            elemClass:$sub12, sub12), elemClass:$sub13, sub13),
+                            elemClass:$sub14, sub14), elemClass:$sub15, sub15)
 >;
 
 // bitconvert pattern
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index eeafec8..fe994d2 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -38,6 +38,7 @@ const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF
 
 void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                              int SPAdj,
+                                             unsigned FIOperandNum,
                                              RegScavenger *RS) const {
   assert(!"Subroutines not supported yet");
 }
@@ -47,5 +48,28 @@ unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return 0;
 }
 
+unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
+
+  switch(IndirectIndex) {
+  case 0: return AMDGPU::sub0;
+  case 1: return AMDGPU::sub1;
+  case 2: return AMDGPU::sub2;
+  case 3: return AMDGPU::sub3;
+  case 4: return AMDGPU::sub4;
+  case 5: return AMDGPU::sub5;
+  case 6: return AMDGPU::sub6;
+  case 7: return AMDGPU::sub7;
+  case 8: return AMDGPU::sub8;
+  case 9: return AMDGPU::sub9;
+  case 10: return AMDGPU::sub10;
+  case 11: return AMDGPU::sub11;
+  case 12: return AMDGPU::sub12;
+  case 13: return AMDGPU::sub13;
+  case 14: return AMDGPU::sub14;
+  case 15: return AMDGPU::sub15;
+  default: llvm_unreachable("indirect index out of range");
+  }
+}
+
 #define GET_REGINFO_TARGET_DESC
 #include "AMDGPUGenRegisterInfo.inc"
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 76ee7ae..1fc88e7 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -53,9 +53,12 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
 
   const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
                            RegScavenger *RS) const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
+  unsigned getIndirectSubReg(unsigned IndirectIndex) const;
+
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/AMDGPURegisterInfo.td b/lib/Target/R600/AMDGPURegisterInfo.td
index 8181e02..b5aca03 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.td
+++ b/lib/Target/R600/AMDGPURegisterInfo.td
@@ -12,10 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 let Namespace = "AMDGPU" in {
-  def sel_x : SubRegIndex;
-  def sel_y : SubRegIndex;
-  def sel_z : SubRegIndex;
-  def sel_w : SubRegIndex;
+
+foreach Index = 0-15 in {
+  def sub#Index : SubRegIndex;
+}
+
+def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
+
 }
 
 include "R600RegisterInfo.td"
diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
index 8295efd..26f842e 100644
--- a/lib/Target/R600/AMDGPUStructurizeCFG.cpp
+++ b/lib/Target/R600/AMDGPUStructurizeCFG.cpp
@@ -22,30 +22,101 @@
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Support/PatternMatch.h"
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 namespace {
 
 // Definition of the complex types used in this pass.
 
 typedef std::pair<BasicBlock *, Value *> BBValuePair;
-typedef ArrayRef<BasicBlock*> BBVecRef;
 
 typedef SmallVector<RegionNode*, 8> RNVector;
 typedef SmallVector<BasicBlock*, 8> BBVector;
+typedef SmallVector<BranchInst*, 8> BranchVector;
 typedef SmallVector<BBValuePair, 2> BBValueVector;
 
+typedef SmallPtrSet<BasicBlock *, 8> BBSet;
+
 typedef DenseMap<PHINode *, BBValueVector> PhiMap;
+typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap;
 typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
 typedef DenseMap<BasicBlock *, Value *> BBPredicates;
 typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
-typedef DenseMap<BasicBlock *, unsigned> VisitedMap;
+typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
+typedef DenseMap<BasicBlock *, BBVector> BB2BBVecMap;
 
 // The name for newly created blocks.
 
 static const char *FlowBlockName = "Flow";
 
+/// @brief Find the nearest common dominator for multiple BasicBlocks
+///
+/// Helper class for AMDGPUStructurizeCFG
+/// TODO: Maybe move into common code
+class NearestCommonDominator {
+
+  DominatorTree *DT;
+
+  DTN2UnsignedMap IndexMap;
+
+  BasicBlock *Result;
+  unsigned ResultIndex;
+  bool ExplicitMentioned;
+
+public:
+  /// \brief Start a new query
+  NearestCommonDominator(DominatorTree *DomTree) {
+    DT = DomTree;
+    Result = 0;
+  }
+
+  /// \brief Add BB to the resulting dominator
+  void addBlock(BasicBlock *BB, bool Remember = true) {
+
+    DomTreeNode *Node = DT->getNode(BB);
+
+    if (Result == 0) {
+      unsigned Numbering = 0;
+      for (;Node;Node = Node->getIDom())
+        IndexMap[Node] = ++Numbering;
+      Result = BB;
+      ResultIndex = 1;
+      ExplicitMentioned = Remember;
+      return;
+    }
+
+    for (;Node;Node = Node->getIDom())
+      if (IndexMap.count(Node))
+        break;
+      else
+        IndexMap[Node] = 0;
+
+    assert(Node && "Dominator tree invalid!");
+
+    unsigned Numbering = IndexMap[Node];
+    if (Numbering > ResultIndex) {
+      Result = Node->getBlock();
+      ResultIndex = Numbering;
+      ExplicitMentioned = Remember && (Result == BB);
+    } else if (Numbering == ResultIndex) {
+      ExplicitMentioned |= Remember;
+    }
+  }
+
+  /// \brief Is "Result" one of the BBs added with "Remember" = True?
+  bool wasResultExplicitMentioned() {
+    return ExplicitMentioned;
+  }
+
+  /// \brief Get the query result
+  BasicBlock *getResult() {
+    return Result;
+  }
+};
+
 /// @brief Transforms the control flow graph on one single entry/exit region
 /// at a time.
 ///
@@ -106,45 +177,62 @@ class AMDGPUStructurizeCFG : public RegionPass {
   DominatorTree *DT;
 
   RNVector Order;
-  VisitedMap Visited;
-  PredMap Predicates;
+  BBSet Visited;
+
   BBPhiMap DeletedPhis;
-  BBVector FlowsInserted;
+  BB2BBVecMap AddedPhis;
+
+  PredMap Predicates;
+  BranchVector Conditions;
 
-  BasicBlock *LoopStart;
-  BasicBlock *LoopEnd;
-  BBPredicates LoopPred;
+  BB2BBMap Loops;
+  PredMap LoopPreds;
+  BranchVector LoopConds;
+
+  RegionNode *PrevNode;
 
   void orderNodes();
 
-  void buildPredicate(BranchInst *Term, unsigned Idx,
-                      BBPredicates &Pred, bool Invert);
+  void analyzeLoops(RegionNode *N);
+
+  Value *invert(Value *Condition);
 
-  void analyzeBlock(BasicBlock *BB);
+  Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
 
-  void analyzeLoop(BasicBlock *BB, unsigned &LoopIdx);
+  void gatherPredicates(RegionNode *N);
 
   void collectInfos();
 
-  bool dominatesPredicates(BasicBlock *A, BasicBlock *B);
+  void insertConditions(bool Loops);
+
+  void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void setPhiValues();
 
   void killTerminator(BasicBlock *BB);
 
-  RegionNode *skipChained(RegionNode *Node);
+  void changeExit(RegionNode *Node, BasicBlock *NewExit,
+                  bool IncludeDominator);
 
-  void delPhiValues(BasicBlock *From, BasicBlock *To);
+  BasicBlock *getNextFlow(BasicBlock *Dominator);
 
-  void addPhiValues(BasicBlock *From, BasicBlock *To);
+  BasicBlock *needPrefix(bool NeedEmpty);
 
-  BasicBlock *getNextFlow(BasicBlock *Prev);
+  BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
 
-  bool isPredictableTrue(BasicBlock *Prev, BasicBlock *Node);
+  void setPrevNode(BasicBlock *BB);
 
-  BasicBlock *wireFlowBlock(BasicBlock *Prev, RegionNode *Node);
+  bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
 
-  void createFlow();
+  bool isPredictableTrue(RegionNode *Node);
+
+  void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
 
-  void insertConditions();
+  void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void createFlow();
 
   void rebuildSSA();
 
@@ -198,212 +286,214 @@ void AMDGPUStructurizeCFG::orderNodes() {
   }
 }
 
-/// \brief Build blocks and loop predicates
-void AMDGPUStructurizeCFG::buildPredicate(BranchInst *Term, unsigned Idx,
-                                          BBPredicates &Pred, bool Invert) {
-  Value *True = Invert ? BoolFalse : BoolTrue;
-  Value *False = Invert ? BoolTrue : BoolFalse;
+/// \brief Determine the end of the loops
+void AMDGPUStructurizeCFG::analyzeLoops(RegionNode *N) {
 
-  RegionInfo *RI = ParentRegion->getRegionInfo();
-  BasicBlock *BB = Term->getParent();
+  if (N->isSubRegion()) {
+    // Test for exit as back edge
+    BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
+    if (Visited.count(Exit))
+      Loops[Exit] = N->getEntry();
+
+  } else {
+    // Test for sucessors as back edge
+    BasicBlock *BB = N->getNodeAs<BasicBlock>();
+    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *Succ = Term->getSuccessor(i);
 
-  // Handle the case where multiple regions start at the same block
-  Region *R = BB != ParentRegion->getEntry() ?
-              RI->getRegionFor(BB) : ParentRegion;
+      if (Visited.count(Succ))
+        Loops[Succ] = BB;
+    }
+  }
+}
 
-  if (R == ParentRegion) {
-    // It's a top level block in our region
-    Value *Cond = True;
-    if (Term->isConditional()) {
-      BasicBlock *Other = Term->getSuccessor(!Idx);
+/// \brief Invert the given condition
+Value *AMDGPUStructurizeCFG::invert(Value *Condition) {
 
-      if (Visited.count(Other)) {
-        if (!Pred.count(Other))
-          Pred[Other] = False;
+  // First: Check if it's a constant
+  if (Condition == BoolTrue)
+    return BoolFalse;
 
-        if (!Pred.count(BB))
-          Pred[BB] = True;
-        return;
-      }
-      Cond = Term->getCondition();
+  if (Condition == BoolFalse)
+    return BoolTrue;
 
-      if (Idx != Invert)
-        Cond = BinaryOperator::CreateNot(Cond, "", Term);
-    }
+  if (Condition == BoolUndef)
+    return BoolUndef;
 
-    Pred[BB] = Cond;
+  // Second: If the condition is already inverted, return the original value
+  if (match(Condition, m_Not(m_Value(Condition))))
+    return Condition;
 
-  } else if (ParentRegion->contains(R)) {
-    // It's a block in a sub region
-    while(R->getParent() != ParentRegion)
-      R = R->getParent();
+  // Third: Check all the users for an invert
+  BasicBlock *Parent = cast<Instruction>(Condition)->getParent();
+  for (Value::use_iterator I = Condition->use_begin(),
+       E = Condition->use_end(); I != E; ++I) {
 
-    Pred[R->getEntry()] = True;
+    Instruction *User = dyn_cast<Instruction>(*I);
+    if (!User || User->getParent() != Parent)
+      continue;
 
-  } else {
-    // It's a branch from outside into our parent region
-    Pred[BB] = True;
+    if (match(*I, m_Not(m_Specific(Condition))))
+      return *I;
   }
-}
 
-/// \brief Analyze the successors of each block and build up predicates
-void AMDGPUStructurizeCFG::analyzeBlock(BasicBlock *BB) {
-  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-  BBPredicates &Pred = Predicates[BB];
+  // Last option: Create a new instruction
+  return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+}
 
-  for (; PI != PE; ++PI) {
-    BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
+/// \brief Build the condition for one edge
+Value *AMDGPUStructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+                                            bool Invert) {
+  Value *Cond = Invert ? BoolFalse : BoolTrue;
+  if (Term->isConditional()) {
+    Cond = Term->getCondition();
 
-    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
-      BasicBlock *Succ = Term->getSuccessor(i);
-      if (Succ != BB)
-        continue;
-      buildPredicate(Term, i, Pred, false);
-    }
+    if (Idx != Invert)
+      Cond = invert(Cond);
   }
+  return Cond;
 }
 
-/// \brief Analyze the conditions leading to loop to a previous block
-void AMDGPUStructurizeCFG::analyzeLoop(BasicBlock *BB, unsigned &LoopIdx) {
-  BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+/// \brief Analyze the predecessors of each block and build up predicates
+void AMDGPUStructurizeCFG::gatherPredicates(RegionNode *N) {
 
-  for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
-    BasicBlock *Succ = Term->getSuccessor(i);
+  RegionInfo *RI = ParentRegion->getRegionInfo();
+  BasicBlock *BB = N->getEntry();
+  BBPredicates &Pred = Predicates[BB];
+  BBPredicates &LPred = LoopPreds[BB];
+
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+       PI != PE; ++PI) {
 
-    // Ignore it if it's not a back edge
-    if (!Visited.count(Succ))
+    // Ignore it if it's a branch from outside into our region entry
+    if (!ParentRegion->contains(*PI))
       continue;
 
-    buildPredicate(Term, i, LoopPred, true);
+    Region *R = RI->getRegionFor(*PI);
+    if (R == ParentRegion) {
 
-    LoopEnd = BB;
-    if (Visited[Succ] < LoopIdx) {
-      LoopIdx = Visited[Succ];
-      LoopStart = Succ;
+      // It's a top level block in our region
+      BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
+      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *Succ = Term->getSuccessor(i);
+        if (Succ != BB)
+          continue;
+
+        if (Visited.count(*PI)) {
+          // Normal forward edge
+          if (Term->isConditional()) {
+            // Try to treat it like an ELSE block
+            BasicBlock *Other = Term->getSuccessor(!i);
+            if (Visited.count(Other) && !Loops.count(Other) &&
+                !Pred.count(Other) && !Pred.count(*PI)) {
+
+              Pred[Other] = BoolFalse;
+              Pred[*PI] = BoolTrue;
+              continue;
+            }
+          }
+          Pred[*PI] = buildCondition(Term, i, false);
+ 
+        } else {
+          // Back edge
+          LPred[*PI] = buildCondition(Term, i, true);
+        }
+      }
+
+    } else {
+
+      // It's an exit from a sub region
+      while(R->getParent() != ParentRegion)
+        R = R->getParent();
+
+      // Edge from inside a subregion to its entry, ignore it
+      if (R == N)
+        continue;
+
+      BasicBlock *Entry = R->getEntry();
+      if (Visited.count(Entry))
+        Pred[Entry] = BoolTrue;
+      else
+        LPred[Entry] = BoolFalse;
     }
   }
 }
 
 /// \brief Collect various loop and predicate infos
 void AMDGPUStructurizeCFG::collectInfos() {
-  unsigned Number = 0, LoopIdx = ~0;
 
   // Reset predicate
   Predicates.clear();
 
   // and loop infos
-  LoopStart = LoopEnd = 0;
-  LoopPred.clear();
+  Loops.clear();
+  LoopPreds.clear();
+
+  // Reset the visited nodes
+  Visited.clear();
 
-  RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
-  for (Visited.clear(); OI != OE; Visited[(*OI++)->getEntry()] = ++Number) {
+  for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
+       OI != OE; ++OI) {
 
     // Analyze all the conditions leading to a node
-    analyzeBlock((*OI)->getEntry());
+    gatherPredicates(*OI);
 
-    if ((*OI)->isSubRegion())
-      continue;
+    // Remember that we've seen this node
+    Visited.insert((*OI)->getEntry());
 
-    // Find the first/last loop nodes and loop predicates
-    analyzeLoop((*OI)->getNodeAs<BasicBlock>(), LoopIdx);
+    // Find the last back edges
+    analyzeLoops(*OI);
   }
 }
 
-/// \brief Does A dominate all the predicates of B ?
-bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *A, BasicBlock *B) {
-  BBPredicates &Preds = Predicates[B];
-  for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
-       PI != PE; ++PI) {
+/// \brief Insert the missing branch conditions
+void AMDGPUStructurizeCFG::insertConditions(bool Loops) {
+  BranchVector &Conds = Loops ? LoopConds : Conditions;
+  Value *Default = Loops ? BoolTrue : BoolFalse;
+  SSAUpdater PhiInserter;
 
-    if (!DT->dominates(A, PI->first))
-      return false;
-  }
-  return true;
-}
+  for (BranchVector::iterator I = Conds.begin(),
+       E = Conds.end(); I != E; ++I) {
 
-/// \brief Remove phi values from all successors and the remove the terminator.
-void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
-  TerminatorInst *Term = BB->getTerminator();
-  if (!Term)
-    return;
+    BranchInst *Term = *I;
+    assert(Term->isConditional());
 
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
-       SI != SE; ++SI) {
+    BasicBlock *Parent = Term->getParent();
+    BasicBlock *SuccTrue = Term->getSuccessor(0);
+    BasicBlock *SuccFalse = Term->getSuccessor(1);
 
-    delPhiValues(BB, *SI);
-  }
+    PhiInserter.Initialize(Boolean, "");
+    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
+    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
 
-  Term->eraseFromParent();
-}
+    BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
 
-/// First: Skip forward to the first region node that either isn't a subregion or not
-/// dominating it's exit, remove all the skipped nodes from the node order.
-///
-/// Second: Handle the first successor directly if the resulting nodes successor
-/// predicates are still dominated by the original entry
-RegionNode *AMDGPUStructurizeCFG::skipChained(RegionNode *Node) {
-  BasicBlock *Entry = Node->getEntry();
+    NearestCommonDominator Dominator(DT);
+    Dominator.addBlock(Parent, false);
 
-  // Skip forward as long as it is just a linear flow
-  while (true) {
-    BasicBlock *Entry = Node->getEntry();
-    BasicBlock *Exit;
+    Value *ParentValue = 0;
+    for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+         PI != PE; ++PI) {
 
-    if (Node->isSubRegion()) {
-      Exit = Node->getNodeAs<Region>()->getExit();
-    } else {
-      TerminatorInst *Term = Entry->getTerminator();
-      if (Term->getNumSuccessors() != 1)
+      if (PI->first == Parent) {
+        ParentValue = PI->second;
         break;
-      Exit = Term->getSuccessor(0);
+      }
+      PhiInserter.AddAvailableValue(PI->first, PI->second);
+      Dominator.addBlock(PI->first);
     }
 
-    // It's a back edge, break here so we can insert a loop node
-    if (!Visited.count(Exit))
-      return Node;
-
-    // More than node edges are pointing to exit
-    if (!DT->dominates(Entry, Exit))
-      return Node;
-
-    RegionNode *Next = ParentRegion->getNode(Exit);
-    RNVector::iterator I = std::find(Order.begin(), Order.end(), Next);
-    assert(I != Order.end());
-
-    Visited.erase(Next->getEntry());
-    Order.erase(I);
-    Node = Next;
-  }
+    if (ParentValue) {
+      Term->setCondition(ParentValue);
+    } else {
+      if (!Dominator.wasResultExplicitMentioned())
+        PhiInserter.AddAvailableValue(Dominator.getResult(), Default);
 
-  BasicBlock *BB = Node->getEntry();
-  TerminatorInst *Term = BB->getTerminator();
-  if (Term->getNumSuccessors() != 2)
-    return Node;
-
-  // Our node has exactly two succesors, check if we can handle
-  // any of them directly
-  BasicBlock *Succ = Term->getSuccessor(0);
-  if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) {
-    Succ = Term->getSuccessor(1);
-    if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ))
-      return Node;
-  } else {
-    BasicBlock *Succ2 = Term->getSuccessor(1);
-    if (Visited.count(Succ2) && Visited[Succ] > Visited[Succ2] &&
-        dominatesPredicates(Entry, Succ2))
-      Succ = Succ2;
+      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+    }
   }
-
-  RegionNode *Next = ParentRegion->getNode(Succ);
-  RNVector::iterator E = Order.end();
-  RNVector::iterator I = std::find(Order.begin(), E, Next);
-  assert(I != E);
-
-  killTerminator(BB);
-  FlowsInserted.push_back(BB);
-  Visited.erase(Succ);
-  Order.erase(I);
-  return ParentRegion->getNode(wireFlowBlock(BB, Next));
 }
 
 /// \brief Remove all PHI values coming from "From" into "To" and remember
@@ -421,224 +511,306 @@ void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
   }
 }
 
-/// \brief Add the PHI values back once we knew the new predecessor
+/// \brief Add a dummy PHI value as soon as we knew the new predecessor
 void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
-  if (!DeletedPhis.count(To))
-    return;
+  for (BasicBlock::iterator I = To->begin(), E = To->end();
+       I != E && isa<PHINode>(*I);) {
+
+    PHINode &Phi = cast<PHINode>(*I++);
+    Value *Undef = UndefValue::get(Phi.getType());
+    Phi.addIncoming(Undef, From);
+  }
+  AddedPhis[To].push_back(From);
+}
+
+/// \brief Add the real PHI value as soon as everything is set up
+void AMDGPUStructurizeCFG::setPhiValues() {
 
-  PhiMap &Map = DeletedPhis[To];
   SSAUpdater Updater;
+  for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end();
+       AI != AE; ++AI) {
 
-  for (PhiMap::iterator I = Map.begin(), E = Map.end(); I != E; ++I) {
+    BasicBlock *To = AI->first;
+    BBVector &From = AI->second;
 
-    PHINode *Phi = I->first;
-    Updater.Initialize(Phi->getType(), "");
-    BasicBlock *Fallback = To;
-    bool HaveFallback = false;
+    if (!DeletedPhis.count(To))
+      continue;
 
-    for (BBValueVector::iterator VI = I->second.begin(), VE = I->second.end();
-         VI != VE; ++VI) {
+    PhiMap &Map = DeletedPhis[To];
+    for (PhiMap::iterator PI = Map.begin(), PE = Map.end();
+         PI != PE; ++PI) {
 
-      Updater.AddAvailableValue(VI->first, VI->second);
-      BasicBlock *Dom = DT->findNearestCommonDominator(Fallback, VI->first);
-      if (Dom == VI->first)
-        HaveFallback = true;
-      else if (Dom != Fallback)
-        HaveFallback = false;
-      Fallback = Dom;
-    }
-    if (!HaveFallback) {
+      PHINode *Phi = PI->first;
       Value *Undef = UndefValue::get(Phi->getType());
-      Updater.AddAvailableValue(Fallback, Undef);
+      Updater.Initialize(Phi->getType(), "");
+      Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+      Updater.AddAvailableValue(To, Undef);
+
+      NearestCommonDominator Dominator(DT);
+      Dominator.addBlock(To, false);
+      for (BBValueVector::iterator VI = PI->second.begin(),
+           VE = PI->second.end(); VI != VE; ++VI) {
+
+        Updater.AddAvailableValue(VI->first, VI->second);
+        Dominator.addBlock(VI->first);
+      }
+
+      if (!Dominator.wasResultExplicitMentioned())
+        Updater.AddAvailableValue(Dominator.getResult(), Undef);
+
+      for (BBVector::iterator FI = From.begin(), FE = From.end();
+           FI != FE; ++FI) {
+
+        int Idx = Phi->getBasicBlockIndex(*FI);
+        assert(Idx != -1);
+        Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI));
+      }
+    }
+
+    DeletedPhis.erase(To);
+  }
+  assert(DeletedPhis.empty());
+}
+
+/// \brief Remove phi values from all successors and then remove the terminator.
+void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
+  TerminatorInst *Term = BB->getTerminator();
+  if (!Term)
+    return;
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+       SI != SE; ++SI) {
+
+    delPhiValues(BB, *SI);
+  }
+
+  Term->eraseFromParent();
+}
+
+/// \brief Let node exit(s) point to NewExit
+void AMDGPUStructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+                                      bool IncludeDominator) {
+
+  if (Node->isSubRegion()) {
+    Region *SubRegion = Node->getNodeAs<Region>();
+    BasicBlock *OldExit = SubRegion->getExit();
+    BasicBlock *Dominator = 0;
+
+    // Find all the edges from the sub region to the exit
+    for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit);
+         I != E;) {
+
+      BasicBlock *BB = *I++;
+      if (!SubRegion->contains(BB))
+        continue;
+
+      // Modify the edges to point to the new exit
+      delPhiValues(BB, OldExit);
+      BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
+      addPhiValues(BB, NewExit);
+
+      // Find the new dominator (if requested)
+      if (IncludeDominator) {
+        if (!Dominator)
+          Dominator = BB;
+        else
+          Dominator = DT->findNearestCommonDominator(Dominator, BB);
+      }
     }
 
-    Phi->addIncoming(Updater.GetValueAtEndOfBlock(From), From);
+    // Change the dominator (if requested)
+    if (Dominator)
+      DT->changeImmediateDominator(NewExit, Dominator);
+
+    // Update the region info
+    SubRegion->replaceExit(NewExit);
+
+  } else {
+    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+    killTerminator(BB);
+    BranchInst::Create(NewExit, BB);
+    addPhiValues(BB, NewExit);
+    if (IncludeDominator)
+      DT->changeImmediateDominator(NewExit, BB);
   }
-  DeletedPhis.erase(To);
 }
 
 /// \brief Create a new flow node and update dominator tree and region info
-BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Prev) {
+BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Dominator) {
   LLVMContext &Context = Func->getContext();
   BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
                        Order.back()->getEntry();
   BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
                                         Func, Insert);
-  DT->addNewBlock(Flow, Prev);
+  DT->addNewBlock(Flow, Dominator);
   ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
-  FlowsInserted.push_back(Flow);
   return Flow;
 }
 
+/// \brief Create a new or reuse the previous node as flow node
+BasicBlock *AMDGPUStructurizeCFG::needPrefix(bool NeedEmpty) {
+
+  BasicBlock *Entry = PrevNode->getEntry();
+
+  if (!PrevNode->isSubRegion()) {
+    killTerminator(Entry);
+    if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
+      return Entry;
+
+  } 
+
+  // create a new flow node
+  BasicBlock *Flow = getNextFlow(Entry);
+
+  // and wire it up
+  changeExit(PrevNode, Flow, true);
+  PrevNode = ParentRegion->getBBNode(Flow);
+  return Flow;
+}
+
+/// \brief Returns the region exit if possible, otherwise just a new flow node
+BasicBlock *AMDGPUStructurizeCFG::needPostfix(BasicBlock *Flow,
+                                              bool ExitUseAllowed) {
+
+  if (Order.empty() && ExitUseAllowed) {
+    BasicBlock *Exit = ParentRegion->getExit();
+    DT->changeImmediateDominator(Exit, Flow);
+    addPhiValues(Flow, Exit);
+    return Exit;
+  }
+  return getNextFlow(Flow);
+}
+
+/// \brief Set the previous node
+void AMDGPUStructurizeCFG::setPrevNode(BasicBlock *BB) {
+  PrevNode =  ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0;
+}
+
+/// \brief Does BB dominate all the predicates of Node ?
+bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+       PI != PE; ++PI) {
+
+    if (!DT->dominates(BB, PI->first))
+      return false;
+  }
+  return true;
+}
+
 /// \brief Can we predict that this node will always be called?
-bool AMDGPUStructurizeCFG::isPredictableTrue(BasicBlock *Prev,
-                                             BasicBlock *Node) {
-  BBPredicates &Preds = Predicates[Node];
+bool AMDGPUStructurizeCFG::isPredictableTrue(RegionNode *Node) {
+
+  BBPredicates &Preds = Predicates[Node->getEntry()];
   bool Dominated = false;
 
+  // Regionentry is always true
+  if (PrevNode == 0)
+    return true;
+
   for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
        I != E; ++I) {
 
     if (I->second != BoolTrue)
       return false;
 
-    if (!Dominated && DT->dominates(I->first, Prev))
+    if (!Dominated && DT->dominates(I->first, PrevNode->getEntry()))
       Dominated = true;
   }
+
+  // TODO: The dominator check is too strict
   return Dominated;
 }
 
-/// \brief Wire up the new control flow by inserting or updating the branch
-/// instructions at node exits
-BasicBlock *AMDGPUStructurizeCFG::wireFlowBlock(BasicBlock *Prev,
-                                                RegionNode *Node) {
-  BasicBlock *Entry = Node->getEntry();
-
-  if (LoopStart == Entry) {
-    LoopStart = Prev;
-    LoopPred[Prev] = BoolTrue;
-  }
+/// Take one node from the order vector and wire it up
+void AMDGPUStructurizeCFG::wireFlow(bool ExitUseAllowed,
+                                    BasicBlock *LoopEnd) {
 
-  // Wire it up temporary, skipChained may recurse into us
-  BranchInst::Create(Entry, Prev);
-  DT->changeImmediateDominator(Entry, Prev);
-  addPhiValues(Prev, Entry);
+  RegionNode *Node = Order.pop_back_val();
+  Visited.insert(Node->getEntry());
 
-  Node = skipChained(Node);
+  if (isPredictableTrue(Node)) {
+    // Just a linear flow
+    if (PrevNode) {
+      changeExit(PrevNode, Node->getEntry(), true);
+    }
+    PrevNode = Node;
 
-  BasicBlock *Next = getNextFlow(Prev);
-  if (!isPredictableTrue(Prev, Entry)) {
-    // Let Prev point to entry and next block
-    Prev->getTerminator()->eraseFromParent();
-    BranchInst::Create(Entry, Next, BoolUndef, Prev);
   } else {
-    DT->changeImmediateDominator(Next, Entry);
-  }
+    // Insert extra prefix node (or reuse last one)
+    BasicBlock *Flow = needPrefix(false);
 
-  // Let node exit(s) point to next block
-  if (Node->isSubRegion()) {
-    Region *SubRegion = Node->getNodeAs<Region>();
-    BasicBlock *Exit = SubRegion->getExit();
+    // Insert extra postfix node (or use exit instead)
+    BasicBlock *Entry = Node->getEntry();
+    BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
 
-    // Find all the edges from the sub region to the exit
-    BBVector ToDo;
-    for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
-      if (SubRegion->contains(*I))
-        ToDo.push_back(*I);
-    }
+    // let it point to entry and next block
+    Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
+    addPhiValues(Flow, Entry);
+    DT->changeImmediateDominator(Entry, Flow);
 
-    // Modify the edges to point to the new flow block
-    for (BBVector::iterator I = ToDo.begin(), E = ToDo.end(); I != E; ++I) {
-      delPhiValues(*I, Exit);
-      TerminatorInst *Term = (*I)->getTerminator();
-      Term->replaceUsesOfWith(Exit, Next);
+    PrevNode = Node;
+    while (!Order.empty() && !Visited.count(LoopEnd) &&
+           dominatesPredicates(Entry, Order.back())) {
+      handleLoops(false, LoopEnd);
     }
 
-    // Update the region info
-    SubRegion->replaceExit(Next);
-
-  } else {
-    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
-    killTerminator(BB);
-    BranchInst::Create(Next, BB);
-
-    if (BB == LoopEnd)
-      LoopEnd = 0;
+    changeExit(PrevNode, Next, false);
+    setPrevNode(Next);
   }
-
-  return Next;
 }
 
-/// Destroy node order and visited map, build up flow order instead.
-/// After this function control flow looks like it should be, but
-/// branches only have undefined conditions.
-void AMDGPUStructurizeCFG::createFlow() {
-  DeletedPhis.clear();
+void AMDGPUStructurizeCFG::handleLoops(bool ExitUseAllowed,
+                                       BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.back();
+  BasicBlock *LoopStart = Node->getEntry();
 
-  BasicBlock *Prev = Order.pop_back_val()->getEntry();
-  assert(Prev == ParentRegion->getEntry() && "Incorrect node order!");
-  Visited.erase(Prev);
-
-  if (LoopStart == Prev) {
-    // Loop starts at entry, split entry so that we can predicate it
-    BasicBlock::iterator Insert = Prev->getFirstInsertionPt();
-    BasicBlock *Split = Prev->splitBasicBlock(Insert, FlowBlockName);
-    DT->addNewBlock(Split, Prev);
-    ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
-    Predicates[Split] = Predicates[Prev];
-    Order.push_back(ParentRegion->getBBNode(Split));
-    LoopPred[Prev] = BoolTrue;
-
-  } else if (LoopStart == Order.back()->getEntry()) {
-    // Loop starts behind entry, split entry so that we can jump to it
-    Instruction *Term = Prev->getTerminator();
-    BasicBlock *Split = Prev->splitBasicBlock(Term, FlowBlockName);
-    DT->addNewBlock(Split, Prev);
-    ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion);
-    Prev = Split;
+  if (!Loops.count(LoopStart)) {
+    wireFlow(ExitUseAllowed, LoopEnd);
+    return;
   }
 
-  killTerminator(Prev);
-  FlowsInserted.clear();
-  FlowsInserted.push_back(Prev);
+  if (!isPredictableTrue(Node))
+    LoopStart = needPrefix(true);
 
-  while (!Order.empty()) {
-    RegionNode *Node = Order.pop_back_val();
-    Visited.erase(Node->getEntry());
-    Prev = wireFlowBlock(Prev, Node);
-    if (LoopStart && !LoopEnd) {
-      // Create an extra loop end node
-      LoopEnd = Prev;
-      Prev = getNextFlow(LoopEnd);
-      BranchInst::Create(Prev, LoopStart, BoolUndef, LoopEnd);
-      addPhiValues(LoopEnd, LoopStart);
-    }
+  LoopEnd = Loops[Node->getEntry()];
+  wireFlow(false, LoopEnd);
+  while (!Visited.count(LoopEnd)) {
+    handleLoops(false, LoopEnd);
   }
 
-  BasicBlock *Exit = ParentRegion->getExit();
-  BranchInst::Create(Exit, Prev);
-  addPhiValues(Prev, Exit);
-  if (DT->dominates(ParentRegion->getEntry(), Exit))
-    DT->changeImmediateDominator(Exit, Prev);
-
-  if (LoopStart && LoopEnd) {
-    BBVector::iterator FI = std::find(FlowsInserted.begin(),
-                                      FlowsInserted.end(),
-                                      LoopStart);
-    for (; *FI != LoopEnd; ++FI) {
-      addPhiValues(*FI, (*FI)->getTerminator()->getSuccessor(0));
-    }
-  }
-
-  assert(Order.empty());
-  assert(Visited.empty());
-  assert(DeletedPhis.empty());
+  // Create an extra loop end node
+  LoopEnd = needPrefix(false);
+  BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
+  LoopConds.push_back(BranchInst::Create(Next, LoopStart,
+                                         BoolUndef, LoopEnd));
+  addPhiValues(LoopEnd, LoopStart);
+  setPrevNode(Next);
 }
 
-/// \brief Insert the missing branch conditions
-void AMDGPUStructurizeCFG::insertConditions() {
-  SSAUpdater PhiInserter;
-
-  for (BBVector::iterator FI = FlowsInserted.begin(), FE = FlowsInserted.end();
-       FI != FE; ++FI) {
-
-    BranchInst *Term = cast<BranchInst>((*FI)->getTerminator());
-    if (Term->isUnconditional())
-      continue;
+/// After this function control flow looks like it should be, but
+/// branches and PHI nodes only have undefined conditions.
+void AMDGPUStructurizeCFG::createFlow() {
 
-    PhiInserter.Initialize(Boolean, "");
-    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), BoolFalse);
+  BasicBlock *Exit = ParentRegion->getExit();
+  bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
 
-    BasicBlock *Succ = Term->getSuccessor(0);
-    BBPredicates &Preds = (*FI == LoopEnd) ? LoopPred : Predicates[Succ];
-    for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
-         PI != PE; ++PI) {
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Conditions.clear();
+  LoopConds.clear();
 
-      PhiInserter.AddAvailableValue(PI->first, PI->second);
-    }
+  PrevNode = 0;
+  Visited.clear();
 
-    Term->setCondition(PhiInserter.GetValueAtEndOfBlock(*FI));
+  while (!Order.empty()) {
+    handleLoops(EntryDominatesExit, 0);
   }
+
+  if (PrevNode)
+    changeExit(PrevNode, Exit, EntryDominatesExit);
+  else
+    assert(EntryDominatesExit);
 }
 
 /// Handle a rare case where the disintegrated nodes instructions
@@ -696,14 +868,21 @@ bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   orderNodes();
   collectInfos();
   createFlow();
-  insertConditions();
+  insertConditions(false);
+  insertConditions(true);
+  setPhiValues();
   rebuildSSA();
 
+  // Cleanup
   Order.clear();
   Visited.clear();
-  Predicates.clear();
   DeletedPhis.clear();
-  FlowsInserted.clear();
+  AddedPhis.clear();
+  Predicates.clear();
+  Conditions.clear();
+  Loops.clear();
+  LoopPreds.clear();
+  LoopConds.clear();
 
   return true;
 }
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index cab7884..1973fc6 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -44,7 +44,7 @@ public:
   virtual ~AMDGPUSubtarget();
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
-  virtual void ParseSubtargetFeatures(llvm::StringRef CPU, llvm::StringRef FS);
+  virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
   bool isOverride(AMDGPUDeviceInfo::Caps) const;
   bool is64bit() const;
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index d09dc2e..e2f00be 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -102,6 +102,12 @@ AMDGPUPassConfig::addPreISel() {
 bool AMDGPUPassConfig::addInstSelector() {
   addPass(createAMDGPUPeepholeOpt(*TM));
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    // This callbacks this pass uses are not implemented yet on SI.
+    addPass(createAMDGPUIndirectAddressingPass(*TM));
+  }
   return false;
 }
 
@@ -116,6 +122,11 @@ bool AMDGPUPassConfig::addPreRegAlloc() {
 }
 
 bool AMDGPUPassConfig::addPostRegAlloc() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createSIInsertWaits(*TM));
+  }
   return false;
 }
 
@@ -132,8 +143,8 @@ bool AMDGPUPassConfig::addPreEmitPass() {
     addPass(createAMDGPUCFGStructurizerPass(*TM));
     addPass(createR600ExpandSpecialInstrsPass(*TM));
     addPass(&FinalizeMachineBundlesID);
+    addPass(createR600LowerConstCopy(*TM));
   } else {
-    addPass(createSILowerLiteralConstantsPass(*TM));
     addPass(createSILowerControlFlowPass(*TM));
   }
 
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 91f9a83..2afe787 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -15,9 +15,9 @@
 #ifndef AMDGPU_TARGET_MACHINE_H
 #define AMDGPU_TARGET_MACHINE_H
 
+#include "AMDGPUFrameLowering.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDILFrameLowering.h"
 #include "AMDILIntrinsicInfo.h"
 #include "R600ISelLowering.h"
 #include "llvm/ADT/OwningPtr.h"
diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h
index 4e577dc..b39fbdb 100644
--- a/lib/Target/R600/AMDIL.h
+++ b/lib/Target/R600/AMDIL.h
@@ -90,14 +90,30 @@ namespace AMDGPUAS {
 enum AddressSpaces {
   PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
   GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory.
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
   LOCAL_ADDRESS    = 3, ///< Address space for local memory.
   REGION_ADDRESS   = 4, ///< Address space for region memory.
   ADDRESS_NONE     = 5, ///< Address space for unknown memory.
   PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
   PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
   USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI
-  LAST_ADDRESS     = 9
+  CONSTANT_BUFFER_0 = 9,
+  CONSTANT_BUFFER_1 = 10,
+  CONSTANT_BUFFER_2 = 11,
+  CONSTANT_BUFFER_3 = 12,
+  CONSTANT_BUFFER_4 = 13,
+  CONSTANT_BUFFER_5 = 14,
+  CONSTANT_BUFFER_6 = 15,
+  CONSTANT_BUFFER_7 = 16,
+  CONSTANT_BUFFER_8 = 17,
+  CONSTANT_BUFFER_9 = 18,
+  CONSTANT_BUFFER_10 = 19,
+  CONSTANT_BUFFER_11 = 20,
+  CONSTANT_BUFFER_12 = 21,
+  CONSTANT_BUFFER_13 = 22,
+  CONSTANT_BUFFER_14 = 23,
+  CONSTANT_BUFFER_15 = 24,
+  LAST_ADDRESS     = 25
 };
 
 } // namespace AMDGPUAS
diff --git a/lib/Target/R600/AMDILDevice.h b/lib/Target/R600/AMDILDevice.h
index b9a1560..97df98c 100644
--- a/lib/Target/R600/AMDILDevice.h
+++ b/lib/Target/R600/AMDILDevice.h
@@ -104,7 +104,7 @@ public:
   static const unsigned int QuarterWavefrontSize = 16;
 protected:
   virtual void setCaps();
-  llvm::BitVector mHWBits;
+  BitVector mHWBits;
   llvm::BitVector mSWBits;
   AMDGPUSubtarget *mSTM;
   uint32_t DeviceFlag;
diff --git a/lib/Target/R600/AMDILFrameLowering.cpp b/lib/Target/R600/AMDILFrameLowering.cpp
deleted file mode 100644
index 9ad495a..0000000
--- a/lib/Target/R600/AMDILFrameLowering.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface to describe a layout of a stack frame on a AMDGPU target
-/// machine.
-//
-//===----------------------------------------------------------------------===//
-#include "AMDILFrameLowering.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-
-using namespace llvm;
-AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
-    int LAO, unsigned TransAl)
-  : TargetFrameLowering(D, StackAl, LAO, TransAl) {
-}
-
-AMDGPUFrameLowering::~AMDGPUFrameLowering() {
-}
-
-int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                         int FI) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return MFI->getObjectOffset(FI);
-}
-
-const TargetFrameLowering::SpillSlot *
-AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
-  NumEntries = 0;
-  return 0;
-}
-void
-AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
-}
-void
-AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
-}
-bool
-AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
-  return false;
-}
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index d15ed39..e77b9dc 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -16,10 +16,12 @@
 #include "AMDGPURegisterInfo.h"
 #include "AMDILDevices.h"
 #include "R600InstrInfo.h"
+#include "SIISelLowering.h"
 #include "llvm/ADT/ValueMap.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include <list>
 #include <queue>
 
@@ -42,9 +44,11 @@ public:
 
   SDNode *Select(SDNode *N);
   virtual const char *getPassName() const;
+  virtual void PostprocessISelDAG();
 
 private:
   inline SDValue getSmallIPtrImm(unsigned Imm);
+  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
 
   // Complex pattern selectors
   bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
@@ -67,9 +71,11 @@ private:
   static bool isLocalLoad(const LoadSDNode *N);
   static bool isRegionLoad(const LoadSDNode *N);
 
-  bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset);
-  bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset);
+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr,
+      SDValue &BaseReg, SDValue& Offset);
   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
 
   // Include the pieces autogenerated from the target description.
 #include "AMDGPUGenDAGISel.inc"
@@ -156,16 +162,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
   switch (Opc) {
   default: break;
-  case ISD::FrameIndex: {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) {
-      unsigned int FI = FIN->getIndex();
-      EVT OpVT = N->getValueType(0);
-      unsigned int NewOpc = AMDGPU::COPY;
-      SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
-      return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI);
-    }
-    break;
-  }
   case ISD::ConstantFP:
   case ISD::Constant: {
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
@@ -224,7 +220,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
             continue;
           }
       } else {
-        if (!TII->isALUInstr(Use->getMachineOpcode())) {
+        if (!TII->isALUInstr(Use->getMachineOpcode()) ||
+            (TII->get(Use->getMachineOpcode()).TSFlags &
+            R600_InstFlag::VECTOR)) {
           continue;
         }
 
@@ -259,7 +257,116 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     break;
   }
   }
-  return SelectCode(N);
+  SDNode *Result = SelectCode(N);
+
+  // Fold operands of selected node
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    const R600InstrInfo *TII =
+        static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+    if (Result && Result->isMachineOpcode() &&
+        !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
+        && TII->isALUInstr(Result->getMachineOpcode())) {
+      // Fold FNEG/FABS/CONST_ADDRESS
+      // TODO: Isel can generate multiple MachineInst, we need to recursively
+      // parse Result
+      bool IsModified = false;
+      do {
+        std::vector<SDValue> Ops;
+        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
+            I != E; ++I)
+          Ops.push_back(*I);
+        IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
+        if (IsModified) {
+          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
+        }
+      } while (IsModified);
+
+      // If node has a single use which is CLAMP_R600, folds it
+      if (Result->hasOneUse() && Result->isMachineOpcode()) {
+        SDNode *PotentialClamp = *Result->use_begin();
+        if (PotentialClamp->isMachineOpcode() &&
+            PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) {
+          unsigned ClampIdx =
+            TII->getOperandIdx(Result->getMachineOpcode(), R600Operands::CLAMP);
+          std::vector<SDValue> Ops;
+          unsigned NumOp = Result->getNumOperands();
+          for (unsigned i = 0; i < NumOp; ++i) {
+            Ops.push_back(Result->getOperand(i));
+          }
+          Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32);
+          Result = CurDAG->SelectNodeTo(PotentialClamp,
+              Result->getMachineOpcode(), PotentialClamp->getVTList(),
+              Ops.data(), NumOp);
+        }
+      }
+    }
+  }
+
+  return Result;
+}
+
+bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
+    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
+  int OperandIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1),
+    TII->getOperandIdx(Opcode, R600Operands::SRC2)
+  };
+  int SelIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL),
+    TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL)
+  };
+  int NegIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG),
+    TII->getOperandIdx(Opcode, R600Operands::SRC2_NEG)
+  };
+  int AbsIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS),
+    -1
+  };
+
+  for (unsigned i = 0; i < 3; i++) {
+    if (OperandIdx[i] < 0)
+      return false;
+    SDValue Operand = Ops[OperandIdx[i] - 1];
+    switch (Operand.getOpcode()) {
+    case AMDGPUISD::CONST_ADDRESS: {
+      if (i == 2)
+        break;
+      SDValue CstOffset;
+      if (!Operand.getValueType().isVector() &&
+          SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
+        Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
+        Ops[SelIdx[i] - 1] = CstOffset;
+        return true;
+      }
+      }
+      break;
+    case ISD::FNEG:
+      if (NegIdx[i] < 0)
+        break;
+      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
+      Ops[NegIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
+      return true;
+    case ISD::FABS:
+      if (AbsIdx[i] < 0)
+        break;
+      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
+      Ops[AbsIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
+      return true;
+    case ISD::BITCAST:
+      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
+      return true;
+    default:
+      break;
+    }
+  }
+  return false;
 }
 
 bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
@@ -406,41 +513,23 @@ const char *AMDGPUDAGToDAGISel::getPassName() const {
 
 ///==== AMDGPU Functions ====///
 
-bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base,
-                                             SDValue& Offset) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+    SDValue& IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
+    return true;
   }
+  return false;
+}
 
-
-  if (Addr.getOpcode() == ISD::ADD) {
-    bool Match = false;
-
-    // Find the base ptr and the offset
-    for (unsigned i = 0; i < Addr.getNumOperands(); i++) {
-      SDValue Arg = Addr.getOperand(i);
-      ConstantSDNode * OffsetNode = dyn_cast<ConstantSDNode>(Arg);
-      // This arg isn't a constant so it must be the base PTR.
-      if (!OffsetNode) {
-        Base = Addr.getOperand(i);
-        continue;
-      }
-      // Check if the constant argument fits in 8-bits.  The offset is in bytes
-      // so we need to convert it to dwords.
-      if (isUInt<8>(OffsetNode->getZExtValue() >> 2)) {
-        Match = true;
-        Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2,
-                                           MVT::i32);
-      }
-    }
-    return Match;
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+    SDValue& BaseReg, SDValue &Offset) {
+  if (!dyn_cast<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, true);
+    return true;
   }
-
-  // Default case, no offset
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, MVT::i32);
-  return true;
+  return false;
 }
 
 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
@@ -470,16 +559,39 @@ bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base,
-                                      SDValue& Offset) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress  ||
-      Addr.getOpcode() != ISD::ADD) {
-    return false;
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  ConstantSDNode *C;
+
+  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
   }
 
-  Base = Addr.getOperand(0);
-  Offset = Addr.getOperand(1);
-
   return true;
 }
+
+void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
+
+  // Go over all selected nodes and try to fold them a bit more
+  const AMDGPUTargetLowering& Lowering = ((const AMDGPUTargetLowering&)TLI);
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ++I) {
+
+    MachineSDNode *Node = dyn_cast<MachineSDNode>(I);
+    if (!Node)
+      continue;
+
+    SDNode *ResNode = Lowering.PostISelFolding(Node, *CurDAG);
+    if (ResNode != Node)
+      ReplaceUses(Node, ResNode);
+  }
+}
+
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 2e60adc..f65e1f3 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -220,9 +220,9 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
   setSelectIsExpensive(true);
   setJumpIsExpensive(true);
 
-  maxStoresPerMemcpy  = 4096;
-  maxStoresPerMemmove = 4096;
-  maxStoresPerMemset  = 4096;
+  MaxStoresPerMemcpy  = 4096;
+  MaxStoresPerMemmove = 4096;
+  MaxStoresPerMemset  = 4096;
 
 }
 
@@ -451,7 +451,8 @@ AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
 
   // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa);
+  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
+      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
 
   // int iq = (int)fq;
   SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
index e969bbf..110f147 100644
--- a/lib/Target/R600/AMDILInstrInfo.td
+++ b/lib/Target/R600/AMDILInstrInfo.td
@@ -116,7 +116,6 @@ def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
 //===--------------------------------------------------------------------===//
 // Floating point math functions
 def IL_div_inf      : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>;
-def IL_mad          : SDNode<"AMDGPUISD::MAD", SDTIL_GenTernaryOp>;
 
 //===----------------------------------------------------------------------===//
 // Integer functions
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
index 3f9e20f..6ec3559 100644
--- a/lib/Target/R600/AMDILIntrinsics.td
+++ b/lib/Target/R600/AMDILIntrinsics.td
@@ -92,12 +92,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in {
       TernaryIntInt;
   def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
       BinaryIntInt;
-  def int_AMDIL_mad_i32 : GCCBuiltin<"__amdil_imad">,
-          TernaryIntInt;
-  def int_AMDIL_mad_u32 : GCCBuiltin<"__amdil_umad">,
-          TernaryIntInt;
-  def int_AMDIL_mad     : GCCBuiltin<"__amdil_mad">,
-          TernaryIntFloat;
   def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
           BinaryIntInt;
   def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
@@ -110,10 +104,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in {
           BinaryIntInt;
   def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
           BinaryIntInt;
-  def int_AMDIL_mad24_i32 : GCCBuiltin<"__amdil_imad24">,
-          TernaryIntInt;
-  def int_AMDIL_mad24_u32 : GCCBuiltin<"__amdil_umad24">,
-          TernaryIntInt;
   def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
           BinaryIntInt;
   def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
index a3d30af..3a28038 100644
--- a/lib/Target/R600/AMDILPeepholeOptimizer.cpp
+++ b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
@@ -366,7 +366,7 @@ AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
     std::string buffer(F->getName().str() + "_noret");
     F = dyn_cast<Function>(
           F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
-    atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
+    atomicFuncs.push_back(std::make_pair(CI, F));
   }
   
   if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
@@ -613,7 +613,7 @@ AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
   if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
   Function *Func = 
     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
-        getOrInsertFunction(llvm::StringRef(name), funcType));
+        getOrInsertFunction(StringRef(name), funcType));
   Value *Operands[4] = {
     width,
     offset,
@@ -777,7 +777,7 @@ AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
   // Lets create the function.
   Function *Func = 
     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(llvm::StringRef(name), funcType));
+                       getOrInsertFunction(StringRef(name), funcType));
   Value *Operands[3] = {
     ShiftInst->getOperand(0),
     shiftValConst,
@@ -967,7 +967,7 @@ AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
     }
     Function *Func = dyn_cast<Function>(
                        CI->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(llvm::StringRef(name), funcType));
+                       getOrInsertFunction(StringRef(name), funcType));
     Value *Operands[3] = {
       CI->getOperand(0),
       CI->getOperand(1),
@@ -999,7 +999,7 @@ AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
     }
     Function *Func = dyn_cast<Function>(
                        CI->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(llvm::StringRef(name), funcType));
+                       getOrInsertFunction(StringRef(name), funcType));
     Value *Operands[2] = {
       CI->getOperand(0),
       CI->getOperand(1)
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index ce0b56b..00f8b10 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -17,7 +17,6 @@ add_llvm_target(R600CodeGen
   AMDILDevice.cpp
   AMDILDeviceInfo.cpp
   AMDILEvergreenDevice.cpp
-  AMDILFrameLowering.cpp
   AMDILIntrinsicInfo.cpp
   AMDILISelDAGToDAG.cpp
   AMDILISelLowering.cpp
@@ -25,6 +24,8 @@ add_llvm_target(R600CodeGen
   AMDILPeepholeOptimizer.cpp
   AMDILSIDevice.cpp
   AMDGPUAsmPrinter.cpp
+  AMDGPUFrameLowering.cpp
+  AMDGPUIndirectAddressing.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUSubtarget.cpp
   AMDGPUStructurizeCFG.cpp
@@ -36,13 +37,14 @@ add_llvm_target(R600CodeGen
   R600ExpandSpecialInstrs.cpp
   R600InstrInfo.cpp
   R600ISelLowering.cpp
+  R600LowerConstCopy.cpp
   R600MachineFunctionInfo.cpp
   R600RegisterInfo.cpp
   SIAnnotateControlFlow.cpp
   SIAssignInterpRegs.cpp
+  SIInsertWaits.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
-  SILowerLiteralConstants.cpp
   SILowerControlFlow.cpp
   SIMachineFunctionInfo.cpp
   SIRegisterInfo.cpp
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index e6c550b..10547a5 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
 
 using namespace llvm;
 
@@ -35,11 +36,29 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << Op.getImm();
   } else if (Op.isFPImm()) {
     O << Op.getFPImm();
+  } else if (Op.isExpr()) {
+    const MCExpr *Exp = Op.getExpr();
+    Exp->print(O);
   } else {
     assert(!"unknown operand type in printOperand");
   }
 }
 
+void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+
+  if (Imm == 2) {
+    O << "P0";
+  } else if (Imm == 1) {
+    O << "P20";
+  } else if (Imm == 0) {
+    O << "P10";
+  } else {
+    assert(!"Invalid interpolation parameter slot");
+  }
+}
+
 void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   printOperand(MI, OpNo, O);
@@ -105,10 +124,7 @@ void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
                                  raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.getImm() != 0) {
-    O << " + " << Op.getImm();
-  }
+  printIfSet(MI, OpNo, O, "+");
 }
 
 void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
@@ -129,4 +145,28 @@ void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const char * chans = "XYZW";
+  int sel = MI->getOperand(OpNo).getImm();
+
+  int chan = sel & 3;
+  sel >>= 2;
+
+  if (sel >= 512) {
+    sel -= 512;
+    int cb = sel >> 12;
+    sel &= 4095;
+    O << cb << "[" << sel << "]";
+  } else if (sel >= 448) {
+    sel -= 448;
+    O << sel;
+  } else if (sel >= 0){
+    O << sel;
+  }
+
+  if (sel >= 0)
+    O << "." << chans[chan];
+}
+
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 96e0e46..767a708 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -33,6 +33,7 @@ public:
 
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm);
   void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -45,6 +46,7 @@ private:
   void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 9d0d6cf..8721f80 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -42,17 +42,6 @@ public:
                                    SmallVectorImpl<MCFixup> &Fixups) const {
     return 0;
   }
-  virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const {
-    return Value;
-  }
-  virtual uint64_t i32LiteralEncode(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixups) const {
-    return 0;
-  }
-  virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixups) const {
-    return 0;
-  }
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 36deae9..d207160 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -38,8 +38,8 @@ using namespace llvm;
 namespace {
 
 class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
-  R600MCCodeEmitter(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
+  R600MCCodeEmitter(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
   const MCSubtargetInfo &STI;
@@ -63,8 +63,8 @@ private:
   void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
                     raw_ostream &OS) const;
   void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const;
-  void EmitSrcISA(const MCInst &MI, unsigned OpIdx, uint64_t &Value,
-                  raw_ostream &OS) const;
+  void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx,
+                    raw_ostream &OS) const;
   void EmitDst(const MCInst &MI, raw_ostream &OS) const;
   void EmitTexInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
                     raw_ostream &OS) const;
@@ -161,9 +161,12 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     case AMDGPU::VTX_READ_PARAM_8_eg:
     case AMDGPU::VTX_READ_PARAM_16_eg:
     case AMDGPU::VTX_READ_PARAM_32_eg:
+    case AMDGPU::VTX_READ_PARAM_128_eg:
     case AMDGPU::VTX_READ_GLOBAL_8_eg:
     case AMDGPU::VTX_READ_GLOBAL_32_eg:
-    case AMDGPU::VTX_READ_GLOBAL_128_eg: {
+    case AMDGPU::VTX_READ_GLOBAL_128_eg:
+    case AMDGPU::TEX_VTX_CONSTBUF:
+    case AMDGPU::TEX_VTX_TEXBUF : {
       uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
       uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
 
@@ -193,7 +196,6 @@ void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
                                      SmallVectorImpl<MCFixup> &Fixups,
                                      raw_ostream &OS) const {
   const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
-  unsigned NumOperands = MI.getNumOperands();
 
   // Emit instruction type
   EmitByte(INSTR_ALU, OS);
@@ -209,19 +211,21 @@ void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
     InstWord01 |= ISAOpCode << 1;
   }
 
-  unsigned SrcIdx = 0;
-  for (unsigned int OpIdx = 1; OpIdx < NumOperands; ++OpIdx) {
-    if (MI.getOperand(OpIdx).isImm() || MI.getOperand(OpIdx).isFPImm() ||
-        OpIdx == (unsigned)MCDesc.findFirstPredOperandIdx()) {
-      continue;
-    }
-    EmitSrcISA(MI, OpIdx, InstWord01, OS);
-    SrcIdx++;
-  }
+  unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 :
+      MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1;
 
-  // Emit zeros for unused sources
-  for ( ; SrcIdx < 3; SrcIdx++) {
-    EmitNullBytes(SRC_BYTE_COUNT - 6, OS);
+  EmitByte(SrcNum, OS);
+
+  const unsigned SrcOps[3][2] = {
+      {R600Operands::SRC0, R600Operands::SRC0_SEL},
+      {R600Operands::SRC1, R600Operands::SRC1_SEL},
+      {R600Operands::SRC2, R600Operands::SRC2_SEL}
+  };
+
+  for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) {
+    unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]];
+    unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]];
+    EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS);
   }
 
   Emit(InstWord01, OS);
@@ -292,34 +296,37 @@ void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx,
 
 }
 
-void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned OpIdx,
-                                   uint64_t &Value, raw_ostream &OS) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
+void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx,
+                                   unsigned SelOpIdx, raw_ostream &OS) const {
+  const MCOperand &RegMO = MI.getOperand(RegOpIdx);
+  const MCOperand &SelMO = MI.getOperand(SelOpIdx);
+
   union {
     float f;
     uint32_t i;
   } InlineConstant;
   InlineConstant.i = 0;
-  // Emit the source select (2 bytes).  For GPRs, this is the register index.
-  // For other potential instruction operands, (e.g. constant registers) the
-  // value of the source select is defined in the r600isa docs.
-  if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
-    if (AMDGPUMCRegisterClasses[AMDGPU::R600_CReg32RegClassID].contains(Reg)) {
-      EmitByte(1, OS);
-    } else {
-      EmitByte(0, OS);
-    }
+  // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0
+  // and select is 0 (GPR index is encoded in the instr encoding. For constants
+  // type is 1 and select is the original const select passed from the driver.
+  unsigned Reg = RegMO.getReg();
+  if (Reg == AMDGPU::ALU_CONST) {
+    EmitByte(1, OS);
+    uint32_t Sel = SelMO.getImm();
+    Emit(Sel, OS);
+  } else {
+    EmitByte(0, OS);
+    Emit((uint32_t)0, OS);
+  }
 
-    if (Reg == AMDGPU::ALU_LITERAL_X) {
-      unsigned ImmOpIndex = MI.getNumOperands() - 1;
-      MCOperand ImmOp = MI.getOperand(ImmOpIndex);
-      if (ImmOp.isFPImm()) {
-        InlineConstant.f = ImmOp.getFPImm();
-      } else {
-        assert(ImmOp.isImm());
-        InlineConstant.i = ImmOp.getImm();
-      }
+  if (Reg == AMDGPU::ALU_LITERAL_X) {
+    unsigned ImmOpIndex = MI.getNumOperands() - 1;
+    MCOperand ImmOp = MI.getOperand(ImmOpIndex);
+    if (ImmOp.isFPImm()) {
+      InlineConstant.f = ImmOp.getFPImm();
+    } else {
+      assert(ImmOp.isImm());
+      InlineConstant.i = ImmOp.getImm();
     }
   }
 
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index b4bdb25..6cc0077 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -24,46 +24,33 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
 
-#define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1))
-#define SI_INSTR_FLAGS_ENCODING_MASK 0xf
-
-// These must be kept in sync with SIInstructions.td and also the
-// InstrEncodingInfo array in SIInstrInfo.cpp.
-//
-// NOTE: This enum is only used to identify the encoding type within LLVM,
-// the actual encoding type that is part of the instruction format is different
-namespace SIInstrEncodingType {
-  enum Encoding {
-    EXP = 0,
-    LDS = 1,
-    MIMG = 2,
-    MTBUF = 3,
-    MUBUF = 4,
-    SMRD = 5,
-    SOP1 = 6,
-    SOP2 = 7,
-    SOPC = 8,
-    SOPK = 9,
-    SOPP = 10,
-    VINTRP = 11,
-    VOP1 = 12,
-    VOP2 = 13,
-    VOP3 = 14,
-    VOPC = 15
-  };
-}
-
 using namespace llvm;
 
 namespace {
+
+/// \brief Helper type used in encoding
+typedef union {
+  int32_t I;
+  float F;
+} IntFloatUnion;
+
 class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
-  SIMCCodeEmitter(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
+  SIMCCodeEmitter(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
   const MCSubtargetInfo &STI;
   MCContext &Ctx;
 
+  /// \brief Encode a sequence of registers with the correct alignment.
+  unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
+
+  /// \brief Can this operand also contain immediate values?
+  bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
+
+  /// \brief Encode an fp or int literal
+  uint32_t getLitEncoding(const MCOperand &MO) const;
+
 public:
   SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
                   const MCSubtargetInfo &sti, MCContext &ctx)
@@ -79,11 +66,6 @@ public:
   virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                                      SmallVectorImpl<MCFixup> &Fixups) const;
 
-public:
-
-  /// \brief Encode a sequence of registers with the correct alignment.
-  unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
-
   /// \brief Encoding for when 2 consecutive registers are used
   virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
                                    SmallVectorImpl<MCFixup> &Fixup) const;
@@ -91,73 +73,142 @@ public:
   /// \brief Encoding for when 4 consectuive registers are used
   virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
                                    SmallVectorImpl<MCFixup> &Fixup) const;
+};
 
-  /// \brief Encoding for SMRD indexed loads
-  virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixup) const;
+} // End anonymous namespace
+
+MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           const MCSubtargetInfo &STI,
+                                           MCContext &Ctx) {
+  return new SIMCCodeEmitter(MCII, MRI, STI, Ctx);
+}
 
-  /// \brief Post-Encoder method for VOP instructions
-  virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const;
+bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
+                                   unsigned OpNo) const {
 
-private:
+  unsigned RegClass = Desc.OpInfo[OpNo].RegClass;
+  return (AMDGPU::SSrc_32RegClassID == RegClass) ||
+         (AMDGPU::SSrc_64RegClassID == RegClass) ||
+         (AMDGPU::VSrc_32RegClassID == RegClass) ||
+         (AMDGPU::VSrc_64RegClassID == RegClass);
+}
 
-  /// \returns this SIInstrEncodingType for this instruction.
-  unsigned getEncodingType(const MCInst &MI) const;
+uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
 
-  /// \brief Get then size in bytes of this instructions encoding.
-  unsigned getEncodingBytes(const MCInst &MI) const;
+  IntFloatUnion Imm;
+  if (MO.isImm())
+    Imm.I = MO.getImm();
+  else if (MO.isFPImm())
+    Imm.F = MO.getFPImm();
+  else
+    return ~0;
 
-  /// \returns the hardware encoding for a register
-  unsigned getRegBinaryCode(unsigned reg) const;
+  if (Imm.I >= 0 && Imm.I <= 64)
+    return 128 + Imm.I;
 
-  /// \brief Generated function that returns the hardware encoding for
-  /// a register
-  unsigned getHWRegNum(unsigned reg) const;
+  if (Imm.I >= -16 && Imm.I <= -1)
+    return 192 + abs(Imm.I);
 
-};
+  if (Imm.F == 0.5f)
+    return 240;
 
-} // End anonymous namespace
+  if (Imm.F == -0.5f)
+    return 241;
 
-MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
-                                           const MCRegisterInfo &MRI,
-                                           const MCSubtargetInfo &STI,
-                                           MCContext &Ctx) {
-  return new SIMCCodeEmitter(MCII, MRI, STI, Ctx);
+  if (Imm.F == 1.0f)
+    return 242;
+
+  if (Imm.F == -1.0f)
+    return 243;
+
+  if (Imm.F == 2.0f)
+    return 244;
+
+  if (Imm.F == -2.0f)
+    return 245;
+
+  if (Imm.F == 4.0f)
+    return 246;
+
+  if (Imm.F == -4.0f)
+    return 247;
+
+  return 255;
 }
 
 void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                                        SmallVectorImpl<MCFixup> &Fixups) const {
+
   uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups);
-  unsigned bytes = getEncodingBytes(MI);
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  unsigned bytes = Desc.getSize();
+
   for (unsigned i = 0; i < bytes; i++) {
     OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
   }
+
+  if (bytes > 4)
+    return;
+
+  // Check for additional literals in SRC0/1/2 (Op 1/2/3)
+  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+
+    // Check if this operand should be encoded as [SV]Src
+    if (!isSrcOperand(Desc, i))
+      continue;
+
+    // Is this operand a literal immediate?
+    const MCOperand &Op = MI.getOperand(i);
+    if (getLitEncoding(Op) != 255)
+      continue;
+
+    // Yes! Encode it
+    IntFloatUnion Imm;
+    if (Op.isImm())
+      Imm.I = Op.getImm();
+    else
+      Imm.F = Op.getFPImm();
+
+    for (unsigned j = 0; j < 4; j++) {
+      OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff));
+    }
+
+    // Only one literal value allowed
+    break;
+  }
 }
 
 uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                             const MCOperand &MO,
                                        SmallVectorImpl<MCFixup> &Fixups) const {
-  if (MO.isReg()) {
-    return getRegBinaryCode(MO.getReg());
-  } else if (MO.isImm()) {
-    return MO.getImm();
-  } else if (MO.isFPImm()) {
-    // XXX: Not all instructions can use inline literals
-    // XXX: We should make sure this is a 32-bit constant
-    union {
-      float F;
-      uint32_t I;
-    } Imm;
-    Imm.F = MO.getFPImm();
-    return Imm.I;
-  } else if (MO.isExpr()) {
+  if (MO.isReg())
+    return MRI.getEncodingValue(MO.getReg());
+
+  if (MO.isExpr()) {
     const MCExpr *Expr = MO.getExpr();
     MCFixupKind Kind = MCFixupKind(FK_PCRel_4);
     Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
     return 0;
-  } else{
-    llvm_unreachable("Encoding of this operand type is not supported yet.");
   }
+
+  // Figure out the operand number, needed for isSrcOperand check
+  unsigned OpNo = 0;
+  for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
+    if (&MO == &MI.getOperand(OpNo))
+      break;
+  }
+
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (isSrcOperand(Desc, OpNo)) {
+    uint32_t Enc = getLitEncoding(MO);
+    if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
+      return Enc;
+
+  } else if (MO.isImm())
+    return MO.getImm();
+
+  llvm_unreachable("Encoding of this operand type is not supported yet.");
   return 0;
 }
 
@@ -167,10 +218,10 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 
 unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo,
                                    unsigned shift) const {
-  unsigned regCode = getRegBinaryCode(MI.getOperand(OpNo).getReg());
-  return regCode >> shift;
-  return 0;
+  unsigned regCode = MRI.getEncodingValue(MI.getOperand(OpNo).getReg());
+  return (regCode & 0xff) >> shift;
 }
+
 unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI,
                                           unsigned OpNo ,
                                         SmallVectorImpl<MCFixup> &Fixup) const {
@@ -182,117 +233,3 @@ unsigned SIMCCodeEmitter::GPR4AlignEncode(const MCInst &MI,
                                         SmallVectorImpl<MCFixup> &Fixup) const {
   return GPRAlign(MI, OpNo, 2);
 }
-
-#define SMRD_OFFSET_MASK 0xff
-#define SMRD_IMM_SHIFT 8
-#define SMRD_SBASE_MASK 0x3f
-#define SMRD_SBASE_SHIFT 9
-/// This function is responsibe for encoding the offset
-/// and the base ptr for SMRD instructions it should return a bit string in
-/// this format:
-///
-/// OFFSET = bits{7-0}
-/// IMM    = bits{8}
-/// SBASE  = bits{14-9}
-///
-uint32_t SIMCCodeEmitter::SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
-                                        SmallVectorImpl<MCFixup> &Fixup) const {
-  uint32_t Encoding;
-
-  const MCOperand &OffsetOp = MI.getOperand(OpNo + 1);
-
-  //XXX: Use this function for SMRD loads with register offsets
-  assert(OffsetOp.isImm());
-
-  Encoding =
-      (getMachineOpValue(MI, OffsetOp, Fixup) & SMRD_OFFSET_MASK)
-    | (1 << SMRD_IMM_SHIFT) //XXX If the Offset is a register we shouldn't set this bit
-    | ((GPR2AlignEncode(MI, OpNo, Fixup) & SMRD_SBASE_MASK) << SMRD_SBASE_SHIFT)
-    ;
-
-  return Encoding;
-}
-
-//===----------------------------------------------------------------------===//
-// Post Encoder Callbacks
-//===----------------------------------------------------------------------===//
-
-uint64_t SIMCCodeEmitter::VOPPostEncode(const MCInst &MI, uint64_t Value) const{
-  unsigned encodingType = getEncodingType(MI);
-  unsigned numSrcOps;
-  unsigned vgprBitOffset;
-
-  if (encodingType == SIInstrEncodingType::VOP3) {
-    numSrcOps = 3;
-    vgprBitOffset = 32;
-  } else {
-    numSrcOps = 1;
-    vgprBitOffset = 0;
-  }
-
-  // Add one to skip over the destination reg operand.
-  for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) {
-    const MCOperand &MO = MI.getOperand(opIdx);
-    if (MO.isReg()) {
-      unsigned reg = MI.getOperand(opIdx).getReg();
-      if (AMDGPUMCRegisterClasses[AMDGPU::VReg_32RegClassID].contains(reg) ||
-          AMDGPUMCRegisterClasses[AMDGPU::VReg_64RegClassID].contains(reg)) {
-        Value |= (VGPR_BIT(opIdx)) << vgprBitOffset;
-      }
-    } else if (MO.isFPImm()) {
-      union {
-        float f;
-        uint32_t i;
-      } Imm;
-      // XXX: Not all instructions can use inline literals
-      // XXX: We should make sure this is a 32-bit constant
-      Imm.f = MO.getFPImm();
-      Value |= ((uint64_t)Imm.i) << 32;
-    }
-  }
-  return Value;
-}
-
-//===----------------------------------------------------------------------===//
-// Encoding helper functions
-//===----------------------------------------------------------------------===//
-
-unsigned SIMCCodeEmitter::getEncodingType(const MCInst &MI) const {
-  return MCII.get(MI.getOpcode()).TSFlags & SI_INSTR_FLAGS_ENCODING_MASK;
-}
-
-unsigned SIMCCodeEmitter::getEncodingBytes(const MCInst &MI) const {
-
-  // These instructions aren't real instructions with an encoding type, so
-  // we need to manually specify their size.
-  switch (MI.getOpcode()) {
-  default: break;
-  case AMDGPU::SI_LOAD_LITERAL_I32:
-  case AMDGPU::SI_LOAD_LITERAL_F32:
-    return 4;
-  }
-
-  unsigned encoding_type = getEncodingType(MI);
-  switch (encoding_type) {
-    case SIInstrEncodingType::EXP:
-    case SIInstrEncodingType::LDS:
-    case SIInstrEncodingType::MUBUF:
-    case SIInstrEncodingType::MTBUF:
-    case SIInstrEncodingType::MIMG:
-    case SIInstrEncodingType::VOP3:
-      return 8;
-    default:
-      return 4;
-  }
-}
-
-
-unsigned SIMCCodeEmitter::getRegBinaryCode(unsigned reg) const {
-  switch (reg) {
-    case AMDGPU::M0: return 124;
-    case AMDGPU::SREG_LIT_0: return 128;
-    case AMDGPU::SI_LITERAL_CONSTANT: return 255;
-    default: return MRI.getEncodingValue(reg);
-  }
-}
-
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index 3dc1ecd..868810c 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -13,6 +13,7 @@
 
 class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
 : Processor<Name, itin, Features>;
+def : Proc<"",           R600_EG_Itin, [FeatureR600ALUInst]>;
 def : Proc<"r600",       R600_EG_Itin, [FeatureR600ALUInst]>;
 def : Proc<"rv710",      R600_EG_Itin, []>;
 def : Proc<"rv730",      R600_EG_Itin, []>;
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 7dea8e4..16cfcf5 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -49,6 +49,9 @@ namespace R600_InstFlag {
 #define HW_REG_MASK 0x1ff
 #define HW_CHAN_SHIFT 9
 
+#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
+#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
+
 namespace R600Operands {
   enum Ops {
     DST,
@@ -62,18 +65,33 @@ namespace R600Operands {
     SRC0_NEG,
     SRC0_REL,
     SRC0_ABS,
+    SRC0_SEL,
     SRC1,
     SRC1_NEG,
     SRC1_REL,
     SRC1_ABS,
+    SRC1_SEL,
     SRC2,
     SRC2_NEG,
     SRC2_REL,
+    SRC2_SEL,
     LAST,
     PRED_SEL,
     IMM,
     COUNT
  };
+
+  const static int ALUOpTable[3][R600Operands::COUNT] = {
+//            W        C     S  S  S  S     S  S  S  S     S  S  S
+//            R  O  D  L  S  R  R  R  R  S  R  R  R  R  S  R  R  R  L  P
+//   D  U     I  M  R  A  R  C  C  C  C  R  C  C  C  C  R  C  C  C  A  R  I
+//   S  E  U  T  O  E  M  C  0  0  0  0  C  1  1  1  1  C  2  2  2  S  E  M
+//   T  M  P  E  D  L  P  0  N  R  A  S  1  N  R  A  S  2  N  R  S  T  D  M
+    {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12},
+    {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19},
+    {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17}
+  };
+
 }
 
 #endif // R600DEFINES_H_
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index b903d4a..f8c900f 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -55,118 +55,6 @@ FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
   return new R600ExpandSpecialInstrsPass(TM);
 }
 
-bool R600ExpandSpecialInstrsPass::ExpandInputPerspective(MachineInstr &MI) {
-  const R600RegisterInfo &TRI = TII->getRegisterInfo();
-  if (MI.getOpcode() != AMDGPU::input_perspective)
-    return false;
-
-  MachineBasicBlock::iterator I = &MI;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  R600MachineFunctionInfo *MFI = MI.getParent()->getParent()
-      ->getInfo<R600MachineFunctionInfo>();
-  unsigned IJIndexBase;
-
-  // In Evergreen ISA doc section 8.3.2 :
-  // We need to interpolate XY and ZW in two different instruction groups.
-  // An INTERP_* must occupy all 4 slots of an instruction group.
-  // Output of INTERP_XY is written in X,Y slots
-  // Output of INTERP_ZW is written in Z,W slots
-  //
-  // Thus interpolation requires the following sequences :
-  //
-  // AnyGPR.x = INTERP_ZW; (Write Masked Out)
-  // AnyGPR.y = INTERP_ZW; (Write Masked Out)
-  // DstGPR.z = INTERP_ZW;
-  // DstGPR.w = INTERP_ZW; (End of first IG)
-  // DstGPR.x = INTERP_XY;
-  // DstGPR.y = INTERP_XY;
-  // AnyGPR.z = INTERP_XY; (Write Masked Out)
-  // AnyGPR.w = INTERP_XY; (Write Masked Out) (End of second IG)
-  //
-  switch (MI.getOperand(1).getImm()) {
-  case 0:
-    IJIndexBase = MFI->GetIJPerspectiveIndex();
-    break;
-  case 1:
-    IJIndexBase = MFI->GetIJLinearIndex();
-    break;
-  default:
-    assert(0 && "Unknow ij index");
-  }
-
-  for (unsigned i = 0; i < 8; i++) {
-    unsigned IJIndex = AMDGPU::R600_TReg32RegClass.getRegister(
-        2 * IJIndexBase + ((i + 1) % 2));
-    unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
-        MI.getOperand(2).getImm());
-
-
-    unsigned Sel = AMDGPU::sel_x;
-    switch (i % 4) {
-    case 0:Sel = AMDGPU::sel_x;break;
-    case 1:Sel = AMDGPU::sel_y;break;
-    case 2:Sel = AMDGPU::sel_z;break;
-    case 3:Sel = AMDGPU::sel_w;break;
-    default:break;
-    }
-
-    unsigned Res = TRI.getSubReg(DstReg, Sel);
-
-    unsigned Opcode = (i < 4)?AMDGPU::INTERP_ZW:AMDGPU::INTERP_XY;
-
-    MachineBasicBlock &MBB = *(MI.getParent());
-    MachineInstr *NewMI =
-        TII->buildDefaultInstruction(MBB, I, Opcode, Res, IJIndex, ReadReg);
-
-    if (!(i> 1 && i < 6)) {
-      TII->addFlag(NewMI, 0, MO_FLAG_MASK);
-    }
-
-    if (i % 4 !=  3)
-      TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
-  }
-
-  MI.eraseFromParent();
-
-  return true;
-}
-
-bool R600ExpandSpecialInstrsPass::ExpandInputConstant(MachineInstr &MI) {
-  const R600RegisterInfo &TRI = TII->getRegisterInfo();
-  if (MI.getOpcode() != AMDGPU::input_constant)
-    return false;
-
-  MachineBasicBlock::iterator I = &MI;
-  unsigned DstReg = MI.getOperand(0).getReg();
-
-  for (unsigned i = 0; i < 4; i++) {
-    unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
-        MI.getOperand(1).getImm());
-
-    unsigned Sel = AMDGPU::sel_x;
-    switch (i % 4) {
-    case 0:Sel = AMDGPU::sel_x;break;
-    case 1:Sel = AMDGPU::sel_y;break;
-    case 2:Sel = AMDGPU::sel_z;break;
-    case 3:Sel = AMDGPU::sel_w;break;
-    default:break;
-    }
-
-    unsigned Res = TRI.getSubReg(DstReg, Sel);
-
-    MachineBasicBlock &MBB = *(MI.getParent());
-    MachineInstr *NewMI = TII->buildDefaultInstruction(
-        MBB, I, AMDGPU::INTERP_LOAD_P0, Res, ReadReg);
-
-    if (i % 4 !=  3)
-      TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
-  }
-
-  MI.eraseFromParent();
-
-  return true;
-}
-
 bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
 
   const R600RegisterInfo &TRI = TII->getRegisterInfo();
@@ -200,7 +88,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         MI.eraseFromParent();
         continue;
         }
-      case AMDGPU::BREAK:
+      case AMDGPU::BREAK: {
         MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
                                           AMDGPU::PRED_SETE_INT,
                                           AMDGPU::PREDICATE_BIT,
@@ -214,12 +102,87 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
                 .addReg(AMDGPU::PREDICATE_BIT);
         MI.eraseFromParent();
         continue;
-    }
+        }
 
-    if (ExpandInputPerspective(MI))
-      continue;
-    if (ExpandInputConstant(MI))
-      continue;
+      case AMDGPU::INTERP_PAIR_XY: {
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(2).getImm());
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          unsigned DstReg;
+
+          if (Chan < 2)
+            DstReg = MI.getOperand(Chan).getReg();
+          else
+            DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
+
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
+              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
+
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan >= 2)
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_PAIR_ZW: {
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(2).getImm());
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          unsigned DstReg;
+
+          if (Chan < 2)
+            DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
+          else
+            DstReg = MI.getOperand(Chan-2).getReg();
+
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
+              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
+
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan < 2)
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_VEC_LOAD: {
+        const R600RegisterInfo &TRI = TII->getRegisterInfo();
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(1).getImm());
+        unsigned DstReg = MI.getOperand(0).getReg();
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
+              TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+      }
 
       bool IsReduction = TII->isReductionOp(MI.getOpcode());
       bool IsVector = TII->isVector(MI);
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index f0eece3..b5c2a93 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -71,10 +72,27 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
 
+  // Legalize loads and stores to the private address space.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
+  setOperationAction(ISD::STORE, MVT::i8, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
   setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::SELECT_CC);
 
   setSchedulingPreference(Sched::VLIW);
 }
@@ -115,15 +133,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     break;
   }
 
-  case AMDGPU::R600_LOAD_CONST: {
-    int64_t RegIndex = MI->getOperand(1).getImm();
-    unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex);
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY))
-                .addOperand(MI->getOperand(0))
-                .addReg(ConstantReg);
-    break;
-  }
-
   case AMDGPU::MASK_WRITE: {
     unsigned maskedRegister = MI->getOperand(0).getReg();
     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
@@ -154,18 +163,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     break;
   }
 
-  case AMDGPU::RESERVE_REG: {
-    R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
-    int64_t ReservedIndex = MI->getOperand(0).getImm();
-    unsigned ReservedReg =
-                         AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex);
-    MFI->ReservedRegs.push_back(ReservedReg);
-    unsigned SuperReg =
-          AMDGPU::R600_Reg128RegClass.getRegister(ReservedIndex / 4);
-    MFI->ReservedRegs.push_back(SuperReg);
-    break;
-  }
-
   case AMDGPU::TXD: {
     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
@@ -250,33 +247,26 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     break;
   }
 
-  case AMDGPU::input_perspective: {
-    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
-
-    // XXX Be more fine about register reservation
-    for (unsigned i = 0; i < 4; i ++) {
-      unsigned ReservedReg = AMDGPU::R600_TReg32RegClass.getRegister(i);
-      MFI->ReservedRegs.push_back(ReservedReg);
-    }
-
-    switch (MI->getOperand(1).getImm()) {
-    case 0:// Perspective
-      MFI->HasPerspectiveInterpolation = true;
-      break;
-    case 1:// Linear
-      MFI->HasLinearInterpolation = true;
-      break;
-    default:
-      assert(0 && "Unknow ij index");
-    }
-
-    return BB;
-  }
-
   case AMDGPU::EG_ExportSwz:
   case AMDGPU::R600_ExportSwz: {
+    // Instruction is left unmodified if its not the last one of its type
+    bool isLastInstructionOfItsType = true;
+    unsigned InstExportType = MI->getOperand(1).getImm();
+    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
+         EndBlock = BB->end(); NextExportInst != EndBlock;
+         NextExportInst = llvm::next(NextExportInst)) {
+      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
+          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
+            .getImm();
+        if (CurrentInstExportType == InstExportType) {
+          isLastInstructionOfItsType = false;
+          break;
+        }
+      }
+    }
     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
-    if (!EOP)
+    if (!EOP && !isLastInstructionOfItsType)
       return BB;
     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
@@ -288,9 +278,18 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
             .addOperand(MI->getOperand(5))
             .addOperand(MI->getOperand(6))
             .addImm(CfInst)
-            .addImm(1);
+            .addImm(EOP);
     break;
   }
+  case AMDGPU::RETURN: {
+    // RETURN instructions must have the live-out registers as implicit uses,
+    // otherwise they appear dead.
+    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+    MachineInstrBuilder MIB(*MF, MI);
+    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
+      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
+    return BB;
+  }
   }
 
   MI->eraseFromParent();
@@ -304,57 +303,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 using namespace llvm::Intrinsic;
 using namespace llvm::AMDGPUIntrinsic;
 
-static SDValue
-InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
-    unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
-    SDValue Scalar, SDValue Chain) {
-  if (!ExportMap[Slot]) {
-    SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
-      DL, MVT::v4f32,
-      DAG.getUNDEF(MVT::v4f32),
-      Scalar,
-      DAG.getConstant(Channel, MVT::i32));
-
-    unsigned Mask = 1 << Channel;
-
-    const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
-        DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
-        DAG.getConstant(Mask, MVT::i32)};
-
-    SDValue Res =  DAG.getNode(
-        AMDGPUISD::EXPORT,
-        DL,
-        MVT::Other,
-        Ops, 6);
-     ExportMap[Slot] = Res.getNode();
-     return Res;
-  }
-
-  SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
-  SDValue PreviousVector = ExportInstruction->getOperand(1);
-  SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
-      DL, MVT::v4f32,
-      PreviousVector,
-      Scalar,
-      DAG.getConstant(Channel, MVT::i32));
-
-  unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
-      ->getZExtValue();
-  Mask |= (1 << Channel);
-
-  const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
-      DAG.getConstant(Inst, MVT::i32),
-      DAG.getConstant(Type, MVT::i32),
-      DAG.getConstant(Slot, MVT::i32),
-      DAG.getConstant(Mask, MVT::i32)};
-
-  DAG.UpdateNodeOperands(ExportInstruction,
-      Ops, 6);
-
-  return Chain;
-
-}
-
 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -364,7 +312,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::SETCC: return LowerSETCC(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::LOAD: return LowerLOAD(Op, DAG);
   case ISD::FPOW: return LowerFPOW(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
     unsigned IntrinsicID =
@@ -372,58 +322,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     switch (IntrinsicID) {
     case AMDGPUIntrinsic::AMDGPU_store_output: {
       MachineFunction &MF = DAG.getMachineFunction();
-      MachineRegisterInfo &MRI = MF.getRegInfo();
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      if (!MRI.isLiveOut(Reg)) {
-        MRI.addLiveOut(Reg);
-      }
+      MFI->LiveOuts.push_back(Reg);
       return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
     }
-    case AMDGPUIntrinsic::R600_store_pixel_color: {
-      MachineFunction &MF = DAG.getMachineFunction();
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-
-      SDNode **OutputsMap = MFI->Outputs;
-      return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
-          RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
-          Chain);
-
+    case AMDGPUIntrinsic::R600_store_swizzle: {
+      const SDValue Args[8] = {
+        Chain,
+        Op.getOperand(2), // Export Value
+        Op.getOperand(3), // ArrayBase
+        Op.getOperand(4), // Type
+        DAG.getConstant(0, MVT::i32), // SWZ_X
+        DAG.getConstant(1, MVT::i32), // SWZ_Y
+        DAG.getConstant(2, MVT::i32), // SWZ_Z
+        DAG.getConstant(3, MVT::i32) // SWZ_W
+      };
+      return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
+          Args, 8);
     }
-    case AMDGPUIntrinsic::R600_store_stream_output : {
-      MachineFunction &MF = DAG.getMachineFunction();
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
-      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-      int64_t BufIndex = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
-
-      SDNode **OutputsMap = MFI->StreamOutputs[BufIndex];
-      unsigned Inst;
-      switch (cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue()  ) {
-      // STREAM3
-      case 3:
-        Inst = 4;
-        break;
-      // STREAM2
-      case 2:
-        Inst = 3;
-        break;
-      // STREAM1
-      case 1:
-        Inst = 2;
-        break;
-      // STREAM0
-      case 0:
-        Inst = 1;
-        break;
-      default:
-        llvm_unreachable("Wrong buffer id for stream outputs !");
-      }
 
-      return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
-          RegIndex / 4, RegIndex % 4, Inst, 0, Op.getOperand(2),
-          Chain);
-    }
     // default for switch(IntrinsicID)
     default: break;
     }
@@ -442,38 +361,35 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
     }
-    case AMDGPUIntrinsic::R600_load_input_perspective: {
-      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      if (slot < 0)
-        return DAG.getUNDEF(MVT::f32);
-      SDValue FullVector = DAG.getNode(
-          AMDGPUISD::INTERP,
-          DL, MVT::v4f32,
-          DAG.getConstant(0, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
-        DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
-    }
-    case AMDGPUIntrinsic::R600_load_input_linear: {
-      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      if (slot < 0)
-        return DAG.getUNDEF(MVT::f32);
-      SDValue FullVector = DAG.getNode(
-        AMDGPUISD::INTERP,
-        DL, MVT::v4f32,
-        DAG.getConstant(1, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
-        DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
-    }
-    case AMDGPUIntrinsic::R600_load_input_constant: {
+
+    case AMDGPUIntrinsic::R600_interp_input: {
       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      if (slot < 0)
-        return DAG.getUNDEF(MVT::f32);
-      SDValue FullVector = DAG.getNode(
-        AMDGPUISD::INTERP_P0,
-        DL, MVT::v4f32,
-        DAG.getConstant(slot / 4 , MVT::i32));
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
-          DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
+      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
+      MachineSDNode *interp;
+      if (ijb < 0) {
+        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
+            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
+        return DAG.getTargetExtractSubreg(
+            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
+            DL, MVT::f32, SDValue(interp, 0));
+      }
+
+      if (slot % 4 < 2)
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
+            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
+            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
+      else
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
+            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
+            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
+
+      return SDValue(interp, slot % 2);
     }
 
     case r600_read_ngroups_x:
@@ -527,6 +443,20 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
   switch (N->getOpcode()) {
   default: return;
   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+    return;
+  case ISD::LOAD: {
+    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
+    Results.push_back(SDValue(Node, 0));
+    Results.push_back(SDValue(Node, 1));
+    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
+    // function
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
+    return;
+  }
+  case ISD::STORE:
+    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
+    Results.push_back(SDValue(Node, 0));
+    return;
   }
 }
 
@@ -594,6 +524,20 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                      false, false, false, 0);
 }
 
+SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL =
+   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+
+  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
+  assert(FIN);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
+}
+
 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT VT = Op.getValueType();
@@ -680,9 +624,12 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
   }
 
   // Try to lower to a SET* instruction:
-  // We need all the operands of SELECT_CC to have the same value type, so if
-  // necessary we need to change True and False to be the same type as LHS and
-  // RHS, and then convert the result of the select_cc back to the correct type.
+  //
+  // CompareVT == MVT::f32 and VT == MVT::i32 is supported by the hardware,
+  // but for the other case where CompareVT != VT, all operands of
+  // SELECT_CC need to have the same value type, so we need to change True and
+  // False to be the same type as LHS and RHS, and then convert the result of
+  // the select_cc back to the correct type.
 
   // Move hardware True/False values to the correct operand.
   if (isHWTrueValue(False) && isHWFalseValue(True)) {
@@ -692,32 +639,17 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
   }
 
   if (isHWTrueValue(True) && isHWFalseValue(False)) {
-    if (CompareVT !=  VT) {
-      if (VT == MVT::f32 && CompareVT == MVT::i32) {
-        SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
-            LHS, RHS,
-            DAG.getConstant(-1, MVT::i32),
-            DAG.getConstant(0, MVT::i32),
-            CC);
-        // Convert integer values of true (-1) and false (0) to fp values of
-        // true (1.0f) and false (0.0f).
-        SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
-                                                  DAG.getConstant(1, MVT::i32));
-        return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
-      } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
-        SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
-            LHS, RHS,
-            DAG.getConstantFP(1.0f, MVT::f32),
-            DAG.getConstantFP(0.0f, MVT::f32),
-            CC);
-        // Convert fp values of true (1.0f) and false (0.0f) to integer values
-        // of true (-1) and false (0).
-        SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
-        return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
-      } else {
-        // I don't think there will be any other type pairings.
-        assert(!"Unhandled operand type parings in SELECT_CC");
-      }
+    if (CompareVT !=  VT && VT == MVT::f32 && CompareVT == MVT::i32) {
+      SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
+          LHS, RHS,
+          DAG.getConstant(-1, MVT::i32),
+          DAG.getConstant(0, MVT::i32),
+          CC);
+      // Convert integer values of true (-1) and false (0) to fp values of
+      // true (1.0f) and false (0.0f).
+      SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
+                                                DAG.getConstant(1, MVT::i32));
+      return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
     } else {
       // This SELECT_CC is already legal.
       return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
@@ -808,6 +740,61 @@ SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return Cond;
 }
 
+/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
+/// convert these pointers to a register index.  Each register holds
+/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
+/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// for indirect addressing.
+SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
+                                               unsigned StackWidth,
+                                               SelectionDAG &DAG) const {
+  unsigned SRLPad;
+  switch(StackWidth) {
+  case 1:
+    SRLPad = 2;
+    break;
+  case 2:
+    SRLPad = 3;
+    break;
+  case 4:
+    SRLPad = 4;
+    break;
+  default: llvm_unreachable("Invalid stack width");
+  }
+
+  return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
+                     DAG.getConstant(SRLPad, MVT::i32));
+}
+
+void R600TargetLowering::getStackAddress(unsigned StackWidth,
+                                         unsigned ElemIdx,
+                                         unsigned &Channel,
+                                         unsigned &PtrIncr) const {
+  switch (StackWidth) {
+  default:
+  case 1:
+    Channel = 0;
+    if (ElemIdx > 0) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 2:
+    Channel = ElemIdx % 2;
+    if (ElemIdx == 2) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 4:
+    Channel = ElemIdx;
+    PtrIncr = 0;
+    break;
+  }
+}
+
 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
@@ -829,9 +816,188 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     }
     return Chain;
   }
-  return SDValue();
+
+  EVT ValueVT = Value.getValueType();
+
+  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return SDValue();
+  }
+
+  // Lowering for indirect addressing
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
+                                         getTargetMachine().getFrameLowering());
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (ValueVT.isVector()) {
+    unsigned NumElemVT = ValueVT.getVectorNumElements();
+    EVT ElemVT = ValueVT.getVectorElementType();
+    SDValue Stores[4];
+
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, MVT::i32));
+      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
+                                 Value, DAG.getConstant(i, MVT::i32));
+
+      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                              Chain, Elem, Ptr,
+                              DAG.getTargetConstant(Channel, MVT::i32));
+    }
+     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
+   } else {
+    if (ValueVT == MVT::i8) {
+      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
+    }
+    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
+    DAG.getTargetConstant(0, MVT::i32)); // Channel 
+  }
+
+  return Chain;
+}
+
+// return (512 + (kc_bank << 12)
+static int
+ConstantAddressBlock(unsigned AddressSpace) {
+  switch (AddressSpace) {
+  case AMDGPUAS::CONSTANT_BUFFER_0:
+    return 512;
+  case AMDGPUAS::CONSTANT_BUFFER_1:
+    return 512 + 4096;
+  case AMDGPUAS::CONSTANT_BUFFER_2:
+    return 512 + 4096 * 2;
+  case AMDGPUAS::CONSTANT_BUFFER_3:
+    return 512 + 4096 * 3;
+  case AMDGPUAS::CONSTANT_BUFFER_4:
+    return 512 + 4096 * 4;
+  case AMDGPUAS::CONSTANT_BUFFER_5:
+    return 512 + 4096 * 5;
+  case AMDGPUAS::CONSTANT_BUFFER_6:
+    return 512 + 4096 * 6;
+  case AMDGPUAS::CONSTANT_BUFFER_7:
+    return 512 + 4096 * 7;
+  case AMDGPUAS::CONSTANT_BUFFER_8:
+    return 512 + 4096 * 8;
+  case AMDGPUAS::CONSTANT_BUFFER_9:
+    return 512 + 4096 * 9;
+  case AMDGPUAS::CONSTANT_BUFFER_10:
+    return 512 + 4096 * 10;
+  case AMDGPUAS::CONSTANT_BUFFER_11:
+    return 512 + 4096 * 11;
+  case AMDGPUAS::CONSTANT_BUFFER_12:
+    return 512 + 4096 * 12;
+  case AMDGPUAS::CONSTANT_BUFFER_13:
+    return 512 + 4096 * 13;
+  case AMDGPUAS::CONSTANT_BUFFER_14:
+    return 512 + 4096 * 14;
+  case AMDGPUAS::CONSTANT_BUFFER_15:
+    return 512 + 4096 * 15;
+  default:
+    return -1;
+  }
 }
 
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Ptr = Op.getOperand(1);
+  SDValue LoweredLoad;
+
+  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+  if (ConstantBlock > -1) {
+    SDValue Result;
+    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
+        dyn_cast<Constant>(LoadNode->getSrcValue())) {
+      SDValue Slots[4];
+      for (unsigned i = 0; i < 4; i++) {
+        // We want Const position encoded with the following formula :
+        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+        // const_index is Ptr computed by llvm using an alignment of 16.
+        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+        // then div by 4 at the ISel step
+        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
+        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+      }
+      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
+    } else {
+      // non constant ptr cant be folded, keeps it as a v4f32 load
+      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
+          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32))
+          );
+    }
+
+    if (!VT.isVector()) {
+      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+          DAG.getConstant(0, MVT::i32));
+    }
+
+    SDValue MergedValues[2] = {
+        Result,
+        Chain
+    };
+    return DAG.getMergeValues(MergedValues, 2, DL);
+  }
+
+  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return SDValue();
+  }
+
+  // Lowering for indirect addressing
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
+                                         getTargetMachine().getFrameLowering());
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (VT.isVector()) {
+    unsigned NumElemVT = VT.getVectorNumElements();
+    EVT ElemVT = VT.getVectorElementType();
+    SDValue Loads[4];
+
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, MVT::i32));
+      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
+                             Chain, Ptr,
+                             DAG.getTargetConstant(Channel, MVT::i32),
+                             Op.getOperand(2));
+    }
+    for (unsigned i = NumElemVT; i < 4; ++i) {
+      Loads[i] = DAG.getUNDEF(ElemVT);
+    }
+    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
+    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
+  } else {
+    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
+                              Chain, Ptr,
+                              DAG.getTargetConstant(0, MVT::i32), // Channel
+                              Op.getOperand(2));
+  }
+
+  SDValue Ops[2];
+  Ops[0] = LoweredLoad;
+  Ops[1] = Chain;
+
+  return DAG.getMergeValues(Ops, 2, DL);
+}
 
 SDValue R600TargetLowering::LowerFPOW(SDValue Op,
     SelectionDAG &DAG) const {
@@ -873,7 +1039,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
                                                     AMDGPUAS::PARAM_I_ADDRESS);
     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
-                                       MachinePointerInfo(new Argument(PtrTy)),
+                                       MachinePointerInfo(UndefValue::get(PtrTy)),
                                        ArgVT, false, false, ArgBytes);
     InVals.push_back(Arg);
     ParamOffsetBytes += ArgBytes;
@@ -904,6 +1070,121 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
       }
       break;
     }
+
+  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
+  // (i32 select_cc f32, f32, -1, 0 cc)
+  //
+  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
+  // this to one of the SET*_DX10 instructions.
+  case ISD::FP_TO_SINT: {
+    SDValue FNeg = N->getOperand(0);
+    if (FNeg.getOpcode() != ISD::FNEG) {
+      return SDValue();
+    }
+    SDValue SelectCC = FNeg.getOperand(0);
+    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
+        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
+        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
+        !isHWTrueValue(SelectCC.getOperand(2)) ||
+        !isHWFalseValue(SelectCC.getOperand(3))) {
+      return SDValue();
+    }
+
+    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
+                           SelectCC.getOperand(0), // LHS
+                           SelectCC.getOperand(1), // RHS
+                           DAG.getConstant(-1, MVT::i32), // True
+                           DAG.getConstant(0, MVT::i32),  // Flase
+                           SelectCC.getOperand(4)); // CC
+
+    break;
+  }
+  // Extract_vec (Build_vector) generated by custom lowering
+  // also needs to be customly combined
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Arg = N->getOperand(0);
+    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return Arg->getOperand(Element);
+      }
+    }
+    if (Arg.getOpcode() == ISD::BITCAST &&
+        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
+            Arg->getOperand(0).getOperand(Element));
+      }
+    }
+  }
+
+  case ISD::SELECT_CC: {
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
+    //      selectcc x, y, a, b, inv(cc)
+    SDValue LHS = N->getOperand(0);
+    if (LHS.getOpcode() != ISD::SELECT_CC) {
+      return SDValue();
+    }
+
+    SDValue RHS = N->getOperand(1);
+    SDValue True = N->getOperand(2);
+    SDValue False = N->getOperand(3);
+
+    if (LHS.getOperand(2).getNode() != True.getNode() ||
+        LHS.getOperand(3).getNode() != False.getNode() ||
+        RHS.getNode() != False.getNode() ||
+        cast<CondCodeSDNode>(N->getOperand(4))->get() != ISD::SETEQ) {
+      return SDValue();
+    }
+
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(LHS->getOperand(4))->get();
+    CCOpcode = ISD::getSetCCInverse(
+                        CCOpcode, LHS.getOperand(0).getValueType().isInteger());
+    return DAG.getSelectCC(N->getDebugLoc(),
+                           LHS.getOperand(0),
+                           LHS.getOperand(1),
+                           LHS.getOperand(2),
+                           LHS.getOperand(3),
+                           CCOpcode);
+    }
+  case AMDGPUISD::EXPORT: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+    SDValue NewBldVec[4] = {
+        DAG.getUNDEF(MVT::f32),
+        DAG.getUNDEF(MVT::f32),
+        DAG.getUNDEF(MVT::f32),
+        DAG.getUNDEF(MVT::f32)
+      };
+    SDValue NewArgs[8] = {
+      N->getOperand(0), // Chain
+      SDValue(),
+      N->getOperand(2), // ArrayBase
+      N->getOperand(3), // Type
+      N->getOperand(4), // SWZ_X
+      N->getOperand(5), // SWZ_Y
+      N->getOperand(6), // SWZ_Z
+      N->getOperand(7) // SWZ_W
+    };
+    for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
+      if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
+        if (C->isZero()) {
+          NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
+        } else if (C->isExactlyValue(1.0)) {
+          NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
+        } else {
+          NewBldVec[i] = Arg.getOperand(i);
+        }
+      } else {
+        NewBldVec[i] = Arg.getOperand(i);
+      }
+    }
+    DebugLoc DL = N->getDebugLoc();
+    NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
+    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
+  }
   }
   return SDValue();
 }
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index 2b954da..afa3897 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -63,7 +63,13 @@ private:
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const;
-  
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
+                                          SelectionDAG &DAG) const;
+  void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
+                       unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
 };
 
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 06b78d0..7e3f005 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -16,8 +16,11 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 
 #define GET_INSTRINFO_CTOR
 #include "AMDGPUGenDFAPacketizer.inc"
@@ -104,7 +107,6 @@ bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
   switch (Opcode) {
   default: return false;
   case AMDGPU::RETURN:
-  case AMDGPU::RESERVE_REG:
     return true;
   }
 }
@@ -466,6 +468,124 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return 2;
 }
 
+int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = 0;
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    Offset = std::max(Offset,
+                      GET_REG_INDEX(RI.getEncodingValue(LI->first)));
+  }
+
+  return Offset + 1;
+}
+
+int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  assert(!MFI->hasVarSizedObjects());
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
+std::vector<unsigned> R600InstrInfo::getIndirectReservedRegs(
+                                             const MachineFunction &MF) const {
+  const AMDGPUFrameLowering *TFL =
+                 static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
+  std::vector<unsigned> Regs;
+
+  unsigned StackWidth = TFL->getStackWidth(MF);
+  int End = getIndirectIndexEnd(MF);
+
+  if (End == -1) {
+    return Regs;
+  }
+
+  for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
+    unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
+    Regs.push_back(SuperReg);
+    for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+      Regs.push_back(Reg);
+    }
+  }
+  return Regs;
+}
+
+unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  // XXX: Remove when we support a stack width > 2
+  assert(Channel == 0);
+  return RegIndex;
+}
+
+const TargetRegisterClass * R600InstrInfo::getIndirectAddrStoreRegClass(
+                                                     unsigned SourceReg) const {
+  return &AMDGPU::R600_TReg32RegClass;
+}
+
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrLoadRegClass() const {
+  return &AMDGPU::TRegMemRegClass;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                               AMDGPU::AR_X, OffsetReg);
+  setImmOperand(MOVA, R600Operands::WRITE, 0);
+
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      AddrReg, ValueReg)
+                                      .addReg(AMDGPU::AR_X, RegState::Implicit);
+  setImmOperand(Mov, R600Operands::DST_REL, 1);
+  return Mov;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                                       AMDGPU::AR_X,
+                                                       OffsetReg);
+  setImmOperand(MOVA, R600Operands::WRITE, 0);
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      ValueReg,
+                                      AddrReg)
+                                      .addReg(AMDGPU::AR_X, RegState::Implicit);
+  setImmOperand(Mov, R600Operands::SRC0_REL, 1);
+
+  return Mov;
+}
+
+const TargetRegisterClass *R600InstrInfo::getSuperIndirectRegClass() const {
+  return &AMDGPU::IndirectRegRegClass;
+}
+
+
 MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
                                                   MachineBasicBlock::iterator I,
                                                   unsigned Opcode,
@@ -486,13 +606,15 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
      .addReg(Src0Reg)  // $src0
      .addImm(0)        // $src0_neg
      .addImm(0)        // $src0_rel
-     .addImm(0);       // $src0_abs
+     .addImm(0)        // $src0_abs
+     .addImm(-1);       // $src0_sel
 
   if (Src1Reg) {
     MIB.addReg(Src1Reg) // $src1
        .addImm(0)       // $src1_neg
        .addImm(0)       // $src1_rel
-       .addImm(0);       // $src1_abs
+       .addImm(0)       // $src1_abs
+       .addImm(-1);      // $src1_sel
   }
 
   //XXX: The r600g finalizer expects this to be 1, once we've moved the
@@ -521,16 +643,6 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
 
 int R600InstrInfo::getOperandIdx(unsigned Opcode,
                                  R600Operands::Ops Op) const {
-  const static int OpTable[3][R600Operands::COUNT] = {
-//            W        C     S  S  S     S  S  S     S  S
-//            R  O  D  L  S  R  R  R  S  R  R  R  S  R  R  L  P
-//   D  U     I  M  R  A  R  C  C  C  C  C  C  C  R  C  C  A  R  I
-//   S  E  U  T  O  E  M  C  0  0  0  C  1  1  1  C  2  2  S  E  M
-//   T  M  P  E  D  L  P  0  N  R  A  1  N  R  A  2  N  R  T  D  M
-    {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8,-1,-1,-1,-1,-1,-1,-1, 9,10,11},
-    {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,-1,-1,-1,13,14,15,16,17},
-    {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8,-1, 9,10,11,12,13,14}
-  };
   unsigned TargetFlags = get(Opcode).TSFlags;
   unsigned OpTableIdx;
 
@@ -556,7 +668,7 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode,
     OpTableIdx = 2;
   }
 
-  return OpTable[OpTableIdx][Op];
+  return R600Operands::ALUOpTable[OpTableIdx][Op];
 }
 
 void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index 11685af..efe721c 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -113,6 +113,38 @@ namespace llvm {
   virtual int getInstrLatency(const InstrItineraryData *ItinData,
                               SDNode *Node) const { return 1;}
 
+  /// \returns a list of all the registers that may be accesed using indirect
+  /// addressing.
+  std::vector<unsigned> getIndirectReservedRegs(const MachineFunction &MF) const;
+
+  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+
+
+  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
+                                            unsigned Channel) const;
+
+  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
+                                                      unsigned SourceReg) const;
+
+  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+
+  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned ValueReg, unsigned Address,
+                                  unsigned OffsetReg) const;
+
+  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned ValueReg, unsigned Address,
+                                  unsigned OffsetReg) const;
+
+  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
+
+
+  ///buildDefaultInstruction - This function returns a MachineInstr with
+  /// all the instruction modifiers initialized to their default values.
   /// You can use this function to avoid manually specifying each instruction
   /// modifier operand when building a new instruction.
   ///
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 64bab18..8242df9 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -70,6 +70,11 @@ class InstFlag<string PM = "printOperand", int Default = 0>
   let PrintMethod = PM;
 }
 
+// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
+def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
+  let PrintMethod = "printSel";
+}
+
 def LITERAL : InstFlag<"printLiteral">;
 
 def WRITE : InstFlag <"printWrite", 1>;
@@ -86,9 +91,16 @@ def UP : InstFlag <"printUpdatePred">;
 // default to 0.
 def LAST : InstFlag<"printLast", 1>;
 
+def FRAMEri : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
+}
+
 def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
 def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
 def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
+def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
+def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
 class R600ALU_Word0 {
   field bits<32> Word0;
@@ -173,6 +185,55 @@ class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
   let Word1{17-13} = alu_inst;
 }
 
+class VTX_WORD0 {
+  field bits<32> Word0;
+  bits<7> SRC_GPR;
+  bits<5> VC_INST;
+  bits<2> FETCH_TYPE;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> BUFFER_ID;
+  bits<1> SRC_REL;
+  bits<2> SRC_SEL_X;
+  bits<6> MEGA_FETCH_COUNT;
+
+  let Word0{4-0}   = VC_INST;
+  let Word0{6-5}   = FETCH_TYPE;
+  let Word0{7}     = FETCH_WHOLE_QUAD;
+  let Word0{15-8}  = BUFFER_ID;
+  let Word0{22-16} = SRC_GPR;
+  let Word0{23}    = SRC_REL;
+  let Word0{25-24} = SRC_SEL_X;
+  let Word0{31-26} = MEGA_FETCH_COUNT;
+}
+
+class VTX_WORD1_GPR {
+  field bits<32> Word1;
+  bits<7> DST_GPR;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<1> USE_CONST_FIELDS;
+  bits<6> DATA_FORMAT;
+  bits<2> NUM_FORMAT_ALL;
+  bits<1> FORMAT_COMP_ALL;
+  bits<1> SRF_MODE_ALL;
+
+  let Word1{6-0} = DST_GPR;
+  let Word1{7}    = DST_REL;
+  let Word1{8}    = 0; // Reserved
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{21}    = USE_CONST_FIELDS;
+  let Word1{27-22} = DATA_FORMAT;
+  let Word1{29-28} = NUM_FORMAT_ALL;
+  let Word1{30}    = FORMAT_COMP_ALL;
+  let Word1{31}    = SRF_MODE_ALL;
+}
+
 /*
 XXX: R600 subtarget uses a slightly different encoding than the other
 subtargets.  We currently handle this in R600MCCodeEmitter, but we may
@@ -214,11 +275,11 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
     InstR600 <0,
               (outs R600_Reg32:$dst),
               (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
-                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs,
+                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
                    LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
               !strconcat(opName,
                    "$clamp $dst$write$dst_rel$omod, "
-                   "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                   "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
                    "$literal $pred_sel$last"),
               pattern,
               itin>,
@@ -254,13 +315,13 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
           (outs R600_Reg32:$dst),
           (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
                OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
-               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs,
-               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
                LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
           !strconcat(opName,
                 "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
-                "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
-                "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
+                "$src0_neg$src0_abs$src0$src0_sel$src0_abs$src0_rel, "
+                "$src1_neg$src1_abs$src1$src1_sel$src1_abs$src1_rel, "
                 "$literal $pred_sel$last"),
           pattern,
           itin>,
@@ -291,14 +352,14 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
   InstR600 <0,
           (outs R600_Reg32:$dst),
           (ins REL:$dst_rel, CLAMP:$clamp,
-               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel,
-               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel,
-               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
+               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
                LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
           !strconcat(opName, "$clamp $dst$dst_rel, "
-                             "$src0_neg$src0$src0_rel, "
-                             "$src1_neg$src1$src1_rel, "
-                             "$src2_neg$src2$src2_rel, "
+                             "$src0_neg$src0$src0_sel$src0_rel, "
+                             "$src1_neg$src1$src1_sel$src1_rel, "
+                             "$src2_neg$src2$src2_sel$src2_rel, "
                              "$literal $pred_sel$last"),
           pattern,
           itin>,
@@ -342,6 +403,27 @@ def TEX_SHADOW : PatLeaf<
   }]
 >;
 
+def TEX_RECT : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 5;
+  }]
+>;
+
+def TEX_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 9 || TType == 10 || TType == 15 || TType == 16;
+  }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 11 || TType == 12 || TType == 17;
+  }]
+>;
+
 class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, bits<4> rat_id, dag outs,
                  dag ins, string asm, list<dag> pattern> :
     InstR600ISA <outs, ins, asm, pattern> {
@@ -414,32 +496,35 @@ def isR600toCayman : Predicate<
                      "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">;
 
 //===----------------------------------------------------------------------===//
-// Interpolation Instructions
+// R600 SDNodes
 //===----------------------------------------------------------------------===//
 
-def INTERP: SDNode<"AMDGPUISD::INTERP",
-  SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisInt<1>, SDTCisInt<2>]>
-  >;
+def INTERP_PAIR_XY :  AMDGPUShaderInst <
+  (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
+  (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+  "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
+  []>;
 
-def INTERP_P0: SDNode<"AMDGPUISD::INTERP_P0",
-  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisInt<1>]>
-  >;
+def INTERP_PAIR_ZW :  AMDGPUShaderInst <
+  (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
+  (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+  "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
+  []>;
 
-let usesCustomInserter = 1 in {
-def input_perspective :  AMDGPUShaderInst <
-  (outs R600_Reg128:$dst),
-  (ins i32imm:$src0, i32imm:$src1),
-  "input_perspective $src0 $src1 : dst",
-  [(set R600_Reg128:$dst, (INTERP (i32 imm:$src0), (i32 imm:$src1)))]>;
-}  // End usesCustomInserter = 1
-
-def input_constant :  AMDGPUShaderInst <
-  (outs R600_Reg128:$dst),
-  (ins i32imm:$src),
-  "input_perspective $src : dst",
-  [(set R600_Reg128:$dst, (INTERP_P0 (i32 imm:$src)))]>;
+def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
+  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+  [SDNPMayLoad]
+>;
 
+//===----------------------------------------------------------------------===//
+// Interpolation Instructions
+//===----------------------------------------------------------------------===//
 
+def INTERP_VEC_LOAD :  AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins i32imm:$src0),
+  "INTERP_LOAD $src0 : $dst",
+  []>;
 
 def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
   let bank_swizzle = 5;
@@ -455,7 +540,7 @@ def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
 // Export Instructions
 //===----------------------------------------------------------------------===//
 
-def ExportType : SDTypeProfile<0, 5, [SDTCisFP<0>, SDTCisInt<1>]>;
+def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
 
 def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
   [SDNPHasChain, SDNPSideEffect]>;
@@ -507,53 +592,59 @@ class ExportBufWord1 {
 multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
   def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
     (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x),
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
         0, 61, 0, 7, 7, 7, cf_inst, 0)
   >;
 
   def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
     (ExportInst
-        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x),
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
         0, 61, 7, 0, 7, 7, cf_inst, 0)
   >;
 
-  def : Pat<(int_R600_store_pixel_dummy),
+  def : Pat<(int_R600_store_dummy (i32 imm:$type)),
     (ExportInst
-        (v4f32 (IMPLICIT_DEF)), 0, 0, 7, 7, 7, 7, cf_inst, 0)
+        (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0)
   >;
 
-  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 0),
-    (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)),
-        (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
-        0, 1, 2, 3, cf_inst, 0)
+  def : Pat<(int_R600_store_dummy 1),
+    (ExportInst
+        (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+    (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
+        (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
+        imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
   >;
+
 }
 
 multiclass SteamOutputExportPattern<Instruction ExportInst,
     bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
 // Stream0
-  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 1),
-      (i32 imm:$type), (i32 imm:$arraybase), (i32 imm:$mask)),
-      (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
       4095, imm:$mask, buf0inst, 0)>;
 // Stream1
-  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 2),
-      (i32 imm:$type), (i32 imm:$arraybase), (i32 imm:$mask)),
-      (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
       4095, imm:$mask, buf1inst, 0)>;
 // Stream2
-  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 3),
-      (i32 imm:$type), (i32 imm:$arraybase), (i32 imm:$mask)),
-      (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
       4095, imm:$mask, buf2inst, 0)>;
 // Stream3
-  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 4),
-      (i32 imm:$type), (i32 imm:$arraybase), (i32 imm:$mask)),
-      (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
       4095, imm:$mask, buf3inst, 0)>;
 }
 
-let isTerminator = 1, usesCustomInserter = 1 in {
+let usesCustomInserter = 1 in {
 
 class ExportSwzInst : InstR600ISA<(
     outs),
@@ -567,7 +658,7 @@ class ExportSwzInst : InstR600ISA<(
   let Inst{63-32} = Word1;
 }
 
-} // End isTerminator = 1, usesCustomInserter = 1
+} // End usesCustomInserter = 1
 
 class ExportBufInst : InstR600ISA<(
     outs),
@@ -580,7 +671,7 @@ class ExportBufInst : InstR600ISA<(
   let Inst{63-32} = Word1;
 }
 
-let Predicates = [isR600toCayman] in { 
+let Predicates = [isR600toCayman] in {
 
 //===----------------------------------------------------------------------===//
 // Common Instructions R600, R700, Evergreen, Cayman
@@ -624,6 +715,34 @@ def SNE : R600_2OP <
     COND_NE))]
 >;
 
+def SETE_DX10 : R600_2OP <
+  0xC, "SETE_DX10",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
+    COND_EQ))]
+>;
+
+def SETGT_DX10 : R600_2OP <
+  0xD, "SETGT_DX10",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
+    COND_GT))]
+>;
+
+def SETGE_DX10 : R600_2OP <
+  0xE, "SETGE_DX10",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
+    COND_GE))]
+>;
+
+def SETNE_DX10 : R600_2OP <
+  0xF, "SETNE_DX10",
+  [(set R600_Reg32:$dst,
+    (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
+     COND_NE))]
+>;
+
 def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
 def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>;
 def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
@@ -684,7 +803,7 @@ def SETE_INT : R600_2OP <
 >;
 
 def SETGT_INT : R600_2OP <
-  0x3B, "SGT_INT",
+  0x3B, "SETGT_INT",
   [(set (i32 R600_Reg32:$dst),
    (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))]
 >;
@@ -830,8 +949,13 @@ class MUL_LIT_Common <bits<5> inst> : R600_3OP <
 
 class MULADD_Common <bits<5> inst> : R600_3OP <
   inst, "MULADD",
+  []
+>;
+
+class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD_IEEE",
   [(set (f32 R600_Reg32:$dst),
-   (IL_mad R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2))]
+   (fadd (fmul R600_Reg32:$src0, R600_Reg32:$src1), R600_Reg32:$src2))]
 >;
 
 class CNDE_Common <bits<5> inst> : R600_3OP <
@@ -988,6 +1112,7 @@ let Predicates = [isR600] in {
 
   def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
   def MULADD_r600 : MULADD_Common<0x10>;
+  def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>;
   def CNDE_r600 : CNDE_Common<0x18>;
   def CNDGT_r600 : CNDGT_Common<0x19>;
   def CNDGE_r600 : CNDGE_Common<0x1A>;
@@ -1070,7 +1195,7 @@ let Predicates = [isR700] in {
 //===----------------------------------------------------------------------===//
 
 let Predicates = [isEG] in {
-  
+
 def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
 defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
 
@@ -1127,6 +1252,7 @@ let Predicates = [isEGorCayman] in {
   >;
 
   def MULADD_eg : MULADD_Common<0x14>;
+  def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
   def ASHR_eg : ASHR_Common<0x15>;
   def LSHR_eg : LSHR_Common<0x16>;
   def LSHL_eg : LSHL_Common<0x17>;
@@ -1138,6 +1264,10 @@ let Predicates = [isEGorCayman] in {
   defm DOT4_eg : DOT4_Common<0xBE>;
   defm CUBE_eg : CUBE_Common<0xC0>;
 
+let hasSideEffects = 1 in {
+  def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", []>;
+}
+
   def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
 
   def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
@@ -1228,37 +1358,30 @@ def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
 >;
 
 class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern> {
-
-  // Operands
-  bits<7> DST_GPR;
-  bits<7> SRC_GPR;
+    : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern>,
+      VTX_WORD1_GPR, VTX_WORD0 {
 
   // Static fields
-  bits<5> VC_INST = 0;
-  bits<2> FETCH_TYPE = 2;
-  bits<1> FETCH_WHOLE_QUAD = 0;
-  bits<8> BUFFER_ID = buffer_id;
-  bits<1> SRC_REL = 0;
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = buffer_id;
+  let SRC_REL = 0;
   // XXX: We can infer this field based on the SRC_GPR.  This would allow us
   // to store vertex addresses in any channel, not just X.
-  bits<2> SRC_SEL_X = 0;
-  bits<6> MEGA_FETCH_COUNT;
-  bits<1> DST_REL = 0;
-  bits<3> DST_SEL_X;
-  bits<3> DST_SEL_Y;
-  bits<3> DST_SEL_Z;
-  bits<3> DST_SEL_W;
+  let SRC_SEL_X = 0;
+  let DST_REL = 0;
   // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
   // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
   // however, based on my testing if USE_CONST_FIELDS is set, then all
   // these fields need to be set to 0.
-  bits<1> USE_CONST_FIELDS = 0;
-  bits<6> DATA_FORMAT;
-  bits<2> NUM_FORMAT_ALL = 1;
-  bits<1> FORMAT_COMP_ALL = 0;
-  bits<1> SRF_MODE_ALL = 0;
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 1;
+  let FORMAT_COMP_ALL = 0;
+  let SRF_MODE_ALL = 0;
 
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
   // LLVM can only encode 64-bit instructions, so these fields are manually
   // encoded in R600CodeEmitter
   //
@@ -1269,29 +1392,7 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
   // bits<1>  ALT_CONST = 0;
   // bits<2>  BUFFER_INDEX_MODE = 0;
 
-  // VTX_WORD0
-  let Inst{4-0}   = VC_INST;
-  let Inst{6-5}   = FETCH_TYPE;
-  let Inst{7}     = FETCH_WHOLE_QUAD;
-  let Inst{15-8}  = BUFFER_ID;
-  let Inst{22-16} = SRC_GPR;
-  let Inst{23}    = SRC_REL;
-  let Inst{25-24} = SRC_SEL_X;
-  let Inst{31-26} = MEGA_FETCH_COUNT;
-
-  // VTX_WORD1_GPR
-  let Inst{38-32} = DST_GPR;
-  let Inst{39}    = DST_REL;
-  let Inst{40}    = 0; // Reserved
-  let Inst{43-41} = DST_SEL_X;
-  let Inst{46-44} = DST_SEL_Y;
-  let Inst{49-47} = DST_SEL_Z;
-  let Inst{52-50} = DST_SEL_W;
-  let Inst{53}    = USE_CONST_FIELDS;
-  let Inst{59-54} = DATA_FORMAT;
-  let Inst{61-60} = NUM_FORMAT_ALL;
-  let Inst{62}    = FORMAT_COMP_ALL;
-  let Inst{63}    = SRF_MODE_ALL;
+
 
   // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
   // is done in R600CodeEmitter
@@ -1346,7 +1447,7 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
 
   // This is not really necessary, but there were some GPU hangs that appeared
   // to be caused by ALU instructions in the next instruction group that wrote
-  // to the $ptr registers of the VTX_READ.  
+  // to the $ptr registers of the VTX_READ.
   // e.g.
   // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
   // %T2_X<def> = MOV %ZERO
@@ -1387,6 +1488,10 @@ def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
   [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
 >;
 
+def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
+  [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))]
+>;
+
 //===----------------------------------------------------------------------===//
 // VTX Read from global memory space
 //===----------------------------------------------------------------------===//
@@ -1417,9 +1522,15 @@ def CONSTANT_LOAD_eg : VTX_READ_32_eg <1,
 
 }
 
+//===----------------------------------------------------------------------===//
+// Regist loads and stores - for indirect addressing
+//===----------------------------------------------------------------------===//
+
+defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+
 let Predicates = [isCayman] in {
 
-let isVector = 1 in { 
+let isVector = 1 in {
 
 def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
 
@@ -1476,6 +1587,7 @@ def PRED_X : InstR600 <
   (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
   "", [], NullALU> {
   let FlagOperandIdx = 3;
+  let isTerminator = 1;
 }
 
 let isTerminator = 1, isBranch = 1, isBarrier = 1 in {
@@ -1502,19 +1614,6 @@ def MASK_WRITE : AMDGPUShaderInst <
 
 } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
 
-def R600_LOAD_CONST : AMDGPUShaderInst <
-  (outs R600_Reg32:$dst),
-  (ins i32imm:$src0),
-  "R600_LOAD_CONST $dst, $src0",
-  [(set R600_Reg32:$dst, (int_AMDGPU_load_const imm:$src0))]
->;
-
-def RESERVE_REG : AMDGPUShaderInst <
-  (outs),
-  (ins i32imm:$src),
-  "RESERVE_REG $src",
-  [(int_AMDGPU_reserve_reg imm:$src)]
->;
 
 def TXD: AMDGPUShaderInst <
   (outs R600_Reg128:$dst),
@@ -1540,11 +1639,138 @@ def FNEG_R600 : FNEG<R600_Reg32>;
 //===---------------------------------------------------------------------===//
 // Return instruction
 //===---------------------------------------------------------------------===//
-let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
+let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1,
+    usesCustomInserter = 1 in {
   def RETURN          : ILFormat<(outs), (ins variable_ops),
       "RETURN", [(IL_retflag)]>;
 }
 
+
+//===----------------------------------------------------------------------===//
+// Constant Buffer Addressing Support
+//===----------------------------------------------------------------------===//
+
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+def CONST_COPY : Instruction {
+  let OutOperandList = (outs R600_Reg32:$dst);
+  let InOperandList = (ins i32imm:$src);
+  let Pattern = [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
+  let AsmString = "CONST_COPY";
+  let neverHasSideEffects = 1;
+  let isAsCheapAsAMove = 1;
+  let Itinerary = NullALU;
+}
+} // end isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
+
+def TEX_VTX_CONSTBUF :
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr), "VTX_READ_eg $dst, $ptr",
+      [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr))]>,
+  VTX_WORD1_GPR, VTX_WORD0 {
+
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = 0;
+  let SRC_REL = 0;
+  let SRC_SEL_X = 0;
+  let DST_REL = 0;
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 2;
+  let FORMAT_COMP_ALL = 1;
+  let SRF_MODE_ALL = 1;
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 2;
+  let DST_SEL_W        = 3;
+  let DATA_FORMAT      = 35;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+}
+
+def TEX_VTX_TEXBUF:
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
+      [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
+VTX_WORD1_GPR, VTX_WORD0 {
+
+let VC_INST = 0;
+let FETCH_TYPE = 2;
+let FETCH_WHOLE_QUAD = 0;
+let SRC_REL = 0;
+let SRC_SEL_X = 0;
+let DST_REL = 0;
+let USE_CONST_FIELDS = 1;
+let NUM_FORMAT_ALL = 0;
+let FORMAT_COMP_ALL = 0;
+let SRF_MODE_ALL = 1;
+let MEGA_FETCH_COUNT = 16;
+let DST_SEL_X        = 0;
+let DST_SEL_Y        = 1;
+let DST_SEL_Z        = 2;
+let DST_SEL_W        = 3;
+let DATA_FORMAT      = 0;
+
+let Inst{31-0} = Word0;
+let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+}
+
+
+
 //===--------------------------------------------------------------------===//
 // Instructions support
 //===--------------------------------------------------------------------===//
@@ -1641,7 +1867,19 @@ def : Pat <
 // SGE Reverse args
 def : Pat <
   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE),
-  (SGE R600_Reg32:$src1, R600_Reg32:$src0) 
+  (SGE R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGT_DX10 reverse args
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LT),
+  (SETGT_DX10 R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGE_DX10 reverse args
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LE),
+  (SETGE_DX10 R600_Reg32:$src1, R600_Reg32:$src0)
 >;
 
 // SETGT_INT reverse args
@@ -1682,31 +1920,43 @@ def : Pat <
   (SETE R600_Reg32:$src0, R600_Reg32:$src1)
 >;
 
+//SETE_DX10 - 'true if ordered'
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETO),
+  (SETE_DX10 R600_Reg32:$src0, R600_Reg32:$src1)
+>;
+
 //SNE - 'true if unordered'
 def : Pat <
   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO),
   (SNE R600_Reg32:$src0, R600_Reg32:$src1)
 >;
 
-def : Extract_Element <f32, v4f32, R600_Reg128, 0, sel_x>;
-def : Extract_Element <f32, v4f32, R600_Reg128, 1, sel_y>;
-def : Extract_Element <f32, v4f32, R600_Reg128, 2, sel_z>;
-def : Extract_Element <f32, v4f32, R600_Reg128, 3, sel_w>;
+//SETNE_DX10 - 'true if ordered'
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUO),
+  (SETNE_DX10 R600_Reg32:$src0, R600_Reg32:$src1)
+>;
+
+def : Extract_Element <f32, v4f32, R600_Reg128, 0, sub0>;
+def : Extract_Element <f32, v4f32, R600_Reg128, 1, sub1>;
+def : Extract_Element <f32, v4f32, R600_Reg128, 2, sub2>;
+def : Extract_Element <f32, v4f32, R600_Reg128, 3, sub3>;
 
-def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sel_x>;
-def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sel_y>;
-def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sel_z>;
-def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sel_w>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sub0>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sub1>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sub2>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sub3>;
 
-def : Extract_Element <i32, v4i32, R600_Reg128, 0, sel_x>;
-def : Extract_Element <i32, v4i32, R600_Reg128, 1, sel_y>;
-def : Extract_Element <i32, v4i32, R600_Reg128, 2, sel_z>;
-def : Extract_Element <i32, v4i32, R600_Reg128, 3, sel_w>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 0, sub0>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 1, sub1>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 2, sub2>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 3, sub3>;
 
-def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sel_x>;
-def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sel_y>;
-def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sel_z>;
-def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sel_w>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sub0>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
 
 def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
 def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td
index 3825bc4..dc8980a 100644
--- a/lib/Target/R600/R600Intrinsics.td
+++ b/lib/Target/R600/R600Intrinsics.td
@@ -12,21 +12,20 @@
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "R600", isTarget = 1 in {
-  def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_R600_load_input_perspective :
-    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
-  def int_R600_load_input_constant :
-    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
-  def int_R600_load_input_linear :
-    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
+  def int_R600_load_input :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_interp_input :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_load_texbuf :
+    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_store_swizzle :
+    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_R600_store_stream_output :
-    Intrinsic<[], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_R600_store_pixel_color :
-      Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
+    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_R600_store_pixel_depth :
       Intrinsic<[], [llvm_float_ty], []>;
   def int_R600_store_pixel_stencil :
       Intrinsic<[], [llvm_float_ty], []>;
-  def int_R600_store_pixel_dummy :
-      Intrinsic<[], [], []>;
+  def int_R600_store_dummy :
+      Intrinsic<[], [llvm_i32_ty], []>;
 }
diff --git a/lib/Target/R600/R600LowerConstCopy.cpp b/lib/Target/R600/R600LowerConstCopy.cpp
new file mode 100644
index 0000000..3ebe653
--- /dev/null
+++ b/lib/Target/R600/R600LowerConstCopy.cpp
@@ -0,0 +1,222 @@
+//===-- R600LowerConstCopy.cpp - Propagate ConstCopy / lower them to MOV---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass is intended to handle remaining ConstCopy pseudo MachineInstr.
+/// ISel will fold each Const Buffer read inside scalar ALU. However it cannot
+/// fold them inside vector instruction, like DOT4 or Cube ; ISel emits
+/// ConstCopy instead. This pass (executed after ExpandingSpecialInstr) will try
+/// to fold them if possible or replace them by MOV otherwise.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/GlobalValue.h"
+
+namespace llvm {
+
+class R600LowerConstCopy : public MachineFunctionPass {
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  struct ConstPairs {
+    unsigned XYPair;
+    unsigned ZWPair;
+  };
+
+  bool canFoldInBundle(ConstPairs &UsedConst, unsigned ReadConst) const;
+public:
+  R600LowerConstCopy(TargetMachine &tm);
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const { return "R600 Eliminate Symbolic Operand"; }
+};
+
+char R600LowerConstCopy::ID = 0;
+
+R600LowerConstCopy::R600LowerConstCopy(TargetMachine &tm) :
+    MachineFunctionPass(ID),
+    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo()))
+{
+}
+
+bool R600LowerConstCopy::canFoldInBundle(ConstPairs &UsedConst,
+    unsigned ReadConst) const {
+  unsigned ReadConstChan = ReadConst & 3;
+  unsigned ReadConstIndex = ReadConst & (~3);
+  if (ReadConstChan < 2) {
+    if (!UsedConst.XYPair) {
+      UsedConst.XYPair = ReadConstIndex;
+    }
+    return UsedConst.XYPair == ReadConstIndex;
+  } else {
+    if (!UsedConst.ZWPair) {
+      UsedConst.ZWPair = ReadConstIndex;
+    }
+    return UsedConst.ZWPair == ReadConstIndex;
+  }
+}
+
+static bool isControlFlow(const MachineInstr &MI) {
+  return (MI.getOpcode() == AMDGPU::IF_PREDICATE_SET) ||
+  (MI.getOpcode() == AMDGPU::ENDIF) ||
+  (MI.getOpcode() == AMDGPU::ELSE) ||
+  (MI.getOpcode() == AMDGPU::WHILELOOP) ||
+  (MI.getOpcode() == AMDGPU::BREAK);
+}
+
+bool R600LowerConstCopy::runOnMachineFunction(MachineFunction &MF) {
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    DenseMap<unsigned, MachineInstr *> RegToConstIndex;
+    for (MachineBasicBlock::instr_iterator I = MBB.instr_begin(),
+        E = MBB.instr_end(); I != E;) {
+
+      if (I->getOpcode() == AMDGPU::CONST_COPY) {
+        MachineInstr &MI = *I;
+        I = llvm::next(I);
+        unsigned DstReg = MI.getOperand(0).getReg();
+        DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
+            RegToConstIndex.find(DstReg);
+        if (SrcMI != RegToConstIndex.end()) {
+          SrcMI->second->eraseFromParent();
+          RegToConstIndex.erase(SrcMI);
+        }
+        MachineInstr *NewMI = 
+            TII->buildDefaultInstruction(MBB, &MI, AMDGPU::MOV,
+            MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
+        TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
+            MI.getOperand(1).getImm());
+        RegToConstIndex[DstReg] = NewMI;
+        MI.eraseFromParent();
+        continue;
+      }
+
+      std::vector<unsigned> Defs;
+      // We consider all Instructions as bundled because algorithm that  handle
+      // const read port limitations inside an IG is still valid with single
+      // instructions.
+      std::vector<MachineInstr *> Bundle;
+
+      if (I->isBundle()) {
+        unsigned BundleSize = I->getBundleSize();
+        for (unsigned i = 0; i < BundleSize; i++) {
+          I = llvm::next(I);
+          Bundle.push_back(I);
+        }
+      } else if (TII->isALUInstr(I->getOpcode())){
+        Bundle.push_back(I);
+      } else if (isControlFlow(*I)) {
+          RegToConstIndex.clear();
+          I = llvm::next(I);
+          continue;
+      } else {
+        MachineInstr &MI = *I;
+        for (MachineInstr::mop_iterator MOp = MI.operands_begin(),
+            MOpE = MI.operands_end(); MOp != MOpE; ++MOp) {
+          MachineOperand &MO = *MOp;
+          if (!MO.isReg())
+            continue;
+          if (MO.isDef()) {
+            Defs.push_back(MO.getReg());
+          } else {
+            // Either a TEX or an Export inst, prevent from erasing def of used
+            // operand
+            RegToConstIndex.erase(MO.getReg());
+            for (MCSubRegIterator SR(MO.getReg(), &TII->getRegisterInfo());
+                SR.isValid(); ++SR) {
+              RegToConstIndex.erase(*SR);
+            }
+          }
+        }
+      }
+
+
+      R600Operands::Ops OpTable[3][2] = {
+        {R600Operands::SRC0, R600Operands::SRC0_SEL},
+        {R600Operands::SRC1, R600Operands::SRC1_SEL},
+        {R600Operands::SRC2, R600Operands::SRC2_SEL},
+      };
+
+      for(std::vector<MachineInstr *>::iterator It = Bundle.begin(),
+          ItE = Bundle.end(); It != ItE; ++It) {
+        MachineInstr *MI = *It;
+        if (TII->isPredicated(MI)) {
+          // We don't want to erase previous assignment
+          RegToConstIndex.erase(MI->getOperand(0).getReg());
+        } else {
+          int WriteIDX = TII->getOperandIdx(MI->getOpcode(), R600Operands::WRITE);
+          if (WriteIDX < 0 || MI->getOperand(WriteIDX).getImm())
+            Defs.push_back(MI->getOperand(0).getReg());
+        }
+      }
+
+      ConstPairs CP = {0,0};
+      for (unsigned SrcOp = 0; SrcOp < 3; SrcOp++) {
+        for(std::vector<MachineInstr *>::iterator It = Bundle.begin(),
+            ItE = Bundle.end(); It != ItE; ++It) {
+          MachineInstr *MI = *It;
+          int SrcIdx = TII->getOperandIdx(MI->getOpcode(), OpTable[SrcOp][0]);
+          if (SrcIdx < 0)
+            continue;
+          MachineOperand &MO = MI->getOperand(SrcIdx);
+          DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
+              RegToConstIndex.find(MO.getReg());
+          if (SrcMI != RegToConstIndex.end()) {
+            MachineInstr *CstMov = SrcMI->second;
+            int ConstMovSel =
+                TII->getOperandIdx(CstMov->getOpcode(), R600Operands::SRC0_SEL);
+            unsigned ConstIndex = CstMov->getOperand(ConstMovSel).getImm();
+            if (MI->isInsideBundle() && canFoldInBundle(CP, ConstIndex)) {
+              TII->setImmOperand(MI, OpTable[SrcOp][1], ConstIndex);
+              MI->getOperand(SrcIdx).setReg(AMDGPU::ALU_CONST);
+            } else {
+              RegToConstIndex.erase(SrcMI);
+            }
+          }
+        }
+      }
+
+      for (std::vector<unsigned>::iterator It = Defs.begin(), ItE = Defs.end();
+          It != ItE; ++It) {
+        DenseMap<unsigned, MachineInstr *>::iterator SrcMI =
+            RegToConstIndex.find(*It);
+        if (SrcMI != RegToConstIndex.end()) {
+          SrcMI->second->eraseFromParent();
+          RegToConstIndex.erase(SrcMI);
+        }
+      }
+      I = llvm::next(I);
+    }
+
+    if (MBB.succ_empty()) {
+      for (DenseMap<unsigned, MachineInstr *>::iterator
+          DI = RegToConstIndex.begin(), DE = RegToConstIndex.end();
+          DI != DE; ++DI) {
+        DI->second->eraseFromParent();
+      }
+    }
+  }
+  return false;
+}
+
+FunctionPass *createR600LowerConstCopy(TargetMachine &tm) {
+  return new R600LowerConstCopy(tm);
+}
+
+}
+
+
diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp
index 4eb5efa..40aec83 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.cpp
+++ b/lib/Target/R600/R600MachineFunctionInfo.cpp
@@ -13,22 +13,6 @@
 using namespace llvm;
 
 R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
-  : MachineFunctionInfo(),
-    HasLinearInterpolation(false),
-    HasPerspectiveInterpolation(false) {
+  : MachineFunctionInfo() {
     memset(Outputs, 0, sizeof(Outputs));
-    memset(StreamOutputs, 0, sizeof(StreamOutputs));
   }
-
-unsigned R600MachineFunctionInfo::GetIJPerspectiveIndex() const {
-  assert(HasPerspectiveInterpolation);
-  return 0;
-}
-
-unsigned R600MachineFunctionInfo::GetIJLinearIndex() const {
-  assert(HasLinearInterpolation);
-  if (HasPerspectiveInterpolation)
-    return 1;
-  else
-    return 0;
-}
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index e97fb5b..4b901f4 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -13,6 +13,7 @@
 #ifndef R600MACHINEFUNCTIONINFO_H
 #define R600MACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include <vector>
@@ -23,15 +24,9 @@ class R600MachineFunctionInfo : public MachineFunctionInfo {
 
 public:
   R600MachineFunctionInfo(const MachineFunction &MF);
-  std::vector<unsigned> ReservedRegs;
+  SmallVector<unsigned, 4> LiveOuts;
+  std::vector<unsigned> IndirectRegs;
   SDNode *Outputs[16];
-  SDNode *StreamOutputs[64][4];
-  bool HasLinearInterpolation;
-  bool HasPerspectiveInterpolation;
-
-  unsigned GetIJLinearIndex() const;
-  unsigned GetIJPerspectiveIndex() const;
-
 };
 
 } // End llvm namespace
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
index a39f83d..bbd7995 100644
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -15,6 +15,7 @@
 #include "R600RegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "R600Defines.h"
+#include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 
 using namespace llvm;
@@ -28,7 +29,6 @@ R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
 
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const R600MachineFunctionInfo * MFI = MF.getInfo<R600MachineFunctionInfo>();
 
   Reserved.set(AMDGPU::ZERO);
   Reserved.set(AMDGPU::HALF);
@@ -38,21 +38,30 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(AMDGPU::NEG_ONE);
   Reserved.set(AMDGPU::PV_X);
   Reserved.set(AMDGPU::ALU_LITERAL_X);
+  Reserved.set(AMDGPU::ALU_CONST);
   Reserved.set(AMDGPU::PREDICATE_BIT);
   Reserved.set(AMDGPU::PRED_SEL_OFF);
   Reserved.set(AMDGPU::PRED_SEL_ZERO);
   Reserved.set(AMDGPU::PRED_SEL_ONE);
 
-  for (TargetRegisterClass::iterator I = AMDGPU::R600_CReg32RegClass.begin(),
-                        E = AMDGPU::R600_CReg32RegClass.end(); I != E; ++I) {
+  for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
+                        E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
     Reserved.set(*I);
   }
 
-  for (std::vector<unsigned>::const_iterator I = MFI->ReservedRegs.begin(),
-                                    E = MFI->ReservedRegs.end(); I != E; ++I) {
+  for (TargetRegisterClass::iterator I = AMDGPU::TRegMemRegClass.begin(),
+                                     E = AMDGPU::TRegMemRegClass.end();
+                                     I !=  E; ++I) {
     Reserved.set(*I);
   }
 
+  const R600InstrInfo *RII = static_cast<const R600InstrInfo*>(&TII);
+  std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF);
+  for (std::vector<unsigned>::iterator I = IndirectRegs.begin(),
+                                       E = IndirectRegs.end();
+                                       I != E; ++I) {
+    Reserved.set(*I);
+  }
   return Reserved;
 }
 
@@ -81,9 +90,10 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
 unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
   switch (Channel) {
     default: assert(!"Invalid channel index"); return 0;
-    case 0: return AMDGPU::sel_x;
-    case 1: return AMDGPU::sel_y;
-    case 2: return AMDGPU::sel_z;
-    case 3: return AMDGPU::sel_w;
+    case 0: return AMDGPU::sub0;
+    case 1: return AMDGPU::sub1;
+    case 2: return AMDGPU::sub2;
+    case 3: return AMDGPU::sub3;
   }
 }
+
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index d3d6d25..ce5994c 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -19,7 +19,7 @@ class R600RegWithChan <string name, bits<9> sel, string chan> :
 class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
     RegisterWithSubRegs<n, subregs> {
   let Namespace = "AMDGPU";
-  let SubRegIndices = [sel_x, sel_y, sel_z, sel_w];
+  let SubRegIndices = [sub0, sub1, sub2, sub3];
   let HWEncoding = encoding;
 }
 
@@ -28,9 +28,11 @@ foreach Index = 0-127 in {
     // 32-bit Temporary Registers
     def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
 
-    // 32-bit Constant Registers (There are more than 128, this the number
-    // that is currently supported.
-    def C#Index#_#Chan : R600RegWithChan <"C"#Index#"."#Chan, Index, Chan>;
+    // Indirect addressing offset registers
+    def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
+                                              Index, Chan>;
+    def TRegMem#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index,
+                                                Chan>;
   }
   // 128-bit Temporary Registers
   def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW",
@@ -42,7 +44,7 @@ foreach Index = 0-127 in {
 }
 
 // Array Base Register holding input in FS
-foreach Index = 448-464 in {
+foreach Index = 448-480 in {
   def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
 }
 
@@ -61,19 +63,25 @@ def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
 def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
 def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
 def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
+def AR_X : R600Reg<"AR.x", 0>;
 
 def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
-                          (add (sequence "ArrayBase%u", 448, 464))>;
+                          (add (sequence "ArrayBase%u", 448, 480))>;
+// special registers for ALU src operands
+// const buffer reference, SRCx_SEL contains index
+def ALU_CONST : R600Reg<"CBuf", 0>;
+// interpolation param reference, SRCx_SEL contains index
+def ALU_PARAM : R600Reg<"Param", 0>;
 
-def R600_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
-                          (add (interleave
-                                  (interleave (sequence "C%u_X", 0, 127),
-                                              (sequence "C%u_Z", 0, 127)),
-                                  (interleave (sequence "C%u_Y", 0, 127),
-                                              (sequence "C%u_W", 0, 127))))>;
+let isAllocatable = 0 in {
+
+// XXX: Only use the X channel, until we support wider stack widths
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
+
+} // End isAllocatable = 0
 
 def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
-                                   (add (sequence "T%u_X", 0, 127))>;
+                                   (add (sequence "T%u_X", 0, 127), AR_X)>;
 
 def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
                                    (add (sequence "T%u_Y", 0, 127))>;
@@ -85,15 +93,16 @@ def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
                                    (add (sequence "T%u_W", 0, 127))>;
 
 def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
-                          (add (interleave
-                                 (interleave R600_TReg32_X, R600_TReg32_Z),
-                                 (interleave R600_TReg32_Y, R600_TReg32_W)))>;
+                                   (interleave R600_TReg32_X, R600_TReg32_Y,
+                                               R600_TReg32_Z, R600_TReg32_W)>;
 
 def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
     R600_TReg32,
-    R600_CReg32,
     R600_ArrayBase,
-    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>;
+    R600_Addr,
+    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
+    ALU_CONST, ALU_PARAM
+    )>;
 
 def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
     PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
@@ -105,3 +114,33 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
                                 (add (sequence "T%u_XYZW", 0, 127))> {
   let CopyCost = -1;
 }
+
+//===----------------------------------------------------------------------===//
+// Register classes for indirect addressing
+//===----------------------------------------------------------------------===//
+
+// Super register for all the Indirect Registers.  This register class is used
+// by the REG_SEQUENCE instruction to specify the registers to use for direct
+// reads / writes which may be written / read by an indirect address.
+class IndirectSuper<string n, list<Register> subregs> :
+    RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices =
+ [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+  sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15];
+}
+
+def IndirectSuperReg : IndirectSuper<"Indirect",
+  [TRegMem0_X, TRegMem1_X, TRegMem2_X, TRegMem3_X, TRegMem4_X, TRegMem5_X,
+   TRegMem6_X, TRegMem7_X, TRegMem8_X, TRegMem9_X, TRegMem10_X, TRegMem11_X,
+   TRegMem12_X, TRegMem13_X, TRegMem14_X, TRegMem15_X]
+>;
+
+def IndirectReg : RegisterClass<"AMDGPU", [f32, i32], 32, (add IndirectSuperReg)>;
+
+// This register class defines the registers that are the storage units for
+// the "Indirect Addressing" pseudo memory space.
+// XXX: Only use the X channel, until we support wider stack widths
+def TRegMem : RegisterClass<"AMDGPU", [f32, i32], 32,
+  (add (sequence "TRegMem%u_X", 0, 16))
+>;
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index f580377..2477e2a 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -147,7 +147,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
 
 /// \brief Is BB the last block saved on the stack ?
 bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
-  return Stack.back().first == BB;
+  return !Stack.empty() && Stack.back().first == BB;
 }
 
 /// \brief Pop the last saved value from the control flow stack
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 4c672ca..0a0fbd9 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -26,21 +26,22 @@ using namespace llvm;
 
 SITargetLowering::SITargetLowering(TargetMachine &TM) :
     AMDGPUTargetLowering(TM),
-    TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())) {
+    TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())),
+    TRI(TM.getRegisterInfo()) {
   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
   addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
-  addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass);
-  addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass);
+  addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
 
-  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
-  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
+  addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
+  addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
+  addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
 
   computeRegisterProperties();
 
-  setOperationAction(ISD::AND, MVT::i1, Custom);
-
   setOperationAction(ISD::ADD, MVT::i64, Legal);
   setOperationAction(ISD::ADD, MVT::i32, Legal);
 
@@ -62,63 +63,13 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
-  const TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
   MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
   MachineBasicBlock::iterator I = MI;
 
-  if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) {
-    AppendS_WAITCNT(MI, *BB, llvm::next(I));
-    return BB;
-  }
-
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH: return BB;
-  case AMDGPU::CLAMP_SI:
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
-           .addOperand(MI->getOperand(0))
-           .addOperand(MI->getOperand(1))
-           // VSRC1-2 are unused, but we still need to fill all the
-           // operand slots, so we just reuse the VSRC0 operand
-           .addOperand(MI->getOperand(1))
-           .addOperand(MI->getOperand(1))
-           .addImm(0) // ABS
-           .addImm(1) // CLAMP
-           .addImm(0) // OMOD
-           .addImm(0); // NEG
-    MI->eraseFromParent();
-    break;
-
-  case AMDGPU::FABS_SI:
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
-                 .addOperand(MI->getOperand(0))
-                 .addOperand(MI->getOperand(1))
-                 // VSRC1-2 are unused, but we still need to fill all the
-                 // operand slots, so we just reuse the VSRC0 operand
-                 .addOperand(MI->getOperand(1))
-                 .addOperand(MI->getOperand(1))
-                 .addImm(1) // ABS
-                 .addImm(0) // CLAMP
-                 .addImm(0) // OMOD
-                 .addImm(0); // NEG
-    MI->eraseFromParent();
-    break;
-
-  case AMDGPU::FNEG_SI:
-    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
-                 .addOperand(MI->getOperand(0))
-                 .addOperand(MI->getOperand(1))
-                 // VSRC1-2 are unused, but we still need to fill all the
-                 // operand slots, so we just reuse the VSRC0 operand
-                 .addOperand(MI->getOperand(1))
-                 .addOperand(MI->getOperand(1))
-                 .addImm(0) // ABS
-                 .addImm(0) // CLAMP
-                 .addImm(0) // OMOD
-                 .addImm(1); // NEG
-    MI->eraseFromParent();
-    break;
   case AMDGPU::SHADER_TYPE:
     BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType =
                                         MI->getOperand(0).getImm();
@@ -128,29 +79,13 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::SI_INTERP:
     LowerSI_INTERP(MI, *BB, I, MRI);
     break;
-  case AMDGPU::SI_INTERP_CONST:
-    LowerSI_INTERP_CONST(MI, *BB, I, MRI);
-    break;
-  case AMDGPU::SI_KIL:
-    LowerSI_KIL(MI, *BB, I, MRI);
-    break;
   case AMDGPU::SI_WQM:
     LowerSI_WQM(MI, *BB, I, MRI);
     break;
-  case AMDGPU::SI_V_CNDLT:
-    LowerSI_V_CNDLT(MI, *BB, I, MRI);
-    break;
   }
   return BB;
 }
 
-void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
-    MachineBasicBlock::iterator I) const {
-  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WAITCNT))
-          .addImm(0);
-}
-
-
 void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
     MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
@@ -190,57 +125,6 @@ void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
   MI->eraseFromParent();
 }
 
-void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI,
-    MachineBasicBlock &BB, MachineBasicBlock::iterator I,
-    MachineRegisterInfo &MRI) const {
-  MachineOperand dst = MI->getOperand(0);
-  MachineOperand attr_chan = MI->getOperand(1);
-  MachineOperand attr = MI->getOperand(2);
-  MachineOperand params = MI->getOperand(3);
-  unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
-
-  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
-          .addOperand(params);
-
-  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32))
-          .addOperand(dst)
-          .addOperand(attr_chan)
-          .addOperand(attr)
-          .addReg(M0);
-
-  MI->eraseFromParent();
-}
-
-void SITargetLowering::LowerSI_KIL(MachineInstr *MI, MachineBasicBlock &BB,
-    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
-  // Clear this pixel from the exec mask if the operand is negative
-  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CMPX_LE_F32_e32),
-          AMDGPU::VCC)
-          .addReg(AMDGPU::SREG_LIT_0)
-          .addOperand(MI->getOperand(0));
-
-  MI->eraseFromParent();
-}
-
-void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
-    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
-  unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
-
-  BuildMI(BB, I, BB.findDebugLoc(I),
-          TII->get(AMDGPU::V_CMP_GT_F32_e32),
-          VCC)
-          .addReg(AMDGPU::SREG_LIT_0)
-          .addOperand(MI->getOperand(1));
-
-  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32))
-          .addOperand(MI->getOperand(0))
-          .addOperand(MI->getOperand(3))
-          .addOperand(MI->getOperand(2))
-          .addReg(VCC);
-
-  MI->eraseFromParent();
-}
-
 EVT SITargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i1;
 }
@@ -255,7 +139,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::LOAD: return LowerLOAD(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND);
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -272,30 +155,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-/// \brief The function is for lowering i1 operations on the
-/// VCC register.
-///
-/// In the VALU context, VCC is a one bit register, but in the
-/// SALU context the VCC is a 64-bit register (1-bit per thread).  Since only
-/// the SALU can perform operations on the VCC register, we need to promote
-/// the operand types from i1 to i64 in order for tablegen to be able to match
-/// this operation to the correct SALU instruction.  We do this promotion by
-/// wrapping the operands in a CopyToReg node.
-///
-SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op,
-                                               SelectionDAG &DAG,
-                                               unsigned VCCNode) const {
-  DebugLoc DL = Op.getDebugLoc();
-
-  SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64,
-                               DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
-                                           Op.getOperand(0)),
-                               DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
-                                           Op.getOperand(1)));
-
-  return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode);
-}
-
 /// \brief Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
@@ -500,12 +359,252 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
-#define NODE_NAME_CASE(node) case SIISD::node: return #node;
+/// \brief Test if RegClass is one of the VSrc classes 
+static bool isVSrc(unsigned RegClass) {
+  return AMDGPU::VSrc_32RegClassID == RegClass ||
+         AMDGPU::VSrc_64RegClassID == RegClass;
+}
+
+/// \brief Test if RegClass is one of the SSrc classes 
+static bool isSSrc(unsigned RegClass) {
+  return AMDGPU::SSrc_32RegClassID == RegClass ||
+         AMDGPU::SSrc_64RegClassID == RegClass;
+}
+
+/// \brief Analyze the possible immediate value Op
+///
+/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
+/// and the immediate value if it's a literal immediate
+int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
+
+  union {
+    int32_t I;
+    float F;
+  } Imm;
+
+  if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N))
+    Imm.I = Node->getSExtValue();
+  else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
+    Imm.F = Node->getValueAPF().convertToFloat();
+  else
+    return -1; // It isn't an immediate
+
+  if ((Imm.I >= -16 && Imm.I <= 64) ||
+      Imm.F == 0.5f || Imm.F == -0.5f ||
+      Imm.F == 1.0f || Imm.F == -1.0f ||
+      Imm.F == 2.0f || Imm.F == -2.0f ||
+      Imm.F == 4.0f || Imm.F == -4.0f)
+    return 0; // It's an inline immediate
+
+  return Imm.I; // It's a literal immediate
+}
+
+/// \brief Try to fold an immediate directly into an instruction
+bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
+                               bool &ScalarSlotUsed) const {
+
+  MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
+  if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode()))
+    return false;
+
+  const SDValue &Op = Mov->getOperand(0);
+  int32_t Value = analyzeImmediate(Op.getNode());
+  if (Value == -1) {
+    // Not an immediate at all
+    return false;
+
+  } else if (Value == 0) {
+    // Inline immediates can always be fold
+    Operand = Op;
+    return true;
+
+  } else if (Value == Immediate) {
+    // Already fold literal immediate
+    Operand = Op;
+    return true;
+
+  } else if (!ScalarSlotUsed && !Immediate) {
+    // Fold this literal immediate
+    ScalarSlotUsed = true;
+    Immediate = Value;
+    Operand = Op;
+    return true;
 
-const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-  default: return AMDGPUTargetLowering::getTargetNodeName(Opcode);
-  NODE_NAME_CASE(VCC_AND)
-  NODE_NAME_CASE(VCC_BITCAST)
   }
+
+  return false;
+}
+
+/// \brief Does "Op" fit into register class "RegClass" ?
+bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op,
+                                    unsigned RegClass) const {
+
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 
+  SDNode *Node = Op.getNode();
+
+  int OpClass;
+  if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) {
+    const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode());
+    OpClass = Desc.OpInfo[Op.getResNo()].RegClass;
+
+  } else if (Node->getOpcode() == ISD::CopyFromReg) {
+    RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode());
+    OpClass = MRI.getRegClass(Reg->getReg())->getID();
+
+  } else
+    return false;
+
+  if (OpClass == -1)
+    return false;
+
+  return TRI->getRegClass(RegClass)->hasSubClassEq(TRI->getRegClass(OpClass));
+}
+
+/// \brief Make sure that we don't exeed the number of allowed scalars
+void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
+                                       unsigned RegClass,
+                                       bool &ScalarSlotUsed) const {
+
+  // First map the operands register class to a destination class
+  if (RegClass == AMDGPU::VSrc_32RegClassID)
+    RegClass = AMDGPU::VReg_32RegClassID;
+  else if (RegClass == AMDGPU::VSrc_64RegClassID)
+    RegClass = AMDGPU::VReg_64RegClassID;
+  else
+    return;
+
+  // Nothing todo if they fit naturaly
+  if (fitsRegClass(DAG, Operand, RegClass))
+    return;
+
+  // If the scalar slot isn't used yet use it now
+  if (!ScalarSlotUsed) {
+    ScalarSlotUsed = true;
+    return;
+  }
+
+  // This is a conservative aproach, it is possible that we can't determine
+  // the correct register class and copy too often, but better save than sorry.
+  SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
+  SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DebugLoc(),
+                                    Operand.getValueType(), Operand, RC);
+  Operand = SDValue(Node, 0);
+}
+
+SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
+                                          SelectionDAG &DAG) const {
+
+  // Original encoding (either e32 or e64)
+  int Opcode = Node->getMachineOpcode();
+  const MCInstrDesc *Desc = &TII->get(Opcode);
+
+  unsigned NumDefs = Desc->getNumDefs();
+  unsigned NumOps = Desc->getNumOperands();
+
+  // e64 version if available, -1 otherwise
+  int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
+  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64);
+
+  assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
+  assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4));
+
+  int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
+  bool HaveVSrc = false, HaveSSrc = false;
+
+  // First figure out what we alread have in this instruction
+  for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
+       i != e && Op < NumOps; ++i, ++Op) {
+
+    unsigned RegClass = Desc->OpInfo[Op].RegClass;
+    if (isVSrc(RegClass))
+      HaveVSrc = true;
+    else if (isSSrc(RegClass))
+      HaveSSrc = true;
+    else
+      continue;
+
+    int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode());
+    if (Imm != -1 && Imm != 0) {
+      // Literal immediate
+      Immediate = Imm;
+    }
+  }
+
+  // If we neither have VSrc nor SSrc it makes no sense to continue
+  if (!HaveVSrc && !HaveSSrc)
+    return Node;
+
+  // No scalar allowed when we have both VSrc and SSrc
+  bool ScalarSlotUsed = HaveVSrc && HaveSSrc;
+
+  // Second go over the operands and try to fold them
+  std::vector<SDValue> Ops;
+  bool Promote2e64 = false;
+  for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
+       i != e && Op < NumOps; ++i, ++Op) {
+
+    const SDValue &Operand = Node->getOperand(i);
+    Ops.push_back(Operand);
+
+    // Already folded immediate ?
+    if (isa<ConstantSDNode>(Operand.getNode()) ||
+        isa<ConstantFPSDNode>(Operand.getNode()))
+      continue;
+
+    // Is this a VSrc or SSrc operand ?
+    unsigned RegClass = Desc->OpInfo[Op].RegClass;
+    if (!isVSrc(RegClass) && !isSSrc(RegClass)) {
+
+      if (i == 1 && Desc->isCommutable() &&
+          fitsRegClass(DAG, Ops[0], RegClass) &&
+          foldImm(Ops[1], Immediate, ScalarSlotUsed)) {
+
+        assert(isVSrc(Desc->OpInfo[NumDefs].RegClass) ||
+               isSSrc(Desc->OpInfo[NumDefs].RegClass));
+
+        // Swap commutable operands
+        SDValue Tmp = Ops[1];
+        Ops[1] = Ops[0];
+        Ops[0] = Tmp;
+
+      } else if (DescE64 && !Immediate) {
+        // Test if it makes sense to switch to e64 encoding
+
+        RegClass = DescE64->OpInfo[Op].RegClass;
+        int32_t TmpImm = -1;
+        if ((isVSrc(RegClass) || isSSrc(RegClass)) &&
+            foldImm(Ops[i], TmpImm, ScalarSlotUsed)) {
+
+          Immediate = -1;
+          Promote2e64 = true;
+          Desc = DescE64;
+          DescE64 = 0;
+        }
+      }
+      continue;
+    }
+
+    // Try to fold the immediates
+    if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
+      // Folding didn't worked, make sure we don't hit the SReg limit
+      ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
+    }
+  }
+
+  if (Promote2e64) {
+    // Add the modifier flags while promoting
+    for (unsigned i = 0; i < 4; ++i)
+      Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
+  }
+
+  // Add optional chain and glue
+  for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
+    Ops.push_back(Node->getOperand(i));
+
+  // Either create a complete new or update the current instruction
+  if (Promote2e64)
+    return DAG.getMachineNode(OpcodeE64, Node->getDebugLoc(),
+                              Node->getVTList(), Ops.data(), Ops.size());
+  else
+    return DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
 }
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index c088112..737162f 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -22,31 +22,25 @@ namespace llvm {
 
 class SITargetLowering : public AMDGPUTargetLowering {
   const SIInstrInfo * TII;
+  const TargetRegisterInfo * TRI;
 
-  /// Memory reads and writes are syncronized using the S_WAITCNT instruction.
-  /// This function takes the most conservative approach and inserts an
-  /// S_WAITCNT instruction after every read and write.
-  void AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
-              MachineBasicBlock::iterator I) const;
   void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
               MachineBasicBlock::iterator I, unsigned Opocde) const;
   void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
               MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
-  void LowerSI_INTERP_CONST(MachineInstr *MI, MachineBasicBlock &BB,
-              MachineBasicBlock::iterator I, MachineRegisterInfo &MRI) const;
-  void LowerSI_KIL(MachineInstr *MI, MachineBasicBlock &BB,
-              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
   void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
               MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
-  void LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
-              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
 
-  SDValue Loweri1ContextSwitch(SDValue Op, SelectionDAG &DAG,
-                                           unsigned VCCNode) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
+  bool foldImm(SDValue &Operand, int32_t &Immediate,
+               bool &ScalarSlotUsed) const;
+  bool fitsRegClass(SelectionDAG &DAG, SDValue &Op, unsigned RegClass) const;
+  void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 
+                       unsigned RegClass, bool &ScalarSlotUsed) const;
+
 public:
   SITargetLowering(TargetMachine &tm);
   virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
@@ -54,7 +48,9 @@ public:
   virtual EVT getSetCCResultType(EVT VT) const;
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  virtual const char* getTargetNodeName(unsigned Opcode) const;
+  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
+
+  int32_t analyzeImmediate(const SDNode *N) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
new file mode 100644
index 0000000..24fc929
--- /dev/null
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -0,0 +1,353 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief One variable for each of the hardware counters
+typedef union {
+  struct {
+    unsigned VM;
+    unsigned EXP;
+    unsigned LGKM;
+  } Named;
+  unsigned Array[3];
+
+} Counters;
+
+typedef Counters RegCounters[512];
+typedef std::pair<unsigned, unsigned> RegInterval;
+
+class SIInsertWaits : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo &TRI;
+  const MachineRegisterInfo *MRI;
+
+  /// \brief Constant hardware limits
+  static const Counters WaitCounts;
+
+  /// \brief Constant zero value
+  static const Counters ZeroCounts;
+
+  /// \brief Counter values we have already waited on.
+  Counters WaitedOn;
+
+  /// \brief Counter values for last instruction issued.
+  Counters LastIssued;
+
+  /// \brief Registers used by async instructions.
+  RegCounters UsedRegs;
+
+  /// \brief Registers defined by async instructions.
+  RegCounters DefinedRegs;
+
+  /// \brief Different export instruction types seen since last wait.
+  unsigned ExpInstrTypesSeen;
+
+  /// \brief Get increment/decrement amount for this instruction.
+  Counters getHwCounts(MachineInstr &MI);
+
+  /// \brief Is operand relevant for async execution?
+  bool isOpRelevant(MachineOperand &Op);
+
+  /// \brief Get register interval an operand affects.
+  RegInterval getRegInterval(MachineOperand &Op);
+
+  /// \brief Handle instructions async components
+  void pushInstruction(MachineInstr &MI);
+
+  /// \brief Insert the actual wait instruction
+  bool insertWait(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  const Counters &Counts);
+
+  /// \brief Resolve all operand dependencies to counter requirements
+  Counters handleOperands(MachineInstr &MI);
+
+public:
+  SIInsertWaits(TargetMachine &tm) :
+    MachineFunctionPass(ID),
+    TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
+    TRI(TII->getRegisterInfo()) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const {
+    return "SI insert wait  instructions";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIInsertWaits::ID = 0;
+
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+
+FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
+  return new SIInsertWaits(tm);
+}
+
+Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
+
+  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
+  Counters Result;
+
+  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
+
+  // Only consider stores or EXP for EXP_CNT
+  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
+      (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore()));
+
+  // LGKM may uses larger values
+  if (TSFlags & SIInstrFlags::LGKM_CNT) {
+
+    MachineOperand &Op = MI.getOperand(0);
+    assert(Op.isReg() && "First LGKM operand must be a register!");
+
+    unsigned Reg = Op.getReg();
+    unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+    Result.Named.LGKM = Size > 4 ? 2 : 1;
+
+  } else {
+    Result.Named.LGKM = 0;
+  }
+
+  return Result;
+}
+
+bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
+
+  // Constants are always irrelevant
+  if (!Op.isReg())
+    return false;
+
+  // Defines are always relevant
+  if (Op.isDef())
+    return true;
+
+  // For exports all registers are relevant
+  MachineInstr &MI = *Op.getParent();
+  if (MI.getOpcode() == AMDGPU::EXP)
+    return true;
+
+  // For stores the stored value is also relevant
+  if (!MI.getDesc().mayStore())
+    return false;
+
+  for (MachineInstr::mop_iterator I = MI.operands_begin(),
+       E = MI.operands_end(); I != E; ++I) {
+
+    if (I->isReg() && I->isUse())
+      return Op.isIdenticalTo(*I);
+  }
+
+  return false;
+}
+
+RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
+
+  if (!Op.isReg())
+    return std::make_pair(0, 0);
+
+  unsigned Reg = Op.getReg();
+  unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+
+  assert(Size >= 4);
+
+  RegInterval Result;
+  Result.first = TRI.getEncodingValue(Reg);
+  Result.second = Result.first + Size / 4;
+
+  return Result;
+}
+
+void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+
+  // Get the hardware counter increments and sum them up
+  Counters Increment = getHwCounts(MI);
+  unsigned Sum = 0;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    LastIssued.Array[i] += Increment.Array[i];
+    Sum += Increment.Array[i];
+  }
+
+  // If we don't increase anything then that's it
+  if (Sum == 0)
+    return;
+
+  // Remember which export instructions we have seen
+  if (Increment.Named.EXP) {
+    ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+  }
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = MI.getOperand(i);
+    if (!isOpRelevant(Op))
+      continue;
+
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      // Remember which registers we define
+      if (Op.isDef())
+        DefinedRegs[j] = LastIssued;
+
+      // and which one we are using
+      if (Op.isUse())
+        UsedRegs[j] = LastIssued;
+    }
+  }
+}
+
+bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               const Counters &Required) {
+
+  // End of program? No need to wait on anything
+  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+    return false;
+
+  // Figure out if the async instructions execute in order
+  bool Ordered[3];
+
+  // VM_CNT is always ordered
+  Ordered[0] = true;
+
+  // EXP_CNT is unordered if we have both EXP & VM-writes
+  Ordered[1] = ExpInstrTypesSeen == 3;
+
+  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
+  Ordered[2] = false;
+
+  // The values we are going to put into the S_WAITCNT instruction
+  Counters Counts = WaitCounts;
+
+  // Do we really need to wait?
+  bool NeedWait = false;
+
+  for (unsigned i = 0; i < 3; ++i) {
+
+    if (Required.Array[i] <= WaitedOn.Array[i])
+      continue;
+
+    NeedWait = true;
+    
+    if (Ordered[i]) {
+      unsigned Value = LastIssued.Array[i] - Required.Array[i];
+
+      // adjust the value to the real hardware posibilities
+      Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
+
+    } else
+      Counts.Array[i] = 0;
+
+    // Remember on what we have waited on
+    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+  }
+
+  if (!NeedWait)
+    return false;
+
+  // Reset EXP_CNT instruction types
+  if (Counts.Named.EXP == 0)
+    ExpInstrTypesSeen = 0;
+
+  // Build the wait instruction
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+          .addImm((Counts.Named.VM & 0xF) |
+                  ((Counts.Named.EXP & 0x7) << 4) |
+                  ((Counts.Named.LGKM & 0x7) << 8));
+
+  return true;
+}
+
+/// \brief helper function for handleOperands
+static void increaseCounters(Counters &Dst, const Counters &Src) {
+
+  for (unsigned i = 0; i < 3; ++i)
+    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
+}
+
+Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
+
+  Counters Result = ZeroCounts;
+
+  // For each register affected by this
+  // instruction increase the result sequence
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = MI.getOperand(i);
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      if (Op.isDef())
+        increaseCounters(Result, UsedRegs[j]);
+
+      if (Op.isUse())
+        increaseCounters(Result, DefinedRegs[j]);
+    }
+  }
+
+  return Result;
+}
+
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
+
+  bool Changes = false;
+
+  MRI = &MF.getRegInfo();
+
+  WaitedOn = ZeroCounts;
+  LastIssued = ZeroCounts;
+
+  memset(&UsedRegs, 0, sizeof(UsedRegs));
+  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+
+      Changes |= insertWait(MBB, I, handleOperands(*I));
+      pushInstruction(*I);
+    }
+
+    // Wait for everything at the end of the MBB
+    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+  }
+
+  return Changes;
+}
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index aea3b5a..fe417d6 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -1,4 +1,4 @@
-//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===//
+//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,138 +9,418 @@
 //
 // SI Instruction format definitions.
 //
-// Instructions with _32 take 32-bit operands.
-// Instructions with _64 take 64-bit operands.
-//
-// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
-// encoding is the standard encoding, but instruction that make use of
-// any of the instruction modifiers must use the 64-bit encoding.
-//
-// Instructions with _e32 use the 32-bit encoding.
-// Instructions with _e64 use the 64-bit encoding.
-//
 //===----------------------------------------------------------------------===//
 
-class VOP3b_2IN <bits<9> op, string opName, RegisterClass dstClass,
-                 RegisterClass src0Class, RegisterClass src1Class,
-                 list<dag> pattern>
-  : VOP3b <op, (outs dstClass:$vdst),
-               (ins src0Class:$src0, src1Class:$src1, InstFlag:$src2, InstFlag:$sdst,
-                    InstFlag:$omod, InstFlag:$neg),
-           opName, pattern
->;
+class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
+    AMDGPUInst<outs, ins, asm, pattern> {
 
+  field bits<1> VM_CNT = 0;
+  field bits<1> EXP_CNT = 0;
+  field bits<1> LGKM_CNT = 0;
 
-class VOP3_1_32 <bits<9> op, string opName, list<dag> pattern>
-  : VOP3b_2IN <op, opName, SReg_1, AllReg_32, VReg_32, pattern>;
+  let TSFlags{0} = VM_CNT;
+  let TSFlags{1} = EXP_CNT;
+  let TSFlags{2} = LGKM_CNT;
+}
 
-class VOP3_32 <bits<9> op, string opName, list<dag> pattern>
-  : VOP3 <op, (outs VReg_32:$dst), (ins AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
+class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
 
-class VOP3_64 <bits<9> op, string opName, list<dag> pattern>
-  : VOP3 <op, (outs VReg_64:$dst), (ins AllReg_64:$src0, VReg_64:$src1, VReg_64:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
+  field bits<32> Inst;
+  let Size = 4;
+}
 
+class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
 
-class SOP1_32 <bits<8> op, string opName, list<dag> pattern>
-  : SOP1 <op, (outs SReg_32:$dst), (ins SReg_32:$src0), opName, pattern>;
+  field bits<64> Inst;
+  let Size = 8;
+}
 
-class SOP1_64 <bits<8> op, string opName, list<dag> pattern>
-  : SOP1 <op, (outs SReg_64:$dst), (ins SReg_64:$src0), opName, pattern>;
+//===----------------------------------------------------------------------===//
+// Scalar operations
+//===----------------------------------------------------------------------===//
 
-class SOP2_32 <bits<7> op, string opName, list<dag> pattern>
-  : SOP2 <op, (outs SReg_32:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
+class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32<outs, ins, asm, pattern> {
 
-class SOP2_64 <bits<7> op, string opName, list<dag> pattern>
-  : SOP2 <op, (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
+  bits<7> SDST;
+  bits<8> SSRC0;
 
-class SOP2_VCC <bits<7> op, string opName, list<dag> pattern>
-  : SOP2 <op, (outs SReg_1:$vcc), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = op;
+  let Inst{22-16} = SDST;
+  let Inst{31-23} = 0x17d; //encoding;
 
-class VOP1_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
-                   string opName, list<dag> pattern> : 
-  VOP1 <
-    op, (outs vrc:$dst), (ins arc:$src0), opName, pattern
-  >;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
 
-multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern> {
-  def _e32: VOP1_Helper <op, VReg_32, AllReg_32, opName, pattern>;
-  def _e64 : VOP3_32 <{1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-                      opName, []
-  >;
+class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+  
+  bits<7> SDST;
+  bits<8> SSRC0;
+  bits<8> SSRC1;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = SDST;
+  let Inst{29-23} = op;
+  let Inst{31-30} = 0x2; // encoding
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
 }
 
-multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> {
+class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  Enc32<outs, ins, asm, pattern> {
+
+  bits<8> SSRC0;
+  bits<8> SSRC1;
 
-  def _e32 : VOP1_Helper <op, VReg_64, AllReg_64, opName, pattern>;
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17e;
 
-  def _e64 : VOP3_64 <
-    {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-    opName, []
-  >;
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
 }
 
-class VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
-                   string opName, list<dag> pattern> :
-  VOP2 <
-    op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), opName, pattern
-  >;
+class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
+   Enc32 <outs, ins , asm, pattern> {
 
-multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern> {
+  bits <7> SDST;
+  bits <16> SIMM16;
+  
+  let Inst{15-0} = SIMM16;
+  let Inst{22-16} = SDST;
+  let Inst{27-23} = op;
+  let Inst{31-28} = 0xb; //encoding
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
 
-  def _e32 : VOP2_Helper <op, VReg_32, AllReg_32, opName, pattern>;
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
+  (outs),
+  ins,
+  asm,
+  pattern > {
 
-  def _e64 : VOP3_32 <{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-                      opName, []
-  >;
+  bits <16> SIMM16;
+
+  let Inst{15-0} = SIMM16;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17f; // encoding
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
 }
 
-multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> {
-  def _e32: VOP2_Helper <op, VReg_64, AllReg_64, opName, pattern>;
+class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
+            list<dag> pattern> : Enc32<outs, ins, asm, pattern> {
+
+  bits<7> SDST;
+  bits<6> SBASE;
+  bits<8> OFFSET;
+  
+  let Inst{7-0} = OFFSET;
+  let Inst{8} = imm;
+  let Inst{14-9} = SBASE;
+  let Inst{21-15} = SDST;
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+
+  let LGKM_CNT = 1;
+}
 
-  def _e64 : VOP3_64 <
-    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-    opName, []
-  >;
+//===----------------------------------------------------------------------===//
+// Vector ALU operations
+//===----------------------------------------------------------------------===//
+    
+let Uses = [EXEC] in {
+
+class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = op;
+  let Inst{24-17} = VDST;
+  let Inst{31-25} = 0x3f; //encoding
+  
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<8> VSRC1;
+  
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = VDST;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; //encoding
+  
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
 }
 
-class SOPK_32 <bits<5> op, string opName, list<dag> pattern>
-  : SOPK <op, (outs SReg_32:$dst), (ins i16imm:$src0), opName, pattern>;
+class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<9> SRC1;
+  bits<9> SRC2;
+  bits<3> ABS; 
+  bits<1> CLAMP;
+  bits<2> OMOD;
+  bits<3> NEG;
+
+  let Inst{7-0} = VDST;
+  let Inst{10-8} = ABS;
+  let Inst{11} = CLAMP;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = SRC0;
+  let Inst{49-41} = SRC1;
+  let Inst{58-50} = SRC2;
+  let Inst{60-59} = OMOD;
+  let Inst{63-61} = NEG;
+  
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<9> SRC1;
+  bits<9> SRC2;
+  bits<7> SDST;
+  bits<2> OMOD;
+  bits<3> NEG;
+
+  let Inst{7-0} = VDST;
+  let Inst{14-8} = SDST;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = SRC0;
+  let Inst{49-41} = SRC1;
+  let Inst{58-50} = SRC2;
+  let Inst{60-59} = OMOD;
+  let Inst{63-61} = NEG;
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
 
-class SOPK_64 <bits<5> op, string opName, list<dag> pattern>
-  : SOPK <op, (outs SReg_64:$dst), (ins i16imm:$src0), opName, pattern>;
+class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
+    Enc32 <(outs VCCReg:$dst), ins, asm, pattern> {
+
+  bits<9> SRC0;
+  bits<8> VSRC1;
+
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e;
+ 
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
 
-class VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
-                 string opName, list<dag> pattern> :
-  VOPC <
-    op, (ins arc:$src0, vrc:$src1), opName, pattern
-  >;
+class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
 
-multiclass VOPC_32 <bits<9> op, string opName, list<dag> pattern> {
+  bits<8> VDST;
+  bits<8> VSRC;
+  bits<2> ATTRCHAN;
+  bits<6> ATTR;
 
-  def _e32 : VOPC_Helper <
-    {op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-    VReg_32, AllReg_32, opName, pattern
-  >;
+  let Inst{7-0} = VSRC;
+  let Inst{9-8} = ATTRCHAN;
+  let Inst{15-10} = ATTR;
+  let Inst{17-16} = op;
+  let Inst{25-18} = VDST;
+  let Inst{31-26} = 0x32; // encoding
 
-  def _e64 : VOP3_1_32 <
-    op,
-    opName, pattern
-  >;
+  let neverHasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 0;
 }
 
-multiclass VOPC_64 <bits<8> op, string opName, list<dag> pattern> {
+} // End Uses = [EXEC]
 
-  def _e32 : VOPC_Helper <op, VReg_64, AllReg_64, opName, pattern>;
+//===----------------------------------------------------------------------===//
+// Vector I/O operations
+//===----------------------------------------------------------------------===//
 
-  def _e64 : VOP3_64 <
-    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
-    opName, []
-  >;
+let Uses = [EXEC] in {
+
+class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64<outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<12> OFFSET;
+  bits<1> OFFEN;
+  bits<1> IDXEN;
+  bits<1> GLC;
+  bits<1> ADDR64;
+  bits<1> LDS;
+  bits<8> VADDR;
+  bits<5> SRSRC;
+  bits<1> SLC;
+  bits<1> TFE;
+  bits<8> SOFFSET;
+
+  let Inst{11-0} = OFFSET;
+  let Inst{12} = OFFEN;
+  let Inst{13} = IDXEN;
+  let Inst{14} = GLC;
+  let Inst{15} = ADDR64;
+  let Inst{16} = LDS;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC;
+  let Inst{54} = SLC;
+  let Inst{55} = TFE;
+  let Inst{63-56} = SOFFSET;
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+
+  let neverHasSideEffects = 1;
 }
 
-class SOPC_32 <bits<7> op, string opName, list<dag> pattern>
-  : SOPC <op, (outs SCCReg:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
+class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64<outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<12> OFFSET;
+  bits<1> OFFEN;
+  bits<1> IDXEN;
+  bits<1> GLC;
+  bits<1> ADDR64;
+  bits<4> DFMT;
+  bits<3> NFMT;
+  bits<8> VADDR;
+  bits<5> SRSRC;
+  bits<1> SLC;
+  bits<1> TFE;
+  bits<8> SOFFSET;
+
+  let Inst{11-0} = OFFSET;
+  let Inst{12} = OFFEN;
+  let Inst{13} = IDXEN;
+  let Inst{14} = GLC;
+  let Inst{15} = ADDR64;
+  let Inst{18-16} = op;
+  let Inst{22-19} = DFMT;
+  let Inst{25-23} = NFMT;
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC;
+  let Inst{54} = SLC;
+  let Inst{55} = TFE;
+  let Inst{63-56} = SOFFSET;
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+
+  let neverHasSideEffects = 1;
+}
 
-class SOPC_64 <bits<7> op, string opName, list<dag> pattern>
-  : SOPC <op, (outs SCCReg:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
+class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<4> DMASK;
+  bits<1> UNORM;
+  bits<1> GLC;
+  bits<1> DA;
+  bits<1> R128;
+  bits<1> TFE;
+  bits<1> LWE;
+  bits<1> SLC;
+  bits<8> VADDR;
+  bits<5> SRSRC;
+  bits<5> SSAMP; 
+
+  let Inst{11-8} = DMASK;
+  let Inst{12} = UNORM;
+  let Inst{13} = GLC;
+  let Inst{14} = DA;
+  let Inst{15} = R128;
+  let Inst{16} = TFE;
+  let Inst{17} = LWE;
+  let Inst{24-18} = op;
+  let Inst{25} = SLC;
+  let Inst{31-26} = 0x3c;
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC;
+  let Inst{57-53} = SSAMP;
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+}
+
+def EXP : Enc64<
+  (outs),
+  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
+       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+  "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
+  [] > {
+
+  bits<4> EN;
+  bits<6> TGT;
+  bits<1> COMPR;
+  bits<1> DONE;
+  bits<1> VM;
+  bits<8> VSRC0;
+  bits<8> VSRC1;
+  bits<8> VSRC2;
+  bits<8> VSRC3;
+
+  let Inst{3-0} = EN;
+  let Inst{9-4} = TGT;
+  let Inst{10} = COMPR;
+  let Inst{11} = DONE;
+  let Inst{12} = VM;
+  let Inst{31-26} = 0x3e;
+  let Inst{39-32} = VSRC0;
+  let Inst{47-40} = VSRC1;
+  let Inst{55-48} = VSRC2;
+  let Inst{63-56} = VSRC3;
+
+  let EXP_CNT = 1;
+}
 
+} // End Uses = [EXEC]
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index c6ad4d5..d9dbd6a 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -41,7 +41,15 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // never be necessary.
   assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
 
-  if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
+  if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_64RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub0))
+            .addReg(RI.getSubReg(SrcReg, AMDGPU::sub0), getKillRegState(KillSrc))
+            .addReg(DestReg, RegState::Define | RegState::Implicit);
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), RI.getSubReg(DestReg, AMDGPU::sub1))
+            .addReg(RI.getSubReg(SrcReg, AMDGPU::sub1), getKillRegState(KillSrc));
+  } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
     assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
@@ -58,9 +66,19 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 }
 
+MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
+                                              bool NewMI) const {
+
+  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg() ||
+      !MI->getOperand(2).isReg())
+    return 0;
+
+  return TargetInstrInfo::commuteInstruction(MI, NewMI);
+}
+
 MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
                                            int64_t Imm) const {
-  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_IMM_I32), DebugLoc());
+  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_B32_e32), DebugLoc());
   MachineInstrBuilder MIB(*MF, MI);
   MIB.addReg(DstReg, RegState::Define);
   MIB.addImm(Imm);
@@ -76,9 +94,6 @@ bool SIInstrInfo::isMov(unsigned Opcode) const {
   case AMDGPU::S_MOV_B64:
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
-  case AMDGPU::V_MOV_IMM_F32:
-  case AMDGPU::V_MOV_IMM_I32:
-  case AMDGPU::S_MOV_IMM_I32:
     return true;
   }
 }
@@ -87,3 +102,51 @@ bool
 SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   return RC != &AMDGPU::EXECRegRegClass;
 }
+
+//===----------------------------------------------------------------------===//
+// Indirect addressing callbacks
+//===----------------------------------------------------------------------===//
+
+unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
+}
+
+
+int SIInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  llvm_unreachable("Unimplemented");
+}
+
+int SIInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  llvm_unreachable("Unimplemented");
+}
+
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrStoreRegClass(
+                                                     unsigned SourceReg) const {
+  llvm_unreachable("Unimplemented");
+}
+
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrLoadRegClass() const {
+  llvm_unreachable("Unimplemented");
+}
+
+MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
+                                   MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned ValueReg,
+                                   unsigned Address, unsigned OffsetReg) const {
+  llvm_unreachable("Unimplemented");
+}
+
+MachineInstrBuilder SIInstrInfo::buildIndirectRead(
+                                   MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned ValueReg,
+                                   unsigned Address, unsigned OffsetReg) const {
+  llvm_unreachable("Unimplemented");
+}
+
+const TargetRegisterClass *SIInstrInfo::getSuperIndirectRegClass() const {
+  llvm_unreachable("Unimplemented");
+}
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 631f6c0..5789af5 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -35,11 +35,8 @@ public:
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const;
 
-  /// \returns the encoding type of this instruction.
-  unsigned getEncodingType(const MachineInstr &MI) const;
-
-  /// \returns the size of this instructions encoding in number of bytes.
-  unsigned getEncodingBytes(const MachineInstr &MI) const;
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI,
+                                           bool NewMI=false) const;
 
   virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
                                         int64_t Imm) const;
@@ -48,14 +45,48 @@ public:
   virtual bool isMov(unsigned Opcode) const;
 
   virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+
+  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
+                                            unsigned Channel) const;
+
+  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
+                                                      unsigned SourceReg) const;
+
+  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+
+  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                                 MachineBasicBlock::iterator I,
+                                                 unsigned ValueReg,
+                                                 unsigned Address,
+                                                 unsigned OffsetReg) const;
+
+  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                                MachineBasicBlock::iterator I,
+                                                unsigned ValueReg,
+                                                unsigned Address,
+                                                unsigned OffsetReg) const;
+
+  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
   };
 
+namespace AMDGPU {
+
+  int getVOPe64(uint16_t Opcode);
+
+} // End namespace AMDGPU
+
 } // End namespace llvm
 
 namespace SIInstrFlags {
   enum Flags {
     // First 4 bits are the instruction encoding
-    NEED_WAIT = 1 << 4
+    VM_CNT = 1 << 0,
+    EXP_CNT = 1 << 1,
+    LGKM_CNT = 1 << 2
   };
 }
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 873a451..d6c3f06 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===//
+//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,521 +8,280 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// SI DAG Profiles
-//===----------------------------------------------------------------------===//
-def SDTVCCBinaryOp : SDTypeProfile<1, 2, [
-  SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>
-]>;
-
-//===----------------------------------------------------------------------===//
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
 
-// and operation on 64-bit wide vcc
-def SIsreg1_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
-  [SDNPCommutative, SDNPAssociative]
->;
-
-// Special bitcast node for sharing VCC register between VALU and SALU
-def SIsreg1_bitcast : SDNode<"SIISD::VCC_BITCAST",
-  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
+// SMRD takes a 64bit memory address and can only add an 32bit offset
+def SIadd64bit32bit : SDNode<"ISD::ADD",
+  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
 >;
 
-// and operation on 64-bit wide vcc
-def SIvcc_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
-  [SDNPCommutative, SDNPAssociative]
+// Transformation function, extract the lower 32bit of a 64bit immediate
+def LO32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
+}]>;
+
+// Transformation function, extract the upper 32bit of a 64bit immediate
+def HI32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
+}]>;
+
+def IMM8bitDWORD : ImmLeaf <
+  i32, [{
+    return (Imm & ~0x3FC) == 0;
+  }], SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(
+      N->getZExtValue() >> 2, MVT::i32);
+  }]>
 >;
 
-// Special bitcast node for sharing VCC register between VALU and SALU
-def SIvcc_bitcast : SDNode<"SIISD::VCC_BITCAST",
-  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
->;
-
-class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst<outs, ins, asm, pattern> {
-
-  field bits<4> EncodingType = 0;
-  field bits<1> NeedWait = 0;
-
-  let TSFlags{3-0} = EncodingType;
-  let TSFlags{4} = NeedWait;
-
-}
-
-class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
-
-  field bits<32> Inst;
-}
-
-class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
-    InstSI <outs, ins, asm, pattern> {
-
-  field bits<64> Inst;
-}
-
-class SIOperand <ValueType vt, dag opInfo>: Operand <vt> {
-  let EncoderMethod = "encodeOperand";
-  let MIOperandInfo = opInfo;
-}
-
-def IMM16bit : ImmLeaf <
+def IMM12bit : ImmLeaf <
   i16,
-  [{return isInt<16>(Imm);}]
+  [{return isUInt<12>(Imm);}]
 >;
 
-def IMM8bit : ImmLeaf <
-  i32,
-  [{return (int32_t)Imm >= 0 && (int32_t)Imm <= 0xff;}]
->;
+class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
+  return ((const SITargetLowering &)TLI).analyzeImmediate(N) == 0;
+}]>;
 
-def IMM12bit : ImmLeaf <
-  i16,
-  [{return (int16_t)Imm >= 0 && (int16_t)Imm <= 0xfff;}]
->;
+//===----------------------------------------------------------------------===//
+// SI assembler operands
+//===----------------------------------------------------------------------===//
 
-def IMM32bitIn64bit : ImmLeaf <
-  i64,
-  [{return isInt<32>(Imm);}]
->;
+def SIOperand {
+  int ZERO = 0x80;
+  int VCC = 0x6A;
+}
 
 class GPR4Align <RegisterClass rc> : Operand <vAny> {
   let EncoderMethod = "GPR4AlignEncode";
   let MIOperandInfo = (ops rc:$reg); 
 }
 
-class GPR2Align <RegisterClass rc, ValueType vt> : Operand <vt> {
+class GPR2Align <RegisterClass rc> : Operand <iPTR> {
   let EncoderMethod = "GPR2AlignEncode";
   let MIOperandInfo = (ops rc:$reg);
 }
 
-def SMRDmemrr : Operand<iPTR> {
-  let MIOperandInfo = (ops SReg_64, SReg_32);
-  let EncoderMethod = "GPR2AlignEncode";
-}
-
-def SMRDmemri : Operand<iPTR> {
-  let MIOperandInfo = (ops SReg_64, i32imm);
-  let EncoderMethod = "SMRDmemriEncode";
-}
-
-def ADDR_Reg     : ComplexPattern<i64, 2, "SelectADDRReg", [], []>;
-def ADDR_Offset8 : ComplexPattern<i64, 2, "SelectADDR8BitOffset", [], []>;
-
-let Uses = [EXEC] in {
-
-def EXP : Enc64<
-  (outs),
-  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
-       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
-  "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
-  [] > {
-
-  bits<4> EN;
-  bits<6> TGT;
-  bits<1> COMPR;
-  bits<1> DONE;
-  bits<1> VM;
-  bits<8> VSRC0;
-  bits<8> VSRC1;
-  bits<8> VSRC2;
-  bits<8> VSRC3;
-
-  let Inst{3-0} = EN;
-  let Inst{9-4} = TGT;
-  let Inst{10} = COMPR;
-  let Inst{11} = DONE;
-  let Inst{12} = VM;
-  let Inst{31-26} = 0x3e;
-  let Inst{39-32} = VSRC0;
-  let Inst{47-40} = VSRC1;
-  let Inst{55-48} = VSRC2;
-  let Inst{63-56} = VSRC3;
-  let EncodingType = 0; //SIInstrEncodingType::EXP
-
-  let NeedWait = 1;
-  let usesCustomInserter = 1;
-}
-
-class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
-
-  bits<8> VDATA;
-  bits<4> DMASK;
-  bits<1> UNORM;
-  bits<1> GLC;
-  bits<1> DA;
-  bits<1> R128;
-  bits<1> TFE;
-  bits<1> LWE;
-  bits<1> SLC;
-  bits<8> VADDR;
-  bits<5> SRSRC;
-  bits<5> SSAMP; 
-
-  let Inst{11-8} = DMASK;
-  let Inst{12} = UNORM;
-  let Inst{13} = GLC;
-  let Inst{14} = DA;
-  let Inst{15} = R128;
-  let Inst{16} = TFE;
-  let Inst{17} = LWE;
-  let Inst{24-18} = op;
-  let Inst{25} = SLC;
-  let Inst{31-26} = 0x3c;
-  let Inst{39-32} = VADDR;
-  let Inst{47-40} = VDATA;
-  let Inst{52-48} = SRSRC;
-  let Inst{57-53} = SSAMP;
-
-  let EncodingType = 2; //SIInstrEncodingType::MIMG
-
-  let NeedWait = 1;
-  let usesCustomInserter = 1;
-}
-
-class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64<outs, ins, asm, pattern> {
-
-  bits<8> VDATA;
-  bits<12> OFFSET;
-  bits<1> OFFEN;
-  bits<1> IDXEN;
-  bits<1> GLC;
-  bits<1> ADDR64;
-  bits<4> DFMT;
-  bits<3> NFMT;
-  bits<8> VADDR;
-  bits<5> SRSRC;
-  bits<1> SLC;
-  bits<1> TFE;
-  bits<8> SOFFSET;
-
-  let Inst{11-0} = OFFSET;
-  let Inst{12} = OFFEN;
-  let Inst{13} = IDXEN;
-  let Inst{14} = GLC;
-  let Inst{15} = ADDR64;
-  let Inst{18-16} = op;
-  let Inst{22-19} = DFMT;
-  let Inst{25-23} = NFMT;
-  let Inst{31-26} = 0x3a; //encoding
-  let Inst{39-32} = VADDR;
-  let Inst{47-40} = VDATA;
-  let Inst{52-48} = SRSRC;
-  let Inst{54} = SLC;
-  let Inst{55} = TFE;
-  let Inst{63-56} = SOFFSET;
-  let EncodingType = 3; //SIInstrEncodingType::MTBUF
-
-  let NeedWait = 1;
-  let usesCustomInserter = 1;
-  let neverHasSideEffects = 1;
-}
-
-class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64<outs, ins, asm, pattern> {
-
-  bits<8> VDATA;
-  bits<12> OFFSET;
-  bits<1> OFFEN;
-  bits<1> IDXEN;
-  bits<1> GLC;
-  bits<1> ADDR64;
-  bits<1> LDS;
-  bits<8> VADDR;
-  bits<5> SRSRC;
-  bits<1> SLC;
-  bits<1> TFE;
-  bits<8> SOFFSET;
-
-  let Inst{11-0} = OFFSET;
-  let Inst{12} = OFFEN;
-  let Inst{13} = IDXEN;
-  let Inst{14} = GLC;
-  let Inst{15} = ADDR64;
-  let Inst{16} = LDS;
-  let Inst{24-18} = op;
-  let Inst{31-26} = 0x38; //encoding
-  let Inst{39-32} = VADDR;
-  let Inst{47-40} = VDATA;
-  let Inst{52-48} = SRSRC;
-  let Inst{54} = SLC;
-  let Inst{55} = TFE;
-  let Inst{63-56} = SOFFSET;
-  let EncodingType = 4; //SIInstrEncodingType::MUBUF
-
-  let NeedWait = 1;
-  let usesCustomInserter = 1;
-  let neverHasSideEffects = 1;
-}
+include "SIInstrFormats.td"
 
-} // End Uses = [EXEC]
-
-class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32<outs, ins, asm, pattern> {
-
-  bits<7> SDST;
-  bits<15> PTR;
-  bits<8> OFFSET = PTR{7-0};
-  bits<1> IMM    = PTR{8};
-  bits<6> SBASE  = PTR{14-9};
-  
-  let Inst{7-0} = OFFSET;
-  let Inst{8} = IMM;
-  let Inst{14-9} = SBASE;
-  let Inst{21-15} = SDST;
-  let Inst{26-22} = op;
-  let Inst{31-27} = 0x18; //encoding
-  let EncodingType = 5; //SIInstrEncodingType::SMRD
-
-  let NeedWait = 1;
-  let usesCustomInserter = 1;
-}
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction multiclass helpers.
+//
+// Instructions with _32 take 32-bit operands.
+// Instructions with _64 take 64-bit operands.
+//
+// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
+// encoding is the standard encoding, but instruction that make use of
+// any of the instruction modifiers must use the 64-bit encoding.
+//
+// Instructions with _e32 use the 32-bit encoding.
+// Instructions with _e64 use the 64-bit encoding.
+//
+//===----------------------------------------------------------------------===//
 
-class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32<outs, ins, asm, pattern> {
+//===----------------------------------------------------------------------===//
+// Scalar classes
+//===----------------------------------------------------------------------===//
 
-  bits<7> SDST;
-  bits<8> SSRC0;
+class SOP1_32 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+  op, (outs SReg_32:$dst), (ins SSrc_32:$src0),
+  opName#" $dst, $src0", pattern
+>;
 
-  let Inst{7-0} = SSRC0;
-  let Inst{15-8} = op;
-  let Inst{22-16} = SDST;
-  let Inst{31-23} = 0x17d; //encoding;
-  let EncodingType = 6; //SIInstrEncodingType::SOP1
+class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0),
+  opName#" $dst, $src0", pattern
+>;
 
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-}
+class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
 
-class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32 <outs, ins, asm, pattern> {
-  
-  bits<7> SDST;
-  bits<8> SSRC0;
-  bits<8> SSRC1;
+class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
 
-  let Inst{7-0} = SSRC0;
-  let Inst{15-8} = SSRC1;
-  let Inst{22-16} = SDST;
-  let Inst{29-23} = op;
-  let Inst{31-30} = 0x2; // encoding
-  let EncodingType = 7; // SIInstrEncodingType::SOP2  
+class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
+  op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
 
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-}
+class SOPC_64 <bits<7> op, string opName, list<dag> pattern> : SOPC <
+  op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
 
-class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
-  Enc32<outs, ins, asm, pattern> {
+class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
+  op, (outs SReg_32:$dst), (ins i16imm:$src0),
+  opName#" $dst, $src0", pattern
+>;
 
-  bits<8> SSRC0;
-  bits<8> SSRC1;
+class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
+  op, (outs SReg_64:$dst), (ins i16imm:$src0),
+  opName#" $dst, $src0", pattern
+>;
 
-  let Inst{7-0} = SSRC0;
-  let Inst{15-8} = SSRC1;
-  let Inst{22-16} = op;
-  let Inst{31-23} = 0x17e;
-  let EncodingType = 8; // SIInstrEncodingType::SOPC
+multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> {
+  def _IMM : SMRD <
+    op, 1, (outs dstClass:$dst),
+    (ins GPR2Align<SReg_64>:$sbase, i32imm:$offset),
+    asm#" $dst, $sbase, $offset", []
+  >;
 
-  let DisableEncoding = "$dst";
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
+  def _SGPR : SMRD <
+    op, 0, (outs dstClass:$dst),
+    (ins GPR2Align<SReg_64>:$sbase, SReg_32:$soff),
+    asm#" $dst, $sbase, $soff", []
+  >;
 }
 
-class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
-   Enc32 <outs, ins , asm, pattern> {
-
-  bits <7> SDST;
-  bits <16> SIMM16;
-  
-  let Inst{15-0} = SIMM16;
-  let Inst{22-16} = SDST;
-  let Inst{27-23} = op;
-  let Inst{31-28} = 0xb; //encoding
-  let EncodingType = 9; // SIInstrEncodingType::SOPK
+//===----------------------------------------------------------------------===//
+// Vector ALU classes
+//===----------------------------------------------------------------------===//
 
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
+class VOP <string opName> {
+  string OpName = opName;
 }
 
-class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
-  (outs),
-  ins,
-  asm,
-  pattern > {
-
-  bits <16> SIMM16;
-
-  let Inst{15-0} = SIMM16;
-  let Inst{22-16} = op;
-  let Inst{31-23} = 0x17f; // encoding
-  let EncodingType = 10; // SIInstrEncodingType::SOPP
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-}
-    
-let Uses = [EXEC] in {
-
-class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32 <outs, ins, asm, pattern> {
-
-  bits<8> VDST;
-  bits<8> VSRC;
-  bits<2> ATTRCHAN;
-  bits<6> ATTR;
-
-  let Inst{7-0} = VSRC;
-  let Inst{9-8} = ATTRCHAN;
-  let Inst{15-10} = ATTR;
-  let Inst{17-16} = op;
-  let Inst{25-18} = VDST;
-  let Inst{31-26} = 0x32; // encoding
-  let EncodingType = 11; // SIInstrEncodingType::VINTRP
-
-  let neverHasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 0;
+multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
+                        string opName, list<dag> pattern> {
+
+  def _e32 : VOP1 <
+    op, (outs drc:$dst), (ins src:$src0),
+    opName#"_e32 $dst, $src0", pattern
+  >, VOP <opName>;
+
+  def _e64 : VOP3 <
+    {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs drc:$dst),
+    (ins src:$src0,
+         i32imm:$abs, i32imm:$clamp,
+         i32imm:$omod, i32imm:$neg),
+    opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", []
+  >, VOP <opName> {
+    let SRC1 = SIOperand.ZERO;
+    let SRC2 = SIOperand.ZERO;
+  }
 }
 
-class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32 <outs, ins, asm, pattern> {
-
-  bits<8> VDST;
-  bits<9> SRC0;
-  
-  let Inst{8-0} = SRC0;
-  let Inst{16-9} = op;
-  let Inst{24-17} = VDST;
-  let Inst{31-25} = 0x3f; //encoding
-  
-  let EncodingType = 12; // SIInstrEncodingType::VOP1
-  let PostEncoderMethod = "VOPPostEncode";
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
+multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+
+multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
+
+multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
+                        string opName, list<dag> pattern> {
+  def _e32 : VOP2 <
+    op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1),
+    opName#"_e32 $dst, $src0, $src1", pattern
+  >, VOP <opName>;
+
+  def _e64 : VOP3 <
+    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs vrc:$dst),
+    (ins arc:$src0, arc:$src1,
+         i32imm:$abs, i32imm:$clamp,
+         i32imm:$omod, i32imm:$neg),
+    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+  >, VOP <opName> {
+    let SRC2 = SIOperand.ZERO;
+  }
 }
 
-class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc32 <outs, ins, asm, pattern> {
-
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<8> VSRC1;
-  
-  let Inst{8-0} = SRC0;
-  let Inst{16-9} = VSRC1;
-  let Inst{24-17} = VDST;
-  let Inst{30-25} = op;
-  let Inst{31} = 0x0; //encoding
-  
-  let EncodingType = 13; // SIInstrEncodingType::VOP2
-  let PostEncoderMethod = "VOPPostEncode";
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
+multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern>
+  : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+
+multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern>
+  : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern>;
+
+multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern> {
+
+  def _e32 : VOP2 <
+    op, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1),
+    opName#"_e32 $dst, $src0, $src1", pattern
+  >, VOP <opName>;
+
+  def _e64 : VOP3b <
+    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs VReg_32:$dst),
+    (ins VSrc_32:$src0, VSrc_32:$src1,
+         i32imm:$abs, i32imm:$clamp,
+         i32imm:$omod, i32imm:$neg),
+    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+  >, VOP <opName> {
+    let SRC2 = SIOperand.ZERO;
+    /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
+       can write it into any SGPR. We currently don't use the carry out,
+       so for now hardcode it to VCC as well */
+    let SDST = SIOperand.VCC;
+  }
 }
 
-class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
-
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<9> SRC1;
-  bits<9> SRC2;
-  bits<3> ABS; 
-  bits<1> CLAMP;
-  bits<2> OMOD;
-  bits<3> NEG;
-
-  let Inst{7-0} = VDST;
-  let Inst{10-8} = ABS;
-  let Inst{11} = CLAMP;
-  let Inst{25-17} = op;
-  let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
-  
-  let EncodingType = 14; // SIInstrEncodingType::VOP3
-  let PostEncoderMethod = "VOPPostEncode";
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
+multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
+                        string opName, ValueType vt, PatLeaf cond> {
+
+  def _e32 : VOPC <
+    op, (ins arc:$src0, vrc:$src1),
+    opName#"_e32 $dst, $src0, $src1", []
+  >, VOP <opName>;
+
+  def _e64 : VOP3 <
+    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs SReg_64:$dst),
+    (ins arc:$src0, arc:$src1,
+         InstFlag:$abs, InstFlag:$clamp,
+         InstFlag:$omod, InstFlag:$neg),
+    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg",
+    !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>,
+      [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
+    )
+  >, VOP <opName> {
+    let SRC2 = SIOperand.ZERO;
+  }
 }
 
-class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
-    Enc64 <outs, ins, asm, pattern> {
-
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<9> SRC1;
-  bits<9> SRC2;
-  bits<7> SDST;
-  bits<2> OMOD;
-  bits<3> NEG;
-
-  let Inst{7-0} = VDST;
-  let Inst{14-8} = SDST;
-  let Inst{25-17} = op;
-  let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
-
-  let EncodingType = 14; // SIInstrEncodingType::VOP3
-  let PostEncoderMethod = "VOPPostEncode";
-
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-}
+multiclass VOPC_32 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond>;
 
-class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
-    Enc32 <(outs VCCReg:$dst), ins, asm, pattern> {
+multiclass VOPC_64 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
 
-  bits<9> SRC0;
-  bits<8> VSRC1;
+class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
+  op, (outs VReg_32:$dst),
+  (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
+   i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+  opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
+>, VOP <opName>;
 
-  let Inst{8-0} = SRC0;
-  let Inst{16-9} = VSRC1;
-  let Inst{24-17} = op;
-  let Inst{31-25} = 0x3e;
- 
-  let EncodingType = 15; //SIInstrEncodingType::VOPC
-  let PostEncoderMethod = "VOPPostEncode";
-  let DisableEncoding = "$dst";
-  let mayLoad = 0;
-  let mayStore = 0;
-  let hasSideEffects = 0;
-}
+class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
+  op, (outs VReg_64:$dst),
+  (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
+   i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+  opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
+>, VOP <opName>;
 
-} // End Uses = [EXEC]
+//===----------------------------------------------------------------------===//
+// Vector I/O classes
+//===----------------------------------------------------------------------===//
 
-class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
+class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
-  (outs VReg_128:$vdata),
-  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_128:$vaddr,
-       GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
-  asm,
+  (outs),
+  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
+   GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+  asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+     #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
   []> {
-  let mayLoad = 1;
-  let mayStore = 0;
+  let mayStore = 1;
+  let mayLoad = 0;
 }
 
 class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
@@ -530,8 +289,9 @@ class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF
   (outs regClass:$dst),
   (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
        i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
-       i1imm:$tfe, SReg_32:$soffset),
-  asm,
+       i1imm:$tfe, SSrc_32:$soffset),
+  asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, "
+     #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
   []> {
   let mayLoad = 1;
   let mayStore = 0;
@@ -542,48 +302,38 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF
   (outs regClass:$dst),
   (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
        i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
-  asm,
+       i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+  asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+     #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
   []> {
   let mayLoad = 1;
   let mayStore = 0;
 }
 
-class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
+class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
   op,
-  (outs),
-  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
-   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
-   GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
-  asm,
+  (outs VReg_128:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_32:$vaddr,
+       GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
   []> {
-  let mayStore = 1;
-  let mayLoad = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
 }
 
-multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass,
-                        ValueType vt> {
-  def _IMM : SMRD <
-              op,
-              (outs dstClass:$dst),
-              (ins SMRDmemri:$src0),
-              asm,
-              [(set (vt dstClass:$dst), (constant_load ADDR_Offset8:$src0))]
-  >;
-
-  def _SGPR : SMRD <
-              op,
-              (outs dstClass:$dst),
-              (ins SMRDmemrr:$src0),
-              asm,
-              [(set (vt dstClass:$dst), (constant_load ADDR_Reg:$src0))]
-  >;
-}
+//===----------------------------------------------------------------------===//
+// Vector instruction mappings
+//===----------------------------------------------------------------------===//
 
-multiclass SMRD_32 <bits<5> op, string asm, RegisterClass dstClass> {
-  defm _F32 : SMRD_Helper <op, asm, dstClass, f32>;
-  defm _I32 : SMRD_Helper <op, asm, dstClass, i32>;
+// Maps an opcode in e32 form to its e64 equivalent
+def getVOPe64 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["Size"];
+  let KeyCol = ["4"];
+  let ValueCols = [["8"]];
 }
 
-include "SIInstrFormats.td"
 include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 005be96..af116f0 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -11,16 +11,31 @@
 // that are not yet supported remain commented out.
 //===----------------------------------------------------------------------===//
 
+class InterpSlots {
+int P0 = 2;
+int P10 = 0;
+int P20 = 1;
+}
+def INTERP : InterpSlots;
+
+def InterpSlot : Operand<i32> {
+  let PrintMethod = "printInterpSlot";
+}
+
 def isSI : Predicate<"Subtarget.device()"
                             "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">;
 
 let Predicates = [isSI] in {
 
 let neverHasSideEffects = 1 in {
+
+let isMoveImm = 1 in {
 def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
 def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
 def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
 def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
+} // End isMoveImm = 1
+
 def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
 def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
 def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
@@ -28,6 +43,7 @@ def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
 def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
 def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
 } // End neverHasSideEffects = 1
+
 ////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
 ////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
 ////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
@@ -96,6 +112,7 @@ def S_CMPK_EQ_I32 : SOPK <
 >;
 */
 
+let isCompare = 1 in {
 def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
 def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
 def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
@@ -107,6 +124,8 @@ def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
 def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
 def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
 def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
+} // End isCompare = 1
+
 def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
 def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
 //def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
@@ -116,286 +135,262 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
 //def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
 //def EXP : EXP_ <0x00000000, "EXP", []>;
 
-defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>;
-defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>;
-def : Pat <
-  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
-  (V_CMP_LT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>;
-def : Pat <
-  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
-  (V_CMP_EQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>;
-def : Pat <
-  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
-  (V_CMP_LE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>;
-def : Pat <
-  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
-  (V_CMP_GT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>;
-def : Pat <
-  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
-  (V_CMP_LG_F32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>;
-def : Pat <
-  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
-  (V_CMP_GE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>;
-defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>;
-defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32", []>;
-defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32", []>;
-defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32", []>;
-defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>;
-defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>;
-def : Pat <
-  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
-  (V_CMP_NEQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>;
-defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>;
-
-//Side effect is writing to EXEC
-let hasSideEffects = 1 in {
-
-defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32", []>;
-defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32", []>;
-defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32", []>;
-defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32", []>;
-defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32", []>;
-defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32", []>;
-defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32", []>;
-defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32", []>;
-defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32", []>;
-defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32", []>;
-defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32", []>;
-defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32", []>;
-defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32", []>;
-defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32", []>;
-defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32", []>;
-defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32", []>;
-
-} // End hasSideEffects = 1
-
-defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64", []>;
-defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", []>;
-defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", []>;
-defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", []>;
-defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", []>;
-defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64", []>;
-defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", []>;
-defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", []>;
-defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", []>;
-defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64", []>;
-defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64", []>;
-defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64", []>;
-defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64", []>;
-defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", []>;
-defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64", []>;
-defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64", []>;
-
-//Side effect is writing to EXEC
-let hasSideEffects = 1 in {
-
-defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64", []>;
-defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64", []>;
-defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64", []>;
-defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64", []>;
-defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64", []>;
-defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64", []>;
-defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64", []>;
-defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64", []>;
-defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64", []>;
-defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64", []>;
-defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64", []>;
-defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64", []>;
-defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64", []>;
-defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64", []>;
-defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64", []>;
-defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64", []>;
-
-} // End hasSideEffects = 1
-
-defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32", []>;
-defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32", []>;
-defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32", []>;
-defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32", []>;
-defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32", []>;
-defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32", []>;
-defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32", []>;
-defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32", []>;
-defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32", []>;
-defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32", []>;
-defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32", []>;
-defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32", []>;
-defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32", []>;
-defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32", []>;
-defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32", []>;
-defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32", []>;
-defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32", []>;
-defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32", []>;
-defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32", []>;
-defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32", []>;
-defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32", []>;
-defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32", []>;
-defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32", []>;
-defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32", []>;
-defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32", []>;
-defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32", []>;
-defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32", []>;
-defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32", []>;
-defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32", []>;
-defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32", []>;
-defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32", []>;
-defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32", []>;
-defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64", []>;
-defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64", []>;
-defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64", []>;
-defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64", []>;
-defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64", []>;
-defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64", []>;
-defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64", []>;
-defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64", []>;
-defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64", []>;
-defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64", []>;
-defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64", []>;
-defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64", []>;
-defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64", []>;
-defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64", []>;
-defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64", []>;
-defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64", []>;
-defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64", []>;
-defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64", []>;
-defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64", []>;
-defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64", []>;
-defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64", []>;
-defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64", []>;
-defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64", []>;
-defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64", []>;
-defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64", []>;
-defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64", []>;
-defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64", []>;
-defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64", []>;
-defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64", []>;
-defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64", []>;
-defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64", []>;
-defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64", []>;
-defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>;
-defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>;
-def : Pat <
-  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
-  (V_CMP_LT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>;
-def : Pat <
-  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
-  (V_CMP_EQ_I32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>;
-def : Pat <
-  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
-  (V_CMP_LE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>;
-def : Pat <
-  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
-  (V_CMP_GT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>;
-def : Pat <
-  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
-  (V_CMP_NE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>;
-def : Pat <
-  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
-  (V_CMP_GE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
->;
-defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>;
-
-let hasSideEffects = 1 in {
+let isCompare = 1 in {
+
+defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
+defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_LT>;
+defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_EQ>;
+defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_LE>;
+defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_GT>;
+defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", f32, COND_NE>;
+defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_GE>;
+defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32">;
+defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32">;
+defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">;
+defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">;
+defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">;
+defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">;
+defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_NE>;
+defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
+defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32">;
+defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32">;
+defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32">;
+defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32">;
+defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32">;
+defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32">;
+defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32">;
+defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32">;
+defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32">;
+defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32">;
+defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32">;
+defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32">;
+defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32">;
+defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32">;
+defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32">;
+defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
+defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64">;
+defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64">;
+defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64">;
+defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64">;
+defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">;
+defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64">;
+defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64">;
+defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64">;
+defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">;
+defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">;
+defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">;
+defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">;
+defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64">;
+defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
+defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64">;
+defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64">;
+defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64">;
+defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64">;
+defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64">;
+defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64">;
+defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64">;
+defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64">;
+defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64">;
+defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64">;
+defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64">;
+defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64">;
+defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64">;
+defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64">;
+defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64">;
+defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">;
+defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">;
+defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32">;
+defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32">;
+defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32">;
+defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32">;
+defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32">;
+defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32">;
+defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32">;
+defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32">;
+defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32">;
+defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32">;
+defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32">;
+defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">;
+defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">;
+defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32">;
+defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32">;
+defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32">;
+defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32">;
+defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32">;
+defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32">;
+defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32">;
+defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32">;
+defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32">;
+defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32">;
+defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32">;
+defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32">;
+defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32">;
+defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
+defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32">;
+defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">;
+defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">;
+defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64">;
+defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64">;
+defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64">;
+defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64">;
+defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64">;
+defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64">;
+defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64">;
+defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64">;
+defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64">;
+defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64">;
+defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64">;
+defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64">;
+defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64">;
+defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64">;
+defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64">;
+defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64">;
+defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64">;
+defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64">;
+defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64">;
+defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64">;
+defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64">;
+defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64">;
+defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64">;
+defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64">;
+defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64">;
+defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64">;
+defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64">;
+defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64">;
+defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">;
+defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_LT>;
+defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>;
+defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_LE>;
+defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_GT>;
+defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
+defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_GE>;
+defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32">;
+defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32">;
+defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32">;
+defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32">;
+defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32">;
+defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32">;
+defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32">;
+defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
+defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64">;
+defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64">;
+defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64">;
+defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64">;
+defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64">;
+defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64">;
+defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64">;
+defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64">;
+defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64">;
+defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64">;
+defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64">;
+defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64">;
+defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64">;
+defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
+defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32">;
+defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32">;
+defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32">;
+defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32">;
+defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32">;
+defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32">;
+defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32">;
+defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32">;
+defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32">;
+defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32">;
+defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32">;
+defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32">;
+defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32">;
+defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
+defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64">;
+defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64">;
+defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64">;
+defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64">;
+defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64">;
+defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64">;
+defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64">;
+defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64">;
+defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64">;
+defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64">;
+defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64">;
+defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64">;
+defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64">;
+defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32">;
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+} // End isCompare = 1
 
-defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32", []>;
-defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32", []>;
-defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32", []>;
-defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32", []>;
-defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32", []>;
-defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32", []>;
-defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32", []>;
-defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32", []>;
-
-} // End hasSideEffects
-
-defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64", []>;
-defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", []>;
-defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", []>;
-defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", []>;
-defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", []>;
-defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", []>;
-defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", []>;
-defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64", []>;
-
-let hasSideEffects = 1 in {
-
-defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64", []>;
-defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64", []>;
-defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64", []>;
-defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64", []>;
-defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64", []>;
-defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64", []>;
-defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64", []>;
-defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64", []>;
-
-} // End hasSideEffects
-
-defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32", []>;
-defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", []>;
-defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", []>;
-defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", []>;
-defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", []>;
-defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", []>;
-defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", []>;
-defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32", []>;
-
-let hasSideEffects = 1 in {
-
-defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32", []>;
-defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32", []>;
-defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32", []>;
-defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32", []>;
-defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32", []>;
-defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32", []>;
-defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32", []>;
-defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32", []>;
-
-} // End hasSideEffects
-
-defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64", []>;
-defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", []>;
-defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", []>;
-defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", []>;
-defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", []>;
-defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", []>;
-defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", []>;
-defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64", []>;
-defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64", []>;
-defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64", []>;
-defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64", []>;
-defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64", []>;
-defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64", []>;
-defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64", []>;
-defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64", []>;
-defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64", []>;
-defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32", []>;
-defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32", []>;
-defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64", []>;
-defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64", []>;
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
 //def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
@@ -461,11 +456,13 @@ def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORM
 //def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
 //def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
 
-defm S_LOAD_DWORD : SMRD_32 <0x00000000, "S_LOAD_DWORD", SReg_32>;
+let mayLoad = 1 in {
+
+defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>;
 
 //def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128, v4i32>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256, v8i32>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>;
 //def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
 //def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
 //def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
@@ -473,6 +470,8 @@ defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256, v8i32
 //def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
 //def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
 
+} // mayLoad = 1
+
 //def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
 //def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
 //def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
@@ -511,12 +510,12 @@ def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">;
 def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">;
 //def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
-//def IMAGE_SAMPLE_C : MIMG_NoPattern_ <"IMAGE_SAMPLE_C", 0x00000028>;
+def IMAGE_SAMPLE_C : MIMG_Load_Helper <0x00000028, "IMAGE_SAMPLE_C">;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
 //def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
-//def IMAGE_SAMPLE_C_L : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L", 0x0000002c>;
-//def IMAGE_SAMPLE_C_B : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B", 0x0000002d>;
+def IMAGE_SAMPLE_C_L : MIMG_Load_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
+def IMAGE_SAMPLE_C_B : MIMG_Load_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
 //def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
 //def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
@@ -572,19 +571,21 @@ def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">;
 //def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
 //def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
 
-let neverHasSideEffects = 1 in {
+
+let neverHasSideEffects = 1, isMoveImm = 1 in {
 defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
-}  // End neverHasSideEffects
+} // End neverHasSideEffects = 1, isMoveImm = 1
+
 defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
 //defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
 //defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
 defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
-  [(set VReg_32:$dst, (sint_to_fp AllReg_32:$src0))]
+  [(set VReg_32:$dst, (sint_to_fp VSrc_32:$src0))]
 >;
 //defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
 //defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
 defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
-  [(set VReg_32:$dst, (fp_to_sint AllReg_32:$src0))]
+  [(set (i32 VReg_32:$dst), (fp_to_sint VSrc_32:$src0))]
 >;
 defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
 ////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
@@ -601,31 +602,35 @@ defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
 //defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
 //defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
 defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
-  [(set VReg_32:$dst, (AMDGPUfract AllReg_32:$src0))]
+  [(set VReg_32:$dst, (AMDGPUfract VSrc_32:$src0))]
 >;
 defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
-defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", []>;
+defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
+  [(set VReg_32:$dst, (fceil VSrc_32:$src0))]
+>;
 defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
-  [(set VReg_32:$dst, (frint AllReg_32:$src0))]
+  [(set VReg_32:$dst, (frint VSrc_32:$src0))]
 >;
 defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
-  [(set VReg_32:$dst, (ffloor AllReg_32:$src0))]
+  [(set VReg_32:$dst, (ffloor VSrc_32:$src0))]
 >;
 defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
-  [(set VReg_32:$dst, (fexp2 AllReg_32:$src0))]
+  [(set VReg_32:$dst, (fexp2 VSrc_32:$src0))]
 >;
 defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
-defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", []>;
+defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
+  [(set VReg_32:$dst, (flog2 VSrc_32:$src0))]
+>;
 defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
 defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
 defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
-  [(set VReg_32:$dst, (fdiv FP_ONE, AllReg_32:$src0))]
+  [(set VReg_32:$dst, (fdiv FP_ONE, VSrc_32:$src0))]
 >;
 defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
 defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
 defm V_RSQ_LEGACY_F32 : VOP1_32 <
   0x0000002d, "V_RSQ_LEGACY_F32",
-  [(set VReg_32:$dst, (int_AMDGPU_rsq AllReg_32:$src0))]
+  [(set VReg_32:$dst, (int_AMDGPU_rsq VSrc_32:$src0))]
 >;
 defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
 defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
@@ -655,7 +660,7 @@ def V_INTERP_P1_F32 : VINTRP <
   0x00000000,
   (outs VReg_32:$dst),
   (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-  "V_INTERP_P1_F32",
+  "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]",
   []> {
   let DisableEncoding = "$m0";
 }
@@ -664,7 +669,7 @@ def V_INTERP_P2_F32 : VINTRP <
   0x00000001,
   (outs VReg_32:$dst),
   (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-  "V_INTERP_P2_F32",
+  "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
   []> {
 
   let Constraints = "$src0 = $dst";
@@ -675,10 +680,9 @@ def V_INTERP_P2_F32 : VINTRP <
 def V_INTERP_MOV_F32 : VINTRP <
   0x00000002,
   (outs VReg_32:$dst),
-  (ins i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
-  "V_INTERP_MOV_F32",
+  (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]",
   []> {
-  let VSRC = 0;
   let DisableEncoding = "$m0";
 }
 
@@ -695,7 +699,7 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
 
 let isBranch = 1 in {
 def S_BRANCH : SOPP <
-  0x00000002, (ins brtarget:$target), "S_BRANCH",
+  0x00000002, (ins brtarget:$target), "S_BRANCH $target",
   [(br bb:$target)]> {
   let isBarrier = 1;
 }
@@ -703,35 +707,35 @@ def S_BRANCH : SOPP <
 let DisableEncoding = "$scc" in {
 def S_CBRANCH_SCC0 : SOPP <
   0x00000004, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC0", []
+  "S_CBRANCH_SCC0 $target", []
 >;
 def S_CBRANCH_SCC1 : SOPP <
   0x00000005, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC1",
+  "S_CBRANCH_SCC1 $target",
   []
 >;
 } // End DisableEncoding = "$scc"
 
 def S_CBRANCH_VCCZ : SOPP <
   0x00000006, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCZ",
+  "S_CBRANCH_VCCZ $target",
   []
 >;
 def S_CBRANCH_VCCNZ : SOPP <
   0x00000007, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCNZ",
+  "S_CBRANCH_VCCNZ $target",
   []
 >;
 
 let DisableEncoding = "$exec" in {
 def S_CBRANCH_EXECZ : SOPP <
   0x00000008, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECZ",
+  "S_CBRANCH_EXECZ $target",
   []
 >;
 def S_CBRANCH_EXECNZ : SOPP <
   0x00000009, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECNZ",
+  "S_CBRANCH_EXECNZ $target",
   []
 >;
 } // End DisableEncoding = "$exec"
@@ -758,80 +762,101 @@ def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16",
 //def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
 
 def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
-  (ins AllReg_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32",
+  (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
+  "V_CNDMASK_B32_e32 $dst, $src0, $src1, [$vcc]",
   []
 >{
   let DisableEncoding = "$vcc";
 }
 
 def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
-  (ins VReg_32:$src0, VReg_32:$src1, SReg_1:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
-  "V_CNDMASK_B32_e64",
-  [(set (i32 VReg_32:$dst), (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0))]
+  (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2,
+   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
+  "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
+  [(set (i32 VReg_32:$dst), (select (i1 SSrc_64:$src2),
+   VSrc_32:$src1, VSrc_32:$src0))]
 >;
 
 //f32 pattern for V_CNDMASK_B32_e64
 def : Pat <
-  (f32 (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0)),
-  (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_1:$src2)
+  (f32 (select (i1 SSrc_64:$src2), VSrc_32:$src1, VSrc_32:$src0)),
+  (V_CNDMASK_B32_e64 VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2)
 >;
 
 defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
 defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
 
-defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", []>;
-def : Pat <
-  (f32 (fadd AllReg_32:$src0, VReg_32:$src1)),
-  (V_ADD_F32_e32  AllReg_32:$src0, VReg_32:$src1)
+let isCommutable = 1 in {
+defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32",
+  [(set VReg_32:$dst, (fadd VSrc_32:$src0, VReg_32:$src1))]
 >;
+} // End isCommutable = 1
 
-defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>;
-def : Pat <
-  (f32 (fsub AllReg_32:$src0, VReg_32:$src1)),
-  (V_SUB_F32_e32  AllReg_32:$src0, VReg_32:$src1)
+defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32",
+  [(set VReg_32:$dst, (fsub VSrc_32:$src0, VReg_32:$src1))]
 >;
+
 defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>;
 defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
+
+let isCommutable = 1 in {
+
 defm V_MUL_LEGACY_F32 : VOP2_32 <
   0x00000007, "V_MUL_LEGACY_F32",
-  [(set VReg_32:$dst, (int_AMDGPU_mul AllReg_32:$src0, VReg_32:$src1))]
+  [(set VReg_32:$dst, (int_AMDGPU_mul VSrc_32:$src0, VReg_32:$src1))]
 >;
 
 defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
-  [(set VReg_32:$dst, (fmul AllReg_32:$src0, VReg_32:$src1))]
+  [(set VReg_32:$dst, (fmul VSrc_32:$src0, VReg_32:$src1))]
 >;
+
+} // End isCommutable = 1
+
 //defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>;
 //defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
 //defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>;
 //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
+
+let isCommutable = 1 in {
+
 defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
-  [(set VReg_32:$dst, (AMDGPUfmin AllReg_32:$src0, VReg_32:$src1))]
+  [(set VReg_32:$dst, (AMDGPUfmin VSrc_32:$src0, VReg_32:$src1))]
 >;
 
 defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
-  [(set VReg_32:$dst, (AMDGPUfmax AllReg_32:$src0, VReg_32:$src1))]
+  [(set VReg_32:$dst, (AMDGPUfmax VSrc_32:$src0, VReg_32:$src1))]
 >;
+
 defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
 defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
 defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
 defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
 defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
 defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+
+} // End isCommutable = 1
+
 defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
 defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>;
 defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
 defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>;
 defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
 defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
+
+let isCommutable = 1 in {
+
 defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
-  [(set VReg_32:$dst, (and AllReg_32:$src0, VReg_32:$src1))]
+  [(set VReg_32:$dst, (and VSrc_32:$src0, VReg_32:$src1))]
 >;
 defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
-  [(set VReg_32:$dst, (or AllReg_32:$src0, VReg_32:$src1))]
+  [(set VReg_32:$dst, (or VSrc_32:$src0, VReg_32:$src1))]
 >;
 defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
-  [(set VReg_32:$dst, (xor AllReg_32:$src0, VReg_32:$src1))]
+  [(set VReg_32:$dst, (xor VSrc_32:$src0, VReg_32:$src1))]
 >;
+
+} // End isCommutable = 1
+
 defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>;
 defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
 defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
@@ -840,23 +865,30 @@ defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
 //defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
 //defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 let Defs = [VCC] in { // Carry-out goes to VCC
-defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32",
-  [(set VReg_32:$dst, (add (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
+
+let isCommutable = 1 in {
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
+  [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
 >;
-defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32",
-  [(set VReg_32:$dst, (sub (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
+} // End isCommutable = 1
+
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
+  [(set VReg_32:$dst, (sub (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
 >;
+
+defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", []>;
+let Uses = [VCC] in { // Carry-out comes from VCC
+defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", []>;
+defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", []>;
+defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", []>;
+} // End Uses = [VCC]
 } // End Defs = [VCC]
-defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>;
-defm V_ADDC_U32 : VOP2_32 <0x00000028, "V_ADDC_U32", []>;
-defm V_SUBB_U32 : VOP2_32 <0x00000029, "V_SUBB_U32", []>;
-defm V_SUBBREV_U32 : VOP2_32 <0x0000002a, "V_SUBBREV_U32", []>;
 defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
 ////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
 ////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
 ////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
 defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
- [(set VReg_32:$dst, (int_SI_packf16 AllReg_32:$src0, VReg_32:$src1))]
+ [(set VReg_32:$dst, (int_SI_packf16 VSrc_32:$src0, VReg_32:$src1))]
 >;
 ////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
 ////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
@@ -926,6 +958,10 @@ def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
 def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
 def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
 def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
+def : Pat <
+  (mul VSrc_32:$src0, VReg_32:$src1),
+  (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+>;
 def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
 def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
 def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
@@ -949,27 +985,35 @@ def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>;
 def S_CSELECT_B32 : SOP2 <
   0x0000000a, (outs SReg_32:$dst),
   (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
-  [(set (i32 SReg_32:$dst), (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1))]
+  [(set (i32 SReg_32:$dst), (select (i1 SCCReg:$scc),
+                                     SReg_32:$src0, SReg_32:$src1))]
 >;
 
 def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
 
 // f32 pattern for S_CSELECT_B32
 def : Pat <
-  (f32 (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1)),
+  (f32 (select (i1 SCCReg:$scc), SReg_32:$src0, SReg_32:$src1)),
   (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc)
 >;
 
 def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
 
 def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
-  [(set SReg_64:$dst, (and SReg_64:$src0, SReg_64:$src1))]
+  [(set SReg_64:$dst, (i64 (and SSrc_64:$src0, SSrc_64:$src1)))]
 >;
-def S_AND_VCC : SOP2_VCC <0x0000000f, "S_AND_B64",
-  [(set SReg_1:$vcc, (SIvcc_and SReg_64:$src0, SReg_64:$src1))]
+
+def : Pat <
+  (i1 (and SSrc_64:$src0, SSrc_64:$src1)),
+  (S_AND_B64 SSrc_64:$src0, SSrc_64:$src1)
 >;
+
 def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
 def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
+def : Pat <
+  (i1 (or SSrc_64:$src0, SSrc_64:$src1)),
+  (S_OR_B64 SSrc_64:$src0, SSrc_64:$src1)
+>;
 def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
 def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
 def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
@@ -998,54 +1042,12 @@ def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
 //def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
 def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
 
-class V_MOV_IMM <Operand immType, SDNode immNode> : InstSI <
-  (outs VReg_32:$dst),
-  (ins immType:$src0),
-  "V_MOV_IMM",
-   [(set VReg_32:$dst, (immNode:$src0))]
->;
-
-let isCodeGenOnly = 1, isPseudo = 1 in {
-
-def V_MOV_IMM_I32 : V_MOV_IMM<i32imm, imm>;
-def V_MOV_IMM_F32 : V_MOV_IMM<f32imm, fpimm>;
-
-def S_MOV_IMM_I32 : InstSI <
-  (outs SReg_32:$dst),
-  (ins i32imm:$src0),
-  "S_MOV_IMM_I32",
-  [(set SReg_32:$dst, (imm:$src0))]
->;
-
-// i64 immediates aren't really supported in hardware, but LLVM will use the i64
-// type for indices on load and store instructions.  The pattern for
-// S_MOV_IMM_I64 will only match i64 immediates that can fit into 32-bits,
-// which the hardware can handle.
-def S_MOV_IMM_I64 : InstSI <
-  (outs SReg_64:$dst),
-  (ins i64imm:$src0),
-  "S_MOV_IMM_I64 $dst, $src0",
-  [(set SReg_64:$dst, (IMM32bitIn64bit:$src0))]
->;
-
-} // End isCodeGenOnly, isPseudo = 1
-
-class SI_LOAD_LITERAL<Operand ImmType> :
-    Enc32 <(outs), (ins ImmType:$imm), "LOAD_LITERAL $imm", []> {
-
-  bits<32> imm;
-  let Inst{31-0} = imm;
-}
-
-def SI_LOAD_LITERAL_I32 : SI_LOAD_LITERAL<i32imm>;
-def SI_LOAD_LITERAL_F32 : SI_LOAD_LITERAL<f32imm>;
-
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
 def SET_M0 : InstSI <
   (outs SReg_32:$dst),
   (ins i32imm:$src0),
-  "SET_M0",
+  "SET_M0 $dst, $src0",
   [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))]
 >;
 
@@ -1058,13 +1060,6 @@ def LOAD_CONST : AMDGPUShaderInst <
 
 let usesCustomInserter = 1 in {
 
-def SI_V_CNDLT : InstSI <
-  (outs VReg_32:$dst),
-  (ins VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
-  "SI_V_CNDLT $dst, $src0, $src1, $src2",
-  [(set VReg_32:$dst, (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2))]
->;
-
 def SI_INTERP : InstSI <
   (outs VReg_32:$dst),
   (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
@@ -1072,21 +1067,6 @@ def SI_INTERP : InstSI <
   []
 >;
 
-def SI_INTERP_CONST : InstSI <
-  (outs VReg_32:$dst),
-  (ins i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
-  "SI_INTERP_CONST $dst, $attr_chan, $attr, $params",
-  [(set VReg_32:$dst, (int_SI_fs_interp_constant imm:$attr_chan,
-                                                 imm:$attr, SReg_32:$params))]
->;
-
-def SI_KIL : InstSI <
-  (outs),
-  (ins VReg_32:$src),
-  "SI_KIL $src",
-  [(int_AMDGPU_kill VReg_32:$src)]
->;
-
 def SI_WQM : InstSI <
   (outs),
   (ins),
@@ -1106,15 +1086,15 @@ let isBranch = 1, isTerminator = 1 in {
 
 def SI_IF : InstSI <
   (outs SReg_64:$dst),
-  (ins SReg_1:$vcc, brtarget:$target),
-  "SI_IF",
-  [(set SReg_64:$dst, (int_SI_if SReg_1:$vcc, bb:$target))]
+  (ins SReg_64:$vcc, brtarget:$target),
+  "SI_IF $dst, $vcc, $target",
+  [(set SReg_64:$dst, (int_SI_if SReg_64:$vcc, bb:$target))]
 >;
 
 def SI_ELSE : InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src, brtarget:$target),
-  "SI_ELSE",
+  "SI_ELSE $dst, $src, $target",
   [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> {
 
   let Constraints = "$src = $dst";
@@ -1123,7 +1103,7 @@ def SI_ELSE : InstSI <
 def SI_LOOP : InstSI <
   (outs),
   (ins SReg_64:$saved, brtarget:$target),
-  "SI_LOOP",
+  "SI_LOOP $saved, $target",
   [(int_SI_loop SReg_64:$saved, bb:$target)]
 >;
 
@@ -1132,43 +1112,60 @@ def SI_LOOP : InstSI <
 def SI_BREAK : InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src),
-  "SI_ELSE",
+  "SI_ELSE $dst, $src",
   [(set SReg_64:$dst, (int_SI_break SReg_64:$src))]
 >;
 
 def SI_IF_BREAK : InstSI <
   (outs SReg_64:$dst),
-  (ins SReg_1:$vcc, SReg_64:$src),
-  "SI_IF_BREAK",
-  [(set SReg_64:$dst, (int_SI_if_break SReg_1:$vcc, SReg_64:$src))]
+  (ins SReg_64:$vcc, SReg_64:$src),
+  "SI_IF_BREAK $dst, $vcc, $src",
+  [(set SReg_64:$dst, (int_SI_if_break SReg_64:$vcc, SReg_64:$src))]
 >;
 
 def SI_ELSE_BREAK : InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src0, SReg_64:$src1),
-  "SI_ELSE_BREAK",
+  "SI_ELSE_BREAK $dst, $src0, $src1",
   [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))]
 >;
 
 def SI_END_CF : InstSI <
   (outs),
   (ins SReg_64:$saved),
-  "SI_END_CF",
+  "SI_END_CF $saved",
   [(int_SI_end_cf SReg_64:$saved)]
 >;
 
+def SI_KILL : InstSI <
+  (outs),
+  (ins VReg_32:$src),
+  "SI_KIL $src",
+  [(int_AMDGPU_kill VReg_32:$src)]
+>;
+
 } // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
   // Uses = [EXEC], Defs = [EXEC]
 
 } // end IsCodeGenOnly, isPseudo
 
+def : Pat<
+  (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
+  (V_CNDMASK_B32_e64 VReg_32:$src2, VReg_32:$src1, (V_CMP_GT_F32_e64 0, VReg_32:$src0))
+>;
+
+def : Pat <
+  (int_AMDGPU_kilp),
+  (SI_KILL (V_MOV_B32_e32 0xbf800000))
+>;
+
 /* int_SI_vs_load_input */
 def : Pat<
   (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset,
                         VReg_32:$buf_idx_vgpr),
   (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
                            VReg_32:$buf_idx_vgpr, SReg_128:$tlst,
-                           0, 0, (i32 SREG_LIT_0))
+                           0, 0, 0)
 >;
 
 /* int_SI_export */
@@ -1179,43 +1176,101 @@ def : Pat <
        VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3)
 >;
 
-/* int_SI_sample */
-def : Pat <
-  (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler),
-  (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
-                SReg_256:$rsrc, SReg_128:$sampler)
->;
 
-/* int_SI_sample_lod */
+/* int_SI_sample for simple 1D texture lookup */
 def : Pat <
-  (int_SI_sample_lod imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler),
-  (IMAGE_SAMPLE_L imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
-                  SReg_256:$rsrc, SReg_128:$sampler)
+  (int_SI_sample imm:$writemask, (v1i32 VReg_32:$addr),
+                 SReg_256:$rsrc, SReg_128:$sampler, imm),
+  (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0,
+                (i32 (COPY_TO_REGCLASS VReg_32:$addr, VReg_32)),
+                SReg_256:$rsrc, SReg_128:$sampler)
 >;
 
-/* int_SI_sample_bias */
-def : Pat <
-  (int_SI_sample_bias imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler),
-  (IMAGE_SAMPLE_B imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
-                  SReg_256:$rsrc, SReg_128:$sampler)
->;
+class SamplePattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
+                    ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, imm),
+    (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0,
+          (EXTRACT_SUBREG addr_class:$addr, sub0),
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+class SampleRectPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
+                        ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT),
+    (opcode imm:$writemask, 1, 0, 0, 0, 0, 0, 0,
+          (EXTRACT_SUBREG addr_class:$addr, sub0),
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+class SampleArrayPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
+                         ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, TEX_ARRAY),
+    (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0,
+          (EXTRACT_SUBREG addr_class:$addr, sub0),
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+class SampleShadowPattern<Intrinsic name, MIMG opcode,
+                          RegisterClass addr_class, ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW),
+    (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0,
+          (EXTRACT_SUBREG addr_class:$addr, sub0),
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+class SampleShadowArrayPattern<Intrinsic name, MIMG opcode,
+                               RegisterClass addr_class, ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW_ARRAY),
+    (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0,
+          (EXTRACT_SUBREG addr_class:$addr, sub0),
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+/* int_SI_sample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<RegisterClass addr_class, ValueType addr_type> {
+  def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
+  def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
+  def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
+  def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>;
+
+  def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>;
+  def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>;
+  def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>;
+
+  def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>;
+  def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>;
+  def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>;
+}
 
-def CLAMP_SI : CLAMP<VReg_32>;
-def FABS_SI : FABS<VReg_32>;
-def FNEG_SI : FNEG<VReg_32>;
+defm : SamplePatterns<VReg_64, v2i32>;
+defm : SamplePatterns<VReg_128, v4i32>;
+defm : SamplePatterns<VReg_256, v8i32>;
+defm : SamplePatterns<VReg_512, v16i32>;
 
-def : Extract_Element <f32, v4f32, VReg_128, 0, sel_x>;
-def : Extract_Element <f32, v4f32, VReg_128, 1, sel_y>;
-def : Extract_Element <f32, v4f32, VReg_128, 2, sel_z>;
-def : Extract_Element <f32, v4f32, VReg_128, 3, sel_w>;
+def : Extract_Element <f32, v4f32, VReg_128, 0, sub0>;
+def : Extract_Element <f32, v4f32, VReg_128, 1, sub1>;
+def : Extract_Element <f32, v4f32, VReg_128, 2, sub2>;
+def : Extract_Element <f32, v4f32, VReg_128, 3, sub3>;
 
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sel_x>;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sel_y>;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sel_z>;
-def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sel_w>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sub0>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sub1>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sub2>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sub3>;
 
+def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>;
+def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>;
 def : Vector_Build <v4f32, VReg_128, f32, VReg_32>;
-def : Vector_Build <v4i32, SReg_128, i32, SReg_32>;
+def : Vector_Build <v4i32, VReg_128, i32, VReg_32>;
+def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>;
+def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>;
 
 def : BitConvert <i32, f32, SReg_32>;
 def : BitConvert <i32, f32, VReg_32>;
@@ -1223,24 +1278,68 @@ def : BitConvert <i32, f32, VReg_32>;
 def : BitConvert <f32, i32, SReg_32>;
 def : BitConvert <f32, i32, VReg_32>;
 
+/********** =================== **********/
+/********** Src & Dst modifiers **********/
+/********** =================== **********/
+
+def : Pat <
+  (int_AMDIL_clamp VReg_32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
+  (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */),
+   0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
+>;
+
+def : Pat <
+  (fabs VReg_32:$src),
+  (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */),
+   1 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
+>;
+
+def : Pat <
+  (fneg VReg_32:$src),
+  (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */),
+   0 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 1 /* NEG */)
+>;
+
+/********** ================== **********/
+/********** Immediate Patterns **********/
+/********** ================== **********/
+
+def : Pat <
+  (i1 imm:$imm),
+  (S_MOV_B64 imm:$imm)
+>;
+
 def : Pat <
-  (i64 (SIsreg1_bitcast SReg_1:$vcc)),
-  (S_MOV_B64 (COPY_TO_REGCLASS SReg_1:$vcc, SReg_64))
+  (i32 imm:$imm),
+  (V_MOV_B32_e32 imm:$imm)
 >;
 
 def : Pat <
-  (i1 (SIsreg1_bitcast SReg_64:$vcc)),
-  (COPY_TO_REGCLASS SReg_64:$vcc, SReg_1)
+  (f32 fpimm:$imm),
+  (V_MOV_B32_e32 fpimm:$imm)
 >;
 
 def : Pat <
-  (i64 (SIvcc_bitcast VCCReg:$vcc)),
-  (S_MOV_B64 (COPY_TO_REGCLASS VCCReg:$vcc, SReg_64))
+  (i32 imm:$imm),
+  (S_MOV_B32 imm:$imm)
 >;
 
 def : Pat <
-  (i1 (SIvcc_bitcast SReg_64:$vcc)),
-  (COPY_TO_REGCLASS SReg_64:$vcc, VCCReg)
+  (f32 fpimm:$imm),
+  (S_MOV_B32 fpimm:$imm)
+>;
+
+def : Pat <
+  (i64 InlineImm<i64>:$imm),
+  (S_MOV_B64 InlineImm<i64>:$imm)
+>;
+
+// i64 immediates aren't supported in hardware, split it into two 32bit values
+def : Pat <
+  (i64 imm:$imm),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0),
+    (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1)
 >;
 
 /********** ===================== **********/
@@ -1248,6 +1347,12 @@ def : Pat <
 /********** ===================== **********/
 
 def : Pat <
+  (int_SI_fs_interp_constant imm:$attr_chan, imm:$attr, SReg_32:$params),
+  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr,
+                    (S_MOV_B32 SReg_32:$params))
+>;
+
+def : Pat <
   (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params),
   (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan,
              imm:$attr, SReg_32:$params)
@@ -1305,47 +1410,86 @@ def : Pat <
 def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>;
 
 def : Pat <
-  (int_AMDGPU_div AllReg_32:$src0, AllReg_32:$src1),
-  (V_MUL_LEGACY_F32_e32 AllReg_32:$src0, (V_RCP_LEGACY_F32_e32 AllReg_32:$src1))
+  (int_AMDGPU_div VSrc_32:$src0, VSrc_32:$src1),
+  (V_MUL_LEGACY_F32_e32 VSrc_32:$src0, (V_RCP_LEGACY_F32_e32 VSrc_32:$src1))
 >;
 
 def : Pat<
-  (fdiv AllReg_32:$src0, AllReg_32:$src1),
-  (V_MUL_F32_e32 AllReg_32:$src0, (V_RCP_F32_e32 AllReg_32:$src1))
+  (fdiv VSrc_32:$src0, VSrc_32:$src1),
+  (V_MUL_F32_e32 VSrc_32:$src0, (V_RCP_F32_e32 VSrc_32:$src1))
 >;
 
 def : Pat <
-  (int_AMDGPU_kilp),
-  (SI_KIL (V_MOV_IMM_I32 0xbf800000))
+  (fcos VSrc_32:$src0),
+  (V_COS_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
+>;
+
+def : Pat <
+  (fsin VSrc_32:$src0),
+  (V_SIN_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
 >;
 
 def : Pat <
   (int_AMDGPU_cube VReg_128:$src),
   (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
-    (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
-                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
-                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
-                  0, 0, 0, 0), sel_x),
-    (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
-                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
-                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
-                  0, 0, 0, 0), sel_y),
-    (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
-                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
-                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
-                  0, 0, 0, 0), sel_z),
-    (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
-                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
-                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
-                  0, 0, 0, 0), sel_w)
+    (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
+                  (EXTRACT_SUBREG VReg_128:$src, sub1),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2),
+                  0, 0, 0, 0), sub0),
+    (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
+                  (EXTRACT_SUBREG VReg_128:$src, sub1),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2),
+                  0, 0, 0, 0), sub1),
+    (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
+                  (EXTRACT_SUBREG VReg_128:$src, sub1),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2),
+                  0, 0, 0, 0), sub2),
+    (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
+                  (EXTRACT_SUBREG VReg_128:$src, sub1),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2),
+                  0, 0, 0, 0), sub3)
+>;
+
+def : Pat <
+  (i32 (sext (i1 SReg_64:$src0))),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)
 >;
 
 /********** ================== **********/
 /**********   VOP3 Patterns    **********/
 /********** ================== **********/
 
-def : Pat <(f32 (IL_mad AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2)),
-           (V_MAD_LEGACY_F32 AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2,
+def : Pat <(f32 (fadd (fmul VSrc_32:$src0, VSrc_32:$src1), VSrc_32:$src2)),
+           (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
             0, 0, 0, 0)>;
 
+/********** ================== **********/
+/**********   SMRD Patterns    **********/
+/********** ================== **********/
+
+multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+  // 1. Offset as 8bit DWORD immediate
+  def : Pat <
+    (constant_load (SIadd64bit32bit SReg_64:$sbase, IMM8bitDWORD:$offset)),
+    (vt (Instr_IMM SReg_64:$sbase, IMM8bitDWORD:$offset))
+  >;
+
+  // 2. Offset loaded in an 32bit SGPR
+  def : Pat <
+    (constant_load (SIadd64bit32bit SReg_64:$sbase, imm:$offset)),
+    (vt (Instr_SGPR SReg_64:$sbase, (S_MOV_B32 imm:$offset)))
+  >;
+
+  // 3. No offset at all
+  def : Pat <
+    (constant_load SReg_64:$sbase),
+    (vt (Instr_IMM SReg_64:$sbase, 0))
+  >;
+}
+
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+
 } // End isSI predicate
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index c322fef..611b9c4 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -22,9 +22,11 @@ let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
   def int_SI_wqm : Intrinsic <[], [], []>;
 
-  def int_SI_sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_v8i32_ty, llvm_v4i32_ty], [IntrReadMem]>;
-  def int_SI_sample_bias : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_v8i32_ty, llvm_v4i32_ty], [IntrReadMem]>;
-  def int_SI_sample_lod : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_v8i32_ty, llvm_v4i32_ty], [IntrReadMem]>;
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrReadMem]>;
+
+  def int_SI_sample : Sample;
+  def int_SI_sampleb : Sample;
+  def int_SI_samplel : Sample;
 
   /* Interpolation Intrinsics */
 
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 3fbe653..b215aa2 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -68,7 +68,10 @@ private:
   static char ID;
   const TargetInstrInfo *TII;
 
-  void Skip(MachineInstr &MI, MachineOperand &To);
+  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
+
+  void Skip(MachineInstr &From, MachineOperand &To);
+  void SkipIfDead(MachineInstr &MI);
 
   void If(MachineInstr &MI);
   void Else(MachineInstr &MI);
@@ -78,6 +81,7 @@ private:
   void Loop(MachineInstr &MI);
   void EndCf(MachineInstr &MI);
 
+  void Kill(MachineInstr &MI);
   void Branch(MachineInstr &MI);
 
 public:
@@ -100,22 +104,29 @@ FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
   return new SILowerControlFlowPass(tm);
 }
 
-void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
+                                        MachineBasicBlock *To) {
+
   unsigned NumInstr = 0;
 
-  for (MachineBasicBlock *MBB = *From.getParent()->succ_begin();
-       NumInstr < SkipThreshold && MBB != To.getMBB() && !MBB->succ_empty();
+  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
        MBB = *MBB->succ_begin()) {
 
     for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
          NumInstr < SkipThreshold && I != E; ++I) {
 
       if (I->isBundle() || !I->isBundled())
-        ++NumInstr;
+        if (++NumInstr >= SkipThreshold)
+          return true;
     }
   }
 
-  if (NumInstr < SkipThreshold)
+  return false;
+}
+
+void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+
+  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
     return;
 
   DebugLoc DL = From.getDebugLoc();
@@ -124,6 +135,38 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
           .addReg(AMDGPU::EXEC);
 }
 
+void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  if (!shouldSkip(&MBB, &MBB.getParent()->back()))
+    return;
+
+  MachineBasicBlock::iterator Insert = &MI;
+  ++Insert;
+
+  // If the exec mask is non-zero, skip the next two instructions
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addImm(3)
+          .addReg(AMDGPU::EXEC);
+
+  // Exec mask is zero: Export to NULL target...
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
+          .addImm(0)
+          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+          .addImm(0)
+          .addImm(1)
+          .addImm(1)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0);
+
+  // ... and terminate wavefront
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+}
+
 void SILowerControlFlowPass::If(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
@@ -242,8 +285,27 @@ void SILowerControlFlowPass::Branch(MachineInstr &MI) {
     assert(0);
 }
 
+void SILowerControlFlowPass::Kill(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Kill is only allowed in pixel shaders
+  assert(MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
+         ShaderType::PIXEL);
+
+  // Clear this pixel from the exec mask if the operand is negative
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
+          .addImm(0)
+          .addOperand(MI.getOperand(0));
+
+  MI.eraseFromParent();
+}
+
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
-  bool HaveCf = false;
+
+  bool HaveKill = false;
+  unsigned Depth = 0;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI) {
@@ -257,6 +319,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
       switch (MI.getOpcode()) {
         default: break;
         case AMDGPU::SI_IF:
+          ++Depth;
           If(MI);
           break;
 
@@ -277,14 +340,26 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           break;
 
         case AMDGPU::SI_LOOP:
+          ++Depth;
           Loop(MI);
           break;
 
         case AMDGPU::SI_END_CF:
-          HaveCf = true;
+          if (--Depth == 0 && HaveKill) {
+            SkipIfDead(MI);
+            HaveKill = false;
+          }
           EndCf(MI);
           break;
 
+        case AMDGPU::SI_KILL:
+          if (Depth == 0)
+            SkipIfDead(MI);
+          else
+            HaveKill = true;
+          Kill(MI);
+          break;
+
         case AMDGPU::S_BRANCH:
           Branch(MI);
           break;
@@ -292,40 +367,5 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  // TODO: What is this good for?
-  unsigned ShaderType = MF.getInfo<SIMachineFunctionInfo>()->ShaderType;
-  if (HaveCf && ShaderType == ShaderType::PIXEL) {
-    for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-         BI != BE; ++BI) {
-
-      MachineBasicBlock &MBB = *BI;
-      if (MBB.succ_empty()) {
-
-        MachineInstr &MI = *MBB.getFirstNonPHI();
-        DebugLoc DL = MI.getDebugLoc();
-
-        // If the exec mask is non-zero, skip the next two instructions
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-               .addImm(3)
-               .addReg(AMDGPU::EXEC);
-
-        // Exec mask is zero: Export to NULL target...
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::EXP))
-                .addImm(0)
-                .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-                .addImm(0)
-                .addImm(1)
-                .addImm(1)
-                .addReg(AMDGPU::SREG_LIT_0)
-                .addReg(AMDGPU::SREG_LIT_0)
-                .addReg(AMDGPU::SREG_LIT_0)
-                .addReg(AMDGPU::SREG_LIT_0);
-
-        // ... and terminate wavefront
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ENDPGM));
-      }
-    }
-  }
-
   return true;
 }
diff --git a/lib/Target/R600/SILowerLiteralConstants.cpp b/lib/Target/R600/SILowerLiteralConstants.cpp
deleted file mode 100644
index c0411e9..0000000
--- a/lib/Target/R600/SILowerLiteralConstants.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-//===-- SILowerLiteralConstants.cpp - Lower intrs using literal constants--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This pass performs the following transformation on instructions with
-/// literal constants:
-///
-/// %VGPR0 = V_MOV_IMM_I32 1
-///
-/// becomes:
-///
-/// BUNDLE
-///   * %VGPR = V_MOV_B32_32 SI_LITERAL_CONSTANT
-///   * SI_LOAD_LITERAL 1
-///
-/// The resulting sequence matches exactly how the hardware handles immediate
-/// operands, so this transformation greatly simplifies the code generator.
-///
-/// Only the *_MOV_IMM_* support immediate operands at the moment, but when
-/// support for immediate operands is added to other instructions, they
-/// will be lowered here as well.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineInstrBundle.h"
-
-using namespace llvm;
-
-namespace {
-
-class SILowerLiteralConstantsPass : public MachineFunctionPass {
-
-private:
-  static char ID;
-  const TargetInstrInfo *TII;
-
-public:
-  SILowerLiteralConstantsPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  const char *getPassName() const {
-    return "SI Lower literal constants pass";
-  }
-};
-
-} // End anonymous namespace
-
-char SILowerLiteralConstantsPass::ID = 0;
-
-FunctionPass *llvm::createSILowerLiteralConstantsPass(TargetMachine &tm) {
-  return new SILowerLiteralConstantsPass(tm);
-}
-
-bool SILowerLiteralConstantsPass::runOnMachineFunction(MachineFunction &MF) {
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-                               I != MBB.end(); I = Next) {
-      Next = llvm::next(I);
-      MachineInstr &MI = *I;
-      switch (MI.getOpcode()) {
-      default: break;
-      case AMDGPU::S_MOV_IMM_I32:
-      case AMDGPU::S_MOV_IMM_I64:
-      case AMDGPU::V_MOV_IMM_F32:
-      case AMDGPU::V_MOV_IMM_I32: {
-          unsigned MovOpcode;
-          unsigned LoadLiteralOpcode;
-          MachineOperand LiteralOp = MI.getOperand(1);
-          if (AMDGPU::VReg_32RegClass.contains(MI.getOperand(0).getReg())) {
-            MovOpcode = AMDGPU::V_MOV_B32_e32;
-          } else {
-            MovOpcode = AMDGPU::S_MOV_B32;
-          }
-          if (LiteralOp.isImm()) {
-            LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_I32;
-          } else {
-            LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_F32;
-          }
-          MIBundleBuilder Bundle(MBB, I);
-          Bundle
-            .append(BuildMI(MF, MBB.findDebugLoc(I), TII->get(MovOpcode),
-                            MI.getOperand(0).getReg())
-                    .addReg(AMDGPU::SI_LITERAL_CONSTANT))
-            .append(BuildMI(MF, MBB.findDebugLoc(I),
-                            TII->get(LoadLiteralOpcode))
-                    .addOperand(MI.getOperand(1)));
-          llvm::finalizeBundle(MBB, Bundle.begin());
-          MI.eraseFromParent();
-          break;
-        }
-      }
-    }
-  }
-  return false;
-}
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index c3f1361..9e04e24 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -1,44 +1,40 @@
-
-let Namespace = "AMDGPU" in {
-  def low : SubRegIndex;
-  def high : SubRegIndex;
-
-  def sub0 : SubRegIndex;
-  def sub1 : SubRegIndex;
-  def sub2 : SubRegIndex;
-  def sub3 : SubRegIndex;
-  def sub4 : SubRegIndex;
-  def sub5 : SubRegIndex;
-  def sub6 : SubRegIndex;
-  def sub7 : SubRegIndex;
-}
+//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the SI registers
+//===----------------------------------------------------------------------===//
 
 class SIReg <string n, bits<16> encoding = 0> : Register<n> {
   let Namespace = "AMDGPU";
   let HWEncoding = encoding;
 }
 
-class SI_64 <string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> {
-  let Namespace = "AMDGPU";
-  let SubRegIndices = [low, high];
-  let HWEncoding = encoding;
-}
-
-class SGPR_32 <bits<16> num, string name> : SIReg<name, num>;
-
-class VGPR_32 <bits<16> num, string name> : SIReg<name, num>;
-
 // Special Registers
 def VCC : SIReg<"VCC", 106>;
-def EXEC_LO : SIReg <"EXEC LO", 126>;
-def EXEC_HI : SIReg <"EXEC HI", 127>;
-def EXEC : SI_64<"EXEC", [EXEC_LO, EXEC_HI], 126>;
+def EXEC : SIReg<"EXEC", 126>;
 def SCC : SIReg<"SCC", 253>;
-def SREG_LIT_0 : SIReg <"S LIT 0", 128>;
-def SI_LITERAL_CONSTANT : SIReg<"LITERAL CONSTANT", 255>;
 def M0 : SIReg <"M0", 124>;
 
-//Interpolation registers
+// SGPR registers
+foreach Index = 0-101 in {
+  def SGPR#Index : SIReg <"SGPR"#Index, Index>;
+}
+
+// VGPR registers
+foreach Index = 0-255 in {
+  def VGPR#Index : SIReg <"VGPR"#Index, Index> {
+    let HWEncoding{8} = 1;
+  }
+}
+
+// virtual Interpolation registers
 def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">;
 def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">;
 def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">;
@@ -64,73 +60,150 @@ def ANCILLARY : SIReg <"ANCILLARY">;
 def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">;
 def POS_FIXED_PT : SIReg <"POS_FIXED_PT">;
 
-// SGPR 32-bit registers
-foreach Index = 0-101 in {
-  def SGPR#Index : SGPR_32 <Index, "SGPR"#Index>;
-}
+//===----------------------------------------------------------------------===//
+//  Groupings using register classes and tuples
+//===----------------------------------------------------------------------===//
 
+// SGPR 32-bit registers
 def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
                             (add (sequence "SGPR%u", 0, 101))>;
 
 // SGPR 64-bit registers
-def SGPR_64 : RegisterTuples<[low, high],
-                             [(add (decimate SGPR_32, 2)),
-                              (add(decimate (rotl SGPR_32, 1), 2))]>;
+def SGPR_64 : RegisterTuples<[sub0, sub1],
+                             [(add (decimate (trunc SGPR_32, 101), 2)),
+                              (add (decimate (shl SGPR_32, 1), 2))]>;
 
 // SGPR 128-bit registers
-def SGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w],
-                              [(add (decimate SGPR_32, 4)),
-                               (add (decimate (rotl SGPR_32, 1), 4)),
-                               (add (decimate (rotl SGPR_32, 2), 4)),
-                               (add (decimate (rotl SGPR_32, 3), 4))]>;
+def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (decimate (trunc SGPR_32, 99), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4))]>;
 
 // SGPR 256-bit registers
 def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
-                              [(add (decimate SGPR_32, 8)),
-                               (add (decimate (rotl SGPR_32, 1), 8)),
-                               (add (decimate (rotl SGPR_32, 2), 8)),
-                               (add (decimate (rotl SGPR_32, 3), 8)),
-                               (add (decimate (rotl SGPR_32, 4), 8)),
-                               (add (decimate (rotl SGPR_32, 5), 8)),
-                               (add (decimate (rotl SGPR_32, 6), 8)),
-                               (add (decimate (rotl SGPR_32, 7), 8))]>;
+                              [(add (decimate (trunc SGPR_32, 95), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4))]>;
+
+// SGPR 512-bit registers
+def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (decimate (trunc SGPR_32, 87), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4)),
+                               (add (decimate (shl SGPR_32, 8), 4)),
+                               (add (decimate (shl SGPR_32, 9), 4)),
+                               (add (decimate (shl SGPR_32, 10), 4)),
+                               (add (decimate (shl SGPR_32, 11), 4)),
+                               (add (decimate (shl SGPR_32, 12), 4)),
+                               (add (decimate (shl SGPR_32, 13), 4)),
+                               (add (decimate (shl SGPR_32, 14), 4)),
+                               (add (decimate (shl SGPR_32, 15), 4))]>;
 
 // VGPR 32-bit registers
-foreach Index = 0-255 in {
-  def VGPR#Index : VGPR_32 <Index, "VGPR"#Index>;
-}
-
 def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
                             (add (sequence "VGPR%u", 0, 255))>;
 
 // VGPR 64-bit registers
-def VGPR_64 : RegisterTuples<[low, high],
-                             [(add VGPR_32),
-                              (add (rotl VGPR_32, 1))]>;
+def VGPR_64 : RegisterTuples<[sub0, sub1],
+                             [(add (trunc VGPR_32, 255)),
+                              (add (shl VGPR_32, 1))]>;
 
 // VGPR 128-bit registers
-def VGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w],
-                              [(add VGPR_32),
-                               (add (rotl VGPR_32, 1)),
-                               (add (rotl VGPR_32, 2)),
-                               (add (rotl VGPR_32, 3))]>;
+def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (trunc VGPR_32, 253)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3))]>;
+
+// VGPR 256-bit registers
+def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (trunc VGPR_32, 249)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7))]>;
+
+// VGPR 512-bit registers
+def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (trunc VGPR_32, 241)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7)),
+                               (add (shl VGPR_32, 8)),
+                               (add (shl VGPR_32, 9)),
+                               (add (shl VGPR_32, 10)),
+                               (add (shl VGPR_32, 11)),
+                               (add (shl VGPR_32, 12)),
+                               (add (shl VGPR_32, 13)),
+                               (add (shl VGPR_32, 14)),
+                               (add (shl VGPR_32, 15))]>;
+
+//===----------------------------------------------------------------------===//
+//  Register classes used as source and destination
+//===----------------------------------------------------------------------===//
+
+// Special register classes for predicates and the M0 register
+def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)>;
+def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
+def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
+def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
 
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
-    (add SGPR_32,  SREG_LIT_0, M0, EXEC_LO, EXEC_HI)
+  (add SGPR_32, M0Reg)
 >;
 
-def SReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add SGPR_64, VCC, EXEC)>;
-
-def SReg_1 : RegisterClass<"AMDGPU", [i1], 1, (add VCC, SGPR_64, EXEC)>;
+def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64,
+  (add SGPR_64, VCCReg, EXECReg)
+>;
 
 def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>;
 
 def SReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add SGPR_256)>;
 
+def SReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add SGPR_512)>;
+
 // Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
-    (add VGPR_32,
+def VReg_32 : RegisterClass<"AMDGPU", [f32, i32, v1i32], 32, (add VGPR_32)>;
+
+def VReg_64 : RegisterClass<"AMDGPU", [i64, v2i32], 64, (add VGPR_64)>;
+
+def VReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add VGPR_128)>;
+
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add VGPR_256)>;
+
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32], 512, (add VGPR_512)>;
+
+//===----------------------------------------------------------------------===//
+//  [SV]Src_* register classes, can have either an immediate or an register
+//===----------------------------------------------------------------------===//
+
+def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
+
+def SSrc_64 : RegisterClass<"AMDGPU", [i64, i1], 64, (add SReg_64)>;
+
+def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+  (add VReg_32, SReg_32,
     PERSP_SAMPLE_I, PERSP_SAMPLE_J,
     PERSP_CENTER_I, PERSP_CENTER_J,
     PERSP_CENTROID_I, PERSP_CENTROID_J,
@@ -147,21 +220,8 @@ def VReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
     ANCILLARY,
     SAMPLE_COVERAGE,
     POS_FIXED_PT
-    )
+  )
 >;
 
-def VReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add VGPR_64)>;
-
-def VReg_128 : RegisterClass<"AMDGPU", [v4f32], 128, (add VGPR_128)>;
-
-// AllReg_* - A set of all scalar and vector registers of a given width.
-def AllReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, (add VReg_32, SReg_32)>;
-
-def AllReg_64 : RegisterClass<"AMDGPU", [f64, i64], 64, (add SReg_64, VReg_64)>;
-
-// Special register classes for predicates and the M0 register
-def SCCReg : RegisterClass<"AMDGPU", [i1], 1, (add SCC)>;
-def VCCReg : RegisterClass<"AMDGPU", [i1], 1, (add VCC)>;
-def EXECReg : RegisterClass<"AMDGPU", [i1], 1, (add EXEC)>;
-def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
+def VSrc_64 : RegisterClass<"AMDGPU", [i64], 64, (add VReg_64, SReg_64)>;
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index f5e10fc..3d4bfdc 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -21,8 +21,9 @@ void SparcELFMCAsmInfo::anchor() { }
 SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Target &T, StringRef TT) {
   IsLittleEndian = false;
   Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::sparcv9)
-    PointerSize = 8;
+  if (TheTriple.getArch() == Triple::sparcv9) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
 
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 6c47c70..a0dae6e 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -67,6 +67,22 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
   }
 }
 
+void SparcFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+  DebugLoc dl = MI.getDebugLoc();
+  int Size = MI.getOperand(0).getImm();
+  if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
+    Size = -Size;
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (Size)
+    BuildMI(MBB, I, dl, TII.get(SP::ADDri), SP::O6).addReg(SP::O6).addImm(Size);
+  MBB.erase(I);
+}
+
+
 void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
                                   MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index 6b593c9..464233e 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -32,6 +32,10 @@ public:
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
   bool hasFP(const MachineFunction &MF) const { return false; }
 };
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 168640f..138b92d 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -95,15 +95,10 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
   // Analize return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
 
-  // If this is the first return lowered for this function, add the regs to the
-  // liveout set for the function.
-  if (MF.getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
   SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  // Make room for the return address offset.
+  RetOps.push_back(SDValue());
 
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -115,6 +110,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
 
     // Guarantee that all emitted copies are stuck together with flags.
     Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
   unsigned RetAddrOffset = 8; //Call Inst + Delay Slot
@@ -127,18 +123,19 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
     Chain = DAG.getCopyToReg(Chain, dl, SP::I0, Val, Flag);
     Flag = Chain.getValue(1);
-    if (MF.getRegInfo().liveout_empty())
-      MF.getRegInfo().addLiveOut(SP::I0);
+    RetOps.push_back(DAG.getRegister(SP::I0, getPointerTy()));
     RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
   }
 
-  SDValue RetAddrOffsetNode = DAG.getConstant(RetAddrOffset, MVT::i32);
+  RetOps[0] = Chain;  // Update chain.
+  RetOps[1] = DAG.getConstant(RetAddrOffset, MVT::i32);
 
+  // Add the flag if we have it.
   if (Flag.getNode())
-    return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain,
-                       RetAddrOffsetNode, Flag);
-  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain,
-                     RetAddrOffsetNode);
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
 }
 
 /// LowerFormalArguments - V8 uses a very simple ABI, where all values are
@@ -759,10 +756,12 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
 
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
   setOperationAction(ISD::FREM , MVT::f64, Expand);
   setOperationAction(ISD::FMA  , MVT::f64, Expand);
   setOperationAction(ISD::FSIN , MVT::f32, Expand);
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
   setOperationAction(ISD::FMA  , MVT::f32, Expand);
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index e64c140..90b698d 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -126,7 +126,7 @@ def call          : SDNode<"SPISD::CALL", SDT_SPCall,
 
 def SDT_SPRet     : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
 def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
-                           [SDNPHasChain, SDNPOptInGlue]>;
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
 def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
                            [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 9c1c30b..25e90b7 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -56,45 +56,27 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
-void SparcRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  MachineInstr &MI = *I;
-  DebugLoc dl = MI.getDebugLoc();
-  int Size = MI.getOperand(0).getImm();
-  if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
-    Size = -Size;
-  if (Size)
-    BuildMI(MBB, I, dl, TII.get(SP::ADDri), SP::O6).addReg(SP::O6).addImm(Size);
-  MBB.erase(I);
-}
-
 void
 SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                       int SPAdj, RegScavenger *RS) const {
+                                       int SPAdj, unsigned FIOperandNum,
+                                       RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
-  unsigned i = 0;
   MachineInstr &MI = *II;
   DebugLoc dl = MI.getDebugLoc();
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
-  int FrameIndex = MI.getOperand(i).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
   // Addressable stack objects are accessed using neg. offsets from %fp
   MachineFunction &MF = *MI.getParent()->getParent();
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
-               MI.getOperand(i+1).getImm();
+               MI.getOperand(FIOperandNum + 1).getImm();
 
   // Replace frame index with a frame pointer reference.
   if (Offset >= -4096 && Offset <= 4095) {
     // If the offset is small enough to fit in the immediate field, directly
     // encode it.
-    MI.getOperand(i).ChangeToRegister(SP::I6, false);
-    MI.getOperand(i+1).ChangeToImmediate(Offset);
+    MI.getOperand(FIOperandNum).ChangeToRegister(SP::I6, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
   } else {
     // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to 
     // scavenge a register here instead of reserving G1 all of the time.
@@ -104,8 +86,8 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
       .addReg(SP::I6);
     // Insert: G1+%lo(offset) into the user.
-    MI.getOperand(i).ChangeToRegister(SP::G1, false);
-    MI.getOperand(i+1).ChangeToImmediate(Offset & ((1 << 10)-1));
+    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset & ((1 << 10)-1));
   }
 }
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 9515ad3..357879b 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -36,12 +36,9 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index ca438eb..b2c6d55 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -57,7 +57,7 @@ private:
   X86Operand *ParseATTOperand();
   X86Operand *ParseIntelOperand();
   X86Operand *ParseIntelOffsetOfOperator(SMLoc StartLoc);
-  X86Operand *ParseIntelTypeOperator(SMLoc StartLoc);
+  X86Operand *ParseIntelOperator(SMLoc StartLoc, unsigned OpKind);
   X86Operand *ParseIntelMemOperand(unsigned SegReg, SMLoc StartLoc);
   X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size);
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
@@ -168,6 +168,7 @@ struct X86Operand : public MCParsedAsmOperand {
 
   SMLoc StartLoc, EndLoc;
   SMLoc OffsetOfLoc;
+  bool AddressOf;
 
   union {
     struct {
@@ -340,6 +341,10 @@ struct X86Operand : public MCParsedAsmOperand {
     return OffsetOfLoc.getPointer();
   }
 
+  bool needAddressOf() const {
+    return AddressOf;
+  }
+
   bool needSizeDirective() const {
     assert(Kind == Memory && "Invalid access!");
     return Mem.NeedSizeDir;
@@ -471,9 +476,11 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 
   static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+                               bool AddressOf = false,
                                SMLoc OffsetOfLoc = SMLoc()) {
     X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
     Res->Reg.RegNo = RegNo;
+    Res->AddressOf = AddressOf;
     Res->OffsetOfLoc = OffsetOfLoc;
     return Res;
   }
@@ -488,7 +495,7 @@ struct X86Operand : public MCParsedAsmOperand {
 
   /// Create an absolute memory operand.
   static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0, bool NeedSizeDir = false){
+                               unsigned Size = 0, bool NeedSizeDir = false) {
     X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -497,6 +504,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.Scale    = 1;
     Res->Mem.Size     = Size;
     Res->Mem.NeedSizeDir = NeedSizeDir;
+    Res->AddressOf = false;
     return Res;
   }
 
@@ -520,6 +528,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.Scale    = Scale;
     Res->Mem.Size     = Size;
     Res->Mem.NeedSizeDir = NeedSizeDir;
+    Res->AddressOf = false;
     return Res;
   }
 };
@@ -675,115 +684,299 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
   return Size;
 }
 
+enum IntelBracExprState {
+  IBES_START,
+  IBES_LBRAC,
+  IBES_RBRAC,
+  IBES_REGISTER,
+  IBES_REGISTER_STAR,
+  IBES_REGISTER_STAR_INTEGER,
+  IBES_INTEGER,
+  IBES_INTEGER_STAR,
+  IBES_INDEX_REGISTER,
+  IBES_IDENTIFIER,
+  IBES_DISP_EXPR,
+  IBES_MINUS,
+  IBES_ERROR
+};
+
+class IntelBracExprStateMachine {
+  IntelBracExprState State;
+  unsigned BaseReg, IndexReg, Scale;
+  int64_t Disp;
+
+  unsigned TmpReg;
+  int64_t TmpInteger;
+
+  bool isPlus;
+
+public:
+  IntelBracExprStateMachine(MCAsmParser &parser) :
+    State(IBES_START), BaseReg(0), IndexReg(0), Scale(1), Disp(0),
+    TmpReg(0), TmpInteger(0), isPlus(true) {}
+
+  unsigned getBaseReg() { return BaseReg; }
+  unsigned getIndexReg() { return IndexReg; }
+  unsigned getScale() { return Scale; }
+  int64_t getDisp() { return Disp; }
+  bool isValidEndState() { return State == IBES_RBRAC; }
+
+  void onPlus() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_INTEGER:
+      State = IBES_START;
+      if (isPlus)
+        Disp += TmpInteger;
+      else
+        Disp -= TmpInteger;
+      break;
+    case IBES_REGISTER:
+      State = IBES_START;
+      // If we already have a BaseReg, then assume this is the IndexReg with a
+      // scale of 1.
+      if (!BaseReg) {
+        BaseReg = TmpReg;
+      } else {
+        assert (!IndexReg && "BaseReg/IndexReg already set!");
+        IndexReg = TmpReg;
+        Scale = 1;
+      }
+      break;
+    case IBES_INDEX_REGISTER:
+      State = IBES_START;
+      break;
+    }
+    isPlus = true;
+  }
+  void onMinus() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_START:
+      State = IBES_MINUS;
+      break;
+    case IBES_INTEGER:
+      State = IBES_START;
+      if (isPlus)
+        Disp += TmpInteger;
+      else
+        Disp -= TmpInteger;
+      break;
+    case IBES_REGISTER:
+      State = IBES_START;
+      // If we already have a BaseReg, then assume this is the IndexReg with a
+      // scale of 1.
+      if (!BaseReg) {
+        BaseReg = TmpReg;
+      } else {
+        assert (!IndexReg && "BaseReg/IndexReg already set!");
+        IndexReg = TmpReg;
+        Scale = 1;
+      }
+      break;
+    case IBES_INDEX_REGISTER:
+      State = IBES_START;
+      break;
+    }
+    isPlus = false;
+  }
+  void onRegister(unsigned Reg) {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_START:
+      State = IBES_REGISTER;
+      TmpReg = Reg;
+      break;
+    case IBES_INTEGER_STAR:
+      assert (!IndexReg && "IndexReg already set!");
+      State = IBES_INDEX_REGISTER;
+      IndexReg = Reg;
+      Scale = TmpInteger;
+      break;
+    }
+  }
+  void onDispExpr() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_START:
+      State = IBES_DISP_EXPR;
+      break;
+    }
+  }
+  void onInteger(int64_t TmpInt) {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_START:
+      State = IBES_INTEGER;
+      TmpInteger = TmpInt;
+      break;
+    case IBES_MINUS:
+      State = IBES_INTEGER;
+      TmpInteger = TmpInt;
+      break;
+    case IBES_REGISTER_STAR:
+      assert (!IndexReg && "IndexReg already set!");
+      State = IBES_INDEX_REGISTER;
+      IndexReg = TmpReg;
+      Scale = TmpInt;
+      break;
+    }
+  }
+  void onStar() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_INTEGER:
+      State = IBES_INTEGER_STAR;
+      break;
+    case IBES_REGISTER:
+      State = IBES_REGISTER_STAR;
+      break;
+    }
+  }
+  void onLBrac() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_RBRAC:
+      State = IBES_START;
+      isPlus = true;
+      break;
+    }
+  }
+  void onRBrac() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_DISP_EXPR:
+      State = IBES_RBRAC;
+      break;
+    case IBES_INTEGER:
+      State = IBES_RBRAC;
+      if (isPlus)
+        Disp += TmpInteger;
+      else
+        Disp -= TmpInteger;
+      break;
+    case IBES_REGISTER:
+      State = IBES_RBRAC;
+      // If we already have a BaseReg, then assume this is the IndexReg with a
+      // scale of 1.
+      if (!BaseReg) {
+        BaseReg = TmpReg;
+      } else {
+        assert (!IndexReg && "BaseReg/IndexReg already set!");
+        IndexReg = TmpReg;
+        Scale = 1;
+      }
+      break;
+    case IBES_INDEX_REGISTER:
+      State = IBES_RBRAC;
+      break;
+    }
+  }
+};
+
 X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, 
                                                    unsigned Size) {
-  unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
   const AsmToken &Tok = Parser.getTok();
   SMLoc Start = Tok.getLoc(), End = Tok.getEndLoc();
 
-  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
-  // Parse [ BaseReg + Scale*IndexReg + Disp ] or [ symbol ]
-
   // Eat '['
   if (getLexer().isNot(AsmToken::LBrac))
     return ErrorOperand(Start, "Expected '[' token!");
   Parser.Lex();
 
+  unsigned TmpReg = 0;
+
+  // Try to handle '[' 'symbol' ']'
   if (getLexer().is(AsmToken::Identifier)) {
-    // Parse BaseReg
-    if (ParseRegister(BaseReg, Start, End)) {
-      // Handle '[' 'symbol' ']'
-      if (getParser().ParseExpression(Disp, End)) return 0;
+    if (ParseRegister(TmpReg, Start, End)) {
+      const MCExpr *Disp;
+      if (getParser().parseExpression(Disp, End))
+        return 0;
+
       if (getLexer().isNot(AsmToken::RBrac))
         return ErrorOperand(Parser.getTok().getLoc(), "Expected ']' token!");
-      End = Parser.getTok().getEndLoc();
+      // Adjust the EndLoc due to the ']'.
+      End = SMLoc::getFromPointer(Parser.getTok().getEndLoc().getPointer()-1);
       Parser.Lex();
       return X86Operand::CreateMem(Disp, Start, End, Size);
     }
-  } else if (getLexer().is(AsmToken::Integer)) {
-      int64_t Val = Tok.getIntVal();
-      Parser.Lex();
-      SMLoc Loc = Tok.getLoc();
-      if (getLexer().is(AsmToken::RBrac)) {
-        // Handle '[' number ']'
-        End = Parser.getTok().getEndLoc();
-        Parser.Lex();
-        const MCExpr *Disp = MCConstantExpr::Create(Val, getContext());
-        if (SegReg)
-          return X86Operand::CreateMem(SegReg, Disp, 0, 0, Scale,
-                                       Start, End, Size);
-        return X86Operand::CreateMem(Disp, Start, End, Size);
-      } else if (getLexer().is(AsmToken::Star)) {
-        // Handle '[' Scale*IndexReg ']'
-        Parser.Lex();
-        SMLoc IdxRegLoc = Tok.getLoc();
-        if (ParseRegister(IndexReg, IdxRegLoc, End))
-          return ErrorOperand(IdxRegLoc, "Expected register");
-        Scale = Val;
-      } else
-        return ErrorOperand(Loc, "Unexpected token");
   }
 
-  // Parse ][ as a plus.
-  bool ExpectRBrac = true;
-  if (getLexer().is(AsmToken::RBrac)) {
-    ExpectRBrac = false;
-    End = Parser.getTok().getEndLoc();
-    Parser.Lex();
-  }
+  // Parse [ BaseReg + Scale*IndexReg + Disp ].
+  bool Done = false;
+  IntelBracExprStateMachine SM(Parser);
+
+  // If we parsed a register, then the end loc has already been set and
+  // the identifier has already been lexed.  We also need to update the
+  // state.
+  if (TmpReg)
+    SM.onRegister(TmpReg);
+
+  const MCExpr *Disp = 0;
+  while (!Done) {
+    bool UpdateLocLex = true;
 
-  if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus) ||
-      getLexer().is(AsmToken::LBrac)) {
-    ExpectRBrac = true;
-    bool isPlus = getLexer().is(AsmToken::Plus) ||
-      getLexer().is(AsmToken::LBrac);
-    Parser.Lex(); 
-    SMLoc PlusLoc = Tok.getLoc();
-    if (getLexer().is(AsmToken::Integer)) {
+    // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
+    // identifier.  Don't try an parse it as a register.
+    if (Tok.getString().startswith("."))
+      break;
+
+    switch (getLexer().getKind()) {
+    default: {
+      if (SM.isValidEndState()) {
+        Done = true;
+        break;
+      }
+      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+    }
+    case AsmToken::Identifier: {
+      // This could be a register or a displacement expression.
+      if(!ParseRegister(TmpReg, Start, End)) {
+        SM.onRegister(TmpReg);
+        UpdateLocLex = false;
+        break;
+      } else if (!getParser().parseExpression(Disp, End)) {
+        SM.onDispExpr();
+        UpdateLocLex = false;
+        break;
+      }
+      return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+    }
+    case AsmToken::Integer: {
       int64_t Val = Tok.getIntVal();
-      Parser.Lex();
-      if (getLexer().is(AsmToken::Star)) {
-        Parser.Lex();
-        SMLoc IdxRegLoc = Tok.getLoc();
-        if (ParseRegister(IndexReg, IdxRegLoc, End))
-          return ErrorOperand(IdxRegLoc, "Expected register");
-        Scale = Val;
-      } else if (getLexer().is(AsmToken::RBrac)) {
-        const MCExpr *ValExpr = MCConstantExpr::Create(Val, getContext());
-        Disp = isPlus ? ValExpr : MCConstantExpr::Create(0-Val, getContext());
-      } else
-        return ErrorOperand(PlusLoc, "unexpected token after +");
-    } else if (getLexer().is(AsmToken::Identifier)) {
-      // This could be an index register or a displacement expression.
-      if (!IndexReg)
-        ParseRegister(IndexReg, Start, End);
-      else if (getParser().ParseExpression(Disp, End))
-        return 0;
+      SM.onInteger(Val);
+      break;
     }
-  }
-  
-  // Parse ][ as a plus.
-  if (getLexer().is(AsmToken::RBrac)) {
-    ExpectRBrac = false;
-    End = Parser.getTok().getEndLoc();
-    Parser.Lex();
-    if (getLexer().is(AsmToken::LBrac)) {
-      ExpectRBrac = true;
-      Parser.Lex();
-      if (getParser().ParseExpression(Disp, End))
-        return 0;
+    case AsmToken::Plus:    SM.onPlus(); break;
+    case AsmToken::Minus:   SM.onMinus(); break;
+    case AsmToken::Star:    SM.onStar(); break;
+    case AsmToken::LBrac:   SM.onLBrac(); break;
+    case AsmToken::RBrac:   SM.onRBrac(); break;
+    }
+    if (!Done && UpdateLocLex) {
+      End = Tok.getLoc();
+      Parser.Lex(); // Consume the token.
     }
-  } else if (ExpectRBrac) {
-    if (getParser().ParseExpression(Disp, End))
-      return 0;
   }
 
-  if (ExpectRBrac) {
-    if (getLexer().isNot(AsmToken::RBrac))
-      return ErrorOperand(End, "expected ']' token!");
-    End = Parser.getTok().getEndLoc();
-    Parser.Lex();
-  }
+  if (!Disp)
+    Disp = MCConstantExpr::Create(SM.getDisp(), getContext());
 
   // Parse the dot operator (e.g., [ebx].foo.bar).
   if (Tok.getString().startswith(".")) {
@@ -797,10 +990,18 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
     Disp = NewDisp;
   }
 
+  int BaseReg = SM.getBaseReg();
+  int IndexReg = SM.getIndexReg();
+
   // handle [-42]
-  if (!BaseReg && !IndexReg)
-    return X86Operand::CreateMem(Disp, Start, End, Size);
+  if (!BaseReg && !IndexReg) {
+    if (!SegReg)
+      return X86Operand::CreateMem(Disp, Start, End);
+    else
+      return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size);
+  }
 
+  int Scale = SM.getScale();
   return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
                                Start, End, Size);
 }
@@ -832,28 +1033,43 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) {
   }
 
   const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
-  if (getParser().ParseExpression(Disp, End))
+  if (getParser().parseExpression(Disp, End))
     return 0;
 
   bool NeedSizeDir = false;
-  if (!Size && isParsingInlineAsm()) {
+  bool IsVarDecl = false;
+  if (isParsingInlineAsm()) {
     if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) {
       const MCSymbol &Sym = SymRef->getSymbol();
       // FIXME: The SemaLookup will fail if the name is anything other then an
       // identifier.
       // FIXME: Pass a valid SMLoc.
-      SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size);
+      unsigned tLength, tSize, tType;
+      SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, tLength,
+                                              tSize, tType, IsVarDecl);
+      if (!Size)
+        Size = tType * 8; // Size is in terms of bits in this context.
       NeedSizeDir = Size > 0;
     }
   }
   if (!isParsingInlineAsm())
     return X86Operand::CreateMem(Disp, Start, End, Size);
-  else
+  else {
+    // If this is not a VarDecl then assume it is a FuncDecl or some other label
+    // reference.  We need an 'r' constraint here, so we need to create register
+    // operand to ensure proper matching.  Just pick a GPR based on the size of
+    // a pointer.
+    if (!IsVarDecl) {
+      unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+      return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true);
+    }
+
     // When parsing inline assembly we set the base register to a non-zero value
     // as we don't know the actual value at this time.  This is necessary to
     // get the matching correct in some cases.
     return X86Operand::CreateMem(/*SegReg*/0, Disp, /*BaseReg*/1, /*IndexReg*/0,
                                  /*Scale*/1, Start, End, Size, NeedSizeDir);
+  }
 }
 
 /// Parse the '.' operator.
@@ -919,7 +1135,7 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
 
   SMLoc End;
   const MCExpr *Val;
-  if (getParser().ParseExpression(Val, End))
+  if (getParser().parseExpression(Val, End))
     return ErrorOperand(Start, "Unable to parse expression!");
 
   // Don't emit the offset operator.
@@ -929,13 +1145,23 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
   // register operand to ensure proper matching.  Just pick a GPR based on
   // the size of a pointer.
   unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
-  return X86Operand::CreateReg(RegNo, Start, End, OffsetOfLoc);
+  return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
+                               OffsetOfLoc);
 }
 
-/// Parse the 'TYPE' operator.  The TYPE operator returns the size of a C or
-/// C++ type or variable. If the variable is an array, TYPE returns the size of
-/// a single element of the array.
-X86Operand *X86AsmParser::ParseIntelTypeOperator(SMLoc Start) {
+enum IntelOperatorKind {
+  IOK_LENGTH,
+  IOK_SIZE,
+  IOK_TYPE
+};
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
+/// returns the number of elements in an array.  It returns the value 1 for
+/// non-array variables.  The SIZE operator returns the size of a C or C++
+/// variable.  A variable's size is the product of its LENGTH and TYPE.  The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+X86Operand *X86AsmParser::ParseIntelOperator(SMLoc Start, unsigned OpKind) {
   SMLoc TypeLoc = Start;
   Parser.Lex(); // Eat offset.
   Start = Parser.getTok().getLoc();
@@ -943,60 +1169,63 @@ X86Operand *X86AsmParser::ParseIntelTypeOperator(SMLoc Start) {
 
   SMLoc End;
   const MCExpr *Val;
-  if (getParser().ParseExpression(Val, End))
+  if (getParser().parseExpression(Val, End))
     return 0;
 
-  unsigned Size = 0;
+  unsigned Length = 0, Size = 0, Type = 0;
   if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Val)) {
     const MCSymbol &Sym = SymRef->getSymbol();
     // FIXME: The SemaLookup will fail if the name is anything other then an
     // identifier.
     // FIXME: Pass a valid SMLoc.
-    if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size))
-      return ErrorOperand(Start, "Unable to lookup TYPE of expr!");
-
-    Size /= 8; // Size is in terms of bits, but we want bytes in the context.
+    bool IsVarDecl;
+    if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Length,
+                                                 Size, Type, IsVarDecl))
+      return ErrorOperand(Start, "Unable to lookup expr!");
+  }
+  unsigned CVal;
+  switch(OpKind) {
+  default: llvm_unreachable("Unexpected operand kind!");
+  case IOK_LENGTH: CVal = Length; break;
+  case IOK_SIZE: CVal = Size; break;
+  case IOK_TYPE: CVal = Type; break;
   }
 
   // Rewrite the type operator and the C or C++ type or variable in terms of an
   // immediate.  E.g. TYPE foo -> $$4
   unsigned Len = End.getPointer() - TypeLoc.getPointer();
-  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, Size));
+  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal));
 
-  const MCExpr *Imm = MCConstantExpr::Create(Size, getContext());
+  const MCExpr *Imm = MCConstantExpr::Create(CVal, getContext());
   return X86Operand::CreateImm(Imm, Start, End, /*NeedAsmRewrite*/false);
 }
 
 X86Operand *X86AsmParser::ParseIntelOperand() {
   SMLoc Start = Parser.getTok().getLoc(), End;
-
-  // offset operator.
   StringRef AsmTokStr = Parser.getTok().getString();
-  if ((AsmTokStr == "offset" || AsmTokStr == "OFFSET") &&
-      isParsingInlineAsm())
-    return ParseIntelOffsetOfOperator(Start);
-
-  // Type directive.
-  if ((AsmTokStr == "type" || AsmTokStr == "TYPE") &&
-      isParsingInlineAsm())
-    return ParseIntelTypeOperator(Start);
-
-  // Unsupported directives.
-  if (isParsingIntelSyntax() &&
-      (AsmTokStr == "size" || AsmTokStr == "SIZE" ||
-       AsmTokStr == "length" || AsmTokStr == "LENGTH"))
-      return ErrorOperand(Start, "Unsupported directive!");
-
-  // immediate.
+
+  // Offset, length, type and size operators.
+  if (isParsingInlineAsm()) {
+    if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+      return ParseIntelOffsetOfOperator(Start);
+    if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
+      return ParseIntelOperator(Start, IOK_LENGTH);
+    if (AsmTokStr == "size" || AsmTokStr == "SIZE")
+      return ParseIntelOperator(Start, IOK_SIZE);
+    if (AsmTokStr == "type" || AsmTokStr == "TYPE")
+      return ParseIntelOperator(Start, IOK_TYPE);
+  }
+
+  // Immediate.
   if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) ||
       getLexer().is(AsmToken::Minus)) {
     const MCExpr *Val;
-    if (!getParser().ParseExpression(Val, End)) {
+    if (!getParser().parseExpression(Val, End)) {
       return X86Operand::CreateImm(Val, Start, End);
     }
   }
 
-  // register
+  // Register.
   unsigned RegNo = 0;
   if (!ParseRegister(RegNo, Start, End)) {
     // If this is a segment register followed by a ':', then this is the start
@@ -1008,7 +1237,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
     return ParseIntelMemOperand(RegNo, Start);
   }
 
-  // mem operand
+  // Memory operand.
   return ParseIntelMemOperand(0, Start);
 }
 
@@ -1042,7 +1271,7 @@ X86Operand *X86AsmParser::ParseATTOperand() {
     SMLoc Start = Parser.getTok().getLoc(), End;
     Parser.Lex();
     const MCExpr *Val;
-    if (getParser().ParseExpression(Val, End))
+    if (getParser().parseExpression(Val, End))
       return 0;
     return X86Operand::CreateImm(Val, Start, End);
   }
@@ -1060,7 +1289,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
   if (getLexer().isNot(AsmToken::LParen)) {
     SMLoc ExprEnd;
-    if (getParser().ParseExpression(Disp, ExprEnd)) return 0;
+    if (getParser().parseExpression(Disp, ExprEnd)) return 0;
 
     // After parsing the base expression we could either have a parenthesized
     // memory address or not.  If not, return now.  If so, eat the (.
@@ -1086,7 +1315,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
       SMLoc ExprEnd;
 
       // It must be an parenthesized expression, parse it now.
-      if (getParser().ParseParenExpression(Disp, ExprEnd))
+      if (getParser().parseParenExpression(Disp, ExprEnd))
         return 0;
 
       // After parsing the base expression we could either have a parenthesized
@@ -1146,7 +1375,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
           SMLoc Loc = Parser.getTok().getLoc();
 
           int64_t ScaleVal;
-          if (getParser().ParseAbsoluteExpression(ScaleVal)){
+          if (getParser().parseAbsoluteExpression(ScaleVal)){
             Error(Loc, "expected scale expression");
             return 0;
           }
@@ -1165,7 +1394,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
       SMLoc Loc = Parser.getTok().getLoc();
 
       int64_t Value;
-      if (getParser().ParseAbsoluteExpression(Value))
+      if (getParser().parseAbsoluteExpression(Value))
         return 0;
 
       if (Value != 1)
@@ -1306,7 +1535,7 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
     if (X86Operand *Op = ParseOperand())
       Operands.push_back(Op);
     else {
-      Parser.EatToEndOfStatement();
+      Parser.eatToEndOfStatement();
       return true;
     }
 
@@ -1317,14 +1546,14 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       if (X86Operand *Op = ParseOperand())
         Operands.push_back(Op);
       else {
-        Parser.EatToEndOfStatement();
+        Parser.eatToEndOfStatement();
         return true;
       }
     }
 
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       SMLoc Loc = getLexer().getLoc();
-      Parser.EatToEndOfStatement();
+      Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
     }
   }
@@ -2014,10 +2243,10 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
-      if (getParser().ParseExpression(Value))
+      if (getParser().parseExpression(Value))
         return true;
 
-      getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/);
+      getParser().getStreamer().EmitValue(Value, Size);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 64ac5e6..0f6eeb1 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -34,10 +34,6 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   switch (MI->getOpcode()) {
   case X86::INSERTPSrr:
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
-    break;
   case X86::VINSERTPSrr:
     DestName = getRegName(MI->getOperand(0).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -46,10 +42,6 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::MOVLHPSrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVLHPSMask(2, ShuffleMask);
-    break;
   case X86::VMOVLHPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -58,10 +50,6 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::MOVHLPSrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeMOVHLPSMask(2, ShuffleMask);
-    break;
   case X86::VMOVHLPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -69,6 +57,29 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeMOVHLPSMask(2, ShuffleMask);
     break;
 
+  case X86::PALIGNR128rr:
+  case X86::VPALIGNR128rr:
+    Src1Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PALIGNR128rm:
+  case X86::VPALIGNR128rm:
+    Src2Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePALIGNRMask(MVT::v16i8,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPALIGNR256rr:
+    Src1Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPALIGNR256rm:
+    Src2Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePALIGNRMask(MVT::v32i8,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+
   case X86::PSHUFDri:
   case X86::VPSHUFDri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -131,15 +142,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::PUNPCKHBWrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHBWrm:
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
-    break;
   case X86::VPUNPCKHBWrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::PUNPCKHBWrm:
   case X86::VPUNPCKHBWrm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -154,15 +160,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
     break;
   case X86::PUNPCKHWDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHWDrm:
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
-    break;
   case X86::VPUNPCKHWDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::PUNPCKHWDrm:
   case X86::VPUNPCKHWDrm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -177,15 +178,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
     break;
   case X86::PUNPCKHDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHDQrm:
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
-    break;
   case X86::VPUNPCKHDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::PUNPCKHDQrm:
   case X86::VPUNPCKHDQrm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -200,15 +196,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
     break;
   case X86::PUNPCKHQDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKHQDQrm:
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
-    break;
   case X86::VPUNPCKHQDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::PUNPCKHQDQrm:
   case X86::VPUNPCKHQDQrm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -224,15 +215,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::PUNPCKLBWrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKLBWrm:
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
-    break;
   case X86::VPUNPCKLBWrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::PUNPCKLBWrm:
   case X86::VPUNPCKLBWrm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -247,15 +233,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
     break;
   case X86::PUNPCKLWDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKLWDrm:
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
-    break;
   case X86::VPUNPCKLWDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::PUNPCKLWDrm:
   case X86::VPUNPCKLWDrm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -270,15 +251,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
     break;
   case X86::PUNPCKLDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKLDQrm:
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
-    break;
   case X86::VPUNPCKLDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::PUNPCKLDQrm:
   case X86::VPUNPCKLDQrm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -293,15 +269,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
     break;
   case X86::PUNPCKLQDQrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::PUNPCKLQDQrm:
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
-    break;
   case X86::VPUNPCKLQDQrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::PUNPCKLQDQrm:
   case X86::VPUNPCKLQDQrm:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
@@ -317,16 +288,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::SHUFPDrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::SHUFPDrmi:
-    DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    break;
   case X86::VSHUFPDrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::SHUFPDrmi:
   case X86::VSHUFPDrmi:
     DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
                     ShuffleMask);
@@ -344,16 +309,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::SHUFPSrri:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::SHUFPSrmi:
-    DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
-                    ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    break;
   case X86::VSHUFPSrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::SHUFPSrmi:
   case X86::VSHUFPSrmi:
     DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
                     ShuffleMask);
@@ -371,15 +330,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::UNPCKLPDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKLPDrm:
-    DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    break;
   case X86::VUNPCKLPDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::UNPCKLPDrm:
   case X86::VUNPCKLPDrm:
     DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -394,15 +348,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::UNPCKLPSrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKLPSrm:
-    DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    break;
   case X86::VUNPCKLPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::UNPCKLPSrm:
   case X86::VUNPCKLPSrm:
     DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -417,15 +366,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::UNPCKHPDrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKHPDrm:
-    DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    break;
   case X86::VUNPCKHPDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::UNPCKHPDrm:
   case X86::VUNPCKHPDrm:
     DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -440,15 +384,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::UNPCKHPSrr:
-    Src2Name = getRegName(MI->getOperand(2).getReg());
-    // FALL THROUGH.
-  case X86::UNPCKHPSrm:
-    DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
-    Src1Name = getRegName(MI->getOperand(0).getReg());
-    break;
   case X86::VUNPCKHPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
+  case X86::UNPCKHPSrm:
   case X86::VUNPCKHPSrm:
     DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 7ea1961..9e68388 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -104,7 +104,7 @@ namespace X86II {
 
     /// MO_TLSLD - On a symbol operand this indicates that the immediate is
     /// the offset of the GOT entry with the TLS index for the module that
-    /// contains the symbol. When this index is passed to a call to to
+    /// contains the symbol. When this index is passed to a call to
     /// __tls_get_addr, the function will return the base address of the TLS
     /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
     ///
@@ -114,7 +114,7 @@ namespace X86II {
 
     /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
     /// the offset of the GOT entry with the TLS index for the module that
-    /// contains the symbol. When this index is passed to a call to to
+    /// contains the symbol. When this index is passed to a call to
     /// ___tls_get_addr, the function will return the base address of the TLS
     /// block for the symbol. Used in the IA32 local dynamic TLS access model.
     ///
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 16488eb..7815ae9 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -44,7 +44,7 @@ void X86MCAsmInfoDarwin::anchor() { }
 X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   bool is64Bit = T.getArch() == Triple::x86_64;
   if (is64Bit)
-    PointerSize = 8;
+    PointerSize = CalleeSaveStackSlotSize = 8;
 
   AssemblerDialect = AsmWriterFlavor;
 
@@ -76,8 +76,16 @@ X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
 void X86ELFMCAsmInfo::anchor() { }
 
 X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
-  if (T.getArch() == Triple::x86_64)
-    PointerSize = 8;
+  bool is64Bit = T.getArch() == Triple::x86_64;
+  bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+  // For ELF, x86-64 pointer size depends on the ABI.
+  // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+  // with the x32 ABI, pointer size remains the default 4.
+  PointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+  // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+  CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
 
   AssemblerDialect = AsmWriterFlavor;
 
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 4011035..496b704 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -953,3 +953,12 @@ similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
 turn into hsubpd also.
 
 //===---------------------------------------------------------------------===//
+
+define <2 x i32> @foo(<2 x double> %in) {
+  %x = fptosi <2 x double> %in to <2 x i32>
+  ret <2 x i32> %x
+}
+
+Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 8b87c1f..bbd4904 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -61,6 +61,24 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
     ShuffleMask.push_back(NElts+i);
 }
 
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      unsigned Base = i + Offset;
+      // if i+offset is out of this lane then we actually need the other source
+      if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+      ShuffleMask.push_back(Base + l);
+    }
+  }
+}
+
 /// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
 /// VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 70d8171..017ab32 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -35,6 +35,8 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 // <0,2> or <0,1,4,5>
 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
 void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 3ab2899..0216252 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -120,6 +120,8 @@ def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
                                       "Support BMI2 instructions">;
 def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
                                       "Support RTM instructions">;
+def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
+                                      "Support ADX instructions">;
 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
 def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 5b3e0ba..ac5daec 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -252,14 +252,15 @@ void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
   }
 
   case MachineOperand::MO_Immediate:
-    O << '$' << MO.getImm();
+    if (AsmVariant == 0) O << '$';
+    O << MO.getImm();
     return;
 
   case MachineOperand::MO_JumpTableIndex:
   case MachineOperand::MO_ConstantPoolIndex:
   case MachineOperand::MO_GlobalAddress:
   case MachineOperand::MO_ExternalSymbol: {
-    O << '$';
+    if (AsmVariant == 0) O << '$';
     printSymbolOperand(MO, O);
     break;
   }
@@ -355,19 +356,23 @@ void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
     NeedPlus = true;
   }
 
-  assert (DispSpec.isImm() && "Displacement is not an immediate!");
-  int64_t DispVal = DispSpec.getImm();
-  if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
-    if (NeedPlus) {
-      if (DispVal > 0)
-        O << " + ";
-      else {
-        O << " - ";
-        DispVal = -DispVal;
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    printOperand(MI, Op+3, O, Modifier, AsmVariant);
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
       }
+      O << DispVal;
     }
-    O << DispVal;
-  }  
+  }
   O << ']';
 }
 
@@ -543,7 +548,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
                                         MCSA_IndirectSymbol);
         // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4.
         const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
-        OutStreamer.EmitBytes(StringRef(HltInsts, 5), 0/*addrspace*/);
+        OutStreamer.EmitBytes(StringRef(HltInsts, 5));
       }
 
       Stubs.clear();
@@ -569,7 +574,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
         // .long 0
         if (MCSym.getInt())
           // External to current translation unit.
-          OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
+          OutStreamer.EmitIntValue(0, 4/*size*/);
         else
           // Internal to current translation unit.
           //
@@ -578,8 +583,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
           // using NLPs.  However, sometimes the types are local to the file. So
           // we need to fill in the value for the NLP in those cases.
           OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
-                                                        OutContext),
-                                4/*size*/, 0/*addrspace*/);
+                                                        OutContext), 4/*size*/);
       }
       Stubs.clear();
       OutStreamer.AddBlankLine();
@@ -596,8 +600,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
         // .long _foo
         OutStreamer.EmitValue(MCSymbolRefExpr::
                               Create(Stubs[i].second.getPointer(),
-                                     OutContext),
-                              4/*size*/, 0/*addrspace*/);
+                                     OutContext), 4/*size*/);
       }
       Stubs.clear();
       OutStreamer.AddBlankLine();
@@ -663,7 +666,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
           name += ",DATA";
         else
         name += ",data";
-        OutStreamer.EmitBytes(name, 0);
+        OutStreamer.EmitBytes(name);
       }
 
       for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) {
@@ -672,7 +675,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
         else
           name = " -export:";
         name += DLLExportedFns[i]->getName();
-        OutStreamer.EmitBytes(name, 0);
+        OutStreamer.EmitBytes(name);
       }
     }
   }
@@ -692,7 +695,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);
         OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
-                                    TD->getPointerSize(), 0);
+                                    TD->getPointerSize());
       }
       Stubs.clear();
     }
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 61eb14e..bc7496b 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- X86AsmPrinter.h - Convert X86 LLVM code to assembly -----*- C++ -*-===//
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// AT&T assembly code printer class.
-//
-//===----------------------------------------------------------------------===//
 
 #ifndef X86ASMPRINTER_H
 #define X86ASMPRINTER_H
@@ -35,7 +31,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   }
 
   virtual const char *getPassName() const LLVM_OVERRIDE {
-    return "X86 AT&T-Style Assembly Printer";
+    return "X86 Assembly / Object Emitter";
   }
 
   const X86Subtarget &getSubtarget() const { return *Subtarget; }
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 7ad2fdd..b516be0 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -519,6 +519,9 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
 def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
                                      (sequence "XMM%u", 6, 15))>;
 
+def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+                                           R11, R12, R13, R14, R15, RBP,
+                                           (sequence "XMM%u", 0, 15))>;
 
 // Standard C + YMM6-15
 def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index bc77334..ece38aa 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -124,7 +124,7 @@ template<class CodeEmitter>
 } // end anonymous namespace.
 
 /// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
-/// to the specified templated MachineCodeEmitter object.
+/// to the specified JITCodeEmitter object.
 FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM,
                                                 JITCodeEmitter &JCE) {
   return new Emitter<JITCodeEmitter>(TM, JCE);
diff --git a/lib/Target/X86/X86CompilationCallback_Win64.asm b/lib/Target/X86/X86CompilationCallback_Win64.asm
index f321778..69b4c71 100644
--- a/lib/Target/X86/X86CompilationCallback_Win64.asm
+++ b/lib/Target/X86/X86CompilationCallback_Win64.asm
@@ -11,7 +11,7 @@
 ;;
 ;;===----------------------------------------------------------------------===
 
-extrn X86CompilationCallback2: PROC
+extrn LLVMX86CompilationCallback2: PROC
 
 .code
 X86CompilationCallback proc
@@ -42,7 +42,7 @@ X86CompilationCallback proc
     ; Pass prev frame and return address.
     mov     rcx, rbp
     mov     rdx, qword ptr [rbp+8]
-    call    X86CompilationCallback2
+    call    LLVMX86CompilationCallback2
 
     ; Restore all XMM arg registers.
     movaps  xmm3, [rsp+48+32]
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 5facb7b..b5c3270 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -75,6 +75,8 @@ public:
   virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
                              const LoadInst *LI);
 
+  virtual bool FastLowerArguments();
+
 #include "X86GenFastISel.inc"
 
 private:
@@ -326,12 +328,11 @@ bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
                                     unsigned &ResultReg) {
   unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
                            Src, /*TODO: Kill=*/false);
-
-  if (RR != 0) {
-    ResultReg = RR;
-    return true;
-  } else
+  if (RR == 0)
     return false;
+
+  ResultReg = RR;
+  return true;
 }
 
 /// X86SelectAddress - Attempt to fill in an address from the given value.
@@ -727,7 +728,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 
   // Don't handle popping bytes on return for now.
   if (X86MFInfo->getBytesToPopOnReturn() != 0)
-    return 0;
+    return false;
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
   // tail call optimization. Fastisel doesn't know how to do that.
@@ -738,6 +739,9 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   if (F.isVarArg())
     return false;
 
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
     GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
@@ -805,8 +809,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
             DstReg).addReg(SrcReg);
 
-    // Mark the register as live out of the function.
-    MRI.addLiveOut(VA.getLocReg());
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
   }
 
   // The x86-64 ABI for returning structs by value requires that we copy
@@ -819,11 +823,14 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
            "SRetReturnReg should have been set in LowerFormalArguments()!");
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
             X86::RAX).addReg(Reg);
-    MRI.addLiveOut(X86::RAX);
+    RetRegs.push_back(X86::RAX);
   }
 
   // Now emit the RET.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
   return true;
 }
 
@@ -1372,7 +1379,6 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
     else if (Len >= 2)
       VT = MVT::i16;
     else {
-      assert(Len == 1);
       VT = MVT::i8;
     }
 
@@ -1516,6 +1522,78 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   }
 }
 
+bool X86FastISel::FastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C)
+    return false;
+  
+  if (!Subtarget->is64Bit())
+    return false;
+  
+  // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+  unsigned Idx = 1;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (Idx > 6)
+      return false;
+
+    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = I->getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(ArgTy);
+    if (!ArgVT.isSimple()) return false;
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    case MVT::i32:
+    case MVT::i64:
+      break;
+    default:
+      return false;
+    }
+  }
+
+  static const uint16_t GPR32ArgRegs[] = {
+    X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+  };
+  static const uint16_t GPR64ArgRegs[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+  };
+
+  Idx = 0;
+  const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32);
+  const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (I->use_empty())
+      continue;
+    bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
+    const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
+    unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(DstReg, getKillRegState(true));
+    UpdateValueMap(I, ResultReg);
+  }
+  return true;
+}
+
 bool X86FastISel::X86SelectCall(const Instruction *I) {
   const CallInst *CI = cast<CallInst>(I);
   const Value *Callee = CI->getCalledValue();
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 420aeb8..a05cf5c 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -50,13 +50,13 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           RegInfo->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken() ||
+          MFI->isFrameAddressTaken() || MF.hasMSInlineAsm() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
           MMI.callsUnwindInit() || MMI.callsEHReturn());
 }
 
-static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
-  if (is64Bit) {
+static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
     if (isInt<8>(Imm))
       return X86::SUB64ri8;
     return X86::SUB64ri32;
@@ -67,8 +67,8 @@ static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
   }
 }
 
-static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) {
-  if (is64Bit) {
+static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
     if (isInt<8>(Imm))
       return X86::ADD64ri8;
     return X86::ADD64ri32;
@@ -79,8 +79,8 @@ static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) {
   }
 }
 
-static unsigned getLEArOpcode(unsigned is64Bit) {
-  return is64Bit ? X86::LEA64r : X86::LEA32r;
+static unsigned getLEArOpcode(unsigned IsLP64) {
+  return IsLP64 ? X86::LEA64r : X86::LEA32r;
 }
 
 /// findDeadCallerSavedReg - Return a caller-saved register that isn't live
@@ -145,17 +145,17 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
 static
 void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
                   unsigned StackPtr, int64_t NumBytes,
-                  bool Is64Bit, bool UseLEA,
+                  bool Is64Bit, bool IsLP64, bool UseLEA,
                   const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
   bool isSub = NumBytes < 0;
   uint64_t Offset = isSub ? -NumBytes : NumBytes;
   unsigned Opc;
   if (UseLEA)
-    Opc = getLEArOpcode(Is64Bit);
+    Opc = getLEArOpcode(IsLP64);
   else
     Opc = isSub
-      ? getSUBriOpcode(Is64Bit, Offset)
-      : getADDriOpcode(Is64Bit, Offset);
+      ? getSUBriOpcode(IsLP64, Offset)
+      : getADDriOpcode(IsLP64, Offset);
 
   uint64_t Chunk = (1LL << 31) - 1;
   DebugLoc DL = MBB.findDebugLoc(MBBI);
@@ -660,6 +660,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
   bool HasFP = hasFP(MF);
   bool Is64Bit = STI.is64Bit();
+  bool IsLP64 = STI.isTarget64BitLP64();
   bool IsWin64 = STI.isTargetWin64();
   bool UseLEA = STI.useLeaForSP();
   unsigned StackAlign = getStackAlignment();
@@ -711,7 +712,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (TailCallReturnAddrDelta < 0) {
     MachineInstr *MI =
       BuildMI(MBB, MBBI, DL,
-              TII.get(getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta)),
+              TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)),
               StackPtr)
         .addReg(StackPtr)
         .addImm(-TailCallReturnAddrDelta)
@@ -927,7 +928,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     // MSVC x64's __chkstk needs to adjust %rsp.
     // FIXME: %rax preserves the offset and should be available.
     if (isSPUpdateNeeded)
-      emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
+      emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
                    UseLEA, TII, *RegInfo);
 
     if (isEAXAlive) {
@@ -939,7 +940,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
         MBB.insert(MBBI, MI);
     }
   } else if (NumBytes)
-    emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
+    emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
                  UseLEA, TII, *RegInfo);
 
   // If we need a base pointer, set it up here. It's whatever the value
@@ -996,6 +997,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc DL = MBBI->getDebugLoc();
   bool Is64Bit = STI.is64Bit();
+  bool IsLP64 = STI.isTarget64BitLP64();
   bool UseLEA = STI.useLeaForSP();
   unsigned StackAlign = getStackAlignment();
   unsigned SlotSize = RegInfo->getSlotSize();
@@ -1081,7 +1083,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     if (RegInfo->needsStackRealignment(MF))
       MBBI = FirstCSPop;
     if (CSSize != 0) {
-      unsigned Opc = getLEArOpcode(Is64Bit);
+      unsigned Opc = getLEArOpcode(IsLP64);
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
                    FramePtr, false, -CSSize);
     } else {
@@ -1091,7 +1093,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     }
   } else if (NumBytes) {
     // Adjust stack pointer back: ESP += numbytes.
-    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII, *RegInfo);
+    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA,
+                 TII, *RegInfo);
   }
 
   // We're returning from function via eh_return.
@@ -1126,7 +1129,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     if (Offset) {
       // Check for possible merge with preceding ADD instruction.
       Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
-      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, UseLEA, TII, *RegInfo);
+      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64,
+                   UseLEA, TII, *RegInfo);
     }
 
     // Jump to label or value in register.
@@ -1169,7 +1173,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 
     // Check for possible merge with preceding ADD instruction.
     delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
-    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, UseLEA, TII, *RegInfo);
+    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII,
+                 *RegInfo);
   }
 }
 
@@ -1382,16 +1387,25 @@ HasNestArgument(const MachineFunction *MF) {
 }
 
 
-/// GetScratchRegister - Get a register for performing work in the segmented
-/// stack prologue. Depending on platform and the properties of the function
-/// either one or two registers will be needed. Set primary to true for
-/// the first register, false for the second.
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
 static unsigned
 GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) {
+  CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+
+  // Erlang stuff.
+  if (CallingConvention == CallingConv::HiPE) {
+    if (Is64Bit)
+      return Primary ? X86::R14 : X86::R13;
+    else
+      return Primary ? X86::EBX : X86::EDI;
+  }
+
   if (Is64Bit)
     return Primary ? X86::R11 : X86::R12;
 
-  CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
   bool IsNested = HasNestArgument(&MF);
 
   if (CallingConvention == CallingConv::X86_FastCall ||
@@ -1419,7 +1433,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   bool Is64Bit = STI.is64Bit();
   unsigned TlsReg, TlsOffset;
   DebugLoc DL;
-  const X86Subtarget *ST = &MF.getTarget().getSubtarget<X86Subtarget>();
 
   unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true);
   assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
@@ -1427,8 +1440,8 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 
   if (MF.getFunction()->isVarArg())
     report_fatal_error("Segmented stacks do not support vararg functions.");
-  if (!ST->isTargetLinux() && !ST->isTargetDarwin() &&
-      !ST->isTargetWin32() && !ST->isTargetFreeBSD())
+  if (!STI.isTargetLinux() && !STI.isTargetDarwin() &&
+      !STI.isTargetWin32() && !STI.isTargetFreeBSD())
     report_fatal_error("Segmented stacks not supported on this platform.");
 
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
@@ -1466,13 +1479,13 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 
   // Read the limit off the current stacklet off the stack_guard location.
   if (Is64Bit) {
-    if (ST->isTargetLinux()) {
+    if (STI.isTargetLinux()) {
       TlsReg = X86::FS;
       TlsOffset = 0x70;
-    } else if (ST->isTargetDarwin()) {
+    } else if (STI.isTargetDarwin()) {
       TlsReg = X86::GS;
       TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
-    } else if (ST->isTargetFreeBSD()) {
+    } else if (STI.isTargetFreeBSD()) {
       TlsReg = X86::FS;
       TlsOffset = 0x18;
     } else {
@@ -1488,16 +1501,16 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
     BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
       .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
   } else {
-    if (ST->isTargetLinux()) {
+    if (STI.isTargetLinux()) {
       TlsReg = X86::GS;
       TlsOffset = 0x30;
-    } else if (ST->isTargetDarwin()) {
+    } else if (STI.isTargetDarwin()) {
       TlsReg = X86::GS;
       TlsOffset = 0x48 + 90*4;
-    } else if (ST->isTargetWin32()) {
+    } else if (STI.isTargetWin32()) {
       TlsReg = X86::FS;
       TlsOffset = 0x14; // pvArbitrary, reserved for application use
-    } else if (ST->isTargetFreeBSD()) {
+    } else if (STI.isTargetFreeBSD()) {
       report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
     } else {
       report_fatal_error("Segmented stacks not supported on this platform.");
@@ -1509,10 +1522,10 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
       BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
         .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
 
-    if (ST->isTargetLinux() || ST->isTargetWin32()) {
+    if (STI.isTargetLinux() || STI.isTargetWin32()) {
       BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
         .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
-    } else if (ST->isTargetDarwin()) {
+    } else if (STI.isTargetDarwin()) {
 
       // TlsOffset doesn't fit into a mod r/m byte so we need an extra register
       unsigned ScratchReg2;
@@ -1598,3 +1611,229 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MF.verify();
 #endif
 }
+
+// Erlang programs may need a special prologue to handle the stack size they
+// might need at runtime. That is because Erlang/OTP does not implement a C
+// stack but uses a custom implementation of hybrid stack/heap
+// architecture. (for more information see Eric Stenman's Ph.D. thesis:
+// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+//
+//
+// CheckStack:
+//	temp0 = sp - MaxStack
+//	if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+// OldStart:
+//	...
+// IncStack:
+//	call inc_stack   # doubles the stack space
+//	temp0 = sp - MaxStack
+//	if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
+  const X86InstrInfo &TII = *TM.getInstrInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize();
+  const bool Is64Bit = STI.is64Bit();
+  DebugLoc DL;
+  // HiPE-specific values
+  const unsigned HipeLeafWords = 24;
+  const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+  const unsigned Guaranteed = HipeLeafWords * SlotSize;
+  unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
+                            MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+  unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+  assert(STI.isTargetLinux() &&
+         "HiPE prologue is only supported on Linux operating systems.");
+
+  // Compute the largest caller's frame that is needed to fit the callees'
+  // frames. This 'MaxStack' is computed from:
+  //
+  // a) the fixed frame size, which is the space needed for all spilled temps,
+  // b) outgoing on-stack parameter areas, and
+  // c) the minimum stack space this function needs to make available for the
+  //    functions it calls (a tunable ABI property).
+  if (MFI->hasCalls()) {
+    unsigned MoreStackForCalls = 0;
+
+    for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
+         MBBI != MBBE; ++MBBI)
+      for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
+           MI != ME; ++MI) {
+        if (!MI->isCall())
+          continue;
+
+        // Get callee operand.
+        const MachineOperand &MO = MI->getOperand(0);
+
+        // Only take account of global function calls (no closures etc.).
+        if (!MO.isGlobal())
+          continue;
+
+        const Function *F = dyn_cast<Function>(MO.getGlobal());
+        if (!F)
+          continue;
+
+        // Do not update 'MaxStack' for primitive and built-in functions
+        // (encoded with names either starting with "erlang."/"bif_" or not
+        // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+        // "_", such as the BIF "suspend_0") as they are executed on another
+        // stack.
+        if (F->getName().find("erlang.") != StringRef::npos ||
+            F->getName().find("bif_") != StringRef::npos ||
+            F->getName().find_first_of("._") == StringRef::npos)
+          continue;
+
+        unsigned CalleeStkArity =
+          F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+        if (HipeLeafWords - 1 > CalleeStkArity)
+          MoreStackForCalls = std::max(MoreStackForCalls,
+                               (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+      }
+    MaxStack += MoreStackForCalls;
+  }
+
+  // If the stack frame needed is larger than the guaranteed then runtime checks
+  // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+  if (MaxStack > Guaranteed) {
+    MachineBasicBlock &prologueMBB = MF.front();
+    MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+    MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+    for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(),
+           E = prologueMBB.livein_end(); I != E; I++) {
+      stackCheckMBB->addLiveIn(*I);
+      incStackMBB->addLiveIn(*I);
+    }
+
+    MF.push_front(incStackMBB);
+    MF.push_front(stackCheckMBB);
+
+    unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+    unsigned LEAop, CMPop, CALLop;
+    if (Is64Bit) {
+      SPReg = X86::RSP;
+      PReg  = X86::RBP;
+      LEAop = X86::LEA64r;
+      CMPop = X86::CMP64rm;
+      CALLop = X86::CALL64pcrel32;
+      SPLimitOffset = 0x90;
+    } else {
+      SPReg = X86::ESP;
+      PReg  = X86::EBP;
+      LEAop = X86::LEA32r;
+      CMPop = X86::CMP32rm;
+      CALLop = X86::CALLpcrel32;
+      SPLimitOffset = 0x4c;
+    }
+
+    ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+    assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+           "HiPE prologue scratch register is live-in");
+
+    // Create new MBB for StackCheck:
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    // SPLimitOffset is in a fixed heap location (pointed by BP).
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB);
+
+    // Create new MBB for IncStack:
+    BuildMI(incStackMBB, DL, TII.get(CALLop)).
+      addExternalSymbol("inc_stack_0");
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB);
+
+    stackCheckMBB->addSuccessor(&prologueMBB, 99);
+    stackCheckMBB->addSuccessor(incStackMBB, 1);
+    incStackMBB->addSuccessor(&prologueMBB, 99);
+    incStackMBB->addSuccessor(incStackMBB, 1);
+  }
+#ifdef XDEBUG
+  MF.verify();
+#endif
+}
+
+void X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const X86RegisterInfo &RegInfo = *TM.getRegisterInfo();
+  unsigned StackPtr = RegInfo.getStackRegister();
+  bool reseveCallFrame = hasReservedCallFrame(MF);
+  int Opcode = I->getOpcode();
+  bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+  bool IsLP64 = STI.isTarget64BitLP64();
+  DebugLoc DL = I->getDebugLoc();
+  uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
+  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+  I = MBB.erase(I);
+
+  if (!reseveCallFrame) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+    // adjcallstackdown instruction into 'add ESP, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    if (Amount == 0)
+      return;
+
+    // We need to keep the stack aligned properly.  To do this, we round the
+    // amount of space needed for the outgoing arguments up to the next
+    // alignment boundary.
+    unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+    Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
+
+    MachineInstr *New = 0;
+    if (Opcode == TII.getCallFrameSetupOpcode()) {
+      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
+                    StackPtr)
+        .addReg(StackPtr)
+        .addImm(Amount);
+    } else {
+      assert(Opcode == TII.getCallFrameDestroyOpcode());
+
+      // Factor out the amount the callee already popped.
+      Amount -= CalleeAmt;
+
+      if (Amount) {
+        unsigned Opc = getADDriOpcode(IsLP64, Amount);
+        New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+          .addReg(StackPtr).addImm(Amount);
+      }
+    }
+
+    if (New) {
+      // The EFLAGS implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      // Replace the pseudo instruction with a new instruction.
+      MBB.insert(I, New);
+    }
+
+    return;
+  }
+
+  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.  We do this until we have
+    // more advanced stack pointer tracking ability.
+    unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+    MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+      .addReg(StackPtr).addImm(CalleeAmt);
+
+    // The EFLAGS implicit def is dead.
+    New->getOperand(3).setIsDead();
+
+    // We are not tracking the stack pointer adjustment by the callee, so make
+    // sure we restore the stack pointer immediately after the call, there may
+    // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+    MachineBasicBlock::iterator B = MBB.begin();
+    while (I != B && !llvm::prior(I)->isCall())
+      --I;
+    MBB.insert(I, New);
+  }
+}
+
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index dc515dc..3f08b9a 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -43,6 +43,8 @@ public:
 
   void adjustForSegmentedStacks(MachineFunction &MF) const;
 
+  void adjustForHiPEPrologue(MachineFunction &MF) const;
+
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS = NULL) const;
 
@@ -63,6 +65,10 @@ public:
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const;
   uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 935f9bd..00fbe69 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -280,13 +280,13 @@ namespace {
 
     /// getTargetMachine - Return a reference to the TargetMachine, casted
     /// to the target-specific type.
-    const X86TargetMachine &getTargetMachine() {
+    const X86TargetMachine &getTargetMachine() const {
       return static_cast<const X86TargetMachine &>(TM);
     }
 
     /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
     /// to the target-specific type.
-    const X86InstrInfo *getInstrInfo() {
+    const X86InstrInfo *getInstrInfo() const {
       return getTargetMachine().getInstrInfo();
     }
   };
@@ -446,7 +446,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     if (OptLevel != CodeGenOpt::None &&
         (N->getOpcode() == X86ISD::CALL ||
          (N->getOpcode() == X86ISD::TC_RETURN &&
-          // Only does this if load can be foled into TC_RETURN.
+          // Only does this if load can be folded into TC_RETURN.
           (Subtarget->is64Bit() ||
            getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
       /// Also try moving call address load from outside callseq_start to just
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 4ab92ad..1c3b9ae 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -605,10 +605,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 
     // We don't support sin/cos/fmod
-    setOperationAction(ISD::FSIN , MVT::f64, Expand);
-    setOperationAction(ISD::FCOS , MVT::f64, Expand);
-    setOperationAction(ISD::FSIN , MVT::f32, Expand);
-    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
     // Expand FP immediates into loads from the stack, except for the special
     // cases we handle.
@@ -633,8 +635,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
     // We don't support sin/cos/fmod
-    setOperationAction(ISD::FSIN , MVT::f32, Expand);
-    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
     // Special cases we handle for FP constants.
     addLegalFPImmediate(APFloat(+0.0f)); // xorps
@@ -644,8 +647,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 
     if (!TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
-      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
     }
   } else if (!TM.Options.UseSoftFloat) {
     // f32 and f64 in x87.
@@ -659,10 +663,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 
     if (!TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::FSIN           , MVT::f32  , Expand);
-      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
-      setOperationAction(ISD::FCOS           , MVT::f32  , Expand);
-      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+      setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
     }
     addLegalFPImmediate(APFloat(+0.0)); // FLD0
     addLegalFPImmediate(APFloat(+1.0)); // FLD1
@@ -699,8 +705,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
 
     if (!TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
-      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
+      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
     }
 
     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
@@ -748,7 +755,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
     setOperationAction(ISD::FABS, VT, Expand);
     setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FSINCOS, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FSINCOS, VT, Expand);
     setOperationAction(ISD::FREM, VT, Expand);
     setOperationAction(ISD::FMA,  VT, Expand);
     setOperationAction(ISD::FPOWI, VT, Expand);
@@ -1047,6 +1056,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
       setOperationAction(ISD::SRA,             MVT::v4i32, Custom);
     }
+    setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
+    setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
@@ -1111,6 +1122,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
 
+    setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
+
     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
@@ -1166,6 +1179,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
       setOperationAction(ISD::SHL,             MVT::v8i32, Legal);
 
       setOperationAction(ISD::SRA,             MVT::v8i32, Legal);
+
+      setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
     } else {
       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
@@ -1275,6 +1290,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setLibcallName(RTLIB::SRA_I128, 0);
   }
 
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget->hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget->isTargetDarwin()) {
+      // For MacOSX, we don't want to the normal expansion of a libcall to
+      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+      // traffic.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
+
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1295,6 +1323,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
@@ -1306,17 +1335,17 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // On Darwin, -Os means optimize for size without hurting performance,
   // do not reduce the limit.
-  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
-  maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
-  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
-  maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
-  maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
-  maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
+  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
   setPrefLoopAlignment(4); // 2^4 bytes.
-  benefitFromCodePlacementOpt = true;
+  BenefitFromCodePlacementOpt = true;
 
   // Predictable cmov don't hurt on atom because it's in-order.
-  predictableSelectIsExpensive = !Subtarget->isAtom();
+  PredictableSelectIsExpensive = !Subtarget->isAtom();
 
   setPrefFunctionAlignment(4); // 2^4 bytes.
 }
@@ -1562,14 +1591,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                  RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
-  // Add the regs to the liveout set for the function.
-  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  for (unsigned i = 0; i != RVLocs.size(); ++i)
-    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
-      MRI.addLiveOut(RVLocs[i].getLocReg());
-
   SDValue Flag;
-
   SmallVector<SDValue, 6> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   // Operand #1 = Bytes To Pop
@@ -1638,12 +1660,13 @@ X86TargetLowering::LowerReturn(SDValue Chain,
 
     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
     Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  // The x86-64 ABI for returning structs by value requires that we copy
-  // the sret argument into %rax for the return. We saved the argument into
-  // a virtual register in the entry block, so now we copy the value out
-  // and into %rax.
+  // The x86-64 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // We saved the argument into a virtual register in the entry block,
+  // so now we copy the value out and into %rax/%eax.
   if (Subtarget->is64Bit() &&
       DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
     MachineFunction &MF = DAG.getMachineFunction();
@@ -1653,11 +1676,12 @@ X86TargetLowering::LowerReturn(SDValue Chain,
            "SRetReturnReg should have been set in LowerFormalArguments().");
     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
 
-    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
+    unsigned RetValReg = Subtarget->isTarget64BitILP32() ? X86::EAX : X86::RAX;
+    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
     Flag = Chain.getValue(1);
 
-    // RAX now acts like a return value.
-    MRI.addLiveOut(X86::RAX);
+    // RAX/EAX now acts like a return value.
+    RetOps.push_back(DAG.getRegister(RetValReg, MVT::i64));
   }
 
   RetOps[0] = Chain;  // Update chain.
@@ -2009,14 +2033,16 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     InVals.push_back(ArgValue);
   }
 
-  // The x86-64 ABI for returning structs by value requires that we copy
-  // the sret argument into %rax for the return. Save the argument into
-  // a virtual register so that we can access it from the return points.
+  // The x86-64 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // Save the argument into a virtual register so that we can access it
+  // from the return points.
   if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
     unsigned Reg = FuncInfo->getSRetReturnReg();
     if (!Reg) {
-      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+      MVT PtrTy = getPointerTy();
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
       FuncInfo->setSRetReturnReg(Reg);
     }
     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
@@ -2630,8 +2656,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // This isn't right, although it's probably harmless on x86; liveouts
     // should be computed from returns not tail calls.  Consider a void
     // function making a tail call to a function returning int.
-    return DAG.getNode(X86ISD::TC_RETURN, dl,
-                       NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
   }
 
   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
@@ -2789,7 +2814,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                                     SelectionDAG& DAG) const {
+                                                     SelectionDAG &DAG) const {
   if (!IsTailCallConvention(CalleeCC) &&
       CalleeCC != CallingConv::C)
     return false;
@@ -2828,7 +2853,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
   // An stdcall caller is expected to clean up its arguments; the callee
   // isn't going to do that.
-  if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
+  if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
     return false;
 
   // Do not sibcall optimize vararg calls unless all arguments are passed via
@@ -2948,9 +2973,15 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // callee-saved registers are restored. These happen to be the same
     // registers used to pass 'inreg' arguments so watch out for those.
     if (!Subtarget->is64Bit() &&
-        !isa<GlobalAddressSDNode>(Callee) &&
-        !isa<ExternalSymbolSDNode>(Callee)) {
+        ((!isa<GlobalAddressSDNode>(Callee) &&
+          !isa<ExternalSymbolSDNode>(Callee)) ||
+         getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
       unsigned NumInRegs = 0;
+      // In PIC we need an extra register to formulate the address computation
+      // for the callee.
+      unsigned MaxInRegs =
+          (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         if (!VA.isRegLoc())
@@ -2959,7 +2990,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
         switch (Reg) {
         default: break;
         case X86::EAX: case X86::EDX: case X86::ECX:
-          if (++NumInRegs == 3)
+          if (++NumInRegs == MaxInRegs)
             return false;
           break;
         }
@@ -2995,7 +3026,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
   case X86ISD::SHUFP:
-  case X86ISD::PALIGN:
+  case X86ISD::PALIGNR:
   case X86ISD::MOVLHPS:
   case X86ISD::MOVLHPD:
   case X86ISD::MOVHLPS:
@@ -3045,7 +3076,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
                                     SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
-  case X86ISD::PALIGN:
+  case X86ISD::PALIGNR:
   case X86ISD::SHUFP:
   case X86ISD::VPERM2X128:
     return DAG.getNode(Opc, dl, VT, V1, V2,
@@ -3355,8 +3386,8 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 /// is suitable for input to PALIGNR.
 static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
                           const X86Subtarget *Subtarget) {
-  if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
-      (VT.getSizeInBits() == 256 && !Subtarget->hasInt256()))
+  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
+      (VT.is256BitVector() && !Subtarget->hasInt256()))
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -3445,7 +3476,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
 /// reverse of what x86 shuffles want.
 static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
                         bool Commuted = false) {
-  if (!HasFp256 && VT.getSizeInBits() == 256)
+  if (!HasFp256 && VT.is256BitVector())
     return false;
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -3580,7 +3611,7 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
 static
 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
                                SelectionDAG &DAG) {
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
   DebugLoc dl = SVOp->getDebugLoc();
 
   if (VT != MVT::v8i32 && VT != MVT::v8f32)
@@ -3630,7 +3661,7 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
-  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
 
@@ -3669,7 +3700,7 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
-  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
 
@@ -3700,14 +3731,14 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
 /// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
-                                  bool HasInt256) {
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
   unsigned NumElts = VT.getVectorNumElements();
+  bool Is256BitVec = VT.is256BitVector();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
-  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+  if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
 
@@ -3715,7 +3746,7 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
   // FIXME: Need a better way to get rid of this, there's no latency difference
   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
   // the former later. We should also remove the "_undef" special mask.
-  if (NumElts == 4 && VT.getSizeInBits() == 256)
+  if (NumElts == 4 && Is256BitVec)
     return false;
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -3749,7 +3780,7 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
-  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
 
@@ -3831,7 +3862,7 @@ static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
 
   unsigned HalfSize = VT.getVectorNumElements()/2;
 
@@ -3865,7 +3896,7 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
 
   unsigned NumElts = VT.getVectorNumElements();
   // Only match 256-bit with 32/64-bit types
-  if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
+  if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
     return false;
 
   unsigned NumLanes = VT.getSizeInBits()/128;
@@ -3921,8 +3952,8 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
 
   unsigned NumElems = VT.getVectorNumElements();
 
-  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
-      (VT.getSizeInBits() == 256 && NumElems != 8))
+  if ((VT.is128BitVector() && NumElems != 4) ||
+      (VT.is256BitVector() && NumElems != 8))
     return false;
 
   // "i+1" is the value the indexed mask element must have
@@ -3944,8 +3975,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
 
   unsigned NumElems = VT.getVectorNumElements();
 
-  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
-      (VT.getSizeInBits() == 256 && NumElems != 8))
+  if ((VT.is128BitVector() && NumElems != 4) ||
+      (VT.is256BitVector() && NumElems != 8))
     return false;
 
   // "i" is the value the indexed mask element must have
@@ -4005,9 +4036,8 @@ bool X86::isVEXTRACTF128Index(SDNode *N) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
 
-  unsigned VL = N->getValueType(0).getVectorNumElements();
-  unsigned VBits = N->getValueType(0).getSizeInBits();
-  unsigned ElSize = VBits / VL;
+  MVT VT = N->getValueType(0).getSimpleVT();
+  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   bool Result = (Index * ElSize) % 128 == 0;
 
   return Result;
@@ -4024,9 +4054,8 @@ bool X86::isVINSERTF128Index(SDNode *N) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
 
-  unsigned VL = N->getValueType(0).getVectorNumElements();
-  unsigned VBits = N->getValueType(0).getSizeInBits();
-  unsigned ElSize = VBits / VL;
+  MVT VT = N->getValueType(0).getSimpleVT();
+  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   bool Result = (Index * ElSize) % 128 == 0;
 
   return Result;
@@ -4036,7 +4065,7 @@ bool X86::isVINSERTF128Index(SDNode *N) {
 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
 /// Handles 128-bit and 256-bit.
 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
-  EVT VT = N->getValueType(0);
+  MVT VT = N->getValueType(0).getSimpleVT();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for PSHUF/SHUFP");
@@ -4066,7 +4095,7 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
-  EVT VT = N->getValueType(0);
+  MVT VT = N->getValueType(0).getSimpleVT();
 
   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
          "Unsupported vector type for PSHUFHW");
@@ -4090,7 +4119,7 @@ static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
-  EVT VT = N->getValueType(0);
+  MVT VT = N->getValueType(0).getSimpleVT();
 
   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
          "Unsupported vector type for PSHUFHW");
@@ -4114,7 +4143,7 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
   unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -4145,8 +4174,8 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
 
-  EVT VecVT = N->getOperand(0).getValueType();
-  EVT ElVT = VecVT.getVectorElementType();
+  MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
+  MVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
   return Index / NumElemsPerChunk;
@@ -4162,8 +4191,8 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
 
-  EVT VecVT = N->getValueType(0);
-  EVT ElVT = VecVT.getVectorElementType();
+  MVT VecVT = N->getValueType(0).getSimpleVT();
+  MVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
   return Index / NumElemsPerChunk;
@@ -4173,7 +4202,7 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
 /// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
 /// Handles 256-bit.
 static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
-  EVT VT = N->getValueType(0);
+  MVT VT = N->getValueType(0).getSimpleVT();
 
   unsigned NumElts = VT.getVectorNumElements();
 
@@ -4193,17 +4222,18 @@ static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
-  return ((isa<ConstantSDNode>(Elt) &&
-           cast<ConstantSDNode>(Elt)->isNullValue()) ||
-          (isa<ConstantFPSDNode>(Elt) &&
-           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
+    return CN->isNullValue();
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
+    return CFP->getValueAPF().isPosZero();
+  return false;
 }
 
 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
 /// their permute mask.
 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
                                     SelectionDAG &DAG) {
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> MaskVec;
 
@@ -4352,12 +4382,11 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG, DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
-  unsigned Size = VT.getSizeInBits();
 
   // Always build SSE zero vectors as <4 x i32> bitcasted
   // to their dest type. This ensures they get CSE'd.
   SDValue Vec;
-  if (Size == 128) {  // SSE
+  if (VT.is128BitVector()) {  // SSE
     if (Subtarget->hasSSE2()) {  // SSE2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
@@ -4365,7 +4394,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
     }
-  } else if (Size == 256) { // AVX
+  } else if (VT.is256BitVector()) { // AVX
     if (Subtarget->hasInt256()) { // AVX2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
@@ -4387,14 +4416,13 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
 /// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG,
+static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
                              DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
-  unsigned Size = VT.getSizeInBits();
 
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
   SDValue Vec;
-  if (Size == 256) {
+  if (VT.is256BitVector()) {
     if (HasInt256) { // AVX2
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
@@ -4402,7 +4430,7 @@ static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG,
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
     }
-  } else if (Size == 128) {
+  } else if (VT.is128BitVector()) {
     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   } else
     llvm_unreachable("Unexpected vector type");
@@ -4481,14 +4509,13 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
   EVT VT = V.getValueType();
   DebugLoc dl = V.getDebugLoc();
-  unsigned Size = VT.getSizeInBits();
 
-  if (Size == 128) {
+  if (VT.is128BitVector()) {
     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
                              &SplatMask[0]);
-  } else if (Size == 256) {
+  } else if (VT.is256BitVector()) {
     // To use VPERMILPS to splat scalars, the second half of indicies must
     // refer to the higher part, which is a duplication of the lower one,
     // because VPERMILPS can only handle in-lane permutations.
@@ -4512,14 +4539,14 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
 
   int EltNo = SV->getSplatIndex();
   int NumElems = SrcVT.getVectorNumElements();
-  unsigned Size = SrcVT.getSizeInBits();
+  bool Is256BitVec = SrcVT.is256BitVector();
 
-  assert(((Size == 128 && NumElems > 4) || Size == 256) &&
-          "Unknown how to promote splat for type");
+  assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
+         "Unknown how to promote splat for type");
 
   // Extract the 128-bit part containing the splat element and update
   // the splat element index when it refers to the higher register.
-  if (Size == 256) {
+  if (Is256BitVec) {
     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
     if (EltNo >= NumElems/2)
       EltNo -= NumElems/2;
@@ -4536,7 +4563,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   // Recreate the 256-bit vector and place the same 128-bit vector
   // into the low and high part. This is necessary because we want
   // to use VPERM* to shuffle the vectors
-  if (Size == 256) {
+  if (Is256BitVec) {
     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
   }
 
@@ -4588,6 +4615,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
   case X86ISD::MOVLHPS:
     DecodeMOVLHPSMask(NumElems, Mask);
     break;
+  case X86ISD::PALIGNR:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILP:
     ImmN = N->getOperand(N->getNumOperands()-1);
@@ -4631,7 +4662,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
   case X86ISD::MOVLPS:
   case X86ISD::MOVSHDUP:
   case X86ISD::MOVSLDUP:
-  case X86ISD::PALIGN:
     // Not yet implemented
     return false;
   default: llvm_unreachable("unknown target shuffle node");
@@ -5099,7 +5129,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget->hasFp256())
     return SDValue();
 
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
   DebugLoc dl = Op.getDebugLoc();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
@@ -5297,8 +5327,8 @@ SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
 
-  EVT VT = Op.getValueType();
-  EVT ExtVT = VT.getVectorElementType();
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT ExtVT = VT.getVectorElementType();
   unsigned NumElems = Op.getNumOperands();
 
   // Vectors containing all zeros can be matched by pxor and xorps later
@@ -5314,7 +5344,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   // vpcmpeqd on 256-bit vectors.
-  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+  if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
       return Op;
 
@@ -5629,7 +5659,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 // to create 256-bit vectors from two other 128-bit ones.
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
-  EVT ResVT = Op.getValueType();
+  MVT ResVT = Op.getValueType().getSimpleVT();
 
   assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
 
@@ -5655,8 +5685,8 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
-  EVT VT = SVOp->getValueType(0);
-  EVT EltVT = VT.getVectorElementType();
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
@@ -5667,41 +5697,40 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
   // Check the mask for BLEND and build the value.
   unsigned MaskValue = 0;
   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
-  unsigned NumLanes = (NumElems-1)/8 + 1; 
+  unsigned NumLanes = (NumElems-1)/8 + 1;
   unsigned NumElemsInLane = NumElems / NumLanes;
 
   // Blend for v16i16 should be symetric for the both lanes.
   for (unsigned i = 0; i < NumElemsInLane; ++i) {
 
-    int SndLaneEltIdx = (NumLanes == 2) ? 
+    int SndLaneEltIdx = (NumLanes == 2) ?
       SVOp->getMaskElt(i + NumElemsInLane) : -1;
     int EltIdx = SVOp->getMaskElt(i);
 
-    if ((EltIdx == -1 || EltIdx == (int)i) && 
-        (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
+    if ((EltIdx < 0 || EltIdx == (int)i) &&
+        (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
       continue;
 
-    if (((unsigned)EltIdx == (i + NumElems)) && 
-        (SndLaneEltIdx == -1 || 
+    if (((unsigned)EltIdx == (i + NumElems)) &&
+        (SndLaneEltIdx < 0 ||
          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
       MaskValue |= (1<<i);
-    else 
+    else
       return SDValue();
   }
 
   // Convert i32 vectors to floating point if it is not AVX2.
   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
-  EVT BlendVT = VT;
+  MVT BlendVT = VT;
   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
-    BlendVT = EVT::getVectorVT(*DAG.getContext(), 
-                              EVT::getFloatingPointVT(EltVT.getSizeInBits()), 
-                              NumElems);
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
   }
-  
-  SDValue Ret =  DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
-                             DAG.getConstant(MaskValue, MVT::i32));
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
+                            DAG.getConstant(MaskValue, MVT::i32));
   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
 }
 
@@ -5836,6 +5865,11 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
     }
   }
 
+  // Promote splats to a larger type which usually leads to more efficient code.
+  // FIXME: Is this true if pshufb is available?
+  if (SVOp->isSplat())
+    return PromoteSplat(SVOp, DAG);
+
   // If we have SSSE3, and all words of the result are from 1 input vector,
   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   // is present, fall back to case 4.
@@ -5851,7 +5885,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
       int EltIdx = MaskVals[i] * 2;
       int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
       int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
-      pshufbMask.push_back(DAG.getConstant(Idx0,   MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
       pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
     }
     V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
@@ -5969,6 +6003,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   DebugLoc dl = SVOp->getDebugLoc();
   ArrayRef<int> MaskVals = SVOp->getMask();
 
+  // Promote splats to a larger type which usually leads to more efficient code.
+  // FIXME: Is this true if pshufb is available?
+  if (SVOp->isSplat())
+    return PromoteSplat(SVOp, DAG);
+
   // If we have SSSE3, case 1 is generated when all result bytes come from
   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   // present, fall back to case 3.
@@ -6087,7 +6126,7 @@ static
 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
                                  const X86Subtarget *Subtarget,
                                  SelectionDAG &DAG) {
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
@@ -6134,8 +6173,9 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
 static
 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
-                                 SelectionDAG &DAG, DebugLoc dl) {
+                                 SelectionDAG &DAG) {
   MVT VT = SVOp->getValueType(0).getSimpleVT();
+  DebugLoc dl = SVOp->getDebugLoc();
   unsigned NumElems = VT.getVectorNumElements();
   MVT NewVT;
   unsigned Scale;
@@ -6171,7 +6211,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
 
 /// getVZextMovL - Return a zero-extending vector move low node.
 ///
-static SDValue getVZextMovL(EVT VT, EVT OpVT,
+static SDValue getVZextMovL(MVT VT, EVT OpVT,
                             SDValue SrcOp, SelectionDAG &DAG,
                             const X86Subtarget *Subtarget, DebugLoc dl) {
   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
@@ -6213,14 +6253,14 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   if (NewOp.getNode())
     return NewOp;
 
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
 
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLaneElems = NumElems / 2;
 
   DebugLoc dl = SVOp->getDebugLoc();
-  MVT EltVT = VT.getVectorElementType().getSimpleVT();
-  EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
+  MVT EltVT = VT.getVectorElementType();
+  MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
   SDValue Output[2];
 
   SmallVector<int, 16> Mask;
@@ -6325,7 +6365,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
-  EVT VT = SVOp->getValueType(0);
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
 
   assert(VT.is128BitVector() && "Unsupported vector size");
 
@@ -6579,7 +6619,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
 
 // Reduce a vector shuffle to zext.
 SDValue
-X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
   // PMOVZX is only available from SSE41.
   if (!Subtarget->hasSSE41())
     return SDValue();
@@ -6623,9 +6663,10 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
       return SDValue();
   }
 
+  LLVMContext *Context = DAG.getContext();
   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
-  EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits);
-  EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift);
+  EVT NeVT = EVT::getIntegerVT(*Context, NBits);
+  EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
 
   if (!isTypeLegal(NVT))
     return SDValue();
@@ -6644,8 +6685,21 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
     // If it's foldable, i.e. normal load with single use, we will let code
     // selection to fold it. Otherwise, we will short the conversion sequence.
     if (CIdx && CIdx->getZExtValue() == 0 &&
-        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse()))
+        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
+      if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
+        // The "ext_vec_elt" node is wider than the result node.
+        // In this case we should extract subvector from V.
+        // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
+        unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
+        EVT FullVT = V.getValueType();
+        EVT SubVecVT = EVT::getVectorVT(*Context, 
+                                        FullVT.getVectorElementType(),
+                                        FullVT.getVectorNumElements()/Ratio);
+        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, 
+                        DAG.getIntPtrConstant(0));
+      }
       V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+    }
   }
 
   return DAG.getNode(ISD::BITCAST, DL, VT,
@@ -6655,7 +6709,7 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
   DebugLoc dl = Op.getDebugLoc();
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
@@ -6665,25 +6719,14 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
 
   // Handle splat operations
   if (SVOp->isSplat()) {
-    unsigned NumElem = VT.getVectorNumElements();
-    int Size = VT.getSizeInBits();
-
     // Use vbroadcast whenever the splat comes from a foldable load
     SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
     if (Broadcast.getNode())
       return Broadcast;
-
-    // Handle splats by matching through known shuffle masks
-    if ((Size == 128 && NumElem <= 4) ||
-        (Size == 256 && NumElem <= 8))
-      return SDValue();
-
-    // All remaning splats are promoted to target supported vector shuffles.
-    return PromoteSplat(SVOp, DAG);
   }
 
   // Check integer expanding shuffles.
-  SDValue NewOp = lowerVectorIntExtend(Op, DAG);
+  SDValue NewOp = LowerVectorIntExtend(Op, DAG);
   if (NewOp.getNode())
     return NewOp;
 
@@ -6691,7 +6734,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   // do it!
   if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
       VT == MVT::v16i16 || VT == MVT::v32i8) {
-    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
+    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
     if (NewOp.getNode())
       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
   } else if ((VT == MVT::v4i32 ||
@@ -6699,18 +6742,18 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
     // FIXME: Figure out a cleaner way to do this.
     // Try to make use of movq to zero out the top part.
     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
-      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
-        EVT NewVT = NewOp.getValueType();
+        MVT NewVT = NewOp.getValueType().getSimpleVT();
         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
                                NewVT, true, false))
           return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
                               DAG, Subtarget, dl);
       }
     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
-      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
-        EVT NewVT = NewOp.getValueType();
+        MVT NewVT = NewOp.getValueType().getSimpleVT();
         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
           return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
                               DAG, Subtarget, dl);
@@ -6725,7 +6768,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
   DebugLoc dl = Op.getDebugLoc();
   unsigned NumElems = VT.getVectorNumElements();
   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
@@ -6816,7 +6859,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (isShift && ShVal.hasOneUse()) {
     // If the shifted value has multiple uses, it may be cheaper to use
     // v_set0 + movlhps or movhlps, etc.
-    EVT EltVT = VT.getVectorElementType();
+    MVT EltVT = VT.getVectorElementType();
     ShAmt *= EltVT.getSizeInBits();
     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   }
@@ -6855,7 +6898,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
   if (isShift) {
     // No better options. Use a vshldq / vsrldq.
-    EVT EltVT = VT.getVectorElementType();
+    MVT EltVT = VT.getVectorElementType();
     ShAmt *= EltVT.getSizeInBits();
     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
   }
@@ -6926,7 +6969,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // nodes, and remove one by one until they don't return Op anymore.
 
   if (isPALIGNRMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
+    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
                                 getShufflePALIGNRImmediate(SVOp),
                                 DAG);
 
@@ -7035,13 +7078,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType().getSimpleVT();
   DebugLoc dl = Op.getDebugLoc();
 
-  if (!Op.getOperand(0).getValueType().is128BitVector())
+  if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
     return SDValue();
 
   if (VT.getSizeInBits() == 8) {
@@ -7106,7 +7147,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     return SDValue();
 
   SDValue Vec = Op.getOperand(0);
-  EVT VecVT = Vec.getValueType();
+  MVT VecVT = Vec.getValueType().getSimpleVT();
 
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
@@ -7133,7 +7174,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
       return Res;
   }
 
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
   DebugLoc dl = Op.getDebugLoc();
   // TODO: handle v16i8.
   if (VT.getSizeInBits() == 16) {
@@ -7146,7 +7187,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                  MVT::v4i32, Vec),
                                      Op.getOperand(1)));
     // Transform it so it match pextrw which produces a 32-bit result.
-    EVT EltVT = MVT::i32;
+    MVT EltVT = MVT::i32;
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
                                   Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
@@ -7161,7 +7202,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
     // SHUFPS the element to the lowest double word, then movss.
     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
-    EVT VVT = Op.getOperand(0).getValueType();
+    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7180,7 +7221,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
     int Mask[2] = { 1, -1 };
-    EVT VVT = Op.getOperand(0).getValueType();
+    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7190,11 +7231,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  EVT EltVT = VT.getVectorElementType();
+static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT EltVT = VT.getVectorElementType();
   DebugLoc dl = Op.getDebugLoc();
 
   SDValue N0 = Op.getOperand(0);
@@ -7247,8 +7286,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
 
 SDValue
 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  EVT EltVT = VT.getVectorElementType();
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT EltVT = VT.getVectorElementType();
 
   DebugLoc dl = Op.getDebugLoc();
   SDValue N0 = Op.getOperand(0);
@@ -7296,7 +7335,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
-  EVT OpVT = Op.getValueType();
+  MVT OpVT = Op.getValueType().getSimpleVT();
 
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
@@ -7511,8 +7550,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
-                                      int64_t Offset,
-                                      SelectionDAG &DAG) const {
+                                      int64_t Offset, SelectionDAG &DAG) const {
   // Create the TargetGlobalAddress node, folding in the constant
   // offset if it is legal.
   unsigned char OpFlags =
@@ -7732,7 +7770,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       case TLSModel::LocalExec:
         return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
                                    Subtarget->is64Bit(),
-                         getTargetMachine().getRelocationModel() == Reloc::PIC_);
+                        getTargetMachine().getRelocationModel() == Reloc::PIC_);
     }
     llvm_unreachable("Unknown TLS model.");
   }
@@ -8015,9 +8053,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
 
   SmallVector<Constant*,2> CV1;
   CV1.push_back(
-        ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4330000000000000ULL))));
   CV1.push_back(
-        ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4530000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
 
@@ -8111,7 +8151,8 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
           SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
          "Custom UINT_TO_FP is not supported!");
 
-  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements());
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                             SVT.getVectorNumElements());
   return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
                      DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
 }
@@ -8204,8 +8245,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
 }
 
-std::pair<SDValue,SDValue> X86TargetLowering::
-FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
+std::pair<SDValue,SDValue>
+X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                    bool IsSigned, bool IsReplace) const {
   DebugLoc DL = Op.getDebugLoc();
 
   EVT DstTy = Op.getValueType();
@@ -8299,9 +8341,9 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co
 
 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
                               const X86Subtarget *Subtarget) {
-  EVT VT = Op->getValueType(0);
+  MVT VT = Op->getValueType(0).getSimpleVT();
   SDValue In = Op->getOperand(0);
-  EVT InVT = In.getValueType();
+  MVT InVT = In.getValueType().getSimpleVT();
   DebugLoc dl = Op->getDebugLoc();
 
   // Optimize vectors in AVX mode:
@@ -8330,7 +8372,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
 
-  EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
                              VT.getVectorNumElements()/2);
 
   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
@@ -8352,9 +8394,9 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
 SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
                                             SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
   SDValue In = Op.getOperand(0);
-  EVT SVT = In.getValueType();
+  MVT SVT = In.getValueType().getSimpleVT();
 
   if (Subtarget->hasFp256()) {
     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
@@ -8382,11 +8424,11 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
 }
 
-SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
   SDValue In = Op.getOperand(0);
-  EVT SVT = In.getValueType();
+  MVT SVT = In.getValueType().getSimpleVT();
 
   if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
@@ -8501,9 +8543,10 @@ SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  if (Op.getValueType().isVector()) {
-    if (Op.getValueType() == MVT::v8i16)
-      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(),
+  MVT VT = Op.getValueType().getSimpleVT();
+  if (VT.isVector()) {
+    if (VT == MVT::v8i16)
+      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT,
                          DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
                                      MVT::v8i32, Op.getOperand(0)));
     return SDValue();
@@ -8542,12 +8585,11 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   return FIST;
 }
 
-SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op,
-                                          SelectionDAG &DAG) const {
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
   SDValue In = Op.getOperand(0);
-  EVT SVT = In.getValueType();
+  MVT SVT = In.getValueType().getSimpleVT();
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
@@ -8559,8 +8601,8 @@ SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op,
 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-  EVT EltVT = VT;
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   if (VT.isVector()) {
     EltVT = VT.getVectorElementType();
@@ -8568,9 +8610,11 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   }
   Constant *C;
   if (EltVT == MVT::f64)
-    C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                          APInt(64, ~(1ULL << 63))));
   else
-    C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+                                          APInt(32, ~(1U << 31))));
   C = ConstantVector::getSplat(NumElts, C);
   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
@@ -8591,8 +8635,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-  EVT EltVT = VT;
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   if (VT.isVector()) {
     EltVT = VT.getVectorElementType();
@@ -8600,9 +8644,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   }
   Constant *C;
   if (EltVT == MVT::f64)
-    C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                          APInt(64, 1ULL << 63)));
   else
-    C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+                                          APInt(32, 1U << 31)));
   C = ConstantVector::getSplat(NumElts, C);
   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
@@ -8626,8 +8672,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   DebugLoc dl = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-  EVT SrcVT = Op1.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT SrcVT = Op1.getValueType().getSimpleVT();
 
   // If second operand is smaller, extend it first.
   if (SrcVT.bitsLT(VT)) {
@@ -8646,13 +8692,15 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // First get the sign bit of second operand.
   SmallVector<Constant*,4> CV;
   if (SrcVT == MVT::f64) {
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
+    const fltSemantics &Sem = APFloat::IEEEdouble;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
   } else {
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    const fltSemantics &Sem = APFloat::IEEEsingle;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   }
   Constant *C = ConstantVector::get(CV);
   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -8675,13 +8723,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // Clear first operand sign bit.
   CV.clear();
   if (VT == MVT::f64) {
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
+    const fltSemantics &Sem = APFloat::IEEEdouble;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+                                                   APInt(64, ~(1ULL << 63)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
   } else {
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
-    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    const fltSemantics &Sem = APFloat::IEEEsingle;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+                                                   APInt(32, ~(1U << 31)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
   }
   C = ConstantVector::get(CV);
   CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -8697,7 +8749,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   DebugLoc dl = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
 
   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
@@ -8707,7 +8759,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
 
 // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
 //
-SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
+                                                  SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
 
   if (!Subtarget->hasSSE41())
@@ -9139,65 +9192,10 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   return SDValue();
 }
 
-SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-
-  if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
-
-  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
-  SDValue Op0 = Op.getOperand(0);
-  SDValue Op1 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-
-  // Optimize to BT if possible.
-  // Lower (X & (1 << N)) == 0 to BT(X, N).
-  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
-  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
-  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
-      Op1.getOpcode() == ISD::Constant &&
-      cast<ConstantSDNode>(Op1)->isNullValue() &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
-    if (NewSetCC.getNode())
-      return NewSetCC;
-  }
-
-  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
-  // these.
-  if (Op1.getOpcode() == ISD::Constant &&
-      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
-       cast<ConstantSDNode>(Op1)->isNullValue()) &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-
-    // If the input is a setcc, then reuse the input setcc or use a new one with
-    // the inverted condition.
-    if (Op0.getOpcode() == X86ISD::SETCC) {
-      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
-      bool Invert = (CC == ISD::SETNE) ^
-        cast<ConstantSDNode>(Op1)->isNullValue();
-      if (!Invert) return Op0;
-
-      CCode = X86::GetOppositeBranchCondition(CCode);
-      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
-    }
-  }
-
-  bool isFP = Op1.getValueType().isFloatingPoint();
-  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
-  if (X86CC == X86::COND_INVALID)
-    return SDValue();
-
-  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
-  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
-  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
-                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
-}
-
 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
 // ones, and then concatenate the result back.
 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
 
   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
          "Unsupported value type for operation");
@@ -9217,26 +9215,27 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
 
   // Issue the operation on the smaller types and concatenate the result back
-  MVT EltVT = VT.getVectorElementType().getSimpleVT();
-  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
 }
 
-SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
   SDValue Cond;
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getValueType().getSimpleVT();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
+  bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
   DebugLoc dl = Op.getDebugLoc();
 
   if (isFP) {
 #ifndef NDEBUG
-    EVT EltVT = Op0.getValueType().getVectorElementType();
+    MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT();
     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
 #endif
 
@@ -9377,6 +9376,63 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   return Result;
 }
 
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  MVT VT = Op.getValueType().getSimpleVT();
+
+  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  // Optimize to BT if possible.
+  // Lower (X & (1 << N)) == 0 to BT(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
+      Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op1)->isNullValue() &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
+    if (NewSetCC.getNode())
+      return NewSetCC;
+  }
+
+  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
+  // these.
+  if (Op1.getOpcode() == ISD::Constant &&
+      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+       cast<ConstantSDNode>(Op1)->isNullValue()) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+    // If the input is a setcc, then reuse the input setcc or use a new one with
+    // the inverted condition.
+    if (Op0.getOpcode() == X86ISD::SETCC) {
+      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+      bool Invert = (CC == ISD::SETNE) ^
+        cast<ConstantSDNode>(Op1)->isNullValue();
+      if (!Invert) return Op0;
+
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
+    }
+  }
+
+  bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint();
+  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+  if (X86CC == X86::COND_INVALID)
+    return SDValue();
+
+  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+}
+
 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
 static bool isX86LogicalCmp(SDValue Op) {
   unsigned Opc = Op.getNode()->getOpcode();
@@ -9499,7 +9555,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
-    EVT VT = Op.getValueType();
+    MVT VT = Op.getValueType().getSimpleVT();
 
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
@@ -9610,9 +9666,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
                                             SelectionDAG &DAG) const {
-  EVT VT = Op->getValueType(0);
+  MVT VT = Op->getValueType(0).getSimpleVT();
   SDValue In = Op->getOperand(0);
-  EVT InVT = In.getValueType();
+  MVT InVT = In.getValueType().getSimpleVT();
   DebugLoc dl = Op->getDebugLoc();
 
   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
@@ -9646,7 +9702,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
 
   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
 
-  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
+  MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
                                 VT.getVectorNumElements()/2);
 
   OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
@@ -10155,7 +10211,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
-// getTargetVShiftNOde - Handle vector element shifts where the shift amount
+// getTargetVShiftNode - Handle vector element shifts where the shift amount
 // may or may not be a constant. Takes immediate version of shift as input.
 static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
@@ -11377,13 +11433,55 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 }
 
+SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  EVT EltTy = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue N0 = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Lower sdiv X, pow2-const.
+  BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
+  if (!C)
+    return SDValue();
+
+  APInt SplatValue, SplatUndef;
+  unsigned MinSplatBits;
+  bool HasAnyUndefs;
+  if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs))
+    return SDValue();
+
+  if ((SplatValue != 0) &&
+      (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
+    unsigned lg2 = SplatValue.countTrailingZeros();
+    // Splat the sign bit.
+    SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
+    SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
+    // Add (N0 < 0) ? abs2 - 1 : 0;
+    SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
+    SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
+    SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
+    SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
+    SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
+
+    // If we're dividing by a positive value, we're done.  Otherwise, we must
+    // negate the result.
+    if (SplatValue.isNonNegative())
+      return SRA;
+
+    SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
+    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
+    return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+  }
+  return SDValue();
+}
+
 SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
-  LLVMContext *Context = DAG.getContext();
 
   if (!Subtarget->hasSSE2())
     return SDValue();
@@ -11500,17 +11598,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
 
   // Lower SHL with variable shift amount.
   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
-    Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
-                     DAG.getConstant(23, MVT::i32));
-
-    const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
-    Constant *C = ConstantDataVector::get(*Context, CV);
-    SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
-    SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
-                                 MachinePointerInfo::getConstantPool(),
-                                 false, false, false, 16);
+    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
 
-    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
@@ -11519,8 +11609,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
 
     // a = a << 5;
-    Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
-                     DAG.getConstant(5, MVT::i32));
+    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
 
     // Turn 'a' into a mask suitable for VSELECT
@@ -11952,6 +12041,43 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
                      Op.getOperand(1), Op.getOperand(2));
 }
 
+SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
+
+  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two XMM registers.
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+  // the small struct {f32, f32} is returned in (eax, edx). For f64,
+  // the results are returned via SRet in memory.
+  const char *LibcallName = (ArgVT == MVT::f64)
+    ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  TargetLowering::
+    CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
+                         false, false, false, false, 0,
+                         CallingConv::C, /*isTaillCall=*/false,
+                         /*doesNotRet=*/false, /*isReturnValueUsed*/true,
+                         Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
 /// LowerOperation - Provide custom lowering hooks for some operations.
 ///
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -11981,13 +12107,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
-  case ISD::TRUNCATE:           return lowerTRUNCATE(Op, DAG);
+  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, DAG);
   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, DAG);
   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
-  case ISD::FP_EXTEND:          return lowerFP_EXTEND(Op, DAG);
+  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   case ISD::FABS:               return LowerFABS(Op, DAG);
   case ISD::FNEG:               return LowerFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
@@ -12033,6 +12159,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ADD:                return LowerADD(Op, DAG);
   case ISD::SUB:                return LowerSUB(Op, DAG);
+  case ISD::SDIV:               return LowerSDIV(Op, DAG);
+  case ISD::FSINCOS:            return LowerFSINCOS(Op, DAG);
   }
 }
 
@@ -12372,7 +12500,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
-  case X86ISD::PALIGN:             return "X86ISD::PALIGN";
+  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
@@ -12783,7 +12911,7 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
   MachineFunction::iterator I = MBB;
   ++I;
 
-  assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 &&
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
          "Unexpected number of operands");
 
   assert(MI->hasOneMemOperand() &&
@@ -13015,7 +13143,7 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
   MachineFunction::iterator I = MBB;
   ++I;
 
-  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
          "Unexpected number of operands");
 
   assert(MI->hasOneMemOperand() &&
@@ -15246,13 +15374,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
           isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
         APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
-        if (CondRHS.getConstantOperandVal(0) == -A-1) {
-          SmallVector<SDValue, 32> V(VT.getVectorNumElements(),
-                                     DAG.getConstant(-A, VT.getScalarType()));
+        if (CondRHS.getConstantOperandVal(0) == -A-1)
           return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
-                             DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
-                                         V.data(), V.size()));
-        }
+                             DAG.getConstant(-A, VT));
       }
 
       // Another special case: If C was a sign bit, the sub has been
@@ -15552,7 +15676,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
     ConstantSDNode *CmpAgainst = 0;
     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
-        dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) {
+        !isa<ConstantSDNode>(Cond.getOperand(0))) {
 
       if (CC == X86::COND_NE &&
           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
@@ -15832,8 +15956,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
     if (VT == MVT::f32 || VT == MVT::f64) {
       bool ExpectingFlags = false;
       // Check for any users that want flags:
-      for (SDNode::use_iterator UI = N->use_begin(),
-             UE = N->use_end();
+      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
            !ExpectingFlags && UI != UE; ++UI)
         switch (UI->getOpcode()) {
         default:
@@ -15920,7 +16043,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget *Subtarget) {
   EVT VT = N->getValueType(0);
-  if (VT.getSizeInBits() != 256)
+  if (!VT.is256BitVector())
     return SDValue();
 
   assert((N->getOpcode() == ISD::ANY_EXTEND ||
@@ -15929,7 +16052,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
 
   SDValue Narrow = N->getOperand(0);
   EVT NarrowVT = Narrow->getValueType(0);
-  if (NarrowVT.getSizeInBits() != 128)
+  if (!NarrowVT.is128BitVector())
     return SDValue();
 
   if (Narrow->getOpcode() != ISD::XOR &&
@@ -16125,11 +16248,6 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
 
       DebugLoc DL = N->getDebugLoc();
 
-      // We are going to replace the AND, OR, NAND with either BLEND
-      // or PSIGN, which only look at the MSB. The VSRAI instruction
-      // does not affect the highest bit, so we can get rid of it.
-      Mask = Mask.getOperand(0);
-
       // Now we know we at least have a plendvb with the mask val.  See if
       // we can form a psignb/w/d.
       // psign = x.type == y.type == mask.type && y = sub(0, x);
@@ -16138,7 +16256,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
                "Unsupported VT for PSIGN");
-        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask);
+        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
       }
       // PBLENDVB only available on SSE 4.1
@@ -16296,8 +16414,42 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   EVT MemVT = Ld->getMemoryVT();
   DebugLoc dl = Ld->getDebugLoc();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned RegSz = RegVT.getSizeInBits();
 
   ISD::LoadExtType Ext = Ld->getExtensionType();
+  unsigned Alignment = Ld->getAlignment();
+  bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8;
+
+  // On Sandybridge unaligned 256bit loads are inefficient.
+  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+    unsigned NumElems = RegVT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                                  NumElems/2);
+    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+                                Ld->getPointerInfo(), Ld->isVolatile(),
+                                Ld->isNonTemporal(), Ld->isInvariant(),
+                                Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+                                Ld->getPointerInfo(), Ld->isVolatile(),
+                                Ld->isNonTemporal(), Ld->isInvariant(),
+                                std::max(Alignment/2U, 1U));
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             Load1.getValue(1),
+                             Load2.getValue(1));
+
+    SDValue NewVec = DAG.getUNDEF(RegVT);
+    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+    return DCI.CombineTo(N, NewVec, TF, true);
+  }
 
   // If this is a vector EXT Load then attempt to optimize it using a
   // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
@@ -16312,7 +16464,6 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
     assert(MemVT.isVector() && "Must load a vector from memory");
 
     unsigned NumElems = RegVT.getVectorNumElements();
-    unsigned RegSz = RegVT.getSizeInBits();
     unsigned MemSz = MemVT.getSizeInBits();
     assert(RegSz > MemSz && "Register size must be greater than the mem size");
 
@@ -16356,8 +16507,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
 
     // Represent the data using the same element type that is stored in
     // memory. In practice, we ''widen'' MemVT.
-    EVT WideVecVT = 
-	  EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+    EVT WideVecVT =
+          EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                        loadRegZize/MemVT.getScalarType().getSizeInBits());
 
     assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
@@ -16426,10 +16577,8 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
       // Build the arithmetic shift.
       unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
                      MemVT.getVectorElementType().getSizeInBits();
-      SmallVector<SDValue, 8> C(NumElems,
-                                DAG.getConstant(Amt, RegVT.getScalarType()));
-      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, RegVT, &C[0], C.size());
-      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, BV);
+      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
+                          DAG.getConstant(Amt, RegVT));
 
       return DCI.CombineTo(N, Shuff, TF, true);
     }
@@ -16462,16 +16611,21 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   DebugLoc dl = St->getDebugLoc();
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned Alignment = St->getAlignment();
+  bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8;
 
   // If we are saving a concatenation of two XMM registers, perform two stores.
   // On Sandy Bridge, 256-bit memory operations are executed by two
   // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
   // memory  operation.
   if (VT.is256BitVector() && !Subtarget->hasInt256() &&
-      StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
-      StoredVal.getNumOperands() == 2) {
-    SDValue Value0 = StoredVal.getOperand(0);
-    SDValue Value1 = StoredVal.getOperand(1);
+      StVT == VT && !IsAligned) {
+    unsigned NumElems = VT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
+    SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
 
     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
     SDValue Ptr0 = St->getBasePtr();
@@ -16479,10 +16633,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
 
     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
                                 St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), St->getAlignment());
+                                St->isNonTemporal(), Alignment);
     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
                                 St->getPointerInfo(), St->isVolatile(),
-                                St->isNonTemporal(), St->getAlignment());
+                                St->isNonTemporal(),
+                                std::max(Alignment/2U, 1U));
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   }
 
@@ -16917,6 +17072,41 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 
+                                               const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+  DebugLoc dl = N->getDebugLoc();
+
+  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+  // both SSE and AVX2 since there is no sign-extended shift right
+  // operation on a vector with 64-bit elements.
+  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)) {
+    SDValue N00 = N0.getOperand(0);
+
+    // EXTLOAD has a better solution on AVX2, 
+    // it may be replaced with X86ISD::VSEXT node.
+    if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
+      if (!ISD::isNormalLoad(N00.getNode()))
+        return SDValue();
+
+    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 
+                                  N00, N1);
+      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+    }
+  }
+  return SDValue();
+}
+
 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
@@ -17002,7 +17192,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  if (VT.isVector() && VT.getSizeInBits() == 256) {
+  if (VT.is256BitVector()) {
     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
     if (R.getNode())
       return R;
@@ -17037,8 +17227,8 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// Helper function of PerformSETCCCombine. It is to materialize "setb reg" 
-// as "sbb reg,reg", since it can be extended without zext and produces 
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
 // an all-ones bit which is more useful than 0/1 in some cases.
 static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
   return DAG.getNode(ISD::AND, DL, MVT::i8,
@@ -17056,13 +17246,13 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
   SDValue EFLAGS = N->getOperand(1);
 
   if (CC == X86::COND_A) {
-    // Try to convert COND_A into COND_B in an attempt to facilitate 
+    // Try to convert COND_A into COND_B in an attempt to facilitate
     // materializing "setb reg".
     //
     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
     // cannot take an immediate as its first operand.
     //
-    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
         EFLAGS.getValueType().isInteger() &&
         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
       SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
@@ -17270,7 +17460,8 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
   if (In.getOpcode() != X86ISD::VZEXT)
     return SDValue();
 
-  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0));
+  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0),
+                     In.getOperand(0));
 }
 
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
@@ -17308,13 +17499,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
-  case X86ISD::PALIGN:
+  case X86ISD::PALIGNR:
   case X86ISD::UNPCKH:
   case X86ISD::UNPCKL:
   case X86ISD::MOVHLPS:
@@ -17497,7 +17689,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
       AsmPieces.clear();
       const std::string &ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
-      std::sort(AsmPieces.begin(), AsmPieces.end());
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (AsmPieces.size() == 4 &&
           AsmPieces[0] == "~{cc}" &&
           AsmPieces[1] == "~{dirflag}" &&
@@ -17515,7 +17707,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
       AsmPieces.clear();
       const std::string &ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
-      std::sort(AsmPieces.begin(), AsmPieces.end());
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (AsmPieces.size() == 4 &&
           AsmPieces[0] == "~{cc}" &&
           AsmPieces[1] == "~{dirflag}" &&
@@ -17995,7 +18187,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   // really want an 8-bit or 32-bit register, map to the appropriate register
   // class and return the appropriate register.
   if (Res.second == &X86::GR16RegClass) {
-    if (VT == MVT::i8) {
+    if (VT == MVT::i8 || VT == MVT::i1) {
       unsigned DestReg = 0;
       switch (Res.first) {
       default: break;
@@ -18008,7 +18200,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         Res.first = DestReg;
         Res.second = &X86::GR8RegClass;
       }
-    } else if (VT == MVT::i32) {
+    } else if (VT == MVT::i32 || VT == MVT::f32) {
       unsigned DestReg = 0;
       switch (Res.first) {
       default: break;
@@ -18025,7 +18217,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         Res.first = DestReg;
         Res.second = &X86::GR32RegClass;
       }
-    } else if (VT == MVT::i64) {
+    } else if (VT == MVT::i64 || VT == MVT::f64) {
       unsigned DestReg = 0;
       switch (Res.first) {
       default: break;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 16ce364..958ceb0 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -234,11 +234,8 @@ namespace llvm {
       // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
       EH_SJLJ_LONGJMP,
 
-      /// TC_RETURN - Tail call return.
-      ///   operand #0 chain
-      ///   operand #1 callee (register or absolute)
-      ///   operand #2 stack adjustment
-      ///   operand #3 optional in flag
+      /// TC_RETURN - Tail call return. See X86TargetLowering::LowerCall for
+      /// the list of operands.
       TC_RETURN,
 
       // VZEXT_MOVL - Vector move low and zero extend.
@@ -294,7 +291,7 @@ namespace llvm {
       TESTP,
 
       // Several flavors of instructions with vector shuffle behaviors.
-      PALIGN,
+      PALIGNR,
       PSHUFD,
       PSHUFHW,
       PSHUFLW,
@@ -794,9 +791,7 @@ namespace llvm {
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
@@ -811,20 +806,18 @@ namespace llvm {
     SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC,
                       DebugLoc dl, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const;
@@ -841,8 +834,9 @@ namespace llvm {
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
-
+    SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
 
     // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
     SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
@@ -851,7 +845,7 @@ namespace llvm {
 
     SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index 54b91c3..bb362f5 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -87,12 +87,10 @@ defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>;
 
 def PREFETCH  : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr),
-                       "prefetch $addr", []>;
+                       "prefetch\t$addr", []>;
 
-// FIXME: Diassembler gets a bogus decode conflict.
-let isAsmParserOnly = 1 in
 def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr),
-                       "prefetchw $addr", []>;
+                       "prefetchw\t$addr", []>;
 
 // "3DNowA" instructions
 defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 0eecd5f..d86a406 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -29,11 +29,11 @@ def LEA32r   : I<0x8D, MRMSrcMem,
 def LEA64_32r : I<0x8D, MRMSrcMem,
                   (outs GR32:$dst), (ins lea64_32mem:$src),
                   "lea{l}\t{$src|$dst}, {$dst|$src}",
-                  [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+                  [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
                   Requires<[In64BitMode]>;
 
 let isReMaterializable = 1 in
-def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
                   "lea{q}\t{$src|$dst}, {$dst|$src}",
                   [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
 
@@ -1256,3 +1256,49 @@ let Predicates = [HasBMI2] in {
   let Uses = [RDX] in
     defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
 }
+
+//===----------------------------------------------------------------------===//
+// ADCX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+  def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+             "adcx{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8, OpSize;
+
+  def ADCX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+             "adcx{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+
+  let mayLoad = 1 in {
+  def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+             "adcx{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8, OpSize;
+ 
+  def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+             "adcx{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ADOX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+  def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8XS;
+
+  def ADOX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+
+  let mayLoad = 1 in {
+  def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8XS;
+ 
+  def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+  }
+}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 2a26a22..734e598 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -513,15 +513,19 @@ def CMOV_RFP80 : I<0, Pseudo,
 
 multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> {
   let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in {
+    let Defs = [EFLAGS, AL] in
     def NAME#8  : I<0, Pseudo, (outs GR8:$dst),
                     (ins i8mem:$ptr, GR8:$val),
                     !strconcat(mnemonic, "8 PSEUDO!"), []>;
+    let Defs = [EFLAGS, AX] in
     def NAME#16 : I<0, Pseudo,(outs GR16:$dst),
                     (ins i16mem:$ptr, GR16:$val),
                     !strconcat(mnemonic, "16 PSEUDO!"), []>;
+    let Defs = [EFLAGS, EAX] in
     def NAME#32 : I<0, Pseudo, (outs GR32:$dst),
                     (ins i32mem:$ptr, GR32:$val),
                     !strconcat(mnemonic, "32 PSEUDO!"), []>;
+    let Defs = [EFLAGS, RAX] in
     def NAME#64 : I<0, Pseudo, (outs GR64:$dst),
                     (ins i64mem:$ptr, GR64:$val),
                     !strconcat(mnemonic, "64 PSEUDO!"), []>;
@@ -559,7 +563,8 @@ defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">;
 defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">;
 
 multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> {
-  let usesCustomInserter = 1, mayLoad = 1, mayStore = 1, hasSideEffects = 0 in
+  let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX],
+      mayLoad = 1, mayStore = 1, hasSideEffects = 0 in
     def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
                       (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
                       !strconcat(mnemonic, "6432 PSEUDO!"), []>;
@@ -1076,12 +1081,14 @@ def : Pat<(X86cmp GR64:$src1, 0),
 // inverted.
 multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
                   Instruction Inst64> {
-  def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
-            (Inst16 GR16:$src2, addr:$src1)>;
-  def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
-            (Inst32 GR32:$src2, addr:$src1)>;
-  def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
-            (Inst64 GR64:$src2, addr:$src1)>;
+  let Predicates = [HasCMov] in {
+    def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
+              (Inst16 GR16:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
+              (Inst32 GR32:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
+              (Inst64 GR64:$src2, addr:$src1)>;
+  }
 }
 
 defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index f48f133..7759a8a 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -60,14 +60,14 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        PatFrag MemFrag128, PatFrag MemFrag256,
                        SDNode Op, ValueType OpTy128, ValueType OpTy256> {
   defm r213 : fma3p_rm<opc213,
-                       !strconcat(OpcodeStr, !strconcat("213", PackTy)),
+                       !strconcat(OpcodeStr, "213", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
 let neverHasSideEffects = 1 in {
   defm r132 : fma3p_rm<opc132,
-                       !strconcat(OpcodeStr, !strconcat("132", PackTy)),
+                       !strconcat(OpcodeStr, "132", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256>;
   defm r231 : fma3p_rm<opc231,
-                       !strconcat(OpcodeStr, !strconcat("231", PackTy)),
+                       !strconcat(OpcodeStr, "231", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256>;
 } // neverHasSideEffects = 1
 }
@@ -160,15 +160,15 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
                        ComplexPattern mem_cpat> {
 let neverHasSideEffects = 1 in {
-  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, !strconcat("132", PackTy)),
+  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
                        x86memop, RC, OpVT, mem_frag>;
-  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, !strconcat("231", PackTy)),
+  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
                        x86memop, RC, OpVT, mem_frag>;
 }
 
-defm r213 : fma3s_rm<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
+defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
                      x86memop, RC, OpVT, mem_frag, OpNode>,
-            fma3s_rm_int<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
+            fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
                          memop, mem_cpat, Int, RC>;
 }
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 6151d5c..44e574d 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -570,7 +570,7 @@ class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
 // FMA4 Instruction Templates
 class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, TA,
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, TA,
         OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>;
 
 // XOP 2, 3 and 4 Operand Instruction Template
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 7025e93..2a72fb6 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -160,7 +160,7 @@ def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
 def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
 
-def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
+def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
 
 def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
 def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 9ecf5e2..d989ec7 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -525,6 +525,13 @@ def lea64_32mem : Operand<i32> {
   let ParserMatchClass = X86MemAsmOperand;
 }
 
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
 
 //===----------------------------------------------------------------------===//
 // X86 Complex Pattern Definitions.
@@ -535,6 +542,12 @@ def addr      : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>;
 def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
                                [add, sub, mul, X86mul_imm, shl, or, frameindex],
                                []>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+                                  [add, sub, mul, X86mul_imm, shl, or,
+                                   frameindex, X86WrapperRIP],
+                                  []>;
+
 def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
 
@@ -590,6 +603,7 @@ def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
 def HasBMI       : Predicate<"Subtarget->hasBMI()">;
 def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
 def HasRTM       : Predicate<"Subtarget->hasRTM()">;
+def HasADX       : Predicate<"Subtarget->hasADX()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
@@ -856,16 +870,14 @@ let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
 def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
                  Requires<[In64BitMode]>;
 
-
-
 let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
     mayLoad=1, neverHasSideEffects=1 in {
-def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>,
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l|d}", [], IIC_POP_A>,
                Requires<[In32BitMode]>;
 }
 let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
     mayStore=1, neverHasSideEffects=1 in {
-def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>,
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l|d}", [], IIC_PUSH_A>,
                Requires<[In32BitMode]>;
 }
 
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 3175324..0979752 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -436,93 +436,69 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
 //===----------------------------------------------------------------------===//
 
-class sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, string asm> :
-      SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm,
-      [(set VR128:$dst, (vt (OpNode VR128:$src1,
-                             (scalar_to_vector RC:$src2))))],
-      IIC_SSE_MOV_S_RR>;
+multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+                         X86MemOperand x86memop, string base_opc,
+                         string asm_opr> {
+  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src1, RC:$src2),
+              !strconcat(base_opc, asm_opr),
+              [(set VR128:$dst, (vt (OpNode VR128:$src1,
+                                 (scalar_to_vector RC:$src2))))],
+              IIC_SSE_MOV_S_RR>;
 
-// Loading from memory automatically zeroing upper bits.
-class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
-                    PatFrag mem_pat, string OpcodeStr> :
-      SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
-         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                        [(set RC:$dst, (mem_pat addr:$src))],
-                        IIC_SSE_MOV_S_RM>;
-
-// AVX
-def VMOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
-                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V,
-                VEX_LIG;
-def VMOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
-                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V,
-                VEX_LIG;
-
-// For the disassembler
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
-  def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
-                        (ins VR128:$src1, FR32:$src2),
-                        "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
-                        IIC_SSE_MOV_S_RR>,
-                        XS, VEX_4V, VEX_LIG;
-  def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
-                        (ins VR128:$src1, FR64:$src2),
-                        "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
-                        IIC_SSE_MOV_S_RR>,
-                        XD, VEX_4V, VEX_LIG;
+  // For the disassembler
+  let isCodeGenOnly = 1, hasSideEffects = 0 in
+  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                  (ins VR128:$src1, RC:$src2),
+                  !strconcat(base_opc, asm_opr),
+                  [], IIC_SSE_MOV_S_RR>;
 }
 
-let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX,
-                 VEX_LIG;
-  let AddedComplexity = 20 in
-    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX,
-                   VEX_LIG;
-}
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+                      X86MemOperand x86memop, string OpcodeStr> {
+  // AVX
+  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+                              VEX_4V, VEX_LIG;
 
-def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
-                  "movss\t{$src, $dst|$dst, $src}",
-                  [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
-                  XS, VEX, VEX_LIG;
-def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
-                  "movsd\t{$src, $dst|$dst, $src}",
-                  [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
-                  XD, VEX, VEX_LIG;
+  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+                     VEX, VEX_LIG;
+  // SSE1 & 2
+  let Constraints = "$src1 = $dst" in {
+    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $dst|$dst, $src2}">;
+  }
 
-// SSE1 & 2
-let Constraints = "$src1 = $dst" in {
-  def MOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
-                          "movss\t{$src2, $dst|$dst, $src2}">, XS;
-  def MOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
-                          "movsd\t{$src2, $dst|$dst, $src2}">, XD;
+  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
+}
 
-  // For the disassembler
-  let isCodeGenOnly = 1, hasSideEffects = 0 in {
-    def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
-                         (ins VR128:$src1, FR32:$src2),
-                         "movss\t{$src2, $dst|$dst, $src2}", [],
-                         IIC_SSE_MOV_S_RR>, XS;
-    def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
-                         (ins VR128:$src1, FR64:$src2),
-                         "movsd\t{$src2, $dst|$dst, $src2}", [],
-                         IIC_SSE_MOV_S_RR>, XD;
-  }
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+                         PatFrag mem_pat, string OpcodeStr> {
+  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM>, VEX, VEX_LIG;
+  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM>;
 }
 
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD;
+
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def MOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
+  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
 
   let AddedComplexity = 20 in
-    def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
+    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
 }
 
-def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
-                  "movss\t{$src, $dst|$dst, $src}",
-                  [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
-def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
-                  "movsd\t{$src, $dst|$dst, $src}",
-                  [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
-
 // Patterns
 let Predicates = [HasAVX] in {
   let AddedComplexity = 15 in {
@@ -1110,34 +1086,41 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
 // SSE 1 & 2 - Move Low packed FP Instructions
 //===----------------------------------------------------------------------===//
 
-multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
-                                 SDNode psnode, SDNode pdnode, string base_opc,
-                                 string asm_opr, InstrItinClass itin> {
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                      string base_opc, string asm_opr,
+                                      InstrItinClass itin> {
   def PSrm : PI<opc, MRMSrcMem,
          (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
          !strconcat(base_opc, "s", asm_opr),
-     [(set RC:$dst,
-       (psnode RC:$src1,
+     [(set VR128:$dst,
+       (psnode VR128:$src1,
               (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
               itin, SSEPackedSingle>, TB;
 
   def PDrm : PI<opc, MRMSrcMem,
-         (outs RC:$dst), (ins RC:$src1, f64mem:$src2),
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
          !strconcat(base_opc, "d", asm_opr),
-     [(set RC:$dst, (v2f64 (pdnode RC:$src1,
+     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
                               (scalar_to_vector (loadf64 addr:$src2)))))],
               itin, SSEPackedDouble>, TB, OpSize;
+
 }
 
-let AddedComplexity = 20 in {
-  defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
-                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     IIC_SSE_MOV_LH>, VEX_4V;
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                 string base_opc, InstrItinClass itin> {
+  defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                                    itin>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $dst|$dst, $src2}",
+                                    itin>;
 }
-let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
-  defm MOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
-                                   "\t{$src2, $dst|$dst, $src2}",
-                                   IIC_SSE_MOV_LH>;
+
+let AddedComplexity = 20 in {
+  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
+                                    IIC_SSE_MOV_LH>;
 }
 
 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
@@ -1235,14 +1218,8 @@ let Predicates = [UseSSE2] in {
 //===----------------------------------------------------------------------===//
 
 let AddedComplexity = 20 in {
-  defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
-                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     IIC_SSE_MOV_LH>, VEX_4V;
-}
-let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
-  defm MOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
-                                   "\t{$src2, $dst|$dst, $src2}",
-                                   IIC_SSE_MOV_LH>;
+  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+                                    IIC_SSE_MOV_LH>;
 }
 
 // v2f64 extract element 1 is always custom lowered to unpack high to low
@@ -3012,18 +2989,18 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX], hasSideEffects = 0 in {
   def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
                       (ins FR32:$src1, FR32:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG;
   let mayLoad = 1 in {
   def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
                       (ins FR32:$src1,f32mem:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG;
   def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, ssmem:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG;
   }
@@ -3054,18 +3031,18 @@ multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let Predicates = [HasAVX], hasSideEffects = 0 in {
   def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
                        (ins FR32:$src1, FR32:$src2),
-                       !strconcat(!strconcat("v", OpcodeStr),
+                       !strconcat("v", OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 []>, VEX_4V, VEX_LIG;
   let mayLoad = 1 in {
   def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
                       (ins FR32:$src1,f32mem:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG;
   def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, ssmem:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG;
   }
@@ -3100,22 +3077,22 @@ multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           OpndItins itins> {
 let Predicates = [HasAVX] in {
   def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       !strconcat(!strconcat("v", OpcodeStr),
+                       !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
                        itins.rr>, VEX;
   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       !strconcat(!strconcat("v", OpcodeStr),
+                       !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))],
                        itins.rm>, VEX;
   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                        !strconcat(!strconcat("v", OpcodeStr),
+                        !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
                         itins.rr>, VEX, VEX_L;
   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
-                        !strconcat(!strconcat("v", OpcodeStr),
+                        !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
                         itins.rm>, VEX, VEX_L;
@@ -3135,23 +3112,23 @@ multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
                               OpndItins itins> {
 let Predicates = [HasAVX] in {
   def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                           !strconcat(!strconcat("v", OpcodeStr),
+                           !strconcat("v", OpcodeStr,
                                       "ps\t{$src, $dst|$dst, $src}"),
                            [(set VR128:$dst, (V4F32Int VR128:$src))],
                            itins.rr>, VEX;
   def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                          !strconcat(!strconcat("v", OpcodeStr),
+                          !strconcat("v", OpcodeStr,
                           "ps\t{$src, $dst|$dst, $src}"),
                           [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
                           itins.rm>, VEX;
   def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                            !strconcat(!strconcat("v", OpcodeStr),
+                            !strconcat("v", OpcodeStr,
                                        "ps\t{$src, $dst|$dst, $src}"),
                             [(set VR256:$dst, (V8F32Int VR256:$src))],
                             itins.rr>, VEX, VEX_L;
   def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
                           (ins f256mem:$src),
-                          !strconcat(!strconcat("v", OpcodeStr),
+                          !strconcat("v", OpcodeStr,
                                     "ps\t{$src, $dst|$dst, $src}"),
                           [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))],
                           itins.rm>, VEX, VEX_L;
@@ -3173,18 +3150,18 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX], hasSideEffects = 0 in {
   def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst),
                       (ins FR64:$src1, FR64:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG;
   let mayLoad = 1 in {
   def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
                       (ins FR64:$src1,f64mem:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG;
   def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, sdmem:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       []>, VEX_4V, VEX_LIG;
   }
@@ -3211,22 +3188,22 @@ multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
                           SDNode OpNode, OpndItins itins> {
 let Predicates = [HasAVX] in {
   def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                       !strconcat(!strconcat("v", OpcodeStr),
+                       !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
                        itins.rr>, VEX;
   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
-                       !strconcat(!strconcat("v", OpcodeStr),
+                       !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))],
                        itins.rm>, VEX;
   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                        !strconcat(!strconcat("v", OpcodeStr),
+                        !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
                         itins.rr>, VEX, VEX_L;
   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
-                        !strconcat(!strconcat("v", OpcodeStr),
+                        !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
                         itins.rm>, VEX, VEX_L;
@@ -3985,14 +3962,14 @@ multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
 let Predicates = [HasAVX] in {
   def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
                       (ins VR128:$src1, i8imm:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
                       IIC_SSE_PSHUF>, VEX;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, i8imm:$src2),
-                      !strconcat(!strconcat("v", OpcodeStr),
+                      !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
                        (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
@@ -4002,14 +3979,14 @@ let Predicates = [HasAVX] in {
 let Predicates = [HasAVX2] in {
   def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
                        (ins VR256:$src1, i8imm:$src2),
-                       !strconcat(!strconcat("v", OpcodeStr),
+                       !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
                        IIC_SSE_PSHUF>, VEX, VEX_L;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, i8imm:$src2),
-                       !strconcat(!strconcat("v", OpcodeStr),
+                       !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
                         (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)),
@@ -5190,7 +5167,7 @@ defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
 // SSSE3 - Packed Align Instruction Patterns
 //===---------------------------------------------------------------------===//
 
-multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
+multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
   let neverHasSideEffects = 1 in {
   def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1, VR128:$src2, i8imm:$src3),
@@ -5210,7 +5187,7 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
   }
 }
 
-multiclass ssse3_palign_y<string asm, bit Is2Addr = 1> {
+multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
   let neverHasSideEffects = 1 in {
   def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
       (ins VR256:$src1, VR256:$src2, i8imm:$src3),
@@ -5227,42 +5204,42 @@ multiclass ssse3_palign_y<string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX] in
-  defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
+  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
 let Predicates = [HasAVX2] in
-  defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V, VEX_L;
+  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
-  defm PALIGN : ssse3_palign<"palignr">;
+  defm PALIGN : ssse3_palignr<"palignr">;
 
 let Predicates = [HasAVX2] in {
-def : Pat<(v8i32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
-def : Pat<(v8f32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
-def : Pat<(v16i16 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
-def : Pat<(v32i8 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
 }
 
 let Predicates = [HasAVX] in {
-def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 }
 
 let Predicates = [UseSSSE3] in {
-def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 }
 
@@ -5590,6 +5567,30 @@ defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
 defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
 
 let Predicates = [HasAVX2] in {
+  def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
+  def : Pat<(v8i32  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>;
+
+  def : Pat<(v8i32  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>;
+
+  def : Pat<(v4i64  (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+
+  def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))),
+            (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))),
+            (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))),
+            (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))),
+            (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))),
+            (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
+            (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
   def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
             (VPMOVSXWDYrm addr:$src)>;
   def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
@@ -5628,6 +5629,15 @@ let Predicates = [HasAVX] in {
 }
 
 let Predicates = [UseSSE41] in {
+  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+
   // Common patterns involving scalar load
   def : Pat<(int_x86_sse41_pmovsxbq
               (bitconvert (v4i32 (X86vzmovl
@@ -5727,6 +5737,15 @@ let Predicates = [HasAVX] in {
   def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
             (VPMOVZXDQrm addr:$src)>;
 
+  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+
   def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
                     (scalar_to_vector (loadi64 addr:$src))))))),
             (VPMOVSXWDrm addr:$src)>;
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index ea716bf..3caa1b5 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -352,11 +352,11 @@ def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
 // Descriptor-table support instructions
 
 def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
-              "sgdtw\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
+              "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
 def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
               "sgdt\t$dst", [], IIC_SGDT>, TB;
 def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
-              "sidtw\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
+              "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
 def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
               "sidt\t$dst", []>, TB;
 def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
@@ -374,11 +374,11 @@ def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
                  "sldt{q}\t$dst", [], IIC_SLDT>, TB;
 
 def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
-              "lgdtw\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
+              "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
 def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
               "lgdt\t$src", [], IIC_LGDT>, TB;
 def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
-              "lidtw\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
+              "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
 def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
               "lidt\t$src", [], IIC_LIDT>, TB;
 def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index cca391f..44d8cce 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -79,7 +79,7 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 # define CFI(x)
 #endif
 
-// Provide a wrapper for X86CompilationCallback2 that saves non-traditional
+// Provide a wrapper for LLVMX86CompilationCallback2 that saves non-traditional
 // callee saved registers, for the fastcc calling convention.
 extern "C" {
 #if defined(X86_64_JIT)
@@ -131,12 +131,12 @@ extern "C" {
     "subq    $32, %rsp\n"
     "movq    %rbp, %rcx\n"    // Pass prev frame and return address
     "movq    8(%rbp), %rdx\n"
-    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
     "addq    $32, %rsp\n"
 #else
     "movq    %rbp, %rdi\n"    // Pass prev frame and return address
     "movq    8(%rbp), %rsi\n"
-    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
 #endif
     // Restore all XMM arg registers
     "movaps  112(%rsp), %xmm7\n"
@@ -213,7 +213,7 @@ extern "C" {
     "movl    4(%ebp), %eax\n" // Pass prev frame and return address
     "movl    %eax, 4(%esp)\n"
     "movl    %ebp, (%esp)\n"
-    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
     "movl    %ebp, %esp\n"    // Restore ESP
     CFI(".cfi_def_cfa_register %esp\n")
     "subl    $12, %esp\n"
@@ -269,7 +269,7 @@ extern "C" {
     "movl    4(%ebp), %eax\n" // Pass prev frame and return address
     "movl    %eax, 4(%esp)\n"
     "movl    %ebp, (%esp)\n"
-    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
     "addl    $16, %esp\n"
     "movaps  48(%esp), %xmm3\n"
     CFI(".cfi_restore %xmm3\n")
@@ -300,10 +300,7 @@ extern "C" {
     SIZE(X86CompilationCallback_SSE)
   );
 # else
-  // the following function is called only from this translation unit,
-  // unless we are under 64bit Windows with MSC, where there is
-  // no support for inline assembly
-  static void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+  void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
 
   _declspec(naked) void X86CompilationCallback(void) {
     __asm {
@@ -317,7 +314,7 @@ extern "C" {
       mov   eax, dword ptr [ebp+4]
       mov   dword ptr [esp+4], eax
       mov   dword ptr [esp], ebp
-      call  X86CompilationCallback2
+      call  LLVMX86CompilationCallback2
       mov   esp, ebp
       sub   esp, 12
       pop   ecx
@@ -337,20 +334,17 @@ extern "C" {
 #endif
 }
 
-/// X86CompilationCallback2 - This is the target-specific function invoked by the
+/// This is the target-specific function invoked by the
 /// function stub when we did not know the real target of a call.  This function
 /// must locate the start of the stub or call site and pass it into the JIT
 /// compiler function.
 extern "C" {
-#if !(defined (X86_64_JIT) && defined(_MSC_VER))
- // the following function is called only from this translation unit,
- // unless we are under 64bit Windows with MSC, where there is
- // no support for inline assembly
-static
-#endif
-void LLVM_ATTRIBUTE_USED
-X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr,
+                                                         intptr_t RetAddr) {
   intptr_t *RetAddrLoc = &StackPtr[1];
+  // We are reading raw stack data here. Tell MemorySanitizer that it is
+  // sufficiently initialized.
+  __msan_unpoison(RetAddrLoc, sizeof(*RetAddrLoc));
   assert(*RetAddrLoc == RetAddr &&
          "Could not find return address on the stack!");
 
@@ -517,7 +511,7 @@ void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
 
   // This used to use 0xCD, but that value is used by JITMemoryManager to
   // initialize the buffer with garbage, which means it may follow a
-  // noreturn function call, confusing X86CompilationCallback2.  PR 4929.
+  // noreturn function call, confusing LLVMX86CompilationCallback2.  PR 4929.
   JCE.emitByte(0xCE);   // Interrupt - Just a marker identifying the stub!
   return Result;
 }
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 5a1e1b8..3af1b3e 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -239,7 +239,8 @@ static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
     if (!MI->getOperand(OpNo+i).isReg()) continue;
 
     unsigned Reg = MI->getOperand(OpNo+i).getReg();
-    if (Reg == 0) continue;
+    // LEAs can use RIP-relative addressing, and RIP has no sub/super register.
+    if (Reg == 0 || Reg == X86::RIP) continue;
 
     MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
   }
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index c22872f..83e75ea 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -33,6 +33,19 @@ using namespace llvm;
 STATISTIC(NumBBsPadded, "Number of basic blocks padded");
 
 namespace {
+  struct VisitedBBInfo {
+    // HasReturn - Whether the BB contains a return instruction
+    bool HasReturn;
+
+    // Cycles - Number of cycles until return if HasReturn is true, otherwise
+    // number of cycles until end of the BB
+    unsigned int Cycles;
+
+    VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+    VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+      : HasReturn(HasReturn), Cycles(Cycles) {}
+  };
+
   struct PadShortFunc : public MachineFunctionPass {
     static char ID;
     PadShortFunc() : MachineFunctionPass(ID)
@@ -49,16 +62,21 @@ namespace {
                      unsigned int Cycles = 0);
 
     bool cyclesUntilReturn(MachineBasicBlock *MBB,
-                           unsigned int &Cycles,
-                           MachineBasicBlock::iterator *Location = 0);
+                           unsigned int &Cycles);
 
     void addPadding(MachineBasicBlock *MBB,
                     MachineBasicBlock::iterator &MBBI,
                     unsigned int NOOPsToAdd);
 
     const unsigned int Threshold;
+
+    // ReturnBBs - Maps basic blocks that return to the minimum number of
+    // cycles until the return, starting from the entry block.
     DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
 
+    // VisitedBBs - Cache of previously visited BBs.
+    DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
     const TargetMachine *TM;
     const TargetInstrInfo *TII;
   };
@@ -73,25 +91,26 @@ FunctionPass *llvm::createX86PadShortFunctions() {
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// NOOP instructions before early exits.
 bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
-  bool OptForSize = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
-
-  if (OptForSize)
+  const AttributeSet &FnAttrs = MF.getFunction()->getAttributes();
+  if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::OptimizeForSize) ||
+      FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::MinSize)) {
     return false;
+  }
 
   TM = &MF.getTarget();
   TII = TM->getInstrInfo();
 
   // Search through basic blocks and mark the ones that have early returns
   ReturnBBs.clear();
+  VisitedBBs.clear();
   findReturns(MF.begin());
 
   bool MadeChange = false;
 
-  MachineBasicBlock::iterator ReturnLoc;
   MachineBasicBlock *MBB;
   unsigned int Cycles = 0;
-  unsigned int BBCycles;
 
   // Pad the identified basic blocks with NOOPs
   for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
@@ -100,8 +119,16 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
     Cycles = I->second;
 
     if (Cycles < Threshold) {
-      if (!cyclesUntilReturn(MBB, BBCycles, &ReturnLoc))
-        continue;
+      // BB ends in a return. Skip over any DBG_VALUE instructions
+      // trailing the terminator.
+      assert(MBB->size() > 0 &&
+             "Basic block should contain at least a RET but is empty");
+      MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+      while (ReturnLoc->isDebugValue())
+        --ReturnLoc;
+      assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+             "Basic block does not end with RET");
 
       addPadding(MBB, ReturnLoc, Threshold - Cycles);
       NumBBsPadded++;
@@ -127,18 +154,30 @@ void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
 
   // Follow branches in BB and look for returns
   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
-      I != MBB->succ_end(); ++I) {
+       I != MBB->succ_end(); ++I) {
+    if (*I == MBB)
+      continue;
     findReturns(*I, Cycles);
   }
 }
 
-/// cyclesUntilReturn - if the MBB has a return instruction, set Location
-/// to the instruction and return true. Return false otherwise.
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
 /// Cycles will be incremented by the number of cycles taken to reach the
 /// return or the end of the BB, whichever occurs first.
 bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
-                                     unsigned int &Cycles,
-                                     MachineBasicBlock::iterator *Location) {
+                                     unsigned int &Cycles) {
+  // Return cached result if BB was previously visited
+  DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+    = VisitedBBs.find(MBB);
+  if (it != VisitedBBs.end()) {
+    VisitedBBInfo BBInfo = it->second;
+    Cycles += BBInfo.Cycles;
+    return BBInfo.HasReturn;
+  }
+
+  unsigned int CyclesToEnd = 0;
+
   for (MachineBasicBlock::iterator MBBI = MBB->begin();
         MBBI != MBB->end(); ++MBBI) {
     MachineInstr *MI = MBBI;
@@ -146,14 +185,16 @@ bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
     // functions do not count because the called function will be padded,
     // if necessary.
     if (MI->isReturn() && !MI->isCall()) {
-      if (Location)
-        *Location = MBBI;
+      VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+      Cycles += CyclesToEnd;
       return true;
     }
 
-    Cycles += TII->getInstrLatency(TM->getInstrItineraryData(), MI);
+    CyclesToEnd += TII->getInstrLatency(TM->getInstrItineraryData(), MI);
   }
 
+  VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+  Cycles += CyclesToEnd;
   return false;
 }
 
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 58064b8..16886e4 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -50,7 +50,7 @@ ForceStackAlign("force-align-stack",
                            " needed for the function."),
                  cl::init(false), cl::Hidden);
 
-cl::opt<bool>
+static cl::opt<bool>
 EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
           cl::desc("Enable use of a base pointer for complex stack frames"));
 
@@ -177,20 +177,21 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
 const TargetRegisterClass *
 X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
                                                                          const {
+  const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
   switch (Kind) {
   default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
   case 0: // Normal GPRs.
-    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+    if (Subtarget.isTarget64BitLP64())
       return &X86::GR64RegClass;
     return &X86::GR32RegClass;
   case 1: // Normal GPRs except the stack pointer (for encoding reasons).
-    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+    if (Subtarget.isTarget64BitLP64())
       return &X86::GR64_NOSPRegClass;
     return &X86::GR32_NOSPRegClass;
   case 2: // Available for tailcall (not callee-saved GPRs).
-    if (TM.getSubtarget<X86Subtarget>().isTargetWin64())
+    if (Subtarget.isTargetWin64())
       return &X86::GR64_TCW64RegClass;
-    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+    else if (Subtarget.is64Bit())
       return &X86::GR64_TCRegClass;
 
     const Function *F = MF.getFunction();
@@ -234,38 +235,40 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 
 const uint16_t *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  bool callsEHReturn = false;
-  bool ghcCall = false;
-  bool oclBiCall = false;
-  bool hipeCall = false;
-  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-
-  if (MF) {
-    callsEHReturn = MF->getMMI().callsEHReturn();
-    const Function *F = MF->getFunction();
-    ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
-    oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false);
-    hipeCall = (F ? F->getCallingConv() == CallingConv::HiPE : false);
-  }
-
-  if (ghcCall || hipeCall)
+  switch (MF->getFunction()->getCallingConv()) {
+  case CallingConv::GHC:
+  case CallingConv::HiPE:
     return CSR_NoRegs_SaveList;
-  if (oclBiCall) {
+
+  case CallingConv::Intel_OCL_BI: {
+    bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
     if (HasAVX && IsWin64)
-        return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+      return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
     if (HasAVX && Is64Bit)
-        return CSR_64_Intel_OCL_BI_AVX_SaveList;
+      return CSR_64_Intel_OCL_BI_AVX_SaveList;
     if (!HasAVX && !IsWin64 && Is64Bit)
-        return CSR_64_Intel_OCL_BI_SaveList;
+      return CSR_64_Intel_OCL_BI_SaveList;
+    break;
   }
+
+  case CallingConv::Cold:
+    if (Is64Bit)
+      return CSR_MostRegs_64_SaveList;
+    break;
+
+  default:
+    break;
+  }
+
+  bool CallsEHReturn = MF->getMMI().callsEHReturn();
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_SaveList;
-    if (callsEHReturn)
+    if (CallsEHReturn)
       return CSR_64EHRet_SaveList;
     return CSR_64_SaveList;
   }
-  if (callsEHReturn)
+  if (CallsEHReturn)
     return CSR_32EHRet_SaveList;
   return CSR_32_SaveList;
 }
@@ -286,6 +289,8 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
     return CSR_NoRegs_RegMask;
   if (!Is64Bit)
     return CSR_32_RegMask;
+  if (CC == CallingConv::Cold)
+    return CSR_MostRegs_64_RegMask;
   if (IsWin64)
     return CSR_Win64_RegMask;
   return CSR_64_RegMask;
@@ -389,7 +394,13 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
 
    // When we need stack realignment and there are dynamic allocas, we can't
    // reference off of the stack pointer, so we reserve a base pointer.
-   if (needsStackRealignment(MF) && MFI->hasVarSizedObjects())
+   //
+   // This is also true if the function contain MS-style inline assembly.  We
+   // do this because if any stack changes occur in the inline assembly, e.g.,
+   // "pusha", then any C local variable or C argument references in the
+   // inline assembly will be wrong because the SP is not properly tracked.
+   if ((needsStackRealignment(MF) && MFI->hasVarSizedObjects()) ||
+       MF.hasMSInlineAsm())
      return true;
 
    return false;
@@ -440,123 +451,16 @@ bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
   return false;
 }
 
-static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
-  if (is64Bit) {
-    if (isInt<8>(Imm))
-      return X86::SUB64ri8;
-    return X86::SUB64ri32;
-  } else {
-    if (isInt<8>(Imm))
-      return X86::SUB32ri8;
-    return X86::SUB32ri;
-  }
-}
-
-static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) {
-  if (is64Bit) {
-    if (isInt<8>(Imm))
-      return X86::ADD64ri8;
-    return X86::ADD64ri32;
-  } else {
-    if (isInt<8>(Imm))
-      return X86::ADD32ri8;
-    return X86::ADD32ri;
-  }
-}
-
-void X86RegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  bool reseveCallFrame = TFI->hasReservedCallFrame(MF);
-  int Opcode = I->getOpcode();
-  bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
-  DebugLoc DL = I->getDebugLoc();
-  uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
-  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
-  I = MBB.erase(I);
-
-  if (!reseveCallFrame) {
-    // If the stack pointer can be changed after prologue, turn the
-    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
-    // adjcallstackdown instruction into 'add ESP, <amt>'
-    // TODO: consider using push / pop instead of sub + store / add
-    if (Amount == 0)
-      return;
-
-    // We need to keep the stack aligned properly.  To do this, we round the
-    // amount of space needed for the outgoing arguments up to the next
-    // alignment boundary.
-    unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
-    Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
-
-    MachineInstr *New = 0;
-    if (Opcode == TII.getCallFrameSetupOpcode()) {
-      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(Is64Bit, Amount)),
-                    StackPtr)
-        .addReg(StackPtr)
-        .addImm(Amount);
-    } else {
-      assert(Opcode == TII.getCallFrameDestroyOpcode());
-
-      // Factor out the amount the callee already popped.
-      Amount -= CalleeAmt;
-
-      if (Amount) {
-        unsigned Opc = getADDriOpcode(Is64Bit, Amount);
-        New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
-          .addReg(StackPtr).addImm(Amount);
-      }
-    }
-
-    if (New) {
-      // The EFLAGS implicit def is dead.
-      New->getOperand(3).setIsDead();
-
-      // Replace the pseudo instruction with a new instruction.
-      MBB.insert(I, New);
-    }
-
-    return;
-  }
-
-  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
-    // If we are performing frame pointer elimination and if the callee pops
-    // something off the stack pointer, add it back.  We do this until we have
-    // more advanced stack pointer tracking ability.
-    unsigned Opc = getSUBriOpcode(Is64Bit, CalleeAmt);
-    MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
-      .addReg(StackPtr).addImm(CalleeAmt);
-
-    // The EFLAGS implicit def is dead.
-    New->getOperand(3).setIsDead();
-
-    // We are not tracking the stack pointer adjustment by the callee, so make
-    // sure we restore the stack pointer immediately after the call, there may
-    // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
-    MachineBasicBlock::iterator B = MBB.begin();
-    while (I != B && !llvm::prior(I)->isCall())
-      --I;
-    MBB.insert(I, New);
-  }
-}
-
 void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                     int SPAdj, RegScavenger *RS) const {
+                                     int SPAdj, unsigned FIOperandNum,
+                                     RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
-  unsigned i = 0;
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
-  int FrameIndex = MI.getOperand(i).getIndex();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned BasePtr;
 
   unsigned Opc = MI.getOpcode();
@@ -572,7 +476,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // This must be part of a four operand memory reference.  Replace the
   // FrameIndex with base register with EBP.  Add an offset to the offset.
-  MI.getOperand(i).ChangeToRegister(BasePtr, false);
+  MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
 
   // Now add the frame object offset to the offset from EBP.
   int FIOffset;
@@ -583,17 +487,18 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   } else
     FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
 
-  if (MI.getOperand(i+3).isImm()) {
+  if (MI.getOperand(FIOperandNum+3).isImm()) {
     // Offset is a 32-bit integer.
-    int Imm = (int)(MI.getOperand(i + 3).getImm());
+    int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
     int Offset = FIOffset + Imm;
     assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
            "Requesting 64-bit offset in 32-bit immediate!");
-    MI.getOperand(i + 3).ChangeToImmediate(Offset);
+    MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
   } else {
     // Offset is symbolic. This is extremely rare.
-    uint64_t Offset = FIOffset + (uint64_t)MI.getOperand(i+3).getOffset();
-    MI.getOperand(i+3).setOffset(Offset);
+    uint64_t Offset = FIOffset +
+      (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+    MI.getOperand(FIOperandNum + 3).setOffset(Offset);
   }
 }
 
@@ -618,7 +523,15 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
   case MVT::i8:
     if (High) {
       switch (Reg) {
-      default: return getX86SubSuperRegister(Reg, MVT::i64, High);
+      default: return getX86SubSuperRegister(Reg, MVT::i64);
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SI;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DI;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BP;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SP;
       case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
         return X86::AH;
       case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -738,22 +651,6 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
       return X86::R15D;
     }
   case MVT::i64:
-    // For 64-bit mode if we've requested a "high" register and the
-    // Q or r constraints we want one of these high registers or
-    // just the register name otherwise.
-    if (High) {
-      switch (Reg) {
-      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
-        return X86::SI;
-      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
-        return X86::DI;
-      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
-        return X86::BP;
-      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
-        return X86::SP;
-      // Fallthrough.
-      }
-    }
     switch (Reg) {
     default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 7932ede..b9d7b8c 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -117,12 +117,9 @@ public:
   bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
                             int &FrameIdx) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI) const;
-
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index c14407f..d99d085 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -470,12 +470,17 @@ def IIC_NOP : InstrItinClass;
 // latencies. Since these latencies are not used for pipeline hazards,
 // they do not need to be exact.
 //
+// ILPWindow=10 is an arbitrary threshold that approximates cycles of
+// latency hidden by instruction buffers. The actual value is not very
+// important but should be zero for inorder and nonzero for OOO processors.
+//
 // The GenericModel contains no instruciton itineraries.
 def GenericModel : SchedMachineModel {
   let IssueWidth = 4;
   let MinLatency = 0;
   let LoadLatency = 4;
   let HighLatency = 10;
+  let ILPWindow = 10;
 }
 
 include "X86ScheduleAtom.td"
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 8710261..1e5f2d6 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -525,6 +525,7 @@ def AtomModel : SchedMachineModel {
                        // OperandCycles may be used for expected latency.
   let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
   let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+  let ILPWindow = 0; // Always try to hide expected latency.
 
   let Itineraries = AtomItineraries;
 }
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 757e8c7..f934fdd 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -202,6 +202,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
       SrcPtrInfo.getAddrSpace() >= 256)
     return SDValue();
 
+  // ESI might be used as a base pointer, in that case we can't simply overwrite
+  // the register.  Fall back to generic code.
+  const X86RegisterInfo *TRI =
+      static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo());
+  if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+      TRI->getBaseRegister() == X86::ESI)
+    return SDValue();
+
   MVT AVT;
   if (Align & 1)
     AVT = MVT::i8;
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 53c28f4..0f2c008 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -14,6 +14,8 @@
 #define DEBUG_TYPE "subtarget"
 #include "X86Subtarget.h"
 #include "X86InstrInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -155,6 +157,12 @@ const char *X86Subtarget::getBZeroEntry() const {
   return 0;
 }
 
+bool X86Subtarget::hasSinCos() const {
+  return getTargetTriple().isMacOSX() &&
+    !getTargetTriple().isMacOSXVersionLT(10, 9) &&
+    is64Bit();
+}
+
 /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
 /// to immediate address.
 bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
@@ -318,45 +326,23 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
   }
 }
 
-X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS,
-                           unsigned StackAlignOverride, bool is64Bit)
-  : X86GenSubtargetInfo(TT, CPU, FS)
-  , X86ProcFamily(Others)
-  , PICStyle(PICStyles::None)
-  , X86SSELevel(NoMMXSSE)
-  , X863DNowLevel(NoThreeDNow)
-  , HasCMov(false)
-  , HasX86_64(false)
-  , HasPOPCNT(false)
-  , HasSSE4A(false)
-  , HasAES(false)
-  , HasPCLMUL(false)
-  , HasFMA(false)
-  , HasFMA4(false)
-  , HasXOP(false)
-  , HasMOVBE(false)
-  , HasRDRAND(false)
-  , HasF16C(false)
-  , HasFSGSBase(false)
-  , HasLZCNT(false)
-  , HasBMI(false)
-  , HasBMI2(false)
-  , HasRTM(false)
-  , IsBTMemSlow(false)
-  , IsUAMemFast(false)
-  , HasVectorUAMem(false)
-  , HasCmpxchg16b(false)
-  , UseLeaForSP(false)
-  , HasSlowDivide(false)
-  , PostRAScheduler(false)
-  , PadShortFunctions(false)
-  , stackAlignment(4)
-  // FIXME: this is a known good value for Yonah. How about others?
-  , MaxInlineSizeThreshold(128)
-  , TargetTriple(TT)
-  , In64BitMode(is64Bit) {
-  // Determine default and user specified characteristics
+void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                           "target-cpu");
+  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                          "target-features");
+  std::string CPU =
+    !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+  std::string FS =
+    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+  if (!FS.empty()) {
+    initializeEnvironment();
+    resetSubtargetFeatures(CPU, FS);
+  }
+}
+
+void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   std::string CPUName = CPU;
   if (!FS.empty() || !CPU.empty()) {
     if (CPUName.empty()) {
@@ -433,6 +419,53 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
     stackAlignment = 16;
 }
 
+void X86Subtarget::initializeEnvironment() {
+  X86SSELevel = NoMMXSSE;
+  X863DNowLevel = NoThreeDNow;
+  HasCMov = false;
+  HasX86_64 = false;
+  HasPOPCNT = false;
+  HasSSE4A = false;
+  HasAES = false;
+  HasPCLMUL = false;
+  HasFMA = false;
+  HasFMA4 = false;
+  HasXOP = false;
+  HasMOVBE = false;
+  HasRDRAND = false;
+  HasF16C = false;
+  HasFSGSBase = false;
+  HasLZCNT = false;
+  HasBMI = false;
+  HasBMI2 = false;
+  HasRTM = false;
+  HasADX = false;
+  IsBTMemSlow = false;
+  IsUAMemFast = false;
+  HasVectorUAMem = false;
+  HasCmpxchg16b = false;
+  UseLeaForSP = false;
+  HasSlowDivide = false;
+  PostRAScheduler = false;
+  PadShortFunctions = false;
+  stackAlignment = 4;
+  // FIXME: this is a known good value for Yonah. How about others?
+  MaxInlineSizeThreshold = 128;
+}
+
+X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS,
+                           unsigned StackAlignOverride, bool is64Bit)
+  : X86GenSubtargetInfo(TT, CPU, FS)
+  , X86ProcFamily(Others)
+  , PICStyle(PICStyles::None)
+  , TargetTriple(TT)
+  , StackAlignOverride(StackAlignOverride)
+  , In64BitMode(is64Bit) {
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+}
+
 bool X86Subtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 080f4cf..e97da4b 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -121,6 +121,9 @@ protected:
   /// HasRTM - Processor has RTM instructions.
   bool HasRTM;
 
+  /// HasADX - Processor has ADX instructions.
+  bool HasADX;
+
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -165,11 +168,13 @@ protected:
   InstrItineraryData InstrItins;
 
 private:
+  /// StackAlignOverride - Override the stack alignment.
+  unsigned StackAlignOverride;
+
   /// In64BitMode - True if compiling for 64-bit, false for 32-bit.
   bool In64BitMode;
 
 public:
-
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
@@ -194,7 +199,26 @@ public:
   /// instruction.
   void AutoDetectSubtargetFeatures();
 
-  bool is64Bit() const { return In64BitMode; }
+  /// \brief Reset the features for the X86 target.
+  virtual void resetSubtargetFeatures(const MachineFunction *MF);
+private:
+  void initializeEnvironment();
+  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+  /// Is this x86_64? (disregarding specific ABI / programming model)
+  bool is64Bit() const {
+    return In64BitMode;
+  }
+
+  /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+  bool isTarget64BitILP32() const {
+    return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32);
+  }
+
+  /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+  bool isTarget64BitLP64() const {
+    return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32);
+  }
 
   PICStyles::Style getPICStyle() const { return PICStyle; }
   void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
@@ -229,6 +253,7 @@ public:
   bool hasBMI() const { return HasBMI; }
   bool hasBMI2() const { return HasBMI2; }
   bool hasRTM() const { return HasRTM; }
+  bool hasADX() const { return HasADX; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
@@ -315,6 +340,10 @@ public:
   /// memset with zero passed as the second argument. Otherwise it
   /// returns null.
   const char *getBZeroEntry() const;
+  
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
 
   /// enablePostRAScheduler - run for Atom optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 706e64a..8aa58a2 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -59,8 +59,12 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
   : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true),
-    DL("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
-               "n8:16:32:64-S128"),
+    // The x32 ABI dictates the ILP32 programming model for x64.
+    DL(getSubtargetImpl()->isTarget64BitILP32() ?
+        "e-p:32:32-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+        "n8:16:32:64-S128" :
+        "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+        "n8:16:32:64-S128"),
     InstrInfo(*this),
     TLInfo(*this),
     TSInfo(*this),
@@ -151,6 +155,7 @@ public:
   }
 
   virtual bool addInstSelector();
+  virtual bool addILPOpts();
   virtual bool addPreRegAlloc();
   virtual bool addPostRegAlloc();
   virtual bool addPreEmitPass();
@@ -158,12 +163,7 @@ public:
 } // namespace
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
-  X86PassConfig *PC = new X86PassConfig(this, PM);
-
-  if (X86EarlyIfConv && Subtarget.hasCMov())
-    PC->enablePass(&EarlyIfConverterID);
-
-  return PC;
+  return new X86PassConfig(this, PM);
 }
 
 bool X86PassConfig::addInstSelector() {
@@ -181,6 +181,14 @@ bool X86PassConfig::addInstSelector() {
   return false;
 }
 
+bool X86PassConfig::addILPOpts() {
+  if (X86EarlyIfConv && getX86Subtarget().hasCMov()) {
+    addPass(&EarlyIfConverterID);
+    return true;
+  }
+  return false;
+}
+
 bool X86PassConfig::addPreRegAlloc() {
   return false;  // -print-machineinstr shouldn't print after this.
 }
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index b8ee319..871dacd 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -8,16 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86TargetObjectFile.h"
-#include "X86TargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Target/Mangler.h"
+
 using namespace llvm;
 using namespace dwarf;
 
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 9cc1b18..fefb479 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
 using namespace llvm;
 
 // Declare the pass initialization routine locally as target-specific passes
@@ -75,7 +76,6 @@ public:
 
   /// \name Scalar TTI Implementations
   /// @{
-
   virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
 
   /// @}
@@ -84,6 +84,8 @@ public:
   /// @{
 
   virtual unsigned getNumberOfRegisters(bool Vector) const;
+  virtual unsigned getRegisterBitWidth(bool Vector) const;
+  virtual unsigned getMaximumUnrollFactor() const;
   virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
   virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
                                   int Index, Type *SubTp) const;
@@ -118,45 +120,6 @@ llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) {
 //
 //===----------------------------------------------------------------------===//
 
-namespace {
-struct X86CostTblEntry {
-  int ISD;
-  MVT Type;
-  unsigned Cost;
-};
-}
-
-static int
-FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) {
-  for (unsigned int i = 0; i < len; ++i)
-    if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
-      return i;
-
-  // Could not find an entry.
-  return -1;
-}
-
-namespace {
-struct X86TypeConversionCostTblEntry {
-  int ISD;
-  MVT Dst;
-  MVT Src;
-  unsigned Cost;
-};
-}
-
-static int
-FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
-                   int ISD, MVT Dst, MVT Src) {
-  for (unsigned int i = 0; i < len; ++i)
-    if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
-      return i;
-
-  // Could not find an entry.
-  return -1;
-}
-
-
 X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   // TODO: Currently the __builtin_popcount() implementation using SSE3
@@ -166,11 +129,39 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
 }
 
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
+  if (Vector && !ST->hasSSE1())
+    return 0;
+
   if (ST->is64Bit())
     return 16;
   return 8;
 }
 
+unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
+  if (Vector) {
+    if (ST->hasAVX()) return 256;
+    if (ST->hasSSE1()) return 128;
+    return 0;
+  }
+
+  if (ST->is64Bit())
+    return 64;
+  return 32;
+
+}
+
+unsigned X86TTI::getMaximumUnrollFactor() const {
+  if (ST->isAtom())
+    return 1;
+
+  // Sandybridge and Haswell have multiple execution ports and pipelined
+  // vector units.
+  if (ST->hasAVX())
+    return 4;
+
+  return 2;
+}
+
 unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
@@ -178,7 +169,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const X86CostTblEntry AVX1CostTable[] = {
+  static const CostTblEntry<MVT> AVX1CostTable[] = {
     // We don't have to scalarize unsupported ops. We can issue two half-sized
     // operations and we only need to extract the upper YMM half.
     // Two ops + 1 extract + 1 insert = 4.
@@ -192,7 +183,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
 
   // Look for AVX1 lowering tricks.
   if (ST->hasAVX()) {
-    int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
+    int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
                           LT.second);
     if (Idx != -1)
       return LT.first * AVX1CostTable[Idx].Cost;
@@ -226,7 +217,7 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
 
-  static const X86TypeConversionCostTblEntry AVXConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = {
     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
@@ -241,11 +232,14 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
     { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1,  8 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,  8 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 8 },
     { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
   };
 
   if (ST->hasAVX()) {
-    int Idx = FindInConvertTable(AVXConversionTbl,
+    int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl,
                                  array_lengthof(AVXConversionTbl),
                                  ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
@@ -265,7 +259,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const X86CostTblEntry SSE42CostTbl[] = {
+  static const CostTblEntry<MVT> SSE42CostTbl[] = {
     { ISD::SETCC,   MVT::v2f64,   1 },
     { ISD::SETCC,   MVT::v4f32,   1 },
     { ISD::SETCC,   MVT::v2i64,   1 },
@@ -274,7 +268,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v16i8,   1 },
   };
 
-  static const X86CostTblEntry AVX1CostTbl[] = {
+  static const CostTblEntry<MVT> AVX1CostTbl[] = {
     { ISD::SETCC,   MVT::v4f64,   1 },
     { ISD::SETCC,   MVT::v8f32,   1 },
     // AVX1 does not support 8-wide integer compare.
@@ -284,7 +278,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v32i8,   4 },
   };
 
-  static const X86CostTblEntry AVX2CostTbl[] = {
+  static const CostTblEntry<MVT> AVX2CostTbl[] = {
     { ISD::SETCC,   MVT::v4i64,   1 },
     { ISD::SETCC,   MVT::v8i32,   1 },
     { ISD::SETCC,   MVT::v16i16,  1 },
@@ -292,19 +286,19 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   };
 
   if (ST->hasAVX2()) {
-    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+    int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
     if (Idx != -1)
       return LT.first * AVX2CostTbl[Idx].Cost;
   }
 
   if (ST->hasAVX()) {
-    int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+    int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
     if (Idx != -1)
       return LT.first * AVX1CostTbl[Idx].Cost;
   }
 
   if (ST->hasSSE42()) {
-    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+    int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
     if (Idx != -1)
       return LT.first * SSE42CostTbl[Idx].Cost;
   }
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index c4a5887..0f77948 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -120,9 +120,19 @@ static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
   return false;
 }
 
+static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+  for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
+    if (!MO.clobbersPhysReg(reg))
+      return false;
+  }
+  return true;
+}
+
 static bool hasYmmReg(MachineInstr *MI) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
+    if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+      return true;
     if (!MO.isReg())
       continue;
     if (MO.isDebug())
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 094f18c..7e7d396 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -92,11 +92,19 @@ static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
 static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
                                       uint64_t Address, const void *Decoder);
 
+static DecodeStatus DecodeMEMiiOperand(MCInst &Inst, unsigned Val,
+                                       uint64_t Address, const void *Decoder);
+
 static DecodeStatus Decode2RInstruction(MCInst &Inst,
                                         unsigned Insn,
                                         uint64_t Address,
                                         const void *Decoder);
 
+static DecodeStatus Decode2RImmInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
 static DecodeStatus DecodeR2RInstruction(MCInst &Inst,
                                          unsigned Insn,
                                          uint64_t Address,
@@ -132,6 +140,66 @@ static DecodeStatus DecodeLR2RInstruction(MCInst &Inst,
                                           uint64_t Address,
                                           const void *Decoder);
 
+static DecodeStatus Decode3RInstruction(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
+static DecodeStatus Decode3RImmInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus Decode2RUSInstruction(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeL3RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL6RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL5RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst,
+                                                     unsigned Insn,
+                                                     uint64_t Address,
+                                                     const void *Decoder);
+
 #include "XCoreGenDisassemblerTables.inc"
 
 static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
@@ -157,13 +225,24 @@ static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeMEMiiOperand(MCInst &Inst, unsigned Val,
+                                       uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  Inst.addOperand(MCOperand::CreateImm(0));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus
 Decode2OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2) {
-  unsigned Combined = fieldFromInstruction(Insn, 6, 5) +
-                      fieldFromInstruction(Insn, 5, 1) * 5 - 27;
-  if (Combined >= 9)
+  unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+  if (Combined < 27)
     return MCDisassembler::Fail;
-
+  if (fieldFromInstruction(Insn, 5, 1)) {
+    if (Combined == 31)
+      return MCDisassembler::Fail;
+    Combined += 5;
+  }
+  Combined -= 27;
   unsigned Op1High = Combined % 3;
   unsigned Op2High = Combined / 3;
   Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 2, 2);
@@ -172,14 +251,114 @@ Decode2OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2) {
 }
 
 static DecodeStatus
+Decode3OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2,
+                     unsigned &Op3) {
+  unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+  if (Combined >= 27)
+    return MCDisassembler::Fail;
+
+  unsigned Op1High = Combined % 3;
+  unsigned Op2High = (Combined / 3) % 3;
+  unsigned Op3High = Combined / 9;
+  Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 4, 2);
+  Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 2, 2);
+  Op3 = (Op3High << 2) | fieldFromInstruction(Insn, 0, 2);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                         const void *Decoder) {
+  // Try and decode as a 3R instruction.
+  unsigned Opcode = fieldFromInstruction(Insn, 11, 5);
+  switch (Opcode) {
+  case 0x0:
+    Inst.setOpcode(XCore::STW_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x1:
+    Inst.setOpcode(XCore::LDW_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x2:
+    Inst.setOpcode(XCore::ADD_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x3:
+    Inst.setOpcode(XCore::SUB_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x4:
+    Inst.setOpcode(XCore::SHL_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x5:
+    Inst.setOpcode(XCore::SHR_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x6:
+    Inst.setOpcode(XCore::EQ_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x7:
+    Inst.setOpcode(XCore::AND_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x8:
+    Inst.setOpcode(XCore::OR_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x9:
+    Inst.setOpcode(XCore::LDW_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x10:
+    Inst.setOpcode(XCore::LD16S_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x11:
+    Inst.setOpcode(XCore::LD8U_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x12:
+    Inst.setOpcode(XCore::ADD_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x13:
+    Inst.setOpcode(XCore::SUB_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x14:
+    Inst.setOpcode(XCore::SHL_2rus);
+    return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x15:
+    Inst.setOpcode(XCore::SHR_2rus);
+    return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x16:
+    Inst.setOpcode(XCore::EQ_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x17:
+    Inst.setOpcode(XCore::TSETR_3r);
+    return Decode3RImmInstruction(Inst, Insn, Address, Decoder);
+  case 0x18:
+    Inst.setOpcode(XCore::LSS_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x19:
+    Inst.setOpcode(XCore::LSU_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
 Decode2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
                     const void *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-  }
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode2RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(Op1));
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
   return S;
 }
 
@@ -188,10 +367,11 @@ DecodeR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
                      const void *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op2, Op1);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-  }
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
   return S;
 }
 
@@ -200,11 +380,12 @@ Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
                           const void *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-  }
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
   return S;
 }
 
@@ -213,10 +394,11 @@ DecodeRUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
                      const void *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    Inst.addOperand(MCOperand::CreateImm(Op2));
-  }
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(Op2));
   return S;
 }
 
@@ -225,10 +407,11 @@ DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
                          const void *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeBitpOperand(Inst, Op2, Address, Decoder);
-  }
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeBitpOperand(Inst, Op2, Address, Decoder);
   return S;
 }
 
@@ -237,24 +420,97 @@ DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
                                const void *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeBitpOperand(Inst, Op2, Address, Decoder);
-  }
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeBitpOperand(Inst, Op2, Address, Decoder);
   return S;
 }
 
 static DecodeStatus
+DecodeL2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                          const void *Decoder) {
+  // Try and decode as a L3R / L2RUS instruction.
+  unsigned Opcode = fieldFromInstruction(Insn, 16, 4) |
+                    fieldFromInstruction(Insn, 27, 5) << 4;
+  switch (Opcode) {
+  case 0x0c:
+    Inst.setOpcode(XCore::STW_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x1c:
+    Inst.setOpcode(XCore::XOR_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x2c:
+    Inst.setOpcode(XCore::ASHR_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x3c:
+    Inst.setOpcode(XCore::LDAWF_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x4c:
+    Inst.setOpcode(XCore::LDAWB_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x5c:
+    Inst.setOpcode(XCore::LDA16F_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x6c:
+    Inst.setOpcode(XCore::LDA16B_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x7c:
+    Inst.setOpcode(XCore::MUL_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x8c:
+    Inst.setOpcode(XCore::DIVS_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x9c:
+    Inst.setOpcode(XCore::DIVU_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x10c:
+    Inst.setOpcode(XCore::ST16_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x11c:
+    Inst.setOpcode(XCore::ST8_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x12c:
+    Inst.setOpcode(XCore::ASHR_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x12d:
+    Inst.setOpcode(XCore::OUTPW_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x12e:
+    Inst.setOpcode(XCore::INPW_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x13c:
+    Inst.setOpcode(XCore::LDAWF_l2rus);
+    return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x14c:
+    Inst.setOpcode(XCore::LDAWB_l2rus);
+    return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x15c:
+    Inst.setOpcode(XCore::CRC_l3r);
+    return DecodeL3RSrcDstInstruction(Inst, Insn, Address, Decoder);
+  case 0x18c:
+    Inst.setOpcode(XCore::REMS_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x19c:
+    Inst.setOpcode(XCore::REMU_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
 DecodeL2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
                                const void *Decoder) {
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
                                         Op1, Op2);
-  if (S == MCDisassembler::Success) {
-    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
-    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
-  }
+  if (S != MCDisassembler::Success)
+    return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
   return S;
 }
 
@@ -264,9 +520,212 @@ DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
   unsigned Op1, Op2;
   DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
                                         Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                    const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
   if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
     DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode3RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    Inst.addOperand(MCOperand::CreateImm(Op1));
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                      const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
     DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::CreateImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                      const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::CreateImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL6RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3, Op4, Op5, Op6;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S != MCDisassembler::Success)
+    return S;
+  S = Decode3OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5, Op6);
+  if (S != MCDisassembler::Success)
+    return S;
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op6, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL5RInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  // Try and decode as a L6R instruction.
+  Inst.clear();
+  unsigned Opcode = fieldFromInstruction(Insn, 27, 5);
+  switch (Opcode) {
+  case 0x00:
+    Inst.setOpcode(XCore::LMUL_l6r);
+    return DecodeL6RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+DecodeL5RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3, Op4, Op5;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S != MCDisassembler::Success)
+    return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+  S = Decode2OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5);
+  if (S != MCDisassembler::Success)
+    return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  }
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  }
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
   }
   return S;
 }
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index ea77d92..0d146ba 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -171,7 +171,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // The ABI requires that unsigned scalar types smaller than 32 bits
   // are padded to 32 bits.
   if (Size < 4)
-    OutStreamer.EmitZeros(4 - Size, 0);
+    OutStreamer.EmitZeros(4 - Size);
   
   // Mark the end of the global
   OutStreamer.EmitRawText("\t.cc_bottom " + Twine(GVSym->getName()) + ".data");
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index bb9c77a..019c457 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -332,6 +332,58 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void XCoreFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const XCoreInstrInfo &TII =
+    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
+    // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      assert(Amount%4 == 0);
+      Amount /= 4;
+
+      bool isU6 = isImmU6(Amount);
+      if (!isU6 && !isImmU16(Amount)) {
+        // FIX could emit multiple instructions in this case.
+#ifndef NDEBUG
+        errs() << "eliminateCallFramePseudoInstr size too big: "
+               << Amount << "\n";
+#endif
+        llvm_unreachable(0);
+      }
+
+      MachineInstr *New;
+      if (Old->getOpcode() == XCore::ADJCALLSTACKDOWN) {
+        int Opcode = isU6 ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode))
+          .addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == XCore::ADJCALLSTACKUP);
+        int Opcode = isU6 ? XCore::LDAWSP_ru6_RRegs : XCore::LDAWSP_lru6_RRegs;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode), XCore::SP)
+          .addImm(Amount);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+  
+  MBB.erase(I);
+}
+
 void
 XCoreFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                      RegScavenger *RS) const {
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index db1bbb6..ebad62f 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -39,6 +39,10 @@ namespace llvm {
                                      const std::vector<CalleeSavedInfo> &CSI,
                                      const TargetRegisterInfo *TRI) const;
 
+    void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const;
+
     bool hasFP(const MachineFunction &MF) const;
 
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 472ce63..fbf86c5 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -211,15 +211,10 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32,
                                   Ops, 4);
   }
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    case Intrinsic::xcore_crc8:
-      SDValue Ops[] = { N->getOperand(1), N->getOperand(2), N->getOperand(3) };
-      return CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32, MVT::i32,
-                                    Ops, 3);
-    }
-    break;
+  case XCoreISD::CRC8: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+    return CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32, MVT::i32,
+                                  Ops, 3);
   }
   case ISD::BRIND:
     if (SDNode *ResNode = SelectBRIND(N))
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 6e894ac..f8a9125 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -54,6 +54,7 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::LMUL              : return "XCoreISD::LMUL";
     case XCoreISD::MACCU             : return "XCoreISD::MACCU";
     case XCoreISD::MACCS             : return "XCoreISD::MACCS";
+    case XCoreISD::CRC8              : return "XCoreISD::CRC8";
     case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
     case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
     default                          : return NULL;
@@ -152,9 +153,12 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
-  maxStoresPerMemset = maxStoresPerMemsetOptSize = 4;
-  maxStoresPerMemmove = maxStoresPerMemmoveOptSize
-    = maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 2;
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize
+    = MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 2;
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::STORE);
@@ -167,24 +171,25 @@ SDValue XCoreTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode())
   {
-  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
-  case ISD::BlockAddress:     return LowerBlockAddress(Op, DAG);
-  case ISD::ConstantPool:     return LowerConstantPool(Op, DAG);
-  case ISD::BR_JT:            return LowerBR_JT(Op, DAG);
-  case ISD::LOAD:             return LowerLOAD(Op, DAG);
-  case ISD::STORE:            return LowerSTORE(Op, DAG);
-  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
-  case ISD::VAARG:            return LowerVAARG(Op, DAG);
-  case ISD::VASTART:          return LowerVASTART(Op, DAG);
-  case ISD::SMUL_LOHI:        return LowerSMUL_LOHI(Op, DAG);
-  case ISD::UMUL_LOHI:        return LowerUMUL_LOHI(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::SMUL_LOHI:          return LowerSMUL_LOHI(Op, DAG);
+  case ISD::UMUL_LOHI:          return LowerUMUL_LOHI(Op, DAG);
   // FIXME: Remove these when LegalizeDAGTypes lands.
   case ISD::ADD:
-  case ISD::SUB:              return ExpandADDSUB(Op.getNode(), DAG);
-  case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
-  case ISD::INIT_TRAMPOLINE:  return LowerINIT_TRAMPOLINE(Op, DAG);
-  case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::SUB:                return ExpandADDSUB(Op.getNode(), DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
   }
@@ -736,13 +741,13 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
   unsigned Opcode = (N->getOpcode() == ISD::ADD) ? XCoreISD::LADD :
                                                    XCoreISD::LSUB;
   SDValue Zero = DAG.getConstant(0, MVT::i32);
-  SDValue Carry = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
-                                  LHSL, RHSL, Zero);
-  SDValue Lo(Carry.getNode(), 1);
+  SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                           LHSL, RHSL, Zero);
+  SDValue Carry(Lo.getNode(), 1);
 
-  SDValue Ignored = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
-                                  LHSH, RHSH, Carry);
-  SDValue Hi(Ignored.getNode(), 1);
+  SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                           LHSH, RHSH, Carry);
+  SDValue Ignored(Hi.getNode(), 1);
   // Merge the pieces
   return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
 }
@@ -858,6 +863,23 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 5);
 }
 
+SDValue XCoreTargetLowering::
+LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (IntNo) {
+    case Intrinsic::xcore_crc8:
+      EVT VT = Op.getValueType();
+      SDValue Data =
+        DAG.getNode(XCoreISD::CRC8, DL, DAG.getVTList(VT, VT),
+                    Op.getOperand(1), Op.getOperand(2) , Op.getOperand(3));
+      SDValue Crc(Data.getNode(), 1);
+      SDValue Results[] = { Crc, Data };
+      return DAG.getMergeValues(Results, 2, DL);
+  }
+  return SDValue();
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -1227,15 +1249,11 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
   // Analyze return values.
   CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
 
-  // If this is the first return lowered for this function, add
-  // the regs to the liveout set for the function.
-  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
-    for (unsigned i = 0; i != RVLocs.size(); ++i)
-      if (RVLocs[i].isRegLoc())
-        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
-  }
-
   SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Return on XCore is always a "retsp 0"
+  RetOps.push_back(DAG.getConstant(0, MVT::i32));
 
   // Copy the result values into the output registers.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -1248,15 +1266,17 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
     // guarantee that all emitted copies are
     // stuck together, avoiding something bad
     Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  // Return on XCore is always a "retsp 0"
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
   if (Flag.getNode())
-    return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
-                       Chain, DAG.getConstant(0, MVT::i32), Flag);
-  else // Return Void
-    return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
-                       Chain, DAG.getConstant(0, MVT::i32));
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
 }
 
 //===----------------------------------------------------------------------===//
@@ -1353,13 +1373,13 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Carry = DAG.getConstant(0, VT);
       SDValue Result = DAG.getNode(ISD::AND, dl, VT, N2,
                                    DAG.getConstant(1, VT));
-      SDValue Ops [] = { Carry, Result };
+      SDValue Ops[] = { Result, Carry };
       return DAG.getMergeValues(Ops, 2, dl);
     }
 
     // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
     // low bit set
-    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 0)) {
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
@@ -1367,7 +1387,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       if ((KnownZero & Mask) == Mask) {
         SDValue Carry = DAG.getConstant(0, VT);
         SDValue Result = DAG.getNode(ISD::ADD, dl, VT, N0, N2);
-        SDValue Ops [] = { Carry, Result };
+        SDValue Ops[] = { Result, Carry };
         return DAG.getMergeValues(Ops, 2, dl);
       }
     }
@@ -1391,14 +1411,14 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
         SDValue Borrow = N2;
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT,
                                      DAG.getConstant(0, VT), N2);
-        SDValue Ops [] = { Borrow, Result };
+        SDValue Ops[] = { Result, Borrow };
         return DAG.getMergeValues(Ops, 2, dl);
       }
     }
 
     // fold (lsub x, 0, y) -> 0, sub x, y iff borrow is unused and y has only the
     // low bit set
-    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 0)) {
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
@@ -1406,7 +1426,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       if ((KnownZero & Mask) == Mask) {
         SDValue Borrow = DAG.getConstant(0, VT);
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT, N0, N2);
-        SDValue Ops [] = { Borrow, Result };
+        SDValue Ops[] = { Result, Borrow };
         return DAG.getMergeValues(Ops, 2, dl);
       }
     }
@@ -1432,11 +1452,15 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       // If the high result is unused fold to add(a, b)
       if (N->hasNUsesOfValue(0, 0)) {
         SDValue Lo = DAG.getNode(ISD::ADD, dl, VT, N2, N3);
-        SDValue Ops [] = { Lo, Lo };
+        SDValue Ops[] = { Lo, Lo };
         return DAG.getMergeValues(Ops, 2, dl);
       }
       // Otherwise fold to ladd(a, b, 0)
-      return DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N2, N3, N1);
+      SDValue Result =
+        DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N2, N3, N1);
+      SDValue Carry(Result.getNode(), 1);
+      SDValue Ops[] = { Carry, Result };
+      return DAG.getMergeValues(Ops, 2, dl);
     }
   }
   break;
@@ -1530,7 +1554,7 @@ void XCoreTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   default: break;
   case XCoreISD::LADD:
   case XCoreISD::LSUB:
-    if (Op.getResNo() == 0) {
+    if (Op.getResNo() == 1) {
       // Top bits of carry / borrow are clear.
       KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
                                         KnownZero.getBitWidth() - 1);
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 2874f00..6d430ef 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -63,6 +63,9 @@ namespace llvm {
       // Corresponds to MACCS instruction
       MACCS,
 
+      // Corresponds to CRC8 instruction
+      CRC8,
+
       // Jumptable branch.
       BR_JT,
 
@@ -147,6 +150,7 @@ namespace llvm {
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
diff --git a/lib/Target/XCore/XCoreInstrFormats.td b/lib/Target/XCore/XCoreInstrFormats.td
index 44ac45c..379cc39 100644
--- a/lib/Target/XCore/XCoreInstrFormats.td
+++ b/lib/Target/XCore/XCoreInstrFormats.td
@@ -33,44 +33,122 @@ class PseudoInstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Instruction formats
 //===----------------------------------------------------------------------===//
 
-class _F3R<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _F3R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc;
+  let DecoderMethod = "Decode3RInstruction";
 }
 
-class _FL3R<dag outs, dag ins, string asmstr, list<dag> pattern>
+// 3R with first operand as an immediate. Used for TSETR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F3RImm<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F3R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode3RImmInstruction";
+}
+
+class _FL3R<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{8-4};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{3-0};
+
+  let Inst{15-11} = 0b11111;
+  let DecoderMethod = "DecodeL3RInstruction";
 }
 
-class _F2RUS<dag outs, dag ins, string asmstr, list<dag> pattern>
+// L3R with first operand as both a source and a destination.
+class _FL3RSrcDst<bits<9> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern> : _FL3R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL3RSrcDstInstruction";
+}
+
+class _F2RUS<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc;
+  let DecoderMethod = "Decode2RUSInstruction";
+}
+
+// 2RUS with bitp operand
+class _F2RUSBitp<bits<5> opc, dag outs, dag ins, string asmstr,
+                 list<dag> pattern>
+    : _F2RUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RUSBitpInstruction";
 }
 
-class _FL2RUS<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _FL2RUS<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{8-4};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{3-0};
+
+  let Inst{15-11} = 0b11111;
+  let DecoderMethod = "DecodeL2RUSInstruction";
+}
+
+// L2RUS with bitp operand
+class _FL2RUSBitp<bits<9> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern>
+    : _FL2RUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL2RUSBitpInstruction";
 }
 
-class _FRU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _FRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<4> a;
+  bits<6> b;
+
+  let Inst{15-10} = opc;
+  let Inst{9-6} = a;
+  let Inst{5-0} = b;
 }
 
-class _FLRU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _FLRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<4> a;
+  bits<16> b;
+
+  let Inst{31-26} = opc;
+  let Inst{25-22} = a;
+  let Inst{21-16} = b{5-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = b{15-6};
 }
 
-class _FU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _FU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<6> a;
+
+  let Inst{15-6} = opc;
+  let Inst{5-0} = a;
 }
 
-class _FLU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _FLU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<16> a;
+
+  let Inst{31-22} = opc;
+  let Inst{21-16} = a{5-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = a{15-6};
 }
 
-class _FU10<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _FU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<10> a;
+
+  let Inst{15-10} = opc;
+  let Inst{9-0} = a;
 }
 
-class _FLU10<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _FLU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<20> a;
+
+  let Inst{31-26} = opc;
+  let Inst{25-16} = a{9-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = a{19-10};
 }
 
 class _F2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
@@ -80,6 +158,14 @@ class _F2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
   let DecoderMethod = "Decode2RInstruction";
 }
 
+// 2R with first operand as an immediate. Used for TSETMR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F2RImm<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RImmInstruction";
+}
+
 // 2R with first operand as both a source and a destination.
 class _F2RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
                  list<dag> pattern> : _F2R<opc, outs, ins, asmstr, pattern> {
@@ -148,14 +234,44 @@ class _F0R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{4-0} = opc{4-0};
 }
 
-class _L4R<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _FL4R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<4> d;
+
+  let Inst{31-27} = opc{5-1};
+  let Inst{26-21} = 0b111111;
+  let Inst{20} = opc{0};
+  let Inst{19-16} = d;
+  let Inst{15-11} = 0b11111;
 }
 
-class _L5R<dag outs, dag ins, string asmstr, list<dag> pattern>
+// L4R with 4th operand as both a source and a destination.
+class _FL4RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern>
+    : _FL4R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL4RSrcDstInstruction";
+}
+
+// L4R with 1st and 4th operand as both a source and a destination.
+class _FL4RSrcDstSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                        list<dag> pattern>
+    : _FL4R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL4RSrcDstSrcDstInstruction";
+}
+
+class _FL5R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{5-1};
+  let Inst{20} = opc{0};
+  let Inst{15-11} = 0b11111;
+
+  let DecoderMethod = "DecodeL5RInstruction";
 }
 
-class _L6R<dag outs, dag ins, string asmstr, list<dag> pattern>
+class _FL6R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
     : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc;
+  let Inst{15-11} = 0b11111;
+
+  let DecoderMethod = "DecodeL6RInstruction";
 }
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 95b076f..e140ef2 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -32,8 +32,8 @@ def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                              SDNPVariadic]>;
 
-def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTBrind,
-                         [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad]>;
+def XCoreRetsp : SDNode<"XCoreISD::RETSP", SDTBrind,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad, SDNPVariadic]>;
 
 def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
@@ -182,6 +182,7 @@ def ADDRcpii : ComplexPattern<i32, 2, "SelectADDRcpii", [add, cprelwrapper],
 // Address operands
 def MEMii : Operand<i32> {
   let PrintMethod = "printMemOperand";
+  let DecoderMethod = "DecodeMEMiiOperand";
   let MIOperandInfo = (ops i32imm, i32imm);
 }
 
@@ -200,146 +201,110 @@ def InlineJT32 : Operand<i32> {
 
 // Three operand short
 
-multiclass F3R_2RUS<string OpcStr, SDNode OpNode> {
-  def _3r: _F3R<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
-  def _2rus : _F2RUS<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+multiclass F3R_2RUS<bits<5> opc1, bits<5> opc2, string OpcStr, SDNode OpNode> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"),
+                [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                     !strconcat(OpcStr, " $dst, $b, $c"),
+                     [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
 }
 
-multiclass F3R_2RUS_np<string OpcStr> {
-  def _3r: _F3R<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 []>;
-  def _2rus : _F2RUS<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 []>;
+multiclass F3R_2RUS_np<bits<5> opc1, bits<5> opc2, string OpcStr> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"), []>;
+  def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                     !strconcat(OpcStr, " $dst, $b, $c"), []>;
 }
 
-multiclass F3R_2RBITP<string OpcStr, SDNode OpNode> {
-  def _3r: _F3R<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
-  def _2rus : _F2RUS<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+multiclass F3R_2RBITP<bits<5> opc1, bits<5> opc2, string OpcStr,
+                      SDNode OpNode> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"),
+                [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                         !strconcat(OpcStr, " $dst, $b, $c"),
+                         [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
 }
 
-class F3R<string OpcStr, SDNode OpNode> : _F3R<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+class F3R<bits<5> opc, string OpcStr, SDNode OpNode> :
+  _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+       !strconcat(OpcStr, " $dst, $b, $c"),
+       [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
 
-class F3R_np<string OpcStr> : _F3R<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 []>;
+class F3R_np<bits<5> opc, string OpcStr> :
+  _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+       !strconcat(OpcStr, " $dst, $b, $c"), []>;
 // Three operand long
 
 /// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
-multiclass FL3R_L2RUS<string OpcStr, SDNode OpNode> {
-  def _l3r: _FL3R<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
-  def _l2rus : _FL2RUS<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+multiclass FL3R_L2RUS<bits<9> opc1, bits<9> opc2, string OpcStr,
+                      SDNode OpNode> {
+  def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                  !strconcat(OpcStr, " $dst, $b, $c"),
+                  [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                       !strconcat(OpcStr, " $dst, $b, $c"),
+                       [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
 }
 
 /// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
-multiclass FL3R_L2RBITP<string OpcStr, SDNode OpNode> {
-  def _l3r: _FL3R<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
-  def _l2rus : _FL2RUS<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+multiclass FL3R_L2RBITP<bits<9> opc1, bits<9> opc2, string OpcStr,
+                        SDNode OpNode> {
+  def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                  !strconcat(OpcStr, " $dst, $b, $c"),
+                  [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                           !strconcat(OpcStr, " $dst, $b, $c"),
+                           [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
 }
 
-class FL3R<string OpcStr, SDNode OpNode> : _FL3R<
-                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
-                 !strconcat(OpcStr, " $dst, $b, $c"),
-                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+class FL3R<bits<9> opc, string OpcStr, SDNode OpNode> :
+  _FL3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+        !strconcat(OpcStr, " $dst, $b, $c"),
+        [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
 
 // Register - U6
 // Operand register - U6
-multiclass FRU6_LRU6_branch<string OpcStr> {
-  def _ru6: _FRU6<
-                 (outs), (ins GRRegs:$cond, brtarget:$dest),
-                 !strconcat(OpcStr, " $cond, $dest"),
-                 []>;
-  def _lru6: _FLRU6<
-                 (outs), (ins GRRegs:$cond, brtarget:$dest),
-                 !strconcat(OpcStr, " $cond, $dest"),
-                 []>;
+multiclass FRU6_LRU6_branch<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                  !strconcat(OpcStr, " $a, $b"), []>;
+  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                    !strconcat(OpcStr, " $a, $b"), []>;
 }
 
-multiclass FRU6_LRU6_cp<string OpcStr> {
-  def _ru6: _FRU6<
-                 (outs GRRegs:$dst), (ins i32imm:$a),
-                 !strconcat(OpcStr, " $dst, cp[$a]"),
-                 []>;
-  def _lru6: _FLRU6<
-                 (outs GRRegs:$dst), (ins i32imm:$a),
-                 !strconcat(OpcStr, " $dst, cp[$a]"),
-                 []>;
+multiclass FRU6_LRU6_backwards_branch<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                  !strconcat(OpcStr, " $a, -$b"), []>;
+  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                    !strconcat(OpcStr, " $a, -$b"), []>;
 }
 
-// U6
-multiclass FU6_LU6<string OpcStr, SDNode OpNode> {
-  def _u6: _FU6<
-                 (outs), (ins i32imm:$b),
-                 !strconcat(OpcStr, " $b"),
-                 [(OpNode immU6:$b)]>;
-  def _lu6: _FLU6<
-                 (outs), (ins i32imm:$b),
-                 !strconcat(OpcStr, " $b"),
-                 [(OpNode immU16:$b)]>;
+multiclass FRU6_LRU6_cp<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs GRRegs:$a), (ins i32imm:$b),
+                  !strconcat(OpcStr, " $a, cp[$b]"), []>;
+  def _lru6: _FLRU6<opc, (outs GRRegs:$a), (ins i32imm:$b),
+                    !strconcat(OpcStr, " $a, cp[$b]"), []>;
 }
-multiclass FU6_LU6_int<string OpcStr, Intrinsic Int> {
-  def _u6: _FU6<
-                 (outs), (ins i32imm:$b),
-                 !strconcat(OpcStr, " $b"),
-                 [(Int immU6:$b)]>;
-  def _lu6: _FLU6<
-                 (outs), (ins i32imm:$b),
-                 !strconcat(OpcStr, " $b"),
-                 [(Int immU16:$b)]>;
+
+// U6
+multiclass FU6_LU6<bits<10> opc, string OpcStr, SDNode OpNode> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                [(OpNode immU6:$a)]>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                  [(OpNode immU16:$a)]>;
 }
 
-multiclass FU6_LU6_np<string OpcStr> {
-  def _u6: _FU6<
-                 (outs), (ins i32imm:$b),
-                 !strconcat(OpcStr, " $b"),
-                 []>;
-  def _lu6: _FLU6<
-                 (outs), (ins i32imm:$b),
-                 !strconcat(OpcStr, " $b"),
-                 []>;
+multiclass FU6_LU6_int<bits<10> opc, string OpcStr, Intrinsic Int> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                [(Int immU6:$a)]>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                  [(Int immU16:$a)]>;
 }
 
-// U10
-multiclass FU10_LU10_np<string OpcStr> {
-  def _u10: _FU10<
-                 (outs), (ins i32imm:$b),
-                 !strconcat(OpcStr, " $b"),
-                 []>;
-  def _lu10: _FLU10<
-                 (outs), (ins i32imm:$b),
-                 !strconcat(OpcStr, " $b"),
-                 []>;
+multiclass FU6_LU6_np<bits<10> opc, string OpcStr> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
 }
 
 // Two operand short
@@ -390,368 +355,351 @@ let usesCustomInserter = 1 in {
 //===----------------------------------------------------------------------===//
 
 // Three operand short
-defm ADD : F3R_2RUS<"add", add>;
-defm SUB : F3R_2RUS<"sub", sub>;
+defm ADD : F3R_2RUS<0b00010, 0b10010, "add", add>;
+defm SUB : F3R_2RUS<0b00011, 0b10011, "sub", sub>;
 let neverHasSideEffects = 1 in {
-defm EQ : F3R_2RUS_np<"eq">;
-def LSS_3r : F3R_np<"lss">;
-def LSU_3r : F3R_np<"lsu">;
+defm EQ : F3R_2RUS_np<0b00110, 0b10110, "eq">;
+def LSS_3r : F3R_np<0b11000, "lss">;
+def LSU_3r : F3R_np<0b11001, "lsu">;
 }
-def AND_3r : F3R<"and", and>;
-def OR_3r : F3R<"or", or>;
+def AND_3r : F3R<0b00111, "and", and>;
+def OR_3r : F3R<0b01000, "or", or>;
 
 let mayLoad=1 in {
-def LDW_3r : _F3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
-                  "ldw $dst, $addr[$offset]",
-                  []>;
+def LDW_3r : _F3R<0b01001, (outs GRRegs:$dst),
+                  (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldw $dst, $addr[$offset]", []>;
 
-def LDW_2rus : _F2RUS<(outs GRRegs:$dst), (ins GRRegs:$addr, i32imm:$offset),
-                  "ldw $dst, $addr[$offset]",
-                  []>;
+def LDW_2rus : _F2RUS<0b00001, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, i32imm:$offset),
+                      "ldw $dst, $addr[$offset]", []>;
 
-def LD16S_3r :  _F3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
-                  "ld16s $dst, $addr[$offset]",
-                  []>;
+def LD16S_3r :  _F3R<0b10000, (outs GRRegs:$dst),
+                     (ins GRRegs:$addr, GRRegs:$offset),
+                     "ld16s $dst, $addr[$offset]", []>;
 
-def LD8U_3r :  _F3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
-                  "ld8u $dst, $addr[$offset]",
-                  []>;
+def LD8U_3r :  _F3R<0b10001, (outs GRRegs:$dst),
+                    (ins GRRegs:$addr, GRRegs:$offset),
+                    "ld8u $dst, $addr[$offset]", []>;
 }
 
 let mayStore=1 in {
-def STW_3r : _F3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
-                  "stw $val, $addr[$offset]",
-                  []>;
+def STW_l3r : _FL3R<0b000001100, (outs),
+                    (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                    "stw $val, $addr[$offset]", []>;
 
-def STW_2rus : _F2RUS<(outs), (ins GRRegs:$val, GRRegs:$addr, i32imm:$offset),
-                  "stw $val, $addr[$offset]",
-                  []>;
+def STW_2rus : _F2RUS<0b0000, (outs),
+                      (ins GRRegs:$val, GRRegs:$addr, i32imm:$offset),
+                      "stw $val, $addr[$offset]", []>;
 }
 
-defm SHL : F3R_2RBITP<"shl", shl>;
-defm SHR : F3R_2RBITP<"shr", srl>;
-// TODO tsetr
+defm SHL : F3R_2RBITP<0b00100, 0b10100, "shl", shl>;
+defm SHR : F3R_2RBITP<0b00101, 0b10101, "shr", srl>;
+
+// The first operand is treated as an immediate since it refers to a register
+// number in another thread.
+def TSETR_3r : _F3RImm<0b10111, (outs), (ins i32imm:$a, GRRegs:$b, GRRegs:$c),
+                       "set t[$c]:r$a, $b", []>;
 
 // Three operand long
-def LDAWF_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
-                  "ldaw $dst, $addr[$offset]",
-                  [(set GRRegs:$dst, (ldawf GRRegs:$addr, GRRegs:$offset))]>;
+def LDAWF_l3r : _FL3R<0b000111100, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, GRRegs:$offset),
+                      "ldaw $dst, $addr[$offset]",
+                      [(set GRRegs:$dst,
+                         (ldawf GRRegs:$addr, GRRegs:$offset))]>;
 
 let neverHasSideEffects = 1 in
-def LDAWF_l2rus : _FL2RUS<(outs GRRegs:$dst),
-                    (ins GRRegs:$addr, i32imm:$offset),
-                    "ldaw $dst, $addr[$offset]",
-                    []>;
+def LDAWF_l2rus : _FL2RUS<0b100111100, (outs GRRegs:$dst),
+                          (ins GRRegs:$addr, i32imm:$offset),
+                          "ldaw $dst, $addr[$offset]", []>;
 
-def LDAWB_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
-                  "ldaw $dst, $addr[-$offset]",
-                  [(set GRRegs:$dst, (ldawb GRRegs:$addr, GRRegs:$offset))]>;
+def LDAWB_l3r : _FL3R<0b001001100, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, GRRegs:$offset),
+                      "ldaw $dst, $addr[-$offset]",
+                      [(set GRRegs:$dst,
+                         (ldawb GRRegs:$addr, GRRegs:$offset))]>;
 
 let neverHasSideEffects = 1 in
-def LDAWB_l2rus : _FL2RUS<(outs GRRegs:$dst),
-                    (ins GRRegs:$addr, i32imm:$offset),
-                    "ldaw $dst, $addr[-$offset]",
-                    []>;
-
-def LDA16F_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
-                  "lda16 $dst, $addr[$offset]",
-                  [(set GRRegs:$dst, (lda16f GRRegs:$addr, GRRegs:$offset))]>;
-
-def LDA16B_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
-                  "lda16 $dst, $addr[-$offset]",
-                  [(set GRRegs:$dst, (lda16b GRRegs:$addr, GRRegs:$offset))]>;
-
-def MUL_l3r : FL3R<"mul", mul>;
+def LDAWB_l2rus : _FL2RUS<0b101001100, (outs GRRegs:$dst),
+                         (ins GRRegs:$addr, i32imm:$offset),
+                         "ldaw $dst, $addr[-$offset]", []>;
+
+def LDA16F_l3r : _FL3R<0b001011100, (outs GRRegs:$dst),
+                       (ins GRRegs:$addr, GRRegs:$offset),
+                       "lda16 $dst, $addr[$offset]",
+                       [(set GRRegs:$dst,
+                          (lda16f GRRegs:$addr, GRRegs:$offset))]>;
+
+def LDA16B_l3r : _FL3R<0b001101100, (outs GRRegs:$dst),
+                       (ins GRRegs:$addr, GRRegs:$offset),
+                       "lda16 $dst, $addr[-$offset]",
+                       [(set GRRegs:$dst,
+                          (lda16b GRRegs:$addr, GRRegs:$offset))]>;
+
+def MUL_l3r : FL3R<0b001111100, "mul", mul>;
 // Instructions which may trap are marked as side effecting.
 let hasSideEffects = 1 in {
-def DIVS_l3r : FL3R<"divs", sdiv>;
-def DIVU_l3r : FL3R<"divu", udiv>;
-def REMS_l3r : FL3R<"rems", srem>;
-def REMU_l3r : FL3R<"remu", urem>;
+def DIVS_l3r : FL3R<0b010001100, "divs", sdiv>;
+def DIVU_l3r : FL3R<0b010011100, "divu", udiv>;
+def REMS_l3r : FL3R<0b110001100, "rems", srem>;
+def REMU_l3r : FL3R<0b110011100, "remu", urem>;
 }
-def XOR_l3r : FL3R<"xor", xor>;
-defm ASHR : FL3R_L2RBITP<"ashr", sra>;
+def XOR_l3r : FL3R<0b000011100, "xor", xor>;
+defm ASHR : FL3R_L2RBITP<0b000101100, 0b100101100, "ashr", sra>;
 
 let Constraints = "$src1 = $dst" in
-def CRC_l3r : _FL3R<(outs GRRegs:$dst),
-                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
-                     "crc32 $dst, $src2, $src3",
-                     [(set GRRegs:$dst,
-                        (int_xcore_crc32 GRRegs:$src1, GRRegs:$src2,
-                                         GRRegs:$src3))]>;
+def CRC_l3r : _FL3RSrcDst<0b101011100, (outs GRRegs:$dst),
+                          (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                          "crc32 $dst, $src2, $src3",
+                          [(set GRRegs:$dst,
+                             (int_xcore_crc32 GRRegs:$src1, GRRegs:$src2,
+                                              GRRegs:$src3))]>;
 
-// TODO inpw, outpw
 let mayStore=1 in {
-def ST16_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
-                "st16 $val, $addr[$offset]",
-                []>;
+def ST16_l3r : _FL3R<0b100001100, (outs),
+                     (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                     "st16 $val, $addr[$offset]", []>;
 
-def ST8_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
-                "st8 $val, $addr[$offset]",
-                []>;
+def ST8_l3r : _FL3R<0b100011100, (outs),
+                    (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                    "st8 $val, $addr[$offset]", []>;
 }
 
-// Four operand long
-let Constraints = "$src1 = $dst1,$src2 = $dst2" in {
-def MACCU_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
-                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
-                      GRRegs:$src4),
-                    "maccu $dst1, $dst2, $src3, $src4",
-                    []>;
+def INPW_l2rus : _FL2RUSBitp<0b100101110, (outs GRRegs:$a),
+                             (ins GRRegs:$b, i32imm:$c), "inpw $a, res[$b], $c",
+                             []>;
 
-def MACCS_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
-                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
-                      GRRegs:$src4),
-                    "maccs $dst1, $dst2, $src3, $src4",
-                    []>;
+def OUTPW_l2rus : _FL2RUSBitp<0b100101101, (outs),
+                              (ins GRRegs:$a, GRRegs:$b, i32imm:$c),
+                              "outpw res[$b], $a, $c", []>;
+
+// Four operand long
+let Constraints = "$e = $a,$f = $b" in {
+def MACCU_l4r : _FL4RSrcDstSrcDst<
+  0b000001, (outs GRRegs:$a, GRRegs:$b),
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccu $a, $b, $c, $d", []>;
+
+def MACCS_l4r : _FL4RSrcDstSrcDst<
+  0b000010, (outs GRRegs:$a, GRRegs:$b),
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccs $a, $b, $c, $d", []>;
 }
 
-let Constraints = "$src1 = $dst1" in
-def CRC8_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
-                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
-                    "crc8 $dst1, $dst2, $src2, $src3",
-                    []>;
+let Constraints = "$e = $b" in
+def CRC8_l4r : _FL4RSrcDst<0b000000, (outs GRRegs:$a, GRRegs:$b),
+                           (ins GRRegs:$e, GRRegs:$c, GRRegs:$d),
+                           "crc8 $b, $a, $c, $d", []>;
 
 // Five operand long
 
-def LADD_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
-                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
-                    "ladd $dst1, $dst2, $src1, $src2, $src3",
-                    []>;
+def LADD_l5r : _FL5R<0b000001, (outs GRRegs:$dst1, GRRegs:$dst2),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "ladd $dst2, $dst1, $src1, $src2, $src3",
+                     []>;
 
-def LSUB_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
-                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
-                    "lsub $dst1, $dst2, $src1, $src2, $src3",
-                    []>;
+def LSUB_l5r : _FL5R<0b000010, (outs GRRegs:$dst1, GRRegs:$dst2),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "lsub $dst2, $dst1, $src1, $src2, $src3", []>;
 
-def LDIV_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
-                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
-                    "ldiv $dst1, $dst2, $src1, $src2, $src3",
-                    []>;
+def LDIVU_l5r : _FL5R<0b000000, (outs GRRegs:$dst1, GRRegs:$dst2),
+                      (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                      "ldivu $dst1, $dst2, $src3, $src1, $src2", []>;
 
 // Six operand long
 
-def LMUL_l6r : _L6R<(outs GRRegs:$dst1, GRRegs:$dst2),
-                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
-                      GRRegs:$src4),
-                    "lmul $dst1, $dst2, $src1, $src2, $src3, $src4",
-                    []>;
+def LMUL_l6r : _FL6R<
+  0b00000, (outs GRRegs:$dst1, GRRegs:$dst2),
+  (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3, GRRegs:$src4),
+  "lmul $dst1, $dst2, $src1, $src2, $src3, $src4", []>;
 
 // Register - U6
 
 //let Uses = [DP] in ...
 let neverHasSideEffects = 1, isReMaterializable = 1 in
-def LDAWDP_ru6: _FRU6<(outs GRRegs:$dst), (ins MEMii:$a),
-                    "ldaw $dst, dp[$a]",
-                    []>;
+def LDAWDP_ru6: _FRU6<0b011000, (outs GRRegs:$a), (ins MEMii:$b),
+                      "ldaw $a, dp[$b]", []>;
 
 let isReMaterializable = 1 in                    
-def LDAWDP_lru6: _FLRU6<
-                    (outs GRRegs:$dst), (ins MEMii:$a),
-                    "ldaw $dst, dp[$a]",
-                    [(set GRRegs:$dst, ADDRdpii:$a)]>;
+def LDAWDP_lru6: _FLRU6<0b011000, (outs GRRegs:$a), (ins MEMii:$b),
+                        "ldaw $a, dp[$b]",
+                        [(set GRRegs:$a, ADDRdpii:$b)]>;
 
 let mayLoad=1 in
-def LDWDP_ru6: _FRU6<(outs GRRegs:$dst), (ins MEMii:$a),
-                    "ldw $dst, dp[$a]",
-                    []>;
-                    
-def LDWDP_lru6: _FLRU6<
-                    (outs GRRegs:$dst), (ins MEMii:$a),
-                    "ldw $dst, dp[$a]",
-                    [(set GRRegs:$dst, (load ADDRdpii:$a))]>;
+def LDWDP_ru6: _FRU6<0b010110, (outs GRRegs:$a), (ins MEMii:$b),
+                     "ldw $a, dp[$b]", []>;
+
+def LDWDP_lru6: _FLRU6<0b010110, (outs GRRegs:$a), (ins MEMii:$b),
+                       "ldw $a, dp[$b]",
+                       [(set GRRegs:$a, (load ADDRdpii:$b))]>;
 
 let mayStore=1 in
-def STWDP_ru6 : _FRU6<(outs), (ins GRRegs:$val, MEMii:$addr),
-                  "stw $val, dp[$addr]",
-                  []>;
+def STWDP_ru6 : _FRU6<0b010100, (outs), (ins GRRegs:$a, MEMii:$b),
+                      "stw $a, dp[$b]", []>;
 
-def STWDP_lru6 : _FLRU6<(outs), (ins GRRegs:$val, MEMii:$addr),
-                  "stw $val, dp[$addr]",
-                  [(store GRRegs:$val, ADDRdpii:$addr)]>;
+def STWDP_lru6 : _FLRU6<0b010100, (outs), (ins GRRegs:$a, MEMii:$b),
+                        "stw $a, dp[$b]",
+                        [(store GRRegs:$a, ADDRdpii:$b)]>;
 
 //let Uses = [CP] in ..
 let mayLoad = 1, isReMaterializable = 1, neverHasSideEffects = 1 in
-defm LDWCP : FRU6_LRU6_cp<"ldw">;
+defm LDWCP : FRU6_LRU6_cp<0b011011, "ldw">;
 
 let Uses = [SP] in {
 let mayStore=1 in {
-def STWSP_ru6 : _FRU6<
-                 (outs), (ins GRRegs:$val, i32imm:$index),
-                 "stw $val, sp[$index]",
-                 [(XCoreStwsp GRRegs:$val, immU6:$index)]>;
-
-def STWSP_lru6 : _FLRU6<
-                 (outs), (ins GRRegs:$val, i32imm:$index),
-                 "stw $val, sp[$index]",
-                 [(XCoreStwsp GRRegs:$val, immU16:$index)]>;
+def STWSP_ru6 : _FRU6<0b010101, (outs), (ins GRRegs:$a, i32imm:$b),
+                      "stw $a, sp[$b]",
+                      [(XCoreStwsp GRRegs:$a, immU6:$b)]>;
+
+def STWSP_lru6 : _FLRU6<0b010101, (outs), (ins GRRegs:$a, i32imm:$b),
+                        "stw $a, sp[$b]",
+                        [(XCoreStwsp GRRegs:$a, immU16:$b)]>;
 }
 
 let mayLoad=1 in {
-def LDWSP_ru6 : _FRU6<
-                 (outs GRRegs:$dst), (ins i32imm:$b),
-                 "ldw $dst, sp[$b]",
-                 []>;
+def LDWSP_ru6 : _FRU6<0b010111, (outs GRRegs:$a), (ins i32imm:$b),
+                      "ldw $a, sp[$b]", []>;
 
-def LDWSP_lru6 : _FLRU6<
-                 (outs GRRegs:$dst), (ins i32imm:$b),
-                 "ldw $dst, sp[$b]",
-                 []>;
+def LDWSP_lru6 : _FLRU6<0b010111, (outs GRRegs:$a), (ins i32imm:$b),
+                        "ldw $a, sp[$b]", []>;
 }
 
 let neverHasSideEffects = 1 in {
-def LDAWSP_ru6 : _FRU6<
-                 (outs GRRegs:$dst), (ins i32imm:$b),
-                 "ldaw $dst, sp[$b]",
-                 []>;
+def LDAWSP_ru6 : _FRU6<0b011001, (outs GRRegs:$a), (ins i32imm:$b),
+                       "ldaw $a, sp[$b]", []>;
 
-def LDAWSP_lru6 : _FLRU6<
-                 (outs GRRegs:$dst), (ins i32imm:$b),
-                 "ldaw $dst, sp[$b]",
-                 []>;
+def LDAWSP_lru6 : _FLRU6<0b011001, (outs GRRegs:$a), (ins i32imm:$b),
+                         "ldaw $a, sp[$b]", []>;
 
-def LDAWSP_ru6_RRegs : _FRU6<
-                 (outs RRegs:$dst), (ins i32imm:$b),
-                 "ldaw $dst, sp[$b]",
-                 []>;
+let isCodeGenOnly = 1 in
+def LDAWSP_ru6_RRegs : _FRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+                             "ldaw $a, sp[$b]", []>;
 
-def LDAWSP_lru6_RRegs : _FLRU6<
-                 (outs RRegs:$dst), (ins i32imm:$b),
-                 "ldaw $dst, sp[$b]",
-                 []>;
+let isCodeGenOnly = 1 in
+def LDAWSP_lru6_RRegs : _FLRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+                               "ldaw $a, sp[$b]", []>;
 }
 }
 
 let isReMaterializable = 1 in {
-def LDC_ru6 : _FRU6<
-                 (outs GRRegs:$dst), (ins i32imm:$b),
-                 "ldc $dst, $b",
-                 [(set GRRegs:$dst, immU6:$b)]>;
-
-def LDC_lru6 : _FLRU6<
-                 (outs GRRegs:$dst), (ins i32imm:$b),
-                 "ldc $dst, $b",
-                 [(set GRRegs:$dst, immU16:$b)]>;
+def LDC_ru6 : _FRU6<0b011010, (outs GRRegs:$a), (ins i32imm:$b),
+                    "ldc $a, $b", [(set GRRegs:$a, immU6:$b)]>;
+
+def LDC_lru6 : _FLRU6<0b011010, (outs GRRegs:$a), (ins i32imm:$b),
+                      "ldc $a, $b", [(set GRRegs:$a, immU16:$b)]>;
 }
 
-def SETC_ru6 : _FRU6<(outs), (ins GRRegs:$r, i32imm:$val),
-                  "setc res[$r], $val",
-                  [(int_xcore_setc GRRegs:$r, immU6:$val)]>;
+def SETC_ru6 : _FRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+                     "setc res[$a], $b",
+                     [(int_xcore_setc GRRegs:$a, immU6:$b)]>;
 
-def SETC_lru6 : _FLRU6<(outs), (ins GRRegs:$r, i32imm:$val),
-                  "setc res[$r], $val",
-                  [(int_xcore_setc GRRegs:$r, immU16:$val)]>;
+def SETC_lru6 : _FLRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+                       "setc res[$a], $b",
+                       [(int_xcore_setc GRRegs:$a, immU16:$b)]>;
 
 // Operand register - U6
 let isBranch = 1, isTerminator = 1 in {
-defm BRFT: FRU6_LRU6_branch<"bt">;
-defm BRBT: FRU6_LRU6_branch<"bt">;
-defm BRFF: FRU6_LRU6_branch<"bf">;
-defm BRBF: FRU6_LRU6_branch<"bf">;
+defm BRFT: FRU6_LRU6_branch<0b011100, "bt">;
+defm BRBT: FRU6_LRU6_backwards_branch<0b011101, "bt">;
+defm BRFF: FRU6_LRU6_branch<0b011110, "bf">;
+defm BRBF: FRU6_LRU6_backwards_branch<0b011111, "bf">;
 }
 
 // U6
 let Defs = [SP], Uses = [SP] in {
 let neverHasSideEffects = 1 in
-defm EXTSP : FU6_LU6_np<"extsp">;
+defm EXTSP : FU6_LU6_np<0b0111011110, "extsp">;
+
 let mayStore = 1 in
-defm ENTSP : FU6_LU6_np<"entsp">;
+defm ENTSP : FU6_LU6_np<0b0111011101, "entsp">;
 
 let isReturn = 1, isTerminator = 1, mayLoad = 1, isBarrier = 1 in {
-defm RETSP : FU6_LU6<"retsp", XCoreRetsp>;
+defm RETSP : FU6_LU6<0b0111011111, "retsp", XCoreRetsp>;
 }
 }
 
-// TODO extdp, kentsp, krestsp, blat
-// getsr, kalli
+let neverHasSideEffects = 1 in
+defm EXTDP : FU6_LU6_np<0b0111001110, "extdp">;
+
+let Uses = [R11], isCall=1 in
+defm BLAT : FU6_LU6_np<0b0111001101, "blat">;
+
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-def BRBU_u6 : _FU6<
-                 (outs),
-                 (ins brtarget:$target),
-                 "bu $target",
-                 []>;
+def BRBU_u6 : _FU6<0b0111011100, (outs), (ins brtarget:$a), "bu -$a", []>;
 
-def BRBU_lu6 : _FLU6<
-                 (outs),
-                 (ins brtarget:$target),
-                 "bu $target",
-                 []>;
+def BRBU_lu6 : _FLU6<0b0111011100, (outs), (ins brtarget:$a), "bu -$a", []>;
 
-def BRFU_u6 : _FU6<
-                 (outs),
-                 (ins brtarget:$target),
-                 "bu $target",
-                 []>;
+def BRFU_u6 : _FU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
 
-def BRFU_lu6 : _FLU6<
-                 (outs),
-                 (ins brtarget:$target),
-                 "bu $target",
-                 []>;
+def BRFU_lu6 : _FLU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
 }
 
 //let Uses = [CP] in ...
 let Defs = [R11], neverHasSideEffects = 1, isReMaterializable = 1 in
-def LDAWCP_u6: _FRU6<(outs), (ins MEMii:$a),
-                    "ldaw r11, cp[$a]",
+def LDAWCP_u6: _FU6<0b0111111101, (outs), (ins MEMii:$a), "ldaw r11, cp[$a]",
                     []>;
 
 let Defs = [R11], isReMaterializable = 1 in
-def LDAWCP_lu6: _FLRU6<
-                    (outs), (ins MEMii:$a),
-                    "ldaw r11, cp[$a]",
-                    [(set R11, ADDRcpii:$a)]>;
+def LDAWCP_lu6: _FLU6<0b0111111101, (outs), (ins MEMii:$a), "ldaw r11, cp[$a]",
+                      [(set R11, ADDRcpii:$a)]>;
 
-defm SETSR : FU6_LU6_int<"setsr", int_xcore_setsr>;
+let Defs = [R11] in
+defm GETSR : FU6_LU6_np<0b0111111100, "getsr r11,">;
 
-defm CLRSR : FU6_LU6_int<"clrsr", int_xcore_clrsr>;
+defm SETSR : FU6_LU6_int<0b0111101101, "setsr", int_xcore_setsr>;
+
+defm CLRSR : FU6_LU6_int<0b0111101100, "clrsr", int_xcore_clrsr>;
 
 // setsr may cause a branch if it is used to enable events. clrsr may
 // branch if it is executed while events are enabled.
-let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in {
-defm SETSR_branch : FU6_LU6_np<"setsr">;
-defm CLRSR_branch : FU6_LU6_np<"clrsr">;
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+    isCodeGenOnly = 1 in {
+defm SETSR_branch : FU6_LU6_np<0b0111101101, "setsr">;
+defm CLRSR_branch : FU6_LU6_np<0b0111101100, "clrsr">;
 }
 
+defm KCALL : FU6_LU6_np<0b0111001111, "kcall">;
+
+let Uses = [SP], Defs = [SP], mayStore = 1 in
+defm KENTSP : FU6_LU6_np<0b0111101110, "kentsp">;
+
+let Uses = [SP], Defs = [SP], mayLoad = 1 in
+defm KRESTSP : FU6_LU6_np<0b0111101111, "krestsp">;
+
 // U10
-// TODO ldwcpl, blacp
 
 let Defs = [R11], isReMaterializable = 1, neverHasSideEffects = 1 in
-def LDAP_u10 : _FU10<
-                  (outs),
-                  (ins i32imm:$addr),
-                  "ldap r11, $addr",
-                  []>;
+def LDAPF_u10 : _FU10<0b110110, (outs), (ins i32imm:$a), "ldap r11, $a", []>;
 
 let Defs = [R11], isReMaterializable = 1 in
-def LDAP_lu10 : _FLU10<
-                  (outs),
-                  (ins i32imm:$addr),
-                  "ldap r11, $addr",
-                  [(set R11, (pcrelwrapper tglobaladdr:$addr))]>;
+def LDAPF_lu10 : _FLU10<0b110110, (outs), (ins i32imm:$a), "ldap r11, $a",
+                        [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
 
-let Defs = [R11], isReMaterializable = 1 in
-def LDAP_lu10_ba : _FLU10<(outs),
-                          (ins i32imm:$addr),
-                          "ldap r11, $addr",
-                          [(set R11, (pcrelwrapper tblockaddress:$addr))]>;
+let Defs = [R11], isReMaterializable = 1, isCodeGenOnly = 1 in
+def LDAPF_lu10_ba : _FLU10<0b110110, (outs), (ins i32imm:$a), "ldap r11, $a",
+                           [(set R11, (pcrelwrapper tblockaddress:$a))]>;
 
 let isCall=1,
 // All calls clobber the link register and the non-callee-saved registers:
 Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
-def BL_u10 : _FU10<
-                  (outs), (ins calltarget:$target),
-                  "bl $target",
-                  [(XCoreBranchLink immU10:$target)]>;
-
-def BL_lu10 : _FLU10<
-                  (outs), (ins calltarget:$target),
-                  "bl $target",
-                  [(XCoreBranchLink immU20:$target)]>;
+def BLACP_u10 : _FU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLACP_lu10 : _FLU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLRF_u10 : _FU10<0b110100, (outs), (ins calltarget:$a), "bl $a",
+                     [(XCoreBranchLink immU10:$a)]>;
+
+def BLRF_lu10 : _FLU10<0b110100, (outs), (ins calltarget:$a), "bl $a",
+                       [(XCoreBranchLink immU20:$a)]>;
+}
+
+let Defs = [R11], mayLoad = 1, isReMaterializable = 1,
+    neverHasSideEffects = 1 in {
+def LDWCP_u10 : _FU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]", []>;
+
+def LDWCP_lu10 : _FLU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]",
+                        []>;
 }
 
 // Two operand short
-// TODO eet, eef, tsetmr
 def NOT : _F2R<0b100010, (outs GRRegs:$dst), (ins GRRegs:$b),
                 "not $dst, $b", [(set GRRegs:$dst, (not GRRegs:$b))]>;
 
@@ -867,9 +815,9 @@ def SETD_2r : _FR2R<0b000101, (outs), (ins GRRegs:$r, GRRegs:$val),
                     "setd res[$r], $val",
                     [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
 
-def SETPSC_l2r : _FR2R<0b110000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
-                       "setpsc res[$src1], $src2",
-                       [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
+def SETPSC_2r : _FR2R<0b110000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                      "setpsc res[$src1], $src2",
+                      [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
 
 def GETST_2r : _F2R<0b000001, (outs GRRegs:$dst), (ins GRRegs:$r),
                     "getst $dst, res[$r]",
@@ -899,8 +847,16 @@ def ENDIN_2r : _F2R<0b100101, (outs GRRegs:$dst), (ins GRRegs:$src),
                      "endin $dst, res[$src]",
                      [(set GRRegs:$dst, (int_xcore_endin GRRegs:$src))]>;
 
+def EEF_2r : _F2R<0b001011, (outs), (ins GRRegs:$a, GRRegs:$b),
+                  "eef $a, res[$b]", []>;
+
+def EET_2r : _F2R<0b001001, (outs), (ins GRRegs:$a, GRRegs:$b),
+                  "eet $a, res[$b]", []>;
+
+def TSETMR_2r : _F2RImm<0b000111, (outs), (ins i32imm:$a, GRRegs:$b),
+                        "tsetmr r$a, $b", []>;
+
 // Two operand long
-// getd, testlcl
 def BITREV_l2r : _FL2R<0b0000011000, (outs GRRegs:$dst), (ins GRRegs:$src),
                        "bitrev $dst, $src",
                        [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
@@ -913,6 +869,12 @@ def CLZ_l2r : _FL2R<0b000111000, (outs GRRegs:$dst), (ins GRRegs:$src),
                     "clz $dst, $src",
                     [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
 
+def GETD_l2r : _FL2R<0b0001111001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "getd $dst, res[$src]", []>;
+
+def GETN_l2r : _FL2R<0b0011011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "getn $dst, res[$src]", []>;
+
 def SETC_l2r : _FL2R<0b0010111001, (outs), (ins GRRegs:$r, GRRegs:$val),
                      "setc res[$r], $val",
                      [(int_xcore_setc GRRegs:$r, GRRegs:$val)]>;
@@ -937,14 +899,17 @@ def SETCLK_l2r : _FLR2R<0b0000111001, (outs), (ins GRRegs:$src1, GRRegs:$src2),
                         "setclk res[$src1], $src2",
                         [(int_xcore_setclk GRRegs:$src1, GRRegs:$src2)]>;
 
+def SETN_l2r : _FLR2R<0b0011011000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                      "setn res[$src1], $src2", []>;
+
 def SETRDY_l2r : _FLR2R<0b0010111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
                         "setrdy res[$src1], $src2",
                         [(int_xcore_setrdy GRRegs:$src1, GRRegs:$src2)]>;
 
+def TESTLCL_l2r : _FL2R<0b0010011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                        "testlcl $dst, res[$src]", []>;
+
 // One operand short
-// TODO edu, eeu, waitet, waitef, tstart, clrtp
-// setdp, setcp, setev, kcall
-// dgetreg
 def MSYNC_1r : _F1R<0b000111, (outs), (ins GRRegs:$a),
                     "msync res[$a]",
                     [(int_xcore_msync GRRegs:$a)]>;
@@ -968,9 +933,13 @@ def BR_JT32 : PseudoInstXCore<(outs), (ins InlineJT32:$t, GRRegs:$i),
                               [(XCoreBR_JT32 tjumptable:$t, GRRegs:$i)]>;
 
 let Defs=[SP], neverHasSideEffects=1 in
-def SETSP_1r : _F1R<0b001011, (outs), (ins GRRegs:$a),
-                 "set sp, $a",
-                 []>;
+def SETSP_1r : _F1R<0b001011, (outs), (ins GRRegs:$a), "set sp, $a", []>;
+
+let neverHasSideEffects=1 in
+def SETDP_1r : _F1R<0b001100, (outs), (ins GRRegs:$a), "set dp, $a", []>;
+
+let neverHasSideEffects=1 in
+def SETCP_1r : _F1R<0b001101, (outs), (ins GRRegs:$a), "set cp, $a", []>;
 
 let hasCtrlDep = 1 in 
 def ECALLT_1r : _F1R<0b010011, (outs), (ins GRRegs:$a),
@@ -1008,17 +977,40 @@ def SETEV_1r : _F1R<0b001111, (outs), (ins GRRegs:$a),
                     [(int_xcore_setev GRRegs:$a, R11)]>;
 }
 
+def DGETREG_1r : _F1R<0b001110, (outs GRRegs:$a), (ins), "dgetreg $a", []>;
+
+def EDU_1r : _F1R<0b000000, (outs), (ins GRRegs:$a), "edu res[$a]", []>;
+
 def EEU_1r : _F1R<0b000001, (outs), (ins GRRegs:$a),
                "eeu res[$a]",
                [(int_xcore_eeu GRRegs:$a)]>;
 
+def KCALL_1r : _F1R<0b010000, (outs), (ins GRRegs:$a), "kcall $a", []>;
+
+def WAITEF_1R : _F1R<0b000011, (outs), (ins GRRegs:$a), "waitef $a", []>;
+
+def WAITET_1R : _F1R<0b000010, (outs), (ins GRRegs:$a), "waitet $a", []>;
+
+def TSTART_1R : _F1R<0b000110, (outs), (ins GRRegs:$a), "start t[$a]", []>;
+
+def CLRPT_1R : _F1R<0b100000, (outs), (ins GRRegs:$a), "clrpt res[$a]", []>;
+
 // Zero operand short
-// TODO freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
-// stet, getkep, getksp, setkep, getid, kret, dcall, dret,
-// dentsp, drestsp
 
 def CLRE_0R : _F0R<0b0000001101, (outs), (ins), "clre", [(int_xcore_clre)]>;
 
+def DCALL_0R : _F0R<0b0000011100, (outs), (ins), "dcall", []>;
+
+let Defs = [SP], Uses = [SP] in
+def DENTSP_0R : _F0R<0b0001001100, (outs), (ins), "dentsp", []>;
+
+let Defs = [SP] in
+def DRESTSP_0R : _F0R<0b0001001101, (outs), (ins), "drestsp", []>;
+
+def DRET_0R : _F0R<0b0000011110, (outs), (ins), "dret", []>;
+
+def FREET_0R : _F0R<0b0000001111, (outs), (ins), "freet", []>;
+
 let Defs = [R11] in {
 def GETID_0R : _F0R<0b0001001110, (outs), (ins),
                     "get r11, id",
@@ -1031,12 +1023,44 @@ def GETED_0R : _F0R<0b0000111110, (outs), (ins),
 def GETET_0R : _F0R<0b0000111111, (outs), (ins),
                     "get r11, et",
                     [(set R11, (int_xcore_getet))]>;
+
+def GETKEP_0R : _F0R<0b0001001111, (outs), (ins),
+                     "get r11, kep", []>;
+
+def GETKSP_0R : _F0R<0b0001011100, (outs), (ins),
+                     "get r11, ksp", []>;
 }
 
+let Defs = [SP] in
+def KRET_0R : _F0R<0b0000011101, (outs), (ins), "kret", []>;
+
+let Uses = [SP], mayLoad = 1 in {
+def LDET_0R : _F0R<0b0001011110, (outs), (ins), "ldw et, sp[4]", []>;
+
+def LDSED_0R : _F0R<0b0001011101, (outs), (ins), "ldw sed, sp[3]", []>;
+
+def LDSPC_0R : _F0R<0b0000101100, (outs), (ins), "ldw spc, sp[1]", []>;
+
+def LDSSR_0R : _F0R<0b0000101110, (outs), (ins), "ldw ssr, sp[2]", []>;
+}
+
+let Uses=[R11] in
+def SETKEP_0R : _F0R<0b0000011111, (outs), (ins), "set kep, r11", []>;
+
 def SSYNC_0r : _F0R<0b0000001110, (outs), (ins),
                     "ssync",
                     [(int_xcore_ssync)]>;
 
+let Uses = [SP], mayStore = 1 in {
+def STET_0R : _F0R<0b0000111101, (outs), (ins), "stw et, sp[4]", []>;
+
+def STSED_0R : _F0R<0b0000111100, (outs), (ins), "stw sed, sp[3]", []>;
+
+def STSPC_0R : _F0R<0b0000101101, (outs), (ins), "stw spc, sp[1]", []>;
+
+def STSSR_0R : _F0R<0b0000101111, (outs), (ins), "stw ssr, sp[2]", []>;
+}
+
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
     hasSideEffects = 1 in
 def WAITEU_0R : _F0R<0b0000001100, (outs), (ins),
@@ -1047,8 +1071,8 @@ def WAITEU_0R : _F0R<0b0000001100, (outs), (ins),
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
 
-def : Pat<(XCoreBranchLink tglobaladdr:$addr), (BL_lu10 tglobaladdr:$addr)>;
-def : Pat<(XCoreBranchLink texternalsym:$addr), (BL_lu10 texternalsym:$addr)>;
+def : Pat<(XCoreBranchLink tglobaladdr:$addr), (BLRF_lu10 tglobaladdr:$addr)>;
+def : Pat<(XCoreBranchLink texternalsym:$addr), (BLRF_lu10 texternalsym:$addr)>;
 
 /// sext_inreg
 def : Pat<(sext_inreg GRRegs:$b, i1), (SEXT_rus GRRegs:$b, 1)>;
@@ -1090,7 +1114,7 @@ def : Pat<(truncstorei16 GRRegs:$val, GRRegs:$addr),
           (ST16_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
 
 def : Pat<(store GRRegs:$val, (ldawf GRRegs:$addr, GRRegs:$offset)),
-          (STW_3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+          (STW_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
 def : Pat<(store GRRegs:$val, (add GRRegs:$addr, immUs4:$offset)),
           (STW_2rus GRRegs:$val, GRRegs:$addr, (div4_xform immUs4:$offset))>;
 def : Pat<(store GRRegs:$val, GRRegs:$addr),
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index e637d9a..49b5634 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -101,72 +101,14 @@ XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
   return false;
 }
 
-// This function eliminates ADJCALLSTACKDOWN,
-// ADJCALLSTACKUP pseudo instructions
-void XCoreRegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  if (!TFI->hasReservedCallFrame(MF)) {
-    // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
-    // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'
-    MachineInstr *Old = I;
-    uint64_t Amount = Old->getOperand(0).getImm();
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = TFI->getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
-
-      assert(Amount%4 == 0);
-      Amount /= 4;
-
-      bool isU6 = isImmU6(Amount);
-      if (!isU6 && !isImmU16(Amount)) {
-        // FIX could emit multiple instructions in this case.
-#ifndef NDEBUG
-        errs() << "eliminateCallFramePseudoInstr size too big: "
-               << Amount << "\n";
-#endif
-        llvm_unreachable(0);
-      }
-
-      MachineInstr *New;
-      if (Old->getOpcode() == XCore::ADJCALLSTACKDOWN) {
-        int Opcode = isU6 ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
-        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode))
-          .addImm(Amount);
-      } else {
-        assert(Old->getOpcode() == XCore::ADJCALLSTACKUP);
-        int Opcode = isU6 ? XCore::LDAWSP_ru6_RRegs : XCore::LDAWSP_lru6_RRegs;
-        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode), XCore::SP)
-          .addImm(Amount);
-      }
-
-      // Replace the pseudo instruction with a new instruction...
-      MBB.insert(I, New);
-    }
-  }
-  
-  MBB.erase(I);
-}
-
 void
 XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                       int SPAdj, RegScavenger *RS) const {
+                                       int SPAdj, unsigned FIOperandNum,
+                                       RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
   MachineInstr &MI = *II;
   DebugLoc dl = MI.getDebugLoc();
-  unsigned i = 0;
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-
-  MachineOperand &FrameOp = MI.getOperand(i);
+  MachineOperand &FrameOp = MI.getOperand(FIOperandNum);
   int FrameIndex = FrameOp.getIndex();
 
   MachineFunction &MF = *MI.getParent()->getParent();
@@ -190,14 +132,14 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Special handling of DBG_VALUE instructions.
   if (MI.isDebugValue()) {
-    MI.getOperand(i).ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(i+1).ChangeToImmediate(Offset);
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
     return;
   }
 
   // fold constant into offset.
-  Offset += MI.getOperand(i + 1).getImm();
-  MI.getOperand(i + 1).ChangeToImmediate(0);
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
   
   assert(Offset%4 == 0 && "Misaligned stack offset");
 
@@ -231,7 +173,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
               .addReg(ScratchReg, RegState::Kill);
         break;
       case XCore::STWFI:
-        BuildMI(MBB, II, dl, TII.get(XCore::STW_3r))
+        BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
               .addReg(Reg, getKillRegState(isKill))
               .addReg(FrameReg)
               .addReg(ScratchReg, RegState::Kill);
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index c4dcb6b..1db3248 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -54,12 +54,9 @@ public:
 
   bool useFPForScavengingIndex(const MachineFunction &MF) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, RegScavenger *RS = NULL) const;
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
index de1353e..2bb6e90 100644
--- a/lib/Transforms/CMakeLists.txt
+++ b/lib/Transforms/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(Scalar)
 add_subdirectory(IPO)
 add_subdirectory(Vectorize)
 add_subdirectory(Hello)
+add_subdirectory(ObjCARC)
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index 385544a..e6fa4ed 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -514,14 +514,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   // Attribute - Keep track of the parameter attributes for the arguments
   // that we are *not* promoting. For the ones that we do promote, the parameter
   // attributes are lost
-  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  SmallVector<AttributeSet, 8> AttributesVec;
   const AttributeSet &PAL = F->getAttributes();
 
   // Add any return attributes.
-  Attribute attrs = PAL.getRetAttributes();
-  if (attrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                    attrs));
+  if (PAL.hasAttributes(AttributeSet::ReturnIndex))
+    AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                              PAL.getRetAttributes()));
 
   // First, determine the new argument list
   unsigned ArgIndex = 1;
@@ -537,9 +536,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     } else if (!ArgsToPromote.count(I)) {
       // Unchanged argument
       Params.push_back(I->getType());
-      Attribute attrs = PAL.getParamAttributes(ArgIndex);
-      if (attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), attrs));
+      AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
+      if (attrs.hasAttributes(ArgIndex)) {
+        AttrBuilder B(attrs, ArgIndex);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
+      }
     } else if (I->use_empty()) {
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
@@ -591,10 +593,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   }
 
   // Add any function attributes.
-  attrs = PAL.getFnAttributes();
-  if (attrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                    attrs));
+  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    AttributesVec.push_back(AttributeSet::get(FTy->getContext(),
+                                              PAL.getFnAttributes()));
 
   Type *RetTy = FTy->getReturnType();
 
@@ -639,10 +640,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     const AttributeSet &CallPAL = CS.getAttributes();
 
     // Add any return attributes.
-    Attribute attrs = CallPAL.getRetAttributes();
-    if (attrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                      attrs));
+    if (CallPAL.hasAttributes(AttributeSet::ReturnIndex))
+      AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                                CallPAL.getRetAttributes()));
 
     // Loop over the operands, inserting GEP and loads in the caller as
     // appropriate.
@@ -653,10 +653,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
         Args.push_back(*AI);          // Unmodified argument
 
-        Attribute Attrs = CallPAL.getParamAttributes(ArgIndex);
-        if (Attrs.hasAttributes())
-          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
-
+        if (CallPAL.hasAttributes(ArgIndex)) {
+          AttrBuilder B(CallPAL, ArgIndex);
+          AttributesVec.
+            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+        }
       } else if (ByValArgsToTransform.count(I)) {
         // Emit a GEP and load for each element of the struct.
         Type *AgTy = cast<PointerType>(I->getType())->getElementType();
@@ -715,16 +716,17 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     // Push any varargs arguments on the list.
     for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
       Args.push_back(*AI);
-      Attribute Attrs = CallPAL.getParamAttributes(ArgIndex);
-      if (Attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+      if (CallPAL.hasAttributes(ArgIndex)) {
+        AttrBuilder B(CallPAL, ArgIndex);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+      }
     }
 
     // Add any function attributes.
-    attrs = CallPAL.getFnAttributes();
-    if (attrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                      attrs));
+    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
+      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
+                                                CallPAL.getFnAttributes()));
 
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index ff040e7..49ef1e7 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -272,14 +272,13 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
 
     // Drop any attributes that were on the vararg arguments.
     AttributeSet PAL = CS.getAttributes();
-    if (!PAL.isEmpty() && PAL.getSlot(PAL.getNumSlots() - 1).Index > NumArgs) {
-      SmallVector<AttributeWithIndex, 8> AttributesVec;
-      for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i)
-        AttributesVec.push_back(PAL.getSlot(i));
-      Attribute FnAttrs = PAL.getFnAttributes();
-      if (FnAttrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                        FnAttrs));
+    if (!PAL.isEmpty() && PAL.getSlotIndex(PAL.getNumSlots() - 1) > NumArgs) {
+      SmallVector<AttributeSet, 8> AttributesVec;
+      for (unsigned i = 0; PAL.getSlotIndex(i) <= NumArgs; ++i)
+        AttributesVec.push_back(PAL.getSlotAttributes(i));
+      if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+        AttributesVec.push_back(AttributeSet::get(Fn.getContext(),
+                                                  PAL.getFnAttributes()));
       PAL = AttributeSet::get(Fn.getContext(), AttributesVec);
     }
 
@@ -351,7 +350,7 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   if (Fn.use_empty())
     return false;
 
-  llvm::SmallVector<unsigned, 8> UnusedArgs;
+  SmallVector<unsigned, 8> UnusedArgs;
   for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(); 
        I != E; ++I) {
     Argument *Arg = I;
@@ -697,15 +696,10 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   std::vector<Type*> Params;
 
   // Set up to build a new list of parameter attributes.
-  SmallVector<AttributeWithIndex, 8> AttributesVec;
+  SmallVector<AttributeSet, 8> AttributesVec;
   const AttributeSet &PAL = F->getAttributes();
 
-  // The existing function return attributes.
-  Attribute RAttrs = PAL.getRetAttributes();
-  Attribute FnAttrs = PAL.getFnAttributes();
-
   // Find out the new return value.
-
   Type *RetTy = FTy->getReturnType();
   Type *NRetTy = NULL;
   unsigned RetCount = NumRetVals(F);
@@ -759,22 +753,29 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
   assert(NRetTy && "No new return type found?");
 
+  // The existing function return attributes.
+  AttributeSet RAttrs = PAL.getRetAttributes();
+
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
   // here. Currently, this should not be possible, but special handling might be
   // required when new return value attributes are added.
   if (NRetTy->isVoidTy())
     RAttrs =
-      Attribute::get(NRetTy->getContext(), AttrBuilder(RAttrs).
-                      removeAttributes(Attribute::typeIncompatible(NRetTy)));
+      AttributeSet::get(NRetTy->getContext(), AttributeSet::ReturnIndex,
+                        AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+         removeAttributes(AttributeFuncs::
+                          typeIncompatible(NRetTy, AttributeSet::ReturnIndex),
+                          AttributeSet::ReturnIndex));
   else
-    assert(!AttrBuilder(RAttrs).
-             hasAttributes(Attribute::typeIncompatible(NRetTy)) &&
+    assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+             hasAttributes(AttributeFuncs::
+                           typeIncompatible(NRetTy, AttributeSet::ReturnIndex),
+                           AttributeSet::ReturnIndex) &&
            "Return attributes no longer compatible?");
 
-  if (RAttrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                    RAttrs));
+  if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
+    AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs));
 
   // Remember which arguments are still alive.
   SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
@@ -791,9 +792,11 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
       // Get the original parameter attributes (skipping the first one, that is
       // for the return value.
-      Attribute Attrs = PAL.getParamAttributes(i + 1);
-      if (Attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Params.size(), Attrs));
+      if (PAL.hasAttributes(i + 1)) {
+        AttrBuilder B(PAL, i + 1);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
+      }
     } else {
       ++NumArgumentsEliminated;
       DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName()
@@ -801,9 +804,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     }
   }
 
-  if (FnAttrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                    FnAttrs));
+  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                              PAL.getFnAttributes()));
 
   // Reconstruct the AttributesList based on the vector we constructed.
   AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec);
@@ -836,15 +839,18 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     const AttributeSet &CallPAL = CS.getAttributes();
 
     // The call return attributes.
-    Attribute RAttrs = CallPAL.getRetAttributes();
-    Attribute FnAttrs = CallPAL.getFnAttributes();
+    AttributeSet RAttrs = CallPAL.getRetAttributes();
+
     // Adjust in case the function was changed to return void.
     RAttrs =
-      Attribute::get(NF->getContext(), AttrBuilder(RAttrs).
-           removeAttributes(Attribute::typeIncompatible(NF->getReturnType())));
-    if (RAttrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                      RAttrs));
+      AttributeSet::get(NF->getContext(), AttributeSet::ReturnIndex,
+                        AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+        removeAttributes(AttributeFuncs::
+                         typeIncompatible(NF->getReturnType(),
+                                          AttributeSet::ReturnIndex),
+                         AttributeSet::ReturnIndex));
+    if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
+      AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs));
 
     // Declare these outside of the loops, so we can reuse them for the second
     // loop, which loops the varargs.
@@ -856,22 +862,26 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[i]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        Attribute Attrs = CallPAL.getParamAttributes(i + 1);
-        if (Attrs.hasAttributes())
-          AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+        if (CallPAL.hasAttributes(i + 1)) {
+          AttrBuilder B(CallPAL, i + 1);
+          AttributesVec.
+            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+        }
       }
 
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
       Args.push_back(*I);
-      Attribute Attrs = CallPAL.getParamAttributes(i + 1);
-      if (Attrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
+      if (CallPAL.hasAttributes(i + 1)) {
+        AttrBuilder B(CallPAL, i + 1);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+      }
     }
 
-    if (FnAttrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                      FnAttrs));
+    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
+      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
+                                                CallPAL.getFnAttributes()));
 
     // Reconstruct the AttributesList based on the vector we constructed.
     AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec);
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index e9bc4ad..a75212a 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -215,14 +215,13 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
     AttrBuilder B;
     B.addAttribute(Attribute::ReadOnly)
       .addAttribute(Attribute::ReadNone);
-    F->removeAttribute(AttributeSet::FunctionIndex,
-                       Attribute::get(F->getContext(), B));
+    F->removeAttributes(AttributeSet::FunctionIndex,
+                        AttributeSet::get(F->getContext(),
+                                          AttributeSet::FunctionIndex, B));
 
     // Add in the new attribute.
-    B.clear();
-    B.addAttribute(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
     F->addAttribute(AttributeSet::FunctionIndex,
-                    Attribute::get(F->getContext(), B));
+                    ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
 
     if (ReadsMemory)
       ++NumReadOnly;
@@ -381,7 +380,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
       for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end();
            A != E; ++A) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
-          A->addAttr(Attribute::get(F->getContext(), B));
+          A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
           ++NumNoCapture;
           Changed = true;
         }
@@ -396,7 +395,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
         if (!Tracker.Captured) {
           if (Tracker.Uses.empty()) {
             // If it's trivially not captured, mark it nocapture now.
-            A->addAttr(Attribute::get(F->getContext(), B));
+            A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo()+1, B));
             ++NumNoCapture;
             Changed = true;
           } else {
@@ -431,7 +430,9 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
           ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
         ArgumentSCC[0]->
           Definition->
-          addAttr(Attribute::get(ArgumentSCC[0]->Definition->getContext(), B));
+          addAttr(AttributeSet::get(ArgumentSCC[0]->Definition->getContext(),
+                                    ArgumentSCC[0]->Definition->getArgNo() + 1,
+                                    B));
         ++NumNoCapture;
         Changed = true;
       }
@@ -473,7 +474,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
 
     for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
       Argument *A = ArgumentSCC[i]->Definition;
-      A->addAttr(Attribute::get(A->getContext(), B));
+      A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
       ++NumNoCapture;
       Changed = true;
     }
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index abd37c2..2b9d667 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -448,8 +448,8 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
       Dead[i].second->eraseFromParent();
       Instruction *I = Dead[i].first;
       do {
-	if (isAllocationFn(I, TLI))
-	  break;
+        if (isAllocationFn(I, TLI))
+          break;
         Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
         if (!J)
           break;
@@ -1825,7 +1825,8 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
                                              GlobalValue::InternalLinkage,
                                         ConstantInt::getFalse(GV->getContext()),
                                              GV->getName()+".b",
-                                             GV->getThreadLocalMode());
+                                             GV->getThreadLocalMode(),
+                                             GV->getType()->getAddressSpace());
   GV->getParent()->getGlobalList().insert(GV, NewGV);
 
   Constant *InitVal = GV->getInitializer();
@@ -1845,10 +1846,10 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
       bool StoringOther = SI->getOperand(0) == OtherVal;
       // Only do this if we weren't storing a loaded value.
       Value *StoreVal;
-      if (StoringOther || SI->getOperand(0) == InitVal)
+      if (StoringOther || SI->getOperand(0) == InitVal) {
         StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
                                     StoringOther);
-      else {
+      } else {
         // Otherwise, we are storing a previously loaded copy.  To do this,
         // change the copy from copying the original value to just copying the
         // bool.
@@ -1887,6 +1888,9 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
     UI->eraseFromParent();
   }
 
+  // Retain the name of the old global variable. People who are debugging their
+  // programs may expect these variables to be named the same.
+  NewGV->takeName(GV);
   GV->eraseFromParent();
   return true;
 }
@@ -1989,7 +1993,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     return Changed;
 
   } else if (GS.StoredType <= GlobalStatus::isInitializerStored) {
-    DEBUG(dbgs() << "MARKING CONSTANT: " << *GV);
+    DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
     GV->setConstant(true);
 
     // Clean up any obviously simplifiable users now.
@@ -2067,12 +2071,12 @@ static void ChangeCalleesToFastCall(Function *F) {
 
 static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) {
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
-    if (!Attrs.getSlot(i).Attrs.hasAttribute(Attribute::Nest))
+    unsigned Index = Attrs.getSlotIndex(i);
+    if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest))
       continue;
 
     // There can be only one.
-    return Attrs.removeAttr(C, Attrs.getSlot(i).Index,
-                            Attribute::get(C, Attribute::Nest));
+    return Attrs.removeAttribute(C, Index, Attribute::Nest);
   }
 
   return Attrs;
@@ -2584,24 +2588,38 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
   while (1) {
     Constant *InstResult = 0;
 
+    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
-      if (!SI->isSimple()) return false;  // no volatile/atomic accesses.
+      if (!SI->isSimple()) {
+        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
       Constant *Ptr = getVal(SI->getOperand(1));
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
         Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
-      if (!isSimpleEnoughPointerToCommit(Ptr))
+        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+      }
+      if (!isSimpleEnoughPointerToCommit(Ptr)) {
         // If this is too complex for us to commit, reject it.
+        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
         return false;
+      }
 
       Constant *Val = getVal(SI->getOperand(0));
 
       // If this might be too difficult for the backend to handle (e.g. the addr
       // of one global variable divided by another) then we can't commit it.
-      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD))
+      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD)) {
+        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
+              << "\n");
         return false;
+      }
 
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
         if (CE->getOpcode() == Instruction::BitCast) {
+          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
           // If we're evaluating a store through a bitcast, then we need
           // to pull the bitcast off the pointer type and push it onto the
           // stored value.
@@ -2630,6 +2648,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
             // If we can't improve the situation by introspecting NewTy,
             // we have to give up.
             } else {
+              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                    "evaluate.\n");
               return false;
             }
           }
@@ -2637,25 +2657,36 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           // If we found compatible types, go ahead and push the bitcast
           // onto the stored value.
           Val = ConstantExpr::getBitCast(Val, NewTy);
+
+          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
         }
+      }
 
       MutatedMemory[Ptr] = Val;
     } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
       InstResult = ConstantExpr::get(BO->getOpcode(),
                                      getVal(BO->getOperand(0)),
                                      getVal(BO->getOperand(1)));
+      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
+            << "\n");
     } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
       InstResult = ConstantExpr::getCompare(CI->getPredicate(),
                                             getVal(CI->getOperand(0)),
                                             getVal(CI->getOperand(1)));
+      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+            << "\n");
     } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
       InstResult = ConstantExpr::getCast(CI->getOpcode(),
                                          getVal(CI->getOperand(0)),
                                          CI->getType());
+      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+            << "\n");
     } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
       InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
                                            getVal(SI->getOperand(1)),
                                            getVal(SI->getOperand(2)));
+      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+            << "\n");
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
       Constant *P = getVal(GEP->getOperand(0));
       SmallVector<Constant*, 8> GEPOps;
@@ -2665,41 +2696,70 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       InstResult =
         ConstantExpr::getGetElementPtr(P, GEPOps,
                                        cast<GEPOperator>(GEP)->isInBounds());
+      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
+            << "\n");
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
-      if (!LI->isSimple()) return false;  // no volatile/atomic accesses.
+
+      if (!LI->isSimple()) {
+        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+
       Constant *Ptr = getVal(LI->getOperand(0));
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
         Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
+        DEBUG(dbgs() << "Found a constant pointer expression, constant "
+              "folding: " << *Ptr << "\n");
+      }
       InstResult = ComputeLoadResult(Ptr);
-      if (InstResult == 0) return false; // Could not evaluate load.
+      if (InstResult == 0) {
+        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
+              "\n");
+        return false; // Could not evaluate load.
+      }
+
+      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
     } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
-      if (AI->isArrayAllocation()) return false;  // Cannot handle array allocs.
+      if (AI->isArrayAllocation()) {
+        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        return false;  // Cannot handle array allocs.
+      }
       Type *Ty = AI->getType()->getElementType();
       AllocaTmps.push_back(new GlobalVariable(Ty, false,
                                               GlobalValue::InternalLinkage,
                                               UndefValue::get(Ty),
                                               AI->getName()));
       InstResult = AllocaTmps.back();
+      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
       CallSite CS(CurInst);
 
       // Debug info can safely be ignored here.
       if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
+        DEBUG(dbgs() << "Ignoring debug info.\n");
         ++CurInst;
         continue;
       }
 
       // Cannot handle inline asm.
-      if (isa<InlineAsm>(CS.getCalledValue())) return false;
+      if (isa<InlineAsm>(CS.getCalledValue())) {
+        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        return false;
+      }
 
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
         if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
-          if (MSI->isVolatile()) return false;
+          if (MSI->isVolatile()) {
+            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
+                  "intrinsic.\n");
+            return false;
+          }
           Constant *Ptr = getVal(MSI->getDest());
           Constant *Val = getVal(MSI->getValue());
           Constant *DestVal = ComputeLoadResult(getVal(Ptr));
           if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
             // This memset is a no-op.
+            DEBUG(dbgs() << "Ignoring no-op memset.\n");
             ++CurInst;
             continue;
           }
@@ -2707,6 +2767,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
 
         if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
             II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
           ++CurInst;
           continue;
         }
@@ -2714,8 +2775,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         if (II->getIntrinsicID() == Intrinsic::invariant_start) {
           // We don't insert an entry into Values, as it doesn't have a
           // meaningful return value.
-          if (!II->use_empty())
+          if (!II->use_empty()) {
+            DEBUG(dbgs() << "Found unused invariant_start. Cant evaluate.\n");
             return false;
+          }
           ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
           Value *PtrArg = getVal(II->getArgOperand(1));
           Value *Ptr = PtrArg->stripPointerCasts();
@@ -2723,20 +2786,30 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
             Type *ElemTy = cast<PointerType>(GV->getType())->getElementType();
             if (!Size->isAllOnesValue() &&
                 Size->getValue().getLimitedValue() >=
-                TD->getTypeStoreSize(ElemTy))
+                TD->getTypeStoreSize(ElemTy)) {
               Invariants.insert(GV);
+              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
+                    << "\n");
+            } else {
+              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
+                    "invariant.\n");
+            }
           }
           // Continue even if we do nothing.
           ++CurInst;
           continue;
         }
+
+        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
         return false;
       }
 
       // Resolve function pointers.
       Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
-      if (!Callee || Callee->mayBeOverridden())
+      if (!Callee || Callee->mayBeOverridden()) {
+        DEBUG(dbgs() << "Can not resolve function pointer.\n");
         return false;  // Cannot resolve.
+      }
 
       SmallVector<Constant*, 8> Formals;
       for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
@@ -2746,22 +2819,38 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         // If this is a function we can constant fold, do it.
         if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) {
           InstResult = C;
+          DEBUG(dbgs() << "Constant folded function call. Result: " <<
+                *InstResult << "\n");
         } else {
+          DEBUG(dbgs() << "Can not constant fold function call.\n");
           return false;
         }
       } else {
-        if (Callee->getFunctionType()->isVarArg())
+        if (Callee->getFunctionType()->isVarArg()) {
+          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
           return false;
+        }
 
-        Constant *RetVal;
+        Constant *RetVal = 0;
         // Execute the call, if successful, use the return value.
         ValueStack.push_back(new DenseMap<Value*, Constant*>);
-        if (!EvaluateFunction(Callee, RetVal, Formals))
+        if (!EvaluateFunction(Callee, RetVal, Formals)) {
+          DEBUG(dbgs() << "Failed to evaluate function.\n");
           return false;
+        }
         delete ValueStack.pop_back_val();
         InstResult = RetVal;
+
+        if (InstResult != NULL) {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: " <<
+                InstResult << "\n\n");
+        } else {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+        }
       }
     } else if (isa<TerminatorInst>(CurInst)) {
+      DEBUG(dbgs() << "Found a terminator instruction.\n");
+
       if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
         if (BI->isUnconditional()) {
           NextBB = BI->getSuccessor(0);
@@ -2787,13 +2876,17 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         NextBB = 0;
       } else {
         // invoke, unwind, resume, unreachable.
+        DEBUG(dbgs() << "Can not handle terminator.");
         return false;  // Cannot handle this terminator.
       }
 
       // We succeeded at evaluating this block!
+      DEBUG(dbgs() << "Successfully evaluated block.\n");
       return true;
     } else {
       // Did not know how to evaluate this!
+      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
+            "\n");
       return false;
     }
 
@@ -2807,6 +2900,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
     // If we just processed an invoke, we finished evaluating the block.
     if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
       NextBB = II->getNormalDest();
+      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
       return true;
     }
 
@@ -2845,6 +2939,8 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
 
   while (1) {
     BasicBlock *NextBB = 0; // Initialized to avoid compiler warnings.
+    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+
     if (!EvaluateBlock(CurInst, NextBB))
       return false;
 
@@ -2924,6 +3020,7 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
       }
       break;
     }
+    DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
 
     // We cannot simplify external ctor functions.
     if (F->empty()) continue;
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 2971803..a0095da 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -30,35 +30,41 @@ using namespace llvm;
 
 namespace {
 
-  // AlwaysInliner only inlines functions that are mark as "always inline".
-  class AlwaysInliner : public Inliner {
-    InlineCostAnalyzer CA;
-  public:
-    // Use extremely low threshold.
-    AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/true) {
-      initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    AlwaysInliner(bool InsertLifetime) : Inliner(ID, -2000000000,
-                                                 InsertLifetime) {
-      initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    static char ID; // Pass identification, replacement for typeid
-    virtual InlineCost getInlineCost(CallSite CS);
-
-    using llvm::Pass::doInitialization;
-    using llvm::Pass::doFinalization;
-
-    virtual bool doFinalization(CallGraph &CG) {
-      return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
-    }
-    virtual bool doInitialization(CallGraph &CG);
-  };
+/// \brief Inliner pass which only handles "always inline" functions.
+class AlwaysInliner : public Inliner {
+  InlineCostAnalysis *ICA;
+
+public:
+  // Use extremely low threshold.
+  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), ICA(0) {
+    initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  AlwaysInliner(bool InsertLifetime)
+      : Inliner(ID, -2000000000, InsertLifetime), ICA(0) {
+    initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  virtual InlineCost getInlineCost(CallSite CS);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual bool runOnSCC(CallGraphSCC &SCC);
+
+  using llvm::Pass::doFinalization;
+  virtual bool doFinalization(CallGraph &CG) {
+    return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true);
+  }
+};
+
 }
 
 char AlwaysInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
 INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
 
@@ -89,15 +95,18 @@ InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
   if (Callee && !Callee->isDeclaration() &&
       Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                            Attribute::AlwaysInline) &&
-      CA.isInlineViable(*Callee))
+      ICA->isInlineViable(*Callee))
     return InlineCost::getAlways();
 
   return InlineCost::getNever();
 }
 
-// doInitialization - Initializes the vector of functions that have not
-// been annotated with the "always inline" attribute.
-bool AlwaysInliner::doInitialization(CallGraph &CG) {
-  CA.setDataLayout(getAnalysisIfAvailable<DataLayout>());
-  return false;
+bool AlwaysInliner::runOnSCC(CallGraphSCC &SCC) {
+  ICA = &getAnalysis<InlineCostAnalysis>();
+  return Inliner::runOnSCC(SCC);
+}
+
+void AlwaysInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<InlineCostAnalysis>();
+  Inliner::getAnalysisUsage(AU);
 }
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 9682923..a4f7026 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -28,29 +28,41 @@ using namespace llvm;
 
 namespace {
 
-  class SimpleInliner : public Inliner {
-    InlineCostAnalyzer CA;
-  public:
-    SimpleInliner() : Inliner(ID) {
-      initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    SimpleInliner(int Threshold) : Inliner(ID, Threshold,
-                                           /*InsertLifetime*/true) {
-      initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
-    }
-    static char ID; // Pass identification, replacement for typeid
-    InlineCost getInlineCost(CallSite CS) {
-      return CA.getInlineCost(CS, getInlineThreshold(CS));
-    }
-    using llvm::Pass::doInitialization;
-    virtual bool doInitialization(CallGraph &CG);
-  };
-}
+/// \brief Actaul inliner pass implementation.
+///
+/// The common implementation of the inlining logic is shared between this
+/// inliner pass and the always inliner pass. The two passes use different cost
+/// analyses to determine when to inline.
+class SimpleInliner : public Inliner {
+  InlineCostAnalysis *ICA;
+
+public:
+  SimpleInliner() : Inliner(ID), ICA(0) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  SimpleInliner(int Threshold)
+      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(0) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  InlineCost getInlineCost(CallSite CS) {
+    return ICA->getInlineCost(CS, getInlineThreshold(CS));
+  }
+
+  virtual bool runOnSCC(CallGraphSCC &SCC);
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+};
+
+} // end anonymous namespace
 
 char SimpleInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
 INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
 
@@ -60,10 +72,12 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
   return new SimpleInliner(Threshold);
 }
 
-// doInitialization - Initializes the vector of functions that have been
-// annotated with the noinline attribute.
-bool SimpleInliner::doInitialization(CallGraph &CG) {
-  CA.setDataLayout(getAnalysisIfAvailable<DataLayout>());
-  return false;
+bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
+  ICA = &getAnalysis<InlineCostAnalysis>();
+  return Inliner::runOnSCC(SCC);
 }
 
+void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<InlineCostAnalysis>();
+  Inliner::getAnalysisUsage(AU);
+}
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 2187a2a..663ddb7 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -72,6 +72,40 @@ void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
 typedef DenseMap<ArrayType*, std::vector<AllocaInst*> >
 InlinedArrayAllocasTy;
 
+/// \brief If the inlined function had a higher stack protection level than the
+/// calling function, then bump up the caller's stack protection level.
+static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {
+  // If upgrading the SSP attribute, clear out the old SSP Attributes first.
+  // Having multiple SSP attributes doesn't actually hurt, but it adds useless
+  // clutter to the IR.
+  AttrBuilder B;
+  B.addAttribute(Attribute::StackProtect)
+    .addAttribute(Attribute::StackProtectStrong);
+  AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(),
+                                              AttributeSet::FunctionIndex,
+                                              B);
+  AttributeSet CallerAttr = Caller->getAttributes(),
+               CalleeAttr = Callee->getAttributes();
+
+  if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                              Attribute::StackProtectReq)) {
+    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller->addFnAttr(Attribute::StackProtectReq);
+  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackProtectStrong) &&
+             !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::StackProtectReq)) {
+    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller->addFnAttr(Attribute::StackProtectStrong);
+  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackProtect) &&
+           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                    Attribute::StackProtectReq) &&
+           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                    Attribute::StackProtectStrong))
+    Caller->addFnAttr(Attribute::StackProtect);
+}
+
 /// InlineCallIfPossible - If it is possible to inline the specified call site,
 /// do so and update the CallGraph for this operation.
 ///
@@ -91,16 +125,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
   if (!InlineFunction(CS, IFI, InsertLifetime))
     return false;
 
-  // If the inlined function had a higher stack protection level than the
-  // calling function, then bump up the caller's stack protection level.
-  if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::StackProtectReq))
-    Caller->addFnAttr(Attribute::StackProtectReq);
-  else if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                Attribute::StackProtect) &&
-           !Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                 Attribute::StackProtectReq))
-    Caller->addFnAttr(Attribute::StackProtect);
+  AdjustCallerSSPLevel(Caller, Callee);
 
   // Look at all of the allocas that we inlined through this call site.  If we
   // have already inlined other allocas through other calls into this function,
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index 70d55b0..4bfab5b 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -50,6 +50,8 @@ namespace {
     explicit InternalizePass();
     explicit InternalizePass(ArrayRef<const char *> exportList);
     void LoadFile(const char *Filename);
+    void ClearExportList();
+    void AddToExportList(const std::string &val);
     virtual bool runOnModule(Module &M);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -97,6 +99,14 @@ void InternalizePass::LoadFile(const char *Filename) {
   }
 }
 
+void InternalizePass::ClearExportList() {
+  ExternalNames.clear();
+}
+
+void InternalizePass::AddToExportList(const std::string &val) {
+  ExternalNames.insert(val);
+}
+
 bool InternalizePass::runOnModule(Module &M) {
   CallGraph *CG = getAnalysisIfAvailable<CallGraph>();
   CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt
index b18c915..124cbb6 100644
--- a/lib/Transforms/IPO/LLVMBuild.txt
+++ b/lib/Transforms/IPO/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = IPO
 parent = Transforms
 library_name = ipo
-required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils
+required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils ObjCARC
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 6dc1773..47b2b51 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -214,6 +214,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
       MPM.add(createGVNPass());                   // Remove redundancies
     else
       MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+
+    // BBVectorize may have significantly shortened a loop body; unroll again.
+    if (!DisableUnrollLoops)
+      MPM.add(createLoopUnrollPass());
   }
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index d872f0c..73d9323 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -146,9 +146,11 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
 
       Function *F = (*I)->getFunction();
       const AttributeSet &PAL = F->getAttributes();
-      const AttributeSet &NPAL = PAL.addAttr(F->getContext(), ~0,
-                                            Attribute::get(F->getContext(),
-                                                            NewAttributes));
+      const AttributeSet &NPAL =
+        PAL.addAttributes(F->getContext(), AttributeSet::FunctionIndex,
+                          AttributeSet::get(F->getContext(),
+                                            AttributeSet::FunctionIndex,
+                                            NewAttributes));
       if (PAL != NPAL) {
         MadeChange = true;
         F->setAttributes(NPAL);
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 959daa2..1f6a3a5e 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -27,7 +27,7 @@ namespace llvm {
   class DbgDeclareInst;
   class MemIntrinsic;
   class MemSetInst;
-  
+
 /// SelectPatternFlavor - We can match a variety of different patterns for
 /// select operations.
 enum SelectPatternFlavor {
@@ -36,7 +36,7 @@ enum SelectPatternFlavor {
   SPF_SMAX, SPF_UMAX
   //SPF_ABS - TODO.
 };
-  
+
 /// getComplexity:  Assign a complexity or rank value to LLVM Values...
 ///   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
 static inline unsigned getComplexity(Value *V) {
@@ -51,23 +51,23 @@ static inline unsigned getComplexity(Value *V) {
   return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
 }
 
-  
+
 /// InstCombineIRInserter - This is an IRBuilder insertion helper that works
 /// just like the normal insertion helper, but also adds any new instructions
 /// to the instcombine worklist.
-class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter 
+class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
     : public IRBuilderDefaultInserter<true> {
   InstCombineWorklist &Worklist;
 public:
   InstCombineIRInserter(InstCombineWorklist &WL) : Worklist(WL) {}
-  
+
   void InsertHelper(Instruction *I, const Twine &Name,
                     BasicBlock *BB, BasicBlock::iterator InsertPt) const {
     IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
     Worklist.Add(I);
   }
 };
-  
+
 /// InstCombiner - The -instcombine pass.
 class LLVM_LIBRARY_VISIBILITY InstCombiner
                              : public FunctionPass,
@@ -85,7 +85,7 @@ public:
   /// instructions into the worklist when they are created.
   typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy;
   BuilderTy *Builder;
-      
+
   static char ID; // Pass identification, replacement for typeid
   InstCombiner() : FunctionPass(ID), TD(0), Builder(0) {
     MinimizeSize = false;
@@ -94,7 +94,7 @@ public:
 
 public:
   virtual bool runOnFunction(Function &F);
-  
+
   bool DoOneIteration(Function &F, unsigned ItNum);
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
@@ -211,11 +211,11 @@ public:
 private:
   bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
-  Value *dyn_castFNegVal(Value *V) const;
-  Type *FindElementAtOffset(Type *Ty, int64_t Offset, 
+  Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const;
+  Type *FindElementAtOffset(Type *Ty, int64_t Offset,
                                   SmallVectorImpl<Value*> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
-                                 
+
   /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
   /// results in any code being generated and is interesting to optimize out. If
   /// the cast can be eliminated by some other simple transformation, we prefer
@@ -247,7 +247,7 @@ public:
     return New;
   }
 
-  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the 
+  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the
   // debug loc.
   //
   Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
@@ -263,10 +263,10 @@ public:
   //
   Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
     Worklist.AddUsersToWorkList(I);   // Add all modified instrs to worklist.
-    
+
     // If we are replacing the instruction with itself, this must be in a
     // segment of unreachable code, so just clobber the instruction.
-    if (&I == V) 
+    if (&I == V)
       V = UndefValue::get(I.getType());
 
     DEBUG(errs() << "IC: Replacing " << I << "\n"
@@ -296,13 +296,13 @@ public:
     MadeIRChange = true;
     return 0;  // Don't do anything with FI
   }
-      
+
   void ComputeMaskedBits(Value *V, APInt &KnownZero,
                          APInt &KnownOne, unsigned Depth = 0) const {
     return llvm::ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
   }
-  
-  bool MaskedValueIsZero(Value *V, const APInt &Mask, 
+
+  bool MaskedValueIsZero(Value *V, const APInt &Mask,
                          unsigned Depth = 0) const {
     return llvm::MaskedValueIsZero(V, Mask, TD, Depth);
   }
@@ -325,10 +325,10 @@ private:
 
   /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
   /// based on the demanded bits.
-  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, 
+  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                                  APInt& KnownZero, APInt& KnownOne,
                                  unsigned Depth);
-  bool SimplifyDemandedBits(Use &U, APInt DemandedMask, 
+  bool SimplifyDemandedBits(Use &U, APInt DemandedMask,
                             APInt& KnownZero, APInt& KnownOne,
                             unsigned Depth=0);
   /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
@@ -336,15 +336,15 @@ private:
   Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
                                     APInt DemandedMask, APInt &KnownZero,
                                     APInt &KnownOne);
-      
+
   /// SimplifyDemandedInstructionBits - Inst is an integer instruction that
   /// SimplifyDemandedBits knows about.  See if the instruction has any
   /// properties that allow us to simplify its operands.
   bool SimplifyDemandedInstructionBits(Instruction &Inst);
-      
+
   Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                     APInt& UndefElts, unsigned Depth = 0);
-    
+
   // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
   // which has a PHI node as operand #0, see if we can fold the instruction
   // into the PHI (which is only possible if all operands to the PHI are
@@ -360,10 +360,10 @@ private:
   Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
   Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
 
-  
+
   Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
                         ConstantInt *AndRHS, BinaryOperator &TheAnd);
-  
+
   Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
                             bool isSub, Instruction &I);
   Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
@@ -382,8 +382,8 @@ private:
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
 };
 
-      
-  
+
+
 } // end namespace llvm.
 
 #endif
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index f07c58d..c6d60d6 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -66,10 +66,12 @@ namespace {
     bool insaneIntVal(int V) { return V > 4 || V < -4; }
     APFloat *getFpValPtr(void)
       { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); }
+    const APFloat *getFpValPtr(void) const
+      { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); }
 
     const APFloat &getFpVal(void) const {
       assert(IsFp && BufHasFpVal && "Incorret state");
-      return *reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]);
+      return *getFpValPtr();
     }
 
     APFloat &getFpVal(void)
@@ -1248,6 +1250,16 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
     if (SimplifyDemandedInstructionBits(I))
       return &I;
+
+    // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
+    if (C->isZero() && match(Op1, m_ZExt(m_Value(X))))
+      if (X->getType()->isIntegerTy(1))
+        return CastInst::CreateSExtOrBitCast(X, Op1->getType());
+
+    // Fold (sub 0, (sext bool to B)) --> (zext bool to B)
+    if (C->isZero() && match(Op1, m_SExt(m_Value(X))))
+      if (X->getType()->isIntegerTy(1))
+        return CastInst::CreateZExtOrBitCast(X, Op1->getType());
   }
 
 
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index c1e60d4..4332467 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1245,6 +1245,34 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       }
   }
 
+  {
+    Value *X = 0;
+    bool OpsSwapped = false;
+    // Canonicalize SExt or Not to the LHS
+    if (match(Op1, m_SExt(m_Value())) ||
+        match(Op1, m_Not(m_Value()))) {
+      std::swap(Op0, Op1);
+      OpsSwapped = true;
+    }
+
+    // Fold (and (sext bool to A), B) --> (select bool, B, 0)
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        X->getType()->getScalarType()->isIntegerTy(1)) {
+      Value *Zero = Constant::getNullValue(Op1->getType());
+      return SelectInst::Create(X, Op1, Zero);
+    }
+
+    // Fold (and ~(sext bool to A), B) --> (select bool, 0, B)
+    if (match(Op0, m_Not(m_SExt(m_Value(X)))) &&
+        X->getType()->getScalarType()->isIntegerTy(1)) {
+      Value *Zero = Constant::getNullValue(Op0->getType());
+      return SelectInst::Create(X, Zero, Op1);
+    }
+
+    if (OpsSwapped)
+      std::swap(Op0, Op1);
+  }
+
   return Changed ? &I : 0;
 }
 
@@ -2043,6 +2071,20 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     return BinaryOperator::CreateOr(Inner, C1);
   }
 
+  // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D))
+  // Since this OR statement hasn't been optimized further yet, we hope
+  // that this transformation will allow the new ORs to be optimized.
+  {
+    Value *X = 0, *Y = 0;
+    if (Op0->hasOneUse() && Op1->hasOneUse() &&
+        match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
+        match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
+      Value *orTrue = Builder->CreateOr(A, C);
+      Value *orFalse = Builder->CreateOr(B, D);
+      return SelectInst::Create(X, orTrue, orFalse);
+    }
+  }
+
   return Changed ? &I : 0;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index d17879b..64cd1bd 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1014,8 +1014,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       return false;   // Cannot transform this return value.
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
-      AttrBuilder RAttrs = CallerPAL.getRetAttributes();
-      if (RAttrs.hasAttributes(Attribute::typeIncompatible(NewRetTy)))
+      AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+      if (RAttrs.
+          hasAttributes(AttributeFuncs::
+                        typeIncompatible(NewRetTy, AttributeSet::ReturnIndex),
+                        AttributeSet::ReturnIndex))
         return false;   // Attribute not compatible with transformed value.
     }
 
@@ -1044,14 +1047,16 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (!CastInst::isCastable(ActTy, ParamTy))
       return false;   // Cannot transform this parameter value.
 
-    Attribute Attrs = CallerPAL.getParamAttributes(i + 1);
-    if (AttrBuilder(Attrs).
-          hasAttributes(Attribute::typeIncompatible(ParamTy)))
+    if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
+          hasAttributes(AttributeFuncs::
+                        typeIncompatible(ParamTy, i + 1), i + 1))
       return false;   // Attribute not compatible with transformed value.
 
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
-    if (ParamTy != ActTy && Attrs.hasAttribute(Attribute::ByVal)) {
+    if (ParamTy != ActTy &&
+        CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
+                                                         Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
@@ -1100,11 +1105,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     // won't be dropping them.  Check that these extra arguments have attributes
     // that are compatible with being a vararg call argument.
     for (unsigned i = CallerPAL.getNumSlots(); i; --i) {
-      if (CallerPAL.getSlot(i - 1).Index <= FT->getNumParams())
+      unsigned Index = CallerPAL.getSlotIndex(i - 1);
+      if (Index <= FT->getNumParams())
         break;
-      Attribute PAttrs = CallerPAL.getSlot(i - 1).Attrs;
+
       // Check if it has an attribute that's incompatible with varargs.
-      if (PAttrs.hasAttribute(Attribute::StructRet))
+      AttributeSet PAttrs = CallerPAL.getSlotAttributes(i - 1);
+      if (PAttrs.hasAttribute(Index, Attribute::StructRet))
         return false;
     }
 
@@ -1113,21 +1120,23 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   // inserting cast instructions as necessary.
   std::vector<Value*> Args;
   Args.reserve(NumActualArgs);
-  SmallVector<AttributeWithIndex, 8> attrVec;
+  SmallVector<AttributeSet, 8> attrVec;
   attrVec.reserve(NumCommonArgs);
 
   // Get any return attributes.
-  AttrBuilder RAttrs = CallerPAL.getRetAttributes();
+  AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
-  RAttrs.removeAttributes(Attribute::typeIncompatible(NewRetTy));
+  RAttrs.
+    removeAttributes(AttributeFuncs::
+                     typeIncompatible(NewRetTy, AttributeSet::ReturnIndex),
+                     AttributeSet::ReturnIndex);
 
   // Add the new return attributes.
   if (RAttrs.hasAttributes())
-    attrVec.push_back(
-      AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                              Attribute::get(FT->getContext(), RAttrs)));
+    attrVec.push_back(AttributeSet::get(Caller->getContext(),
+                                        AttributeSet::ReturnIndex, RAttrs));
 
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
@@ -1141,9 +1150,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     }
 
     // Add any parameter attributes.
-    Attribute PAttrs = CallerPAL.getParamAttributes(i + 1);
+    AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
     if (PAttrs.hasAttributes())
-      attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+      attrVec.push_back(AttributeSet::get(Caller->getContext(), i + 1,
+                                          PAttrs));
   }
 
   // If the function takes more arguments than the call was taking, add them
@@ -1168,23 +1178,23 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         }
 
         // Add any parameter attributes.
-        Attribute PAttrs = CallerPAL.getParamAttributes(i + 1);
+        AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
         if (PAttrs.hasAttributes())
-          attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
+          attrVec.push_back(AttributeSet::get(FT->getContext(), i + 1,
+                                              PAttrs));
       }
     }
   }
 
-  Attribute FnAttrs = CallerPAL.getFnAttributes();
-  if (FnAttrs.hasAttributes())
-    attrVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                              FnAttrs));
+  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
+  if (CallerPAL.hasAttributes(AttributeSet::FunctionIndex))
+    attrVec.push_back(AttributeSet::get(Callee->getContext(), FnAttrs));
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
   const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),
-                                                     attrVec);
+                                                       attrVec);
 
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
@@ -1262,12 +1272,12 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   if (!NestAttrs.isEmpty()) {
     unsigned NestIdx = 1;
     Type *NestTy = 0;
-    Attribute NestAttr;
+    AttributeSet NestAttr;
 
     // Look for a parameter marked with the 'nest' attribute.
     for (FunctionType::param_iterator I = NestFTy->param_begin(),
          E = NestFTy->param_end(); I != E; ++NestIdx, ++I)
-      if (NestAttrs.getParamAttributes(NestIdx).hasAttribute(Attribute::Nest)){
+      if (NestAttrs.hasAttribute(NestIdx, Attribute::Nest)) {
         // Record the parameter type and any other attributes.
         NestTy = *I;
         NestAttr = NestAttrs.getParamAttributes(NestIdx);
@@ -1279,17 +1289,16 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       std::vector<Value*> NewArgs;
       NewArgs.reserve(unsigned(CS.arg_end()-CS.arg_begin())+1);
 
-      SmallVector<AttributeWithIndex, 8> NewAttrs;
+      SmallVector<AttributeSet, 8> NewAttrs;
       NewAttrs.reserve(Attrs.getNumSlots() + 1);
 
       // Insert the nest argument into the call argument list, which may
       // mean appending it.  Likewise for attributes.
 
       // Add any result attributes.
-      Attribute Attr = Attrs.getRetAttributes();
-      if (Attr.hasAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
-                                                   Attr));
+      if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
+        NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                             Attrs.getRetAttributes()));
 
       {
         unsigned Idx = 1;
@@ -1301,7 +1310,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
             if (NestVal->getType() != NestTy)
               NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest");
             NewArgs.push_back(NestVal);
-            NewAttrs.push_back(AttributeWithIndex::get(NestIdx, NestAttr));
+            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                                 NestAttr));
           }
 
           if (I == E)
@@ -1309,20 +1319,21 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
 
           // Add the original argument and attributes.
           NewArgs.push_back(*I);
-          Attr = Attrs.getParamAttributes(Idx);
-          if (Attr.hasAttributes())
-            NewAttrs.push_back
-              (AttributeWithIndex::get(Idx + (Idx >= NestIdx), Attr));
+          AttributeSet Attr = Attrs.getParamAttributes(Idx);
+          if (Attr.hasAttributes(Idx)) {
+            AttrBuilder B(Attr, Idx);
+            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                                 Idx + (Idx >= NestIdx), B));
+          }
 
           ++Idx, ++I;
         } while (1);
       }
 
       // Add any function attributes.
-      Attr = Attrs.getFnAttributes();
-      if (Attr.hasAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
-                                                   Attr));
+      if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+        NewAttrs.push_back(AttributeSet::get(FTy->getContext(),
+                                             Attrs.getFnAttributes()));
 
       // The trampoline may have been bitcast to a bogus type (FTy).
       // Handle this by synthesizing a new function type, equal to FTy
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 5af4442..a960ab2 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -30,7 +30,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
     Scale  = 0;
     return ConstantInt::get(Val->getType(), 0);
   }
-  
+
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
     // Cannot look past anything that might overflow.
     OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
@@ -47,19 +47,19 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
         Offset = 0;
         return I->getOperand(0);
       }
-      
+
       if (I->getOpcode() == Instruction::Mul) {
         // This value is scaled by 'RHS'.
         Scale = RHS->getZExtValue();
         Offset = 0;
         return I->getOperand(0);
       }
-      
+
       if (I->getOpcode() == Instruction::Add) {
-        // We have X+C.  Check to see if we really have (X*C2)+C1, 
+        // We have X+C.  Check to see if we really have (X*C2)+C1,
         // where C1 is divisible by C2.
         unsigned SubScale;
-        Value *SubVal = 
+        Value *SubVal =
           DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
         Offset += RHS->getZExtValue();
         Scale = SubScale;
@@ -82,7 +82,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   if (!TD) return 0;
 
   PointerType *PTy = cast<PointerType>(CI.getType());
-  
+
   BuilderTy AllocaBuilder(*Builder);
   AllocaBuilder.SetInsertPoint(AI.getParent(), &AI);
 
@@ -110,7 +110,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   uint64_t ArrayOffset;
   Value *NumElements = // See if the array size is a decomposable linear expr.
     DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
- 
+
   // If we can now satisfy the modulus, by using a non-1 scale, we really can
   // do the xform.
   if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
@@ -125,17 +125,17 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
     // Insert before the alloca, not before the cast.
     Amt = AllocaBuilder.CreateMul(Amt, NumElements);
   }
-  
+
   if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
     Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
                                   Offset, true);
     Amt = AllocaBuilder.CreateAdd(Amt, Off);
   }
-  
+
   AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
   New->setAlignment(AI.getAlignment());
   New->takeName(&AI);
-  
+
   // If the allocation has multiple real uses, insert a cast and change all
   // things that used it to use the new cast.  This will also hack on CI, but it
   // will die soon.
@@ -148,10 +148,10 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   return ReplaceInstUsesWith(CI, New);
 }
 
-/// EvaluateInDifferentType - Given an expression that 
+/// EvaluateInDifferentType - Given an expression that
 /// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
 /// insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, 
+Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
                                              bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
@@ -181,7 +181,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
     Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
     break;
-  }    
+  }
   case Instruction::Trunc:
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -190,7 +190,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     // new.
     if (I->getOperand(0)->getType() == Ty)
       return I->getOperand(0);
-    
+
     // Otherwise, must be the same type of cast, so just reinsert a new one.
     // This also handles the case of zext(trunc(x)) -> zext(x).
     Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
@@ -212,11 +212,11 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     Res = NPN;
     break;
   }
-  default: 
+  default:
     // TODO: Can handle more cases here.
     llvm_unreachable("Unreachable!");
   }
-  
+
   Res->takeName(I);
   return InsertNewInstWith(Res, *I);
 }
@@ -224,7 +224,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
 
 /// This function is a wrapper around CastInst::isEliminableCastPair. It
 /// simply extracts arguments and returns what that function returns.
-static Instruction::CastOps 
+static Instruction::CastOps
 isEliminableCastPair(
   const CastInst *CI, ///< The first cast instruction
   unsigned opcode,       ///< The opcode of the second cast instruction
@@ -253,7 +253,7 @@ isEliminableCastPair(
   if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) ||
       (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy))
     Res = 0;
-  
+
   return Instruction::CastOps(Res);
 }
 
@@ -265,18 +265,18 @@ bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
                                       Type *Ty) {
   // Noop casts and casts of constants should be eliminated trivially.
   if (V->getType() == Ty || isa<Constant>(V)) return false;
-  
+
   // If this is another cast that can be eliminated, we prefer to have it
   // eliminated.
   if (const CastInst *CI = dyn_cast<CastInst>(V))
     if (isEliminableCastPair(CI, opc, Ty, TD))
       return false;
-  
+
   // If this is a vector sext from a compare, then we don't want to break the
   // idiom where each element of the extended vector is either zero or all ones.
   if (opc == Instruction::SExt && isa<CmpInst>(V) && Ty->isVectorTy())
     return false;
-  
+
   return true;
 }
 
@@ -288,7 +288,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   // Many cases of "cast of a cast" are eliminable. If it's eliminable we just
   // eliminate it now.
   if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
-    if (Instruction::CastOps opc = 
+    if (Instruction::CastOps opc =
         isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
@@ -311,7 +311,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
       if (Instruction *NV = FoldOpIntoPhi(CI))
         return NV;
   }
-  
+
   return 0;
 }
 
@@ -330,15 +330,15 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty) {
   // We can always evaluate constants in another type.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   Type *OrigTy = V->getType();
-  
+
   // If this is an extension from the dest type, we can eliminate it, even if it
   // has multiple uses.
-  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && 
+  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
       I->getOperand(0)->getType() == Ty)
     return true;
 
@@ -423,29 +423,29 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty) {
     // TODO: Can handle more cases here.
     break;
   }
-  
+
   return false;
 }
 
 Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
-  
-  // See if we can simplify any instructions used by the input whose sole 
+
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType(), *SrcTy = Src->getType();
-  
+
   // Attempt to truncate the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
       CanEvaluateTruncated(Src, DestTy)) {
-      
+
     // If this cast is a truncate, evaluting in a different type always
     // eliminates the cast, so it is always a win.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -462,7 +462,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Value *Zero = Constant::getNullValue(Src->getType());
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
-  
+
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
   Value *A = 0; ConstantInt *Cst = 0;
   if (Src->hasOneUse() &&
@@ -472,7 +472,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
     // between ASize and ResultSize.
     unsigned ASize = A->getType()->getPrimitiveSizeInBits();
-    
+
     // If the shift amount is larger than the size of A, then the result is
     // known to be zero because all the input bits got shifted out.
     if (Cst->getZExtValue() >= ASize)
@@ -485,7 +485,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Shift->takeName(Src);
     return CastInst::CreateIntegerCast(Shift, CI.getType(), false);
   }
-  
+
   // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest
   // type isn't non-native.
   if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) &&
@@ -508,7 +508,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
   // cast to integer to avoid the comparison.
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
     const APInt &Op1CV = Op1C->getValue();
-      
+
     // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
     // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
     if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
@@ -538,14 +538,14 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
     // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
     // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
     // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
-    if ((Op1CV == 0 || Op1CV.isPowerOf2()) && 
+    if ((Op1CV == 0 || Op1CV.isPowerOf2()) &&
         // This only works for EQ and NE
         ICI->isEquality()) {
       // If Op1C some other power of two, convert:
       uint32_t BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
       ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne);
-        
+
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
         if (!DoXform) return ICI;
@@ -559,7 +559,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           Res = ConstantExpr::getZExt(Res, CI.getType());
           return ReplaceInstUsesWith(CI, Res);
         }
-          
+
         uint32_t ShiftAmt = KnownZeroMask.logBase2();
         Value *In = ICI->getOperand(0);
         if (ShiftAmt) {
@@ -568,12 +568,12 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           In = Builder->CreateLShr(In, ConstantInt::get(In->getType(),ShiftAmt),
                                    In->getName()+".lobit");
         }
-          
+
         if ((Op1CV != 0) == isNE) { // Toggle the low bit.
           Constant *One = ConstantInt::get(In->getType(), 1);
           In = Builder->CreateXor(In, One);
         }
-          
+
         if (CI.getType() == In->getType())
           return ReplaceInstUsesWith(CI, In);
         return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
@@ -646,19 +646,19 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   BitsToClear = 0;
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If the input is a truncate from the destination type, we can trivially
   // eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
-  
+
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
-  
+
   unsigned Opc = I->getOpcode(), Tmp;
   switch (Opc) {
   case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
@@ -678,7 +678,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
     // These can all be promoted if neither operand has 'bits to clear'.
     if (BitsToClear == 0 && Tmp == 0)
       return true;
-      
+
     // If the operation is an AND/OR/XOR and the bits to clear are zero in the
     // other side, BitsToClear is ok.
     if (Tmp == 0 &&
@@ -691,10 +691,10 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
                             APInt::getHighBitsSet(VSize, BitsToClear)))
         return true;
     }
-      
+
     // Otherwise, we don't know how to analyze this BitsToClear case yet.
     return false;
-      
+
   case Instruction::LShr:
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
@@ -716,7 +716,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
         Tmp != BitsToClear)
       return false;
     return true;
-      
+
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
@@ -739,48 +739,48 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
 }
 
 Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
-  // If this zero extend is only used by a truncate, let the truncate by
+  // If this zero extend is only used by a truncate, let the truncate be
   // eliminated before we try to optimize this zext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
     return 0;
-  
+
   // If one of the common conversion will work, do it.
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
 
-  // See if we can simplify any instructions used by the input whose sole 
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
-  
+
   // Attempt to extend the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
   unsigned BitsToClear;
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateZExtd(Src, DestTy, BitsToClear)) { 
+      CanEvaluateZExtd(Src, DestTy, BitsToClear)) {
     assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
            "Unreasonable BitsToClear");
-    
+
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
           " to avoid zero extend: " << CI);
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
-    
+
     uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
     uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-    
+
     // If the high bits are already filled with zeros, just replace this
     // cast with the result.
     if (MaskedValueIsZero(Res, APInt::getHighBitsSet(DestBitSize,
                                                      DestBitSize-SrcBitsKept)))
       return ReplaceInstUsesWith(CI, Res);
-    
+
     // We need to emit an AND to clear the high bits.
     Constant *C = ConstantInt::get(Res->getType(),
                                APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
@@ -792,7 +792,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // 'and' which will be much cheaper than the pair of casts.
   if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
     // TODO: Subsume this into EvaluateInDifferentType.
-    
+
     // Get the sizes of the types involved.  We know that the intermediate type
     // will be smaller than A or C, but don't know the relation between A and C.
     Value *A = CSrc->getOperand(0);
@@ -809,7 +809,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
       Value *And = Builder->CreateAnd(A, AndConst, CSrc->getName()+".mask");
       return new ZExtInst(And, CI.getType());
     }
-    
+
     if (SrcSize == DstSize) {
       APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
       return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(),
@@ -818,7 +818,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     if (SrcSize > DstSize) {
       Value *Trunc = Builder->CreateTrunc(A, CI.getType());
       APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
-      return BinaryOperator::CreateAnd(Trunc, 
+      return BinaryOperator::CreateAnd(Trunc,
                                        ConstantInt::get(Trunc->getType(),
                                                         AndValue));
     }
@@ -876,7 +876,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     Value *New = Builder->CreateZExt(X, CI.getType());
     return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
   }
-  
+
   return 0;
 }
 
@@ -989,14 +989,14 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
   // If this is a constant, it can be trivially promoted.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If this is a truncate from the dest type, we can trivially eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
-  
+
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
@@ -1015,14 +1015,14 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
     // These operators can all arbitrarily be extended if their inputs can.
     return CanEvaluateSExtd(I->getOperand(0), Ty) &&
            CanEvaluateSExtd(I->getOperand(1), Ty);
-      
+
   //case Instruction::Shl:   TODO
   //case Instruction::LShr:  TODO
-      
+
   case Instruction::Select:
     return CanEvaluateSExtd(I->getOperand(1), Ty) &&
            CanEvaluateSExtd(I->getOperand(2), Ty);
-      
+
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
@@ -1036,24 +1036,24 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
     // TODO: Can handle more cases here.
     break;
   }
-  
+
   return false;
 }
 
 Instruction *InstCombiner::visitSExt(SExtInst &CI) {
-  // If this sign extend is only used by a truncate, let the truncate by
-  // eliminated before we try to optimize this zext.
+  // If this sign extend is only used by a truncate, let the truncate be
+  // eliminated before we try to optimize this sext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
     return 0;
-  
+
   if (Instruction *I = commonCastTransforms(CI))
     return I;
-  
-  // See if we can simplify any instructions used by the input whose sole 
+
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
@@ -1076,7 +1076,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     // cast with the result.
     if (ComputeNumSignBits(Res) > DestBitSize - SrcBitSize)
       return ReplaceInstUsesWith(CI, Res);
-    
+
     // We need to emit a shl + ashr to do the sign extend.
     Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
     return BinaryOperator::CreateAShr(Builder->CreateShl(Res, ShAmt, "sext"),
@@ -1089,7 +1089,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) {
       uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
       uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-      
+
       // We need to emit a shl + ashr to do the sign extend.
       Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
       Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext");
@@ -1125,7 +1125,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     A = Builder->CreateShl(A, ShAmtV, CI.getName());
     return BinaryOperator::CreateAShr(A, ShAmtV);
   }
-  
+
   return 0;
 }
 
@@ -1147,7 +1147,7 @@ static Value *LookThroughFPExtensions(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     if (I->getOpcode() == Instruction::FPExt)
       return LookThroughFPExtensions(I->getOperand(0));
-  
+
   // If this value is a constant, return the constant in the smallest FP type
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
@@ -1166,14 +1166,14 @@ static Value *LookThroughFPExtensions(Value *V) {
       return V;
     // Don't try to shrink to various long double types.
   }
-  
+
   return V;
 }
 
 Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
-  
+
   // If we have fptrunc(fadd (fpextend x), (fpextend y)), where x and y are
   // smaller than the destination type, we can eliminate the truncate by doing
   // the add as the smaller type.  This applies to fadd/fsub/fmul/fdiv as well
@@ -1190,7 +1190,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       Type *SrcTy = OpI->getType();
       Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
       Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
-      if (LHSTrunc->getType() != SrcTy && 
+      if (LHSTrunc->getType() != SrcTy &&
           RHSTrunc->getType() != SrcTy) {
         unsigned DstSize = CI.getType()->getScalarSizeInBits();
         // If the source types were both smaller than the destination type of
@@ -1202,10 +1202,36 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
           return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
         }
       }
-      break;  
+      break;
+    }
+
+    // (fptrunc (fneg x)) -> (fneg (fptrunc x))
+    if (BinaryOperator::isFNeg(OpI)) {
+      Value *InnerTrunc = Builder->CreateFPTrunc(OpI->getOperand(1),
+                                                 CI.getType());
+      return BinaryOperator::CreateFNeg(InnerTrunc);
+    }
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
+  if (II) {
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::fabs: {
+        // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+        Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
+                                                   CI.getType());
+        Type *IntrinsicType[] = { CI.getType() };
+        Function *Overload =
+          Intrinsic::getDeclaration(CI.getParent()->getParent()->getParent(),
+                                    II->getIntrinsicID(), IntrinsicType);
+
+        Value *Args[] = { InnerTrunc };
+        return CallInst::Create(Overload, Args, II->getName());
+      }
     }
   }
-  
+
   // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
   CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
   if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
@@ -1220,7 +1246,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         Arg->getOperand(0)->getType()->isFloatTy()) {
       Function *Callee = Call->getCalledFunction();
       Module *M = CI.getParent()->getParent()->getParent();
-      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf", 
+      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf",
                                                    Callee->getAttributes(),
                                                    Builder->getFloatTy(),
                                                    Builder->getFloatTy(),
@@ -1228,15 +1254,15 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
                                        "sqrtfcall");
       ret->setAttributes(Callee->getAttributes());
-      
-      
+
+
       // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
       ReplaceInstUsesWith(*Call, UndefValue::get(Call->getType()));
       EraseInstFromFunction(*Call);
       return ret;
     }
   }
-  
+
   return 0;
 }
 
@@ -1254,7 +1280,7 @@ Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
   // This is safe if the intermediate type has enough bits in its mantissa to
   // accurately represent all values of X.  For example, do not do this with
   // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui. 
+  // 'X' value would cause an undefined result for the fptoui.
   if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
       OpI->getOperand(0)->getType() == FI.getType() &&
       (int)FI.getType()->getScalarSizeInBits() < /*extra bit for sign */
@@ -1268,19 +1294,19 @@ Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
   Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
   if (OpI == 0)
     return commonCastTransforms(FI);
-  
+
   // fptosi(sitofp(X)) --> X
   // fptosi(uitofp(X)) --> X
   // This is safe if the intermediate type has enough bits in its mantissa to
   // accurately represent all values of X.  For example, do not do this with
   // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui. 
+  // 'X' value would cause an undefined result for the fptoui.
   if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
       OpI->getOperand(0)->getType() == FI.getType() &&
       (int)FI.getType()->getScalarSizeInBits() <=
                     OpI->getType()->getFPMantissaWidth())
     return ReplaceInstUsesWith(FI, OpI->getOperand(0));
-  
+
   return commonCastTransforms(FI);
 }
 
@@ -1296,21 +1322,16 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   // If the source integer type is not the intptr_t type for this target, do a
   // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
   // cast to be exposed to other transforms.
-  if (TD) {
-    if (CI.getOperand(0)->getType()->getScalarSizeInBits() >
-        TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreateTrunc(CI.getOperand(0),
-                                      TD->getIntPtrType(CI.getContext()));
-      return new IntToPtrInst(P, CI.getType());
-    }
-    if (CI.getOperand(0)->getType()->getScalarSizeInBits() <
-        TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreateZExt(CI.getOperand(0),
-                                     TD->getIntPtrType(CI.getContext()));
-      return new IntToPtrInst(P, CI.getType());
-    }
+  if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+      TD->getPointerSizeInBits()) {
+    Type *Ty = TD->getIntPtrType(CI.getContext());
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
+    return new IntToPtrInst(P, CI.getType());
   }
-  
+
   if (Instruction *I = commonCastTransforms(CI))
     return I;
 
@@ -1320,19 +1341,19 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
 /// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
 Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
-  
+
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
     // If casting the result of a getelementptr instruction with no offset, turn
     // this into a cast of the original pointer!
     if (GEP->hasAllZeroIndices()) {
       // Changing the cast operand is usually not a good idea but it is safe
-      // here because the pointer operand is being replaced with another 
+      // here because the pointer operand is being replaced with another
       // pointer operand so the opcode doesn't need to change.
       Worklist.Add(GEP);
       CI.setOperand(0, GEP->getOperand(0));
       return &CI;
     }
-    
+
     // If the GEP has a single use, and the base pointer is a bitcast, and the
     // GEP computes a constant offset, see if we can convert these three
     // instructions into fewer.  This typically happens with unions and other
@@ -1353,15 +1374,15 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
         Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
         Builder->CreateGEP(OrigBase, NewIndices);
         NGEP->takeName(GEP);
-        
+
         if (isa<BitCastInst>(CI))
           return new BitCastInst(NGEP, CI.getType());
         assert(isa<PtrToIntInst>(CI));
         return new PtrToIntInst(NGEP, CI.getType());
-      }      
+      }
     }
   }
-  
+
   return commonCastTransforms(CI);
 }
 
@@ -1369,19 +1390,15 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   // If the destination integer type is not the intptr_t type for this target,
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
   // to be exposed to other transforms.
-  if (TD) {
-    if (CI.getType()->getScalarSizeInBits() < TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()));
-      return new TruncInst(P, CI.getType());
-    }
-    if (CI.getType()->getScalarSizeInBits() > TD->getPointerSizeInBits()) {
-      Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()));
-      return new ZExtInst(P, CI.getType());
-    }
+  if (TD && CI.getType()->getScalarSizeInBits() != TD->getPointerSizeInBits()) {
+    Type *Ty = TD->getIntPtrType(CI.getContext());
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder->CreatePtrToInt(CI.getOperand(0), Ty);
+    return CastInst::CreateIntegerCast(P, CI.getType(), /*isSigned=*/false);
   }
-  
+
   return commonPointerCastTransforms(CI);
 }
 
@@ -1396,33 +1413,33 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
   VectorType *SrcTy = cast<VectorType>(InVal->getType());
-  
+
   if (SrcTy->getElementType() != DestTy->getElementType()) {
     // The input types don't need to be identical, but for now they must be the
     // same size.  There is no specific reason we couldn't handle things like
     // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
-    // there yet. 
+    // there yet.
     if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
         DestTy->getElementType()->getPrimitiveSizeInBits())
       return 0;
-    
+
     SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
     InVal = IC.Builder->CreateBitCast(InVal, SrcTy);
   }
-  
+
   // Now that the element types match, get the shuffle mask and RHS of the
   // shuffle to use, which depends on whether we're increasing or decreasing the
   // size of the input.
   SmallVector<uint32_t, 16> ShuffleMask;
   Value *V2;
-  
+
   if (SrcTy->getNumElements() > DestTy->getNumElements()) {
     // If we're shrinking the number of elements, just shuffle in the low
     // elements from the input and use undef as the second shuffle input.
     V2 = UndefValue::get(SrcTy);
     for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
       ShuffleMask.push_back(i);
-    
+
   } else {
     // If we're increasing the number of elements, shuffle in all of the
     // elements from InVal and fill the rest of the result elements with zeros
@@ -1436,7 +1453,7 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
     for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
       ShuffleMask.push_back(SrcElts);
   }
-  
+
   return new ShuffleVectorInst(InVal, V2,
                                ConstantDataVector::get(V2->getContext(),
                                                        ShuffleMask));
@@ -1463,7 +1480,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                      Type *VecEltTy) {
   // Undef values never contribute useful bits to the result.
   if (isa<UndefValue>(V)) return true;
-  
+
   // If we got down to a value of the right type, we win, try inserting into the
   // right element.
   if (V->getType() == VecEltTy) {
@@ -1471,15 +1488,15 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (Constant *C = dyn_cast<Constant>(V))
       if (C->isNullValue())
         return true;
-    
+
     // Fail if multiple elements are inserted into this slot.
     if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
       return false;
-    
+
     Elements[ElementIndex] = V;
     return true;
   }
-  
+
   if (Constant *C = dyn_cast<Constant>(V)) {
     // Figure out the # elements this provides, and bitcast it or slice it up
     // as required.
@@ -1490,7 +1507,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (NumElts == 1)
       return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
                                       ElementIndex, Elements, VecEltTy);
-    
+
     // Okay, this is a constant that covers multiple elements.  Slice it up into
     // pieces and insert each element-sized piece into the vector.
     if (!isa<IntegerType>(C->getType()))
@@ -1498,7 +1515,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                        C->getType()->getPrimitiveSizeInBits()));
     unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
     Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
-    
+
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
                                                                i*ElementSize));
@@ -1508,23 +1525,23 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     }
     return true;
   }
-  
+
   if (!V->hasOneUse()) return false;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return false;
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);  
+                                    Elements, VecEltTy);
   case Instruction::ZExt:
     if (!isMultipleOfTypeSize(
                           I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
                               VecEltTy))
       return false;
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);  
+                                    Elements, VecEltTy);
   case Instruction::Or:
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
                                     Elements, VecEltTy) &&
@@ -1536,11 +1553,11 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (CI == 0) return false;
     if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
     unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
-    
+
     return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
                                     Elements, VecEltTy);
   }
-      
+
   }
 }
 
@@ -1575,11 +1592,11 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
   Value *Result = Constant::getNullValue(CI.getType());
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
     if (Elements[i] == 0) continue;  // Unset element.
-    
+
     Result = IC.Builder->CreateInsertElement(Result, Elements[i],
                                              IC.Builder->getInt32(i));
   }
-  
+
   return Result;
 }
 
@@ -1607,11 +1624,11 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
                                 VecTy->getPrimitiveSizeInBits() / DestWidth);
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
-    
+
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(0));
     }
   }
-  
+
   // bitcast(trunc(lshr(bitcast(somevector), cst))
   ConstantInt *ShAmt = 0;
   if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
@@ -1628,7 +1645,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
                                 VecTy->getPrimitiveSizeInBits() / DestWidth);
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
-      
+
       unsigned Elt = ShAmt->getZExtValue() / DestWidth;
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
@@ -1652,12 +1669,12 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     PointerType *SrcPTy = cast<PointerType>(SrcTy);
     Type *DstElTy = DstPTy->getElementType();
     Type *SrcElTy = SrcPTy->getElementType();
-    
+
     // If the address spaces don't match, don't eliminate the bitcast, which is
     // required for changing types.
     if (SrcPTy->getAddressSpace() != DstPTy->getAddressSpace())
       return 0;
-    
+
     // If we are casting a alloca to a pointer to a type of the same
     // size, rewrite the allocation instruction to allocate the "right" type.
     // There is no need to modify malloc calls because it is their bitcast that
@@ -1665,14 +1682,14 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
       if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
         return V;
-    
+
     // If the source and destination are pointers, and this cast is equivalent
     // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
     // This can enhance SROA and other transforms that want type-safe pointers.
     Constant *ZeroUInt =
       Constant::getNullValue(Type::getInt32Ty(CI.getContext()));
     unsigned NumZeros = 0;
-    while (SrcElTy != DstElTy && 
+    while (SrcElTy != DstElTy &&
            isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() &&
            SrcElTy->getNumContainedTypes() /* not "{}" */) {
       SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt);
@@ -1685,7 +1702,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       return GetElementPtrInst::CreateInBounds(Src, Idxs);
     }
   }
-  
+
   // Try to optimize int -> float bitcasts.
   if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy))
     if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
@@ -1698,7 +1715,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                      Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
       // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
     }
-    
+
     if (isa<IntegerType>(SrcTy)) {
       // If this is a cast from an integer to vector, check to see if the input
       // is a trunc or zext of a bitcast from vector.  If so, we can replace all
@@ -1711,7 +1728,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                                                cast<VectorType>(DestTy), *this))
               return I;
       }
-      
+
       // If the input is an 'or' instruction, we may be doing shifts and ors to
       // assemble the elements of the vector manually.  Try to rip the code out
       // and replace it with insertelements.
@@ -1721,18 +1738,29 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   }
 
   if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
-    if (SrcVTy->getNumElements() == 1 && !DestTy->isVectorTy()) {
-      Value *Elem = 
-        Builder->CreateExtractElement(Src,
-                   Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
-      return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+    if (SrcVTy->getNumElements() == 1) {
+      // If our destination is not a vector, then make this a straight
+      // scalar-scalar cast.
+      if (!DestTy->isVectorTy()) {
+        Value *Elem =
+          Builder->CreateExtractElement(Src,
+                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+        return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+      }
+
+      // Otherwise, see if our source is an insert. If so, then use the scalar
+      // component directly.
+      if (InsertElementInst *IEI =
+            dyn_cast<InsertElementInst>(CI.getOperand(0)))
+        return CastInst::Create(Instruction::BitCast, IEI->getOperand(1),
+                                DestTy);
     }
   }
 
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
     // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
     // a bitcast to a vector with the same # elts.
-    if (SVI->hasOneUse() && DestTy->isVectorTy() && 
+    if (SVI->hasOneUse() && DestTy->isVectorTy() &&
         cast<VectorType>(DestTy)->getNumElements() ==
               SVI->getType()->getNumElements() &&
         SVI->getType()->getNumElements() ==
@@ -1741,9 +1769,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
       // us to eliminate at least one cast.
-      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) && 
+      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) &&
            Tmp->getOperand(0)->getType() == DestTy) ||
-          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) && 
+          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) &&
            Tmp->getOperand(0)->getType() == DestTy)) {
         Value *LHS = Builder->CreateBitCast(SVI->getOperand(0), DestTy);
         Value *RHS = Builder->CreateBitCast(SVI->getOperand(1), DestTy);
@@ -1753,7 +1781,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       }
     }
   }
-  
+
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 40e559e..bad46b4 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1331,6 +1331,25 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
                           And, Constant::getNullValue(And->getType()));
     }
+
+    // Transform (icmp pred iM (shl iM %v, N), CI)
+    // -> (icmp pred i(M-N) (trunc %v iM to i(N-N)), (trunc (CI>>N))
+    // Transform the shl to a trunc if (trunc (CI>>N)) has no loss.
+    // This enables to get rid of the shift in favor of a trunc which can be
+    // free on the target. It has the additional benefit of comparing to a
+    // smaller constant, which will be target friendly.
+    unsigned Amt = ShAmt->getLimitedValue(TypeBits-1);
+    if (Amt != 0 && RHSV.countTrailingZeros() >= Amt) {
+      Type *NTy = IntegerType::get(ICI.getContext(), TypeBits - Amt);
+      Constant *NCI = ConstantExpr::getTrunc(
+                        ConstantExpr::getAShr(RHS,
+                          ConstantInt::get(RHS->getType(), Amt)),
+                        NTy);
+      return new ICmpInst(ICI.getPredicate(),
+                          Builder->CreateTrunc(LHSI->getOperand(0), NTy),
+                          NCI);
+    }
+
     break;
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index d0f4392..8e4267f 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -377,6 +377,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), TD))
     return ReplaceInstUsesWith(I, V);
 
+  bool AllowReassociate = I.hasUnsafeAlgebra();
+
   // Simplify mul instructions with a constant RHS.
   if (isa<Constant>(Op1)) {
     // Try to fold constant mul into select arguments.
@@ -389,7 +391,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
         return NV;
 
     ConstantFP *C = dyn_cast<ConstantFP>(Op1);
-    if (C && I.hasUnsafeAlgebra() && C->getValueAPF().isNormal()) {
+    if (C && AllowReassociate && C->getValueAPF().isNormal()) {
       // Let MDC denote an expression in one of these forms:
       // X * C, C/X, X/C, where C is a constant.
       //
@@ -430,7 +432,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
                         BinaryOperator::CreateFAdd(M0, M1) :
                         BinaryOperator::CreateFSub(M0, M1);
             Instruction *RI = cast<Instruction>(R);
-            RI->setHasUnsafeAlgebra(true);
+            RI->copyFastMathFlags(&I);
             return RI;
           }
         }
@@ -438,9 +440,6 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
   }
 
-  if (Value *Op0v = dyn_castFNegVal(Op0))     // -X * -Y = X*Y
-    if (Value *Op1v = dyn_castFNegVal(Op1))
-      return BinaryOperator::CreateFMul(Op0v, Op1v);
 
   // Under unsafe algebra do:
   // X * log2(0.5*Y) = X*log2(Y) - X
@@ -469,36 +468,66 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
   }
 
-  // X * cond ? 1.0 : 0.0 => cond ? X : 0.0
-  if (I.hasNoNaNs() && I.hasNoSignedZeros()) {
-    Value *V0 = I.getOperand(0);
-    Value *V1 = I.getOperand(1);
-    Value *Cond, *SLHS, *SRHS;
-    bool Match = false;
-
-    if (match(V0, m_Select(m_Value(Cond), m_Value(SLHS), m_Value(SRHS)))) {
-      Match = true;
-    } else if (match(V1, m_Select(m_Value(Cond), m_Value(SLHS), 
-                     m_Value(SRHS)))) {
-      Match = true;
-      std::swap(V0, V1);
+  // Handle symmetric situation in a 2-iteration loop
+  Value *Opnd0 = Op0;
+  Value *Opnd1 = Op1;
+  for (int i = 0; i < 2; i++) {
+    bool IgnoreZeroSign = I.hasNoSignedZeros();
+    if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
+      Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
+      Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
+
+      // -X * -Y => X*Y
+      if (N1)
+        return BinaryOperator::CreateFMul(N0, N1);
+
+      if (Opnd0->hasOneUse()) {
+        // -X * Y => -(X*Y) (Promote negation as high as possible)
+        Value *T = Builder->CreateFMul(N0, Opnd1);
+        cast<Instruction>(T)->setDebugLoc(I.getDebugLoc());
+        Instruction *Neg = BinaryOperator::CreateFNeg(T);
+        if (I.getFastMathFlags().any()) {
+          cast<Instruction>(T)->copyFastMathFlags(&I);
+          Neg->copyFastMathFlags(&I);
+        }
+        return Neg;
+      }
     }
 
-    if (Match) {
-      ConstantFP *C0 = dyn_cast<ConstantFP>(SLHS);
-      ConstantFP *C1 = dyn_cast<ConstantFP>(SRHS);
-
-      if (C0 && C1 &&
-          ((C0->isZero() && C1->isExactlyValue(1.0)) ||
-           (C1->isZero() && C0->isExactlyValue(1.0)))) {
-        Value *T;
-        if (C0->isZero())
-          T = Builder->CreateSelect(Cond, SLHS, V1);
-        else
-          T = Builder->CreateSelect(Cond, V1, SRHS);
-        return ReplaceInstUsesWith(I, T);
+    // (X*Y) * X => (X*X) * Y where Y != X
+    //  The purpose is two-fold: 
+    //   1) to form a power expression (of X).
+    //   2) potentially shorten the critical path: After transformation, the
+    //  latency of the instruction Y is amortized by the expression of X*X,
+    //  and therefore Y is in a "less critical" position compared to what it
+    //  was before the transformation.
+    //
+    if (AllowReassociate) {
+      Value *Opnd0_0, *Opnd0_1;
+      if (Opnd0->hasOneUse() &&
+          match(Opnd0, m_FMul(m_Value(Opnd0_0), m_Value(Opnd0_1)))) {
+        Value *Y = 0;
+        if (Opnd0_0 == Opnd1 && Opnd0_1 != Opnd1)
+          Y = Opnd0_1;
+        else if (Opnd0_1 == Opnd1 && Opnd0_0 != Opnd1)
+          Y = Opnd0_0;
+
+        if (Y) {
+          Instruction *T = cast<Instruction>(Builder->CreateFMul(Opnd1, Opnd1));
+          T->copyFastMathFlags(&I);
+          T->setDebugLoc(I.getDebugLoc());
+
+          Instruction *R = BinaryOperator::CreateFMul(T, Y);
+          R->copyFastMathFlags(&I);
+          return R;
+        }
       }
     }
+
+    if (!isa<Constant>(Op1))
+      std::swap(Opnd0, Opnd1);
+    else
+      break;
   }
 
   return Changed ? &I : 0;
@@ -784,21 +813,140 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   return 0;
 }
 
+/// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
+/// FP value and:
+///    1) 1/C is exact, or 
+///    2) reciprocal is allowed.
+/// If the convertion was successful, the simplified expression "X * 1/C" is
+/// returned; otherwise, NULL is returned.
+///
+static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
+                                             ConstantFP *Divisor,
+                                             bool AllowReciprocal) {
+  const APFloat &FpVal = Divisor->getValueAPF();
+  APFloat Reciprocal(FpVal.getSemantics());
+  bool Cvt = FpVal.getExactInverse(&Reciprocal);
+    
+  if (!Cvt && AllowReciprocal && FpVal.isNormal()) {
+    Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
+    (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
+    Cvt = !Reciprocal.isDenormal();
+  }
+
+  if (!Cvt)
+    return 0;
+
+  ConstantFP *R;
+  R = ConstantFP::get(Dividend->getType()->getContext(), Reciprocal);
+  return BinaryOperator::CreateFMul(Dividend, R);
+}
+
 Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (Value *V = SimplifyFDivInst(Op0, Op1, TD))
     return ReplaceInstUsesWith(I, V);
 
+  bool AllowReassociate = I.hasUnsafeAlgebra();
+  bool AllowReciprocal = I.hasAllowReciprocal();
+
   if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
-    const APFloat &Op1F = Op1C->getValueAPF();
-
-    // If the divisor has an exact multiplicative inverse we can turn the fdiv
-    // into a cheaper fmul.
-    APFloat Reciprocal(Op1F.getSemantics());
-    if (Op1F.getExactInverse(&Reciprocal)) {
-      ConstantFP *RFP = ConstantFP::get(Builder->getContext(), Reciprocal);
-      return BinaryOperator::CreateFMul(Op0, RFP);
+    if (AllowReassociate) {
+      ConstantFP *C1 = 0;
+      ConstantFP *C2 = Op1C;
+      Value *X;
+      Instruction *Res = 0;
+
+      if (match(Op0, m_FMul(m_Value(X), m_ConstantFP(C1)))) {
+        // (X*C1)/C2 => X * (C1/C2)
+        //
+        Constant *C = ConstantExpr::getFDiv(C1, C2);
+        const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
+        if (F.isNormal() && !F.isDenormal())
+          Res = BinaryOperator::CreateFMul(X, C);
+      } else if (match(Op0, m_FDiv(m_Value(X), m_ConstantFP(C1)))) {
+        // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed]
+        //
+        Constant *C = ConstantExpr::getFMul(C1, C2);
+        const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
+        if (F.isNormal() && !F.isDenormal()) {
+          Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C), 
+                                         AllowReciprocal);
+          if (!Res)
+            Res = BinaryOperator::CreateFDiv(X, C); 
+        }
+      }
+
+      if (Res) {
+        Res->setFastMathFlags(I.getFastMathFlags());
+        return Res;
+      }
+    }
+
+    // X / C => X * 1/C
+    if (Instruction *T = CvtFDivConstToReciprocal(Op0, Op1C, AllowReciprocal))
+      return T;
+
+    return 0;
+  }
+
+  if (AllowReassociate && isa<ConstantFP>(Op0)) {
+    ConstantFP *C1 = cast<ConstantFP>(Op0), *C2;
+    Constant *Fold = 0;
+    Value *X;
+    bool CreateDiv = true;
+
+    // C1 / (X*C2) => (C1/C2) / X
+    if (match(Op1, m_FMul(m_Value(X), m_ConstantFP(C2))))
+      Fold = ConstantExpr::getFDiv(C1, C2);
+    else if (match(Op1, m_FDiv(m_Value(X), m_ConstantFP(C2)))) {
+      // C1 / (X/C2) => (C1*C2) / X
+      Fold = ConstantExpr::getFMul(C1, C2);
+    } else if (match(Op1, m_FDiv(m_ConstantFP(C2), m_Value(X)))) {
+      // C1 / (C2/X) => (C1/C2) * X
+      Fold = ConstantExpr::getFDiv(C1, C2);
+      CreateDiv = false;
+    }
+
+    if (Fold) {
+      const APFloat &FoldC = cast<ConstantFP>(Fold)->getValueAPF();
+      if (FoldC.isNormal() && !FoldC.isDenormal()) {
+        Instruction *R = CreateDiv ? 
+                         BinaryOperator::CreateFDiv(Fold, X) :
+                         BinaryOperator::CreateFMul(X, Fold);
+        R->setFastMathFlags(I.getFastMathFlags());
+        return R;
+      }
+    }
+    return 0;
+  }
+
+  if (AllowReassociate) {
+    Value *X, *Y;
+    Value *NewInst = 0;
+    Instruction *SimpR = 0;
+
+    if (Op0->hasOneUse() && match(Op0, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // (X/Y) / Z => X / (Y*Z)
+      //
+      if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op1)) {
+        NewInst = Builder->CreateFMul(Y, Op1);
+        SimpR = BinaryOperator::CreateFDiv(X, NewInst);
+      }
+    } else if (Op1->hasOneUse() && match(Op1, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // Z / (X/Y) => Z*Y / X
+      //
+      if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op0)) {
+        NewInst = Builder->CreateFMul(Op0, Y);
+        SimpR = BinaryOperator::CreateFDiv(NewInst, X);
+      }
+    }
+
+    if (NewInst) {
+      if (Instruction *T = dyn_cast<Instruction>(NewInst))
+        T->setDebugLoc(I.getDebugLoc());
+      SimpR->setFastMathFlags(I.getFastMathFlags());
+      return SimpR;
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index dd7ea14..4f71db1 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -13,7 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
+#include "llvm/Support/PatternMatch.h"
 using namespace llvm;
+using namespace PatternMatch;
 
 /// CheapToScalarize - Return true if the value is cheaper to scalarize than it
 /// is to leave as a vector operation.  isConstant indicates whether we're
@@ -92,6 +94,13 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
     return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
   }
 
+  // Extract a value from a vector add operation with a constant zero.
+  Value *Val = 0; Constant *Con = 0;
+  if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) {
+    if (Con->getAggregateElement(EltNo)->isNullValue())
+      return FindScalarElement(Val, EltNo);
+  }
+
   // Otherwise, we don't know.
   return 0;
 }
@@ -295,12 +304,12 @@ static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask,
     Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
     return V;
   }
-  
+
   if (isa<ConstantAggregateZero>(V)) {
     Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0));
     return V;
   }
-  
+
   if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
     // If this is an insert of an extract from some other vector, include it.
     Value *VecOp    = IEI->getOperand(0);
@@ -595,12 +604,12 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   // ShuffleVectorInst is equivalent to the original one.
   for (unsigned i = 0; i < VWidth; ++i) {
     int eltMask;
-    if (Mask[i] == -1) {
+    if (Mask[i] < 0) {
       // This element is an undef value.
       eltMask = -1;
     } else if (Mask[i] < (int)LHSWidth) {
       // This element is from left hand side vector operand.
-      // 
+      //
       // If LHS is going to be replaced (case 1, 2, or 4), calculate the
       // new mask value for the element.
       if (newLHS != LHS) {
@@ -609,8 +618,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
         // with a -1 mask value.
         if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
           eltMask = -1;
-      }
-      else
+      } else
         eltMask = Mask[i];
     } else {
       // This element is from right hand side vector operand
@@ -630,8 +638,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
                  && "should have been check above");
           eltMask = -1;
         }
-      }
-      else
+      } else
         eltMask = Mask[i]-LHSWidth;
 
       // If LHS's width is changed, shift the mask value accordingly.
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 57ed9e3..49efce5 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -19,20 +19,20 @@
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
-  
+
 /// InstCombineWorklist - This is the worklist management logic for
 /// InstCombine.
 class LLVM_LIBRARY_VISIBILITY InstCombineWorklist {
   SmallVector<Instruction*, 256> Worklist;
   DenseMap<Instruction*, unsigned> WorklistMap;
-  
+
   void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION;
   InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION;
 public:
   InstCombineWorklist() {}
-  
+
   bool isEmpty() const { return Worklist.empty(); }
-  
+
   /// Add - Add the specified instruction to the worklist if it isn't already
   /// in it.
   void Add(Instruction *I) {
@@ -41,12 +41,12 @@ public:
       Worklist.push_back(I);
     }
   }
-  
+
   void AddValue(Value *V) {
     if (Instruction *I = dyn_cast<Instruction>(V))
       Add(I);
   }
-  
+
   /// AddInitialGroup - Add the specified batch of stuff in reverse order.
   /// which should only be done when the worklist is empty and when the group
   /// has no duplicates.
@@ -61,25 +61,25 @@ public:
       Worklist.push_back(I);
     }
   }
-  
+
   // Remove - remove I from the worklist if it exists.
   void Remove(Instruction *I) {
     DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
     if (It == WorklistMap.end()) return; // Not in worklist.
-    
+
     // Don't bother moving everything down, just null out the slot.
     Worklist[It->second] = 0;
-    
+
     WorklistMap.erase(It);
   }
-  
+
   Instruction *RemoveOne() {
     Instruction *I = Worklist.back();
     Worklist.pop_back();
     WorklistMap.erase(I);
     return I;
   }
-  
+
   /// AddUsersToWorkList - When an instruction is simplified, add all users of
   /// the instruction to the work lists because they might get more simplified
   /// now.
@@ -89,18 +89,18 @@ public:
          UI != UE; ++UI)
       Add(cast<Instruction>(*UI));
   }
-  
-  
+
+
   /// Zap - check that the worklist is empty and nuke the backing store for
   /// the map if it is large.
   void Zap() {
     assert(WorklistMap.empty() && "Worklist empty, but map not?");
-    
+
     // Do an explicit clear, this shrinks the map if needed.
     WorklistMap.clear();
   }
 };
-  
+
 } // end namespace llvm.
 
 #endif
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6f24cdd..c6115e3 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -162,6 +162,21 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
   return !Overflow;
 }
 
+/// Conservatively clears subclassOptionalData after a reassociation or
+/// commutation. We preserve fast-math flags when applicable as they can be
+/// preserved.
+static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
+  FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+  if (!FPMO) {
+    I.clearSubclassOptionalData();
+    return;
+  }
+
+  FastMathFlags FMF = I.getFastMathFlags();
+  I.clearSubclassOptionalData();
+  I.setFastMathFlags(FMF);
+}
+
 /// SimplifyAssociativeOrCommutative - This performs a few simplifications for
 /// operators which are associative or commutative:
 //
@@ -219,7 +234,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
             I.clearSubclassOptionalData();
             I.setHasNoSignedWrap(true);
           } else {
-            I.clearSubclassOptionalData();
+            ClearSubclassDataAfterReassociation(I);
           }
 
           Changed = true;
@@ -241,7 +256,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, C);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          ClearSubclassDataAfterReassociation(I);
           Changed = true;
           ++NumReassoc;
           continue;
@@ -263,7 +278,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, B);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          ClearSubclassDataAfterReassociation(I);
           Changed = true;
           ++NumReassoc;
           continue;
@@ -283,7 +298,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, V);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          ClearSubclassDataAfterReassociation(I);
           Changed = true;
           ++NumReassoc;
           continue;
@@ -310,7 +325,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         I.setOperand(1, Folded);
         // Conservatively clear the optional flags, since they may not be
         // preserved by the reassociation.
-        I.clearSubclassOptionalData();
+        ClearSubclassDataAfterReassociation(I);
 
         Changed = true;
         continue;
@@ -516,8 +531,8 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
 // instruction if the LHS is a constant negative zero (which is the 'negate'
 // form).
 //
-Value *InstCombiner::dyn_castFNegVal(Value *V) const {
-  if (BinaryOperator::isFNeg(V))
+Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
+  if (BinaryOperator::isFNeg(V, IgnoreZeroSign))
     return BinaryOperator::getFNegArgument(V);
 
   // Constants can be considered to be negated values if they can be folded.
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 9bd3239..6877475 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -16,7 +16,6 @@
 #define DEBUG_TYPE "asan"
 
 #include "llvm/Transforms/Instrumentation.h"
-#include "BlackList.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -36,6 +35,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InstVisitor.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
@@ -43,6 +43,7 @@
 #include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
@@ -53,7 +54,8 @@ using namespace llvm;
 static const uint64_t kDefaultShadowScale = 3;
 static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
 static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
-static const uint64_t kDefaultShadowOffsetAndroid = 0;
+static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000;  // < 2G.
+static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
 
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
@@ -63,11 +65,13 @@ static const char *kAsanModuleCtorName = "asan.module_ctor";
 static const char *kAsanModuleDtorName = "asan.module_dtor";
 static const int   kAsanCtorAndCtorPriority = 1;
 static const char *kAsanReportErrorTemplate = "__asan_report_";
+static const char *kAsanReportLoadN = "__asan_report_load_n";
+static const char *kAsanReportStoreN = "__asan_report_store_n";
 static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals";
 static const char *kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *kAsanInitName = "__asan_init";
+static const char *kAsanInitName = "__asan_init_v1";
 static const char *kAsanHandleNoReturnName = "__asan_handle_no_return";
 static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
 static const char *kAsanMappingScaleName = "__asan_mapping_scale";
@@ -133,6 +137,9 @@ static cl::opt<int> ClMappingScale("asan-mapping-scale",
        cl::desc("scale of asan shadow mapping"), cl::Hidden, cl::init(0));
 static cl::opt<int> ClMappingOffsetLog("asan-mapping-offset-log",
        cl::desc("offset of asan shadow mapping"), cl::Hidden, cl::init(-1));
+static cl::opt<bool> ClShort64BitOffset("asan-short-64bit-mapping-offset",
+       cl::desc("Use short immediate constant as the mapping offset for 64bit"),
+       cl::Hidden, cl::init(true));
 
 // Optimization flags. Not user visible, used mostly for testing
 // and benchmarking the tool.
@@ -186,14 +193,53 @@ class SetOfDynamicallyInitializedGlobals {
   SmallSet<GlobalValue*, 32> DynInitGlobals;
 };
 
-static int MappingScale() {
-  return ClMappingScale ? ClMappingScale : kDefaultShadowScale;
+/// This struct defines the shadow mapping using the rule:
+///   shadow = (mem >> Scale) ADD-or-OR Offset.
+struct ShadowMapping {
+  int Scale;
+  uint64_t Offset;
+  bool OrShadowOffset;
+};
+
+static ShadowMapping getShadowMapping(const Module &M, int LongSize,
+                                      bool ZeroBaseShadow) {
+  llvm::Triple TargetTriple(M.getTargetTriple());
+  bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
+  bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX;
+  bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64;
+  bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+
+  ShadowMapping Mapping;
+
+  // OR-ing shadow offset if more efficient (at least on x86),
+  // but on ppc64 we have to use add since the shadow offset is not neccesary
+  // 1/8-th of the address space.
+  Mapping.OrShadowOffset = !IsPPC64 && !ClShort64BitOffset;
+
+  Mapping.Offset = (IsAndroid || ZeroBaseShadow) ? 0 :
+      (LongSize == 32 ? kDefaultShadowOffset32 :
+       IsPPC64 ? kPPC64_ShadowOffset64 : kDefaultShadowOffset64);
+  if (!ZeroBaseShadow && ClShort64BitOffset && IsX86_64 && !IsMacOSX) {
+    assert(LongSize == 64);
+    Mapping.Offset = kDefaultShort64bitShadowOffset;
+  }
+  if (!ZeroBaseShadow && ClMappingOffsetLog >= 0) {
+    // Zero offset log is the special case.
+    Mapping.Offset = (ClMappingOffsetLog == 0) ? 0 : 1ULL << ClMappingOffsetLog;
+  }
+
+  Mapping.Scale = kDefaultShadowScale;
+  if (ClMappingScale) {
+    Mapping.Scale = ClMappingScale;
+  }
+
+  return Mapping;
 }
 
-static size_t RedzoneSize() {
+static size_t RedzoneSizeForScale(int MappingScale) {
   // Redzone used for stack and globals is at least 32 bytes.
   // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
-  return std::max(32U, 1U << MappingScale());
+  return std::max(32U, 1U << MappingScale);
 }
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
@@ -201,23 +247,27 @@ struct AddressSanitizer : public FunctionPass {
   AddressSanitizer(bool CheckInitOrder = false,
                    bool CheckUseAfterReturn = false,
                    bool CheckLifetime = false,
-                   StringRef BlacklistFile = StringRef())
+                   StringRef BlacklistFile = StringRef(),
+                   bool ZeroBaseShadow = false)
       : FunctionPass(ID),
         CheckInitOrder(CheckInitOrder || ClInitializers),
         CheckUseAfterReturn(CheckUseAfterReturn || ClUseAfterReturn),
         CheckLifetime(CheckLifetime || ClCheckLifetime),
         BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) {}
+                                            : BlacklistFile),
+        ZeroBaseShadow(ZeroBaseShadow) {}
   virtual const char *getPassName() const {
     return "AddressSanitizerFunctionPass";
   }
   void instrumentMop(Instruction *I);
-  void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB,
-                         Value *Addr, uint32_t TypeSize, bool IsWrite);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite,
+                         Value *SizeArgument);
   Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                            Value *ShadowValue, uint32_t TypeSize);
   Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
-                                 bool IsWrite, size_t AccessSizeIndex);
+                                 bool IsWrite, size_t AccessSizeIndex,
+                                 Value *SizeArgument);
   bool instrumentMemIntrinsic(MemIntrinsic *MI);
   void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
                                    Value *Size,
@@ -227,6 +277,7 @@ struct AddressSanitizer : public FunctionPass {
   void createInitializerPoisonCalls(Module &M,
                                     Value *FirstAddr, Value *LastAddr);
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+  void emitShadowMapping(Module &M, IRBuilder<> &IRB) const;
   virtual bool doInitialization(Module &M);
   static char ID;  // Pass identification, replacement for typeid
 
@@ -240,18 +291,22 @@ struct AddressSanitizer : public FunctionPass {
   bool CheckInitOrder;
   bool CheckUseAfterReturn;
   bool CheckLifetime;
+  SmallString<64> BlacklistFile;
+  bool ZeroBaseShadow;
+
   LLVMContext *C;
   DataLayout *TD;
-  uint64_t MappingOffset;
   int LongSize;
   Type *IntptrTy;
+  ShadowMapping Mapping;
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Function *AsanHandleNoReturnFunc;
-  SmallString<64> BlacklistFile;
   OwningPtr<BlackList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
+  // This array is indexed by AccessIsWrite.
+  Function *AsanErrorCallbackSized[2];
   InlineAsm *EmptyAsm;
   SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
 
@@ -261,11 +316,13 @@ struct AddressSanitizer : public FunctionPass {
 class AddressSanitizerModule : public ModulePass {
  public:
   AddressSanitizerModule(bool CheckInitOrder = false,
-                         StringRef BlacklistFile = StringRef())
+                         StringRef BlacklistFile = StringRef(),
+                         bool ZeroBaseShadow = false)
       : ModulePass(ID),
         CheckInitOrder(CheckInitOrder || ClInitializers),
         BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                            : BlacklistFile) {}
+                                            : BlacklistFile),
+        ZeroBaseShadow(ZeroBaseShadow) {}
   bool runOnModule(Module &M);
   static char ID;  // Pass identification, replacement for typeid
   virtual const char *getPassName() const {
@@ -278,14 +335,20 @@ class AddressSanitizerModule : public ModulePass {
   bool ShouldInstrumentGlobal(GlobalVariable *G);
   void createInitializerPoisonCalls(Module &M, Value *FirstAddr,
                                     Value *LastAddr);
+  size_t RedzoneSize() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
 
   bool CheckInitOrder;
   SmallString<64> BlacklistFile;
+  bool ZeroBaseShadow;
+
   OwningPtr<BlackList> BL;
   SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
   Type *IntptrTy;
   LLVMContext *C;
   DataLayout *TD;
+  ShadowMapping Mapping;
   Function *AsanPoisonGlobals;
   Function *AsanUnpoisonGlobals;
   Function *AsanRegisterGlobals;
@@ -308,6 +371,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   LLVMContext *C;
   Type *IntptrTy;
   Type *IntptrPtrTy;
+  ShadowMapping Mapping;
 
   SmallVector<AllocaInst*, 16> AllocaVec;
   SmallVector<Instruction*, 8> RetVec;
@@ -332,7 +396,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
       : F(F), ASan(ASan), DIB(*F.getParent()), C(ASan.C),
         IntptrTy(ASan.IntptrTy), IntptrPtrTy(PointerType::get(IntptrTy, 0)),
-        TotalStackSize(0), StackAlignment(1 << MappingScale()) {}
+        Mapping(ASan.Mapping),
+        TotalStackSize(0), StackAlignment(1 << Mapping.Scale) {}
 
   bool runOnFunction() {
     if (!ClStack) return false;
@@ -411,6 +476,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
             AI.getAllocatedType()->isSized());
   }
 
+  size_t RedzoneSize() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
   uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
     Type *Ty = AI->getAllocatedType();
     uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty);
@@ -439,9 +507,9 @@ INITIALIZE_PASS(AddressSanitizer, "asan",
     false, false)
 FunctionPass *llvm::createAddressSanitizerFunctionPass(
     bool CheckInitOrder, bool CheckUseAfterReturn, bool CheckLifetime,
-    StringRef BlacklistFile) {
+    StringRef BlacklistFile, bool ZeroBaseShadow) {
   return new AddressSanitizer(CheckInitOrder, CheckUseAfterReturn,
-                              CheckLifetime, BlacklistFile);
+                              CheckLifetime, BlacklistFile, ZeroBaseShadow);
 }
 
 char AddressSanitizerModule::ID = 0;
@@ -449,8 +517,9 @@ INITIALIZE_PASS(AddressSanitizerModule, "asan-module",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
     "ModulePass", false, false)
 ModulePass *llvm::createAddressSanitizerModulePass(
-    bool CheckInitOrder, StringRef BlacklistFile) {
-  return new AddressSanitizerModule(CheckInitOrder, BlacklistFile);
+    bool CheckInitOrder, StringRef BlacklistFile, bool ZeroBaseShadow) {
+  return new AddressSanitizerModule(CheckInitOrder, BlacklistFile,
+                                    ZeroBaseShadow);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -473,32 +542,30 @@ static bool GlobalWasGeneratedByAsan(GlobalVariable *G) {
 
 Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
   // Shadow >> scale
-  Shadow = IRB.CreateLShr(Shadow, MappingScale());
-  if (MappingOffset == 0)
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  if (Mapping.Offset == 0)
     return Shadow;
   // (Shadow >> scale) | offset
-  return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy,
-                                               MappingOffset));
+  if (Mapping.OrShadowOffset)
+    return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
+  else
+    return IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
 }
 
 void AddressSanitizer::instrumentMemIntrinsicParam(
     Instruction *OrigIns,
     Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
+  IRBuilder<> IRB(InsertBefore);
+  if (Size->getType() != IntptrTy)
+    Size = IRB.CreateIntCast(Size, IntptrTy, false);
   // Check the first byte.
-  {
-    IRBuilder<> IRB(InsertBefore);
-    instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite);
-  }
+  instrumentAddress(OrigIns, InsertBefore, Addr, 8, IsWrite, Size);
   // Check the last byte.
-  {
-    IRBuilder<> IRB(InsertBefore);
-    Value *SizeMinusOne = IRB.CreateSub(
-        Size, ConstantInt::get(Size->getType(), 1));
-    SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false);
-    Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
-    Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne);
-    instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
-  }
+  IRB.SetInsertPoint(InsertBefore);
+  Value *SizeMinusOne = IRB.CreateSub(Size, ConstantInt::get(IntptrTy, 1));
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *AddrLast = IRB.CreateAdd(AddrLong, SizeMinusOne);
+  instrumentAddress(OrigIns, InsertBefore, AddrLast, 8, IsWrite, Size);
 }
 
 // Instrument memset/memmove/memcpy
@@ -577,14 +644,24 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
   assert(OrigTy->isSized());
   uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
 
-  if (TypeSize != 8  && TypeSize != 16 &&
-      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
-    // Ignore all unusual sizes.
-    return;
-  }
+  assert((TypeSize % 8) == 0);
 
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check.
+  if (TypeSize == 8  || TypeSize == 16 ||
+      TypeSize == 32 || TypeSize == 64 || TypeSize == 128)
+    return instrumentAddress(I, I, Addr, TypeSize, IsWrite, 0);
+  // Instrument unusual size (but still multiple of 8).
+  // We can not do it with a single check, so we do 1-byte check for the first
+  // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+  // to report the actual access size.
   IRBuilder<> IRB(I);
-  instrumentAddress(I, IRB, Addr, TypeSize, IsWrite);
+  Value *LastByte =  IRB.CreateIntToPtr(
+      IRB.CreateAdd(IRB.CreatePointerCast(Addr, IntptrTy),
+                    ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+      OrigPtrTy);
+  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+  instrumentAddress(I, I, Addr, 8, IsWrite, Size);
+  instrumentAddress(I, I, LastByte, 8, IsWrite, Size);
 }
 
 // Validate the result of Module::getOrInsertFunction called for an interface
@@ -600,10 +677,12 @@ static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
 
 Instruction *AddressSanitizer::generateCrashCode(
     Instruction *InsertBefore, Value *Addr,
-    bool IsWrite, size_t AccessSizeIndex) {
+    bool IsWrite, size_t AccessSizeIndex, Value *SizeArgument) {
   IRBuilder<> IRB(InsertBefore);
-  CallInst *Call = IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex],
-                                  Addr);
+  CallInst *Call = SizeArgument
+    ? IRB.CreateCall2(AsanErrorCallbackSized[IsWrite], Addr, SizeArgument)
+    : IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr);
+
   // We don't do Call->setDoesNotReturn() because the BB already has
   // UnreachableInst at the end.
   // This EmptyAsm is required to avoid callback merge.
@@ -614,7 +693,7 @@ Instruction *AddressSanitizer::generateCrashCode(
 Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                                             Value *ShadowValue,
                                             uint32_t TypeSize) {
-  size_t Granularity = 1 << MappingScale();
+  size_t Granularity = 1 << Mapping.Scale;
   // Addr & (Granularity - 1)
   Value *LastAccessedByte = IRB.CreateAnd(
       AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
@@ -630,12 +709,14 @@ Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
 }
 
 void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
-                                         IRBuilder<> &IRB, Value *Addr,
-                                         uint32_t TypeSize, bool IsWrite) {
+                                         Instruction *InsertBefore,
+                                         Value *Addr, uint32_t TypeSize,
+                                         bool IsWrite, Value *SizeArgument) {
+  IRBuilder<> IRB(InsertBefore);
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
 
   Type *ShadowTy  = IntegerType::get(
-      *C, std::max(8U, TypeSize >> MappingScale()));
+      *C, std::max(8U, TypeSize >> Mapping.Scale));
   Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
   Value *ShadowPtr = memToShadow(AddrLong, IRB);
   Value *CmpVal = Constant::getNullValue(ShadowTy);
@@ -644,7 +725,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
   size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
-  size_t Granularity = 1 << MappingScale();
+  size_t Granularity = 1 << Mapping.Scale;
   TerminatorInst *CrashTerm = 0;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
@@ -663,8 +744,8 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     CrashTerm = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), true);
   }
 
-  Instruction *Crash =
-      generateCrashCode(CrashTerm, AddrLong, IsWrite, AccessSizeIndex);
+  Instruction *Crash = generateCrashCode(
+      CrashTerm, AddrLong, IsWrite, AccessSizeIndex, SizeArgument);
   Crash->setDebugLoc(OrigIns->getDebugLoc());
 }
 
@@ -782,7 +863,9 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   BL.reset(new BlackList(BlacklistFile));
   if (BL->isIn(M)) return false;
   C = &(M.getContext());
-  IntptrTy = Type::getIntNTy(*C, TD->getPointerSizeInBits());
+  int LongSize = TD->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow);
   initializeCallbacks(M);
   DynamicallyInitializedGlobals.Init(M);
 
@@ -819,12 +902,22 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   Value *FirstDynamic = 0, *LastDynamic = 0;
 
   for (size_t i = 0; i < n; i++) {
+    static const uint64_t kMaxGlobalRedzone = 1 << 18;
     GlobalVariable *G = GlobalsToChange[i];
     PointerType *PtrTy = cast<PointerType>(G->getType());
     Type *Ty = PtrTy->getElementType();
     uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
-    size_t RZ = RedzoneSize();
-    uint64_t RightRedzoneSize = RZ + (RZ - (SizeInBytes % RZ));
+    uint64_t MinRZ = RedzoneSize();
+    // MinRZ <= RZ <= kMaxGlobalRedzone
+    // and trying to make RZ to be ~ 1/4 of SizeInBytes.
+    uint64_t RZ = std::max(MinRZ,
+                         std::min(kMaxGlobalRedzone,
+                                  (SizeInBytes / MinRZ / 4) * MinRZ));
+    uint64_t RightRedzoneSize = RZ;
+    // Round up to MinRZ
+    if (SizeInBytes % MinRZ)
+      RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
+    assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
     // Determine whether this global should be poisoned in initialization.
     bool GlobalHasDynamicInitializer =
@@ -848,7 +941,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
         M, NewTy, G->isConstant(), G->getLinkage(),
         NewInitializer, "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
-    NewGlobal->setAlignment(RZ);
+    NewGlobal->setAlignment(MinRZ);
 
     Value *Indices2[2];
     Indices2[0] = IRB.getInt32(0);
@@ -921,6 +1014,10 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
               FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
     }
   }
+  AsanErrorCallbackSized[0] = checkInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportLoadN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanErrorCallbackSized[1] = checkInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
 
   AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
@@ -930,6 +1027,23 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
                             /*hasSideEffects=*/true);
 }
 
+void AddressSanitizer::emitShadowMapping(Module &M, IRBuilder<> &IRB) const {
+  // Tell the values of mapping offset and scale to the run-time.
+  GlobalValue *asan_mapping_offset =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                     ConstantInt::get(IntptrTy, Mapping.Offset),
+                     kAsanMappingOffsetName);
+  // Read the global, otherwise it may be optimized away.
+  IRB.CreateLoad(asan_mapping_offset, true);
+
+  GlobalValue *asan_mapping_scale =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                         ConstantInt::get(IntptrTy, Mapping.Scale),
+                         kAsanMappingScaleName);
+  // Read the global, otherwise it may be optimized away.
+  IRB.CreateLoad(asan_mapping_scale, true);
+}
+
 // virtual
 bool AddressSanitizer::doInitialization(Module &M) {
   // Initialize the private fields. No one has accessed them before.
@@ -955,41 +1069,10 @@ bool AddressSanitizer::doInitialization(Module &M) {
   AsanInitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(AsanInitFunction);
 
-  llvm::Triple targetTriple(M.getTargetTriple());
-  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::Android;
-
-  MappingOffset = isAndroid ? kDefaultShadowOffsetAndroid :
-    (LongSize == 32 ? kDefaultShadowOffset32 : kDefaultShadowOffset64);
-  if (ClMappingOffsetLog >= 0) {
-    if (ClMappingOffsetLog == 0) {
-      // special case
-      MappingOffset = 0;
-    } else {
-      MappingOffset = 1ULL << ClMappingOffsetLog;
-    }
-  }
-
-
-  if (ClMappingOffsetLog >= 0) {
-    // Tell the run-time the current values of mapping offset and scale.
-    GlobalValue *asan_mapping_offset =
-        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
-                       ConstantInt::get(IntptrTy, MappingOffset),
-                       kAsanMappingOffsetName);
-    // Read the global, otherwise it may be optimized away.
-    IRB.CreateLoad(asan_mapping_offset, true);
-  }
-  if (ClMappingScale) {
-    GlobalValue *asan_mapping_scale =
-        new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
-                           ConstantInt::get(IntptrTy, MappingScale()),
-                           kAsanMappingScaleName);
-    // Read the global, otherwise it may be optimized away.
-    IRB.CreateLoad(asan_mapping_scale, true);
-  }
+  Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow);
+  emitShadowMapping(M, IRB);
 
   appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority);
-
   return true;
 }
 
@@ -1015,11 +1098,11 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
   initializeCallbacks(*F.getParent());
 
-  // If needed, insert __asan_init before checking for AddressSafety attr.
+  // If needed, insert __asan_init before checking for SanitizeAddress attr.
   maybeInsertAsanInitAtFunctionEntry(F);
 
   if (!F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::AddressSafety))
+                                      Attribute::SanitizeAddress))
     return false;
 
   if (!ClDebugFunc.empty() && ClDebugFunc != F.getName())
@@ -1048,12 +1131,12 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) {
         // ok, take it.
       } else {
-        if (CallInst *CI = dyn_cast<CallInst>(BI)) {
+        CallSite CS(BI);
+        if (CS) {
           // A call inside BB.
           TempsToInstrument.clear();
-          if (CI->doesNotReturn()) {
-            NoReturnCalls.push_back(CI);
-          }
+          if (CS.doesNotReturn())
+            NoReturnCalls.push_back(CS.getInstruction());
         }
         continue;
       }
@@ -1147,7 +1230,7 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
 void FunctionStackPoisoner::poisonRedZones(
   const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase,
   bool DoPoison) {
-  size_t ShadowRZSize = RedzoneSize() >> MappingScale();
+  size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale;
   assert(ShadowRZSize >= 1 && ShadowRZSize <= 4);
   Type *RZTy = Type::getIntNTy(*C, ShadowRZSize * 8);
   Type *RZPtrTy = PointerType::get(RZTy, 0);
@@ -1178,13 +1261,13 @@ void FunctionStackPoisoner::poisonRedZones(
       // Poison the partial redzone at right
       Ptr = IRB.CreateAdd(
           ShadowBase, ConstantInt::get(IntptrTy,
-                                       (Pos >> MappingScale()) - ShadowRZSize));
+                                       (Pos >> Mapping.Scale) - ShadowRZSize));
       size_t AddressableBytes = RedzoneSize() - (AlignedSize - SizeInBytes);
       uint32_t Poison = 0;
       if (DoPoison) {
         PoisonShadowPartialRightRedzone((uint8_t*)&Poison, AddressableBytes,
                                         RedzoneSize(),
-                                        1ULL << MappingScale(),
+                                        1ULL << Mapping.Scale,
                                         kAsanStackPartialRedzoneMagic);
       }
       Value *PartialPoison = ConstantInt::get(RZTy, Poison);
@@ -1193,7 +1276,7 @@ void FunctionStackPoisoner::poisonRedZones(
 
     // Poison the full redzone at right.
     Ptr = IRB.CreateAdd(ShadowBase,
-                        ConstantInt::get(IntptrTy, Pos >> MappingScale()));
+                        ConstantInt::get(IntptrTy, Pos >> Mapping.Scale));
     bool LastAlloca = (i == AllocaVec.size() - 1);
     Value *Poison = LastAlloca ? PoisonRight : PoisonMid;
     IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
index 4fcbea4..927982d 100644
--- a/lib/Transforms/Instrumentation/BlackList.cpp
+++ b/lib/Transforms/Instrumentation/BlackList.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BlackList.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -78,21 +78,21 @@ BlackList::BlackList(const StringRef Path) {
   }
 
   // Iterate through each of the prefixes, and create Regexs for them.
-  for (StringMap<std::string>::iterator I = Regexps.begin(), E = Regexps.end();
-       I != E; ++I) {
+  for (StringMap<std::string>::const_iterator I = Regexps.begin(),
+       E = Regexps.end(); I != E; ++I) {
     Entries[I->getKey()] = new Regex(I->getValue());
   }
 }
 
-bool BlackList::isIn(const Function &F) {
+bool BlackList::isIn(const Function &F) const {
   return isIn(*F.getParent()) || inSection("fun", F.getName());
 }
 
-bool BlackList::isIn(const GlobalVariable &G) {
+bool BlackList::isIn(const GlobalVariable &G) const {
   return isIn(*G.getParent()) || inSection("global", G.getName());
 }
 
-bool BlackList::isIn(const Module &M) {
+bool BlackList::isIn(const Module &M) const {
   return inSection("src", M.getModuleIdentifier());
 }
 
@@ -107,14 +107,15 @@ static StringRef GetGVTypeString(const GlobalVariable &G) {
   return "<unknown type>";
 }
 
-bool BlackList::isInInit(const GlobalVariable &G) {
+bool BlackList::isInInit(const GlobalVariable &G) const {
   return (isIn(*G.getParent()) ||
           inSection("global-init", G.getName()) ||
           inSection("global-init-type", GetGVTypeString(G)));
 }
 
-bool BlackList::inSection(const StringRef Section, const StringRef Query) {
-  StringMap<Regex*>::iterator I = Entries.find(Section);
+bool BlackList::inSection(const StringRef Section,
+                          const StringRef Query) const {
+  StringMap<Regex*>::const_iterator I = Entries.find(Section);
   if (I == Entries.end()) return false;
 
   Regex *FunctionRegex = I->getValue();
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
index 0b18b4c..a2459fb 100644
--- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "ProfilingUtils.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
@@ -55,8 +54,8 @@ ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
 bool EdgeProfiler::runOnModule(Module &M) {
   Function *Main = M.getFunction("main");
   if (Main == 0) {
-    M.getContext().emitWarning("cannot insert edge profiling into a module"
-                               " with no main function");
+    errs() << "WARNING: cannot insert edge profiling into a module"
+           << " with no main function!\n";
     return false;  // No main, no instrumentation!
   }
 
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 58d5801..80705af 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -71,7 +71,6 @@
 #define DEBUG_TYPE "msan"
 
 #include "llvm/Transforms/Instrumentation.h"
-#include "BlackList.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -91,6 +90,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
@@ -127,6 +127,10 @@ static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
        cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
        cl::Hidden, cl::init(true));
 
+static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
+       cl::desc("exact handling of relational integer ICmp"),
+       cl::Hidden, cl::init(false));
+
 static cl::opt<bool> ClStoreCleanOrigin("msan-store-clean-origin",
        cl::desc("store origin for clean (fully initialized) values"),
        cl::Hidden, cl::init(false));
@@ -361,6 +365,9 @@ bool MemorySanitizer::doInitialization(Module &M) {
   new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
                      IRB.getInt32(TrackOrigins), "__msan_track_origins");
 
+  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                     IRB.getInt32(ClKeepGoing), "__msan_keep_going");
+
   return true;
 }
 
@@ -451,9 +458,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
       DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
       (void)NewSI;
-      // If the store is volatile, add a check.
-      if (I.isVolatile())
-        insertCheck(Val, &I);
+
       if (ClCheckAccessAddress)
         insertCheck(Addr, &I);
 
@@ -574,7 +579,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
       return IT;
     if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
-      uint32_t EltSize = MS.TD->getTypeStoreSizeInBits(VT->getElementType());
+      uint32_t EltSize = MS.TD->getTypeSizeInBits(VT->getElementType());
       return VectorType::get(IntegerType::get(*MS.C, EltSize),
                              VT->getNumElements());
     }
@@ -586,7 +591,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
       return Res;
     }
-    uint32_t TypeSize = MS.TD->getTypeStoreSizeInBits(OrigTy);
+    uint32_t TypeSize = MS.TD->getTypeSizeInBits(OrigTy);
     return IntegerType::get(*MS.C, TypeSize);
   }
 
@@ -847,7 +852,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// Stores the corresponding shadow and (optionally) origin.
   /// Optionally, checks that the store address is fully defined.
-  /// Volatile stores check that the value being stored is fully defined.
   void visitStoreInst(StoreInst &I) {
     StoreList.push_back(&I);
   }
@@ -1127,10 +1131,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *B = I.getOperand(1);
     Value *Sa = getShadow(A);
     Value *Sb = getShadow(B);
-    if (A->getType()->isPointerTy())
-      A = IRB.CreatePointerCast(A, MS.IntptrTy);
-    if (B->getType()->isPointerTy())
-      B = IRB.CreatePointerCast(B, MS.IntptrTy);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
     // A == B  <==>  (C = A^B) == 0
     // A != B  <==>  (C = A^B) != 0
     // Sc = Sa | Sb
@@ -1152,6 +1159,73 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  /// \brief Build the lowest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Maximise the undefined shadow bit, minimize other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
+    } else {
+      // Minimize undefined bits.
+      return IRB.CreateAnd(A, IRB.CreateNot(Sa));
+    }
+  }
+
+  /// \brief Build the highest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Minimise the undefined shadow bit, maximise other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
+    } else {
+      // Maximize undefined bits.
+      return IRB.CreateOr(A, Sa);
+    }
+  }
+
+  /// \brief Instrument relational comparisons.
+  ///
+  /// This function does exact shadow propagation for all relational
+  /// comparisons of integers, pointers and vectors of those.
+  /// FIXME: output seems suboptimal when one of the operands is a constant
+  void handleRelationalComparisonExact(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // Let [a0, a1] be the interval of possible values of A, taking into account
+    // its undefined bits. Let [b0, b1] be the interval of possible values of B.
+    // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
+    bool IsSigned = I.isSigned();
+    Value *S1 = IRB.CreateICmp(I.getPredicate(),
+                               getLowestPossibleValue(IRB, A, Sa, IsSigned),
+                               getHighestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *S2 = IRB.CreateICmp(I.getPredicate(),
+                               getHighestPossibleValue(IRB, A, Sa, IsSigned),
+                               getLowestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *Si = IRB.CreateXor(S1, S2);
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
   /// \brief Instrument signed relational comparisons.
   ///
   /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by
@@ -1181,12 +1255,32 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitICmpInst(ICmpInst &I) {
-    if (ClHandleICmp && I.isEquality())
+    if (!ClHandleICmp) {
+      handleShadowOr(I);
+      return;
+    }
+    if (I.isEquality()) {
       handleEqualityComparison(I);
-    else if (ClHandleICmp && I.isSigned() && I.isRelational())
+      return;
+    }
+
+    assert(I.isRelational());
+    if (ClHandleICmpExact) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+    if (I.isSigned()) {
       handleSignedRelationalComparison(I);
-    else
-      handleShadowOr(I);
+      return;
+    }
+
+    assert(I.isUnsigned());
+    if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+
+    handleShadowOr(I);
   }
 
   void visitFCmpInst(FCmpInst &I) {
@@ -1458,8 +1552,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         AttrBuilder B;
         B.addAttribute(Attribute::ReadOnly)
           .addAttribute(Attribute::ReadNone);
-        Func->removeAttribute(AttributeSet::FunctionIndex,
-                              Attribute::get(Func->getContext(), B));
+        Func->removeAttributes(AttributeSet::FunctionIndex,
+                               AttributeSet::get(Func->getContext(),
+                                                 AttributeSet::FunctionIndex,
+                                                 B));
       }
     }
     IRBuilder<> IRB(&I);
@@ -1498,6 +1594,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       if (MS.TrackOrigins)
         IRB.CreateStore(getOrigin(A),
                         getOriginPtrForArgument(A, IRB, ArgOffset));
+      (void)Store;
       assert(Size != 0 && Store != 0);
       DEBUG(dbgs() << "  Param:" << *Store << "\n");
       ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
@@ -1774,7 +1871,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
     // Unpoison the whole __va_list_tag.
     // FIXME: magic ABI constants.
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
-                     /* size */24, /* alignment */16, false);
+                     /* size */24, /* alignment */8, false);
   }
 
   void visitVACopyInst(VACopyInst &I) {
@@ -1785,7 +1882,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
     // Unpoison the whole __va_list_tag.
     // FIXME: magic ABI constants.
     IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
-                     /* size */ 24, /* alignment */ 16, false);
+                     /* size */24, /* alignment */8, false);
   }
 
   void finalizeInstrumentation() {
@@ -1850,8 +1947,9 @@ bool MemorySanitizer::runOnFunction(Function &F) {
   AttrBuilder B;
   B.addAttribute(Attribute::ReadOnly)
     .addAttribute(Attribute::ReadNone);
-  F.removeAttribute(AttributeSet::FunctionIndex,
-                    Attribute::get(F.getContext(), B));
+  F.removeAttributes(AttributeSet::FunctionIndex,
+                     AttributeSet::get(F.getContext(),
+                                       AttributeSet::FunctionIndex, B));
 
   return Visitor.runOnFunction();
 }
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index c5a1fe9..b45aef6 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ProfileInfoLoader.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -76,8 +75,8 @@ inline static void printEdgeCounter(ProfileInfo::Edge e,
 bool OptimalEdgeProfiler::runOnModule(Module &M) {
   Function *Main = M.getFunction("main");
   if (Main == 0) {
-    M.getContext().emitWarning("cannot insert edge profiling into a module"
-                               " with no main function");
+    errs() << "WARNING: cannot insert edge profiling into a module"
+           << " with no main function!\n";
     return false;  // No main, no instrumentation!
   }
 
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index 358bbeb..7de7326 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -1345,8 +1345,8 @@ bool PathProfiler::runOnModule(Module &M) {
     Main = M.getFunction("MAIN__");
 
   if (!Main) {
-    Context->emitWarning("cannot insert edge profiling into a module"
-                         " with no main function");
+    errs() << "WARNING: cannot insert path profiling into a module"
+           << " with no main function!\n";
     return false;
   }
 
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 29d2ece..f93c5ab 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -22,7 +22,6 @@
 #define DEBUG_TYPE "tsan"
 
 #include "llvm/Transforms/Instrumentation.h"
-#include "BlackList.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -41,6 +40,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/LLVMBuild.txt b/lib/Transforms/LLVMBuild.txt
index f7bca06..15e9fba 100644
--- a/lib/Transforms/LLVMBuild.txt
+++ b/lib/Transforms/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize
+subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC
 
 [component_0]
 type = Group
diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile
index 8b1df92..c390517 100644
--- a/lib/Transforms/Makefile
+++ b/lib/Transforms/Makefile
@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../..
-PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello
+PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello ObjCARC
 
 include $(LEVEL)/Makefile.config
 
diff --git a/lib/Transforms/ObjCARC/CMakeLists.txt b/lib/Transforms/ObjCARC/CMakeLists.txt
new file mode 100644
index 0000000..233deb3
--- /dev/null
+++ b/lib/Transforms/ObjCARC/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_llvm_library(LLVMObjCARCOpts
+  ObjCARC.cpp
+  ObjCARCOpts.cpp
+  ObjCARCExpand.cpp
+  ObjCARCAPElim.cpp
+  ObjCARCAliasAnalysis.cpp
+  ObjCARCUtil.cpp
+  ObjCARCContract.cpp
+  DependencyAnalysis.cpp
+  ProvenanceAnalysis.cpp
+  )
+
+add_dependencies(LLVMObjCARCOpts intrinsics_gen)
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
new file mode 100644
index 0000000..5aada9c
--- /dev/null
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -0,0 +1,261 @@
+//===- DependencyAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-dependency"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/Support/CFG.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool
+llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                                ProvenanceAnalysis &PA,
+                                InstructionClass Class) {
+  switch (Class) {
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_User:
+    // These operations never directly modify a reference count.
+    return false;
+  default: break;
+  }
+
+  ImmutableCallSite CS = static_cast<const Value *>(Inst);
+  assert(CS && "Only calls can alter reference counts!");
+
+  // See if AliasAnalysis can help us with the call.
+  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
+  if (AliasAnalysis::onlyReadsMemory(MRB))
+    return false;
+  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+    for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+         I != E; ++I) {
+      const Value *Op = *I;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  }
+
+  // Assume the worst.
+  return true;
+}
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool
+llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
+                      ProvenanceAnalysis &PA, InstructionClass Class) {
+  // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers.
+  if (Class == IC_Call)
+    return false;
+
+  // Consider various instructions which may have pointer arguments which are
+  // not "uses".
+  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
+    // Comparing a pointer with null, or any other constant, isn't really a use,
+    // because we don't care what the pointer points to, or about the values
+    // of any other dynamic reference-counted pointers.
+    if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA()))
+      return false;
+  } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
+    // For calls, just check the arguments (and not the callee operand).
+    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
+         OE = CS.arg_end(); OI != OE; ++OI) {
+      const Value *Op = *OI;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    // Special-case stores, because we don't care about the stored value, just
+    // the store address.
+    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
+    // If we can't tell what the underlying object was, assume there is a
+    // dependence.
+    return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Op, Ptr);
+  }
+
+  // Check each operand for a match.
+  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
+       OI != OE; ++OI) {
+    const Value *Op = *OI;
+    if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+      return true;
+  }
+  return false;
+}
+
+/// Test if there can be dependencies on Inst through Arg. This function only
+/// tests dependencies relevant for removing pairs of calls.
+bool
+llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
+                       const Value *Arg, ProvenanceAnalysis &PA) {
+  // If we've reached the definition of Arg, stop.
+  if (Inst == Arg)
+    return true;
+
+  switch (Flavor) {
+  case NeedsPositiveRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanUse(Inst, Arg, PA, Class);
+    }
+  }
+
+  case AutoreleasePoolBoundary: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // These mark the end and begin of an autorelease pool scope.
+      return true;
+    default:
+      // Nothing else does this.
+      return false;
+    }
+  }
+
+  case CanChangeRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+      // Conservatively assume this can decrement any count.
+      return true;
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanAlterRefCount(Inst, Arg, PA, Class);
+    }
+  }
+
+  case RetainAutoreleaseDep:
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // Don't merge an objc_autorelease with an objc_retain inside a different
+      // autoreleasepool scope.
+      return true;
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Nothing else matters for objc_retainAutorelease formation.
+      return false;
+    }
+
+  case RetainAutoreleaseRVDep: {
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Anything that can autorelease interrupts
+      // retainAutoreleaseReturnValue formation.
+      return CanInterruptRV(Class);
+    }
+  }
+
+  case RetainRVDep:
+    return CanInterruptRV(GetBasicInstructionClass(Inst));
+  }
+
+  llvm_unreachable("Invalid dependence flavor");
+}
+
+/// Walk up the CFG from StartPos (which is in StartBB) and find local and
+/// non-local dependencies on Arg.
+///
+/// TODO: Cache results?
+void
+llvm::objcarc::FindDependencies(DependenceKind Flavor,
+                                const Value *Arg,
+                                BasicBlock *StartBB, Instruction *StartInst,
+                                SmallPtrSet<Instruction *, 4> &DependingInsts,
+                                SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                ProvenanceAnalysis &PA) {
+  BasicBlock::iterator StartPos = StartInst;
+
+  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
+  Worklist.push_back(std::make_pair(StartBB, StartPos));
+  do {
+    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
+      Worklist.pop_back_val();
+    BasicBlock *LocalStartBB = Pair.first;
+    BasicBlock::iterator LocalStartPos = Pair.second;
+    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
+    for (;;) {
+      if (LocalStartPos == StartBBBegin) {
+        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
+        if (PI == PE)
+          // If we've reached the function entry, produce a null dependence.
+          DependingInsts.insert(0);
+        else
+          // Add the predecessors to the worklist.
+          do {
+            BasicBlock *PredBB = *PI;
+            if (Visited.insert(PredBB))
+              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
+          } while (++PI != PE);
+        break;
+      }
+
+      Instruction *Inst = --LocalStartPos;
+      if (Depends(Flavor, Inst, Arg, PA)) {
+        DependingInsts.insert(Inst);
+        break;
+      }
+    }
+  } while (!Worklist.empty());
+
+  // Determine whether the original StartBB post-dominates all of the blocks we
+  // visited. If not, insert a sentinal indicating that most optimizations are
+  // not safe.
+  for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(),
+       E = Visited.end(); I != E; ++I) {
+    const BasicBlock *BB = *I;
+    if (BB == StartBB)
+      continue;
+    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+      const BasicBlock *Succ = *SI;
+      if (Succ != StartBB && !Visited.count(Succ)) {
+        DependingInsts.insert(reinterpret_cast<Instruction *>(-1));
+        return;
+      }
+    }
+  }
+}
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.h b/lib/Transforms/ObjCARC/DependencyAnalysis.h
new file mode 100644
index 0000000..24d358b
--- /dev/null
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -0,0 +1,79 @@
+//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+
+namespace llvm {
+  class BasicBlock;
+  class Instruction;
+  class Value;
+}
+
+namespace llvm {
+namespace objcarc {
+
+class ProvenanceAnalysis;
+
+/// \enum DependenceKind
+/// \brief Defines different dependence kinds among various ARC constructs.
+///
+/// There are several kinds of dependence-like concepts in use here.
+///
+enum DependenceKind {
+  NeedsPositiveRetainCount,
+  AutoreleasePoolBoundary,
+  CanChangeRetainCount,
+  RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
+  RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
+  RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
+};
+
+void FindDependencies(DependenceKind Flavor,
+                      const Value *Arg,
+                      BasicBlock *StartBB, Instruction *StartInst,
+                      SmallPtrSet<Instruction *, 4> &DependingInstructions,
+                      SmallPtrSet<const BasicBlock *, 4> &Visited,
+                      ProvenanceAnalysis &PA);
+
+bool
+Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
+        ProvenanceAnalysis &PA);
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool
+CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
+       InstructionClass Class);
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool
+CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                 ProvenanceAnalysis &PA, InstructionClass Class);
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
diff --git a/lib/Transforms/ObjCARC/LLVMBuild.txt b/lib/Transforms/ObjCARC/LLVMBuild.txt
new file mode 100644
index 0000000..90a2338
--- /dev/null
+++ b/lib/Transforms/ObjCARC/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Transforms/ObjCARC/LLVMBuild.txt -------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = ObjCARC
+parent = Transforms
+library_name = ObjCARCOpts
+required_libraries = Analysis Core Support TransformUtils
diff --git a/lib/Transforms/ObjCARC/Makefile b/lib/Transforms/ObjCARC/Makefile
new file mode 100644
index 0000000..2a34e21
--- /dev/null
+++ b/lib/Transforms/ObjCARC/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/ObjCARC/Makefile ---------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMObjCARCOpts
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/ObjCARC/ObjCARC.cpp b/lib/Transforms/ObjCARC/ObjCARC.cpp
new file mode 100644
index 0000000..53a31b0
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -0,0 +1,48 @@
+//===-- ObjCARC.cpp -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMObjCARCOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+  class PassRegistry;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// \brief A handy option to enable/disable all ARC Optimizations.
+bool llvm::objcarc::EnableARCOpts;
+static cl::opt<bool, true>
+EnableARCOptimizations("enable-objc-arc-opts",
+                       cl::location(EnableARCOpts),
+                       cl::init(true));
+
+/// initializeObjCARCOptsPasses - Initialize all passes linked into the
+/// ObjCARCOpts library.
+void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
+  initializeObjCARCAliasAnalysisPass(Registry);
+  initializeObjCARCAPElimPass(Registry);
+  initializeObjCARCExpandPass(Registry);
+  initializeObjCARCContractPass(Registry);
+  initializeObjCARCOptPass(Registry);
+}
+
+void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) {
+  initializeObjCARCOpts(*unwrap(R));
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
new file mode 100644
index 0000000..e062b66
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARC.h
@@ -0,0 +1,389 @@
+//===- ObjCARC.h - ObjC ARC Optimization --------------*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines common definitions/declarations used by the ObjC ARC
+/// Optimizer. ARC stands for Automatic Reference Counting and is a system for
+/// managing reference counts for objects in Objective C.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+#define LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Transforms/ObjCARC.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief A handy option to enable/disable all ARC Optimizations.
+extern bool EnableARCOpts;
+
+/// \brief Test if the given module looks interesting to run ARC optimization
+/// on.
+static inline bool ModuleHasARC(const Module &M) {
+  return
+    M.getNamedValue("objc_retain") ||
+    M.getNamedValue("objc_release") ||
+    M.getNamedValue("objc_autorelease") ||
+    M.getNamedValue("objc_retainAutoreleasedReturnValue") ||
+    M.getNamedValue("objc_retainBlock") ||
+    M.getNamedValue("objc_autoreleaseReturnValue") ||
+    M.getNamedValue("objc_autoreleasePoolPush") ||
+    M.getNamedValue("objc_loadWeakRetained") ||
+    M.getNamedValue("objc_loadWeak") ||
+    M.getNamedValue("objc_destroyWeak") ||
+    M.getNamedValue("objc_storeWeak") ||
+    M.getNamedValue("objc_initWeak") ||
+    M.getNamedValue("objc_moveWeak") ||
+    M.getNamedValue("objc_copyWeak") ||
+    M.getNamedValue("objc_retainedObject") ||
+    M.getNamedValue("objc_unretainedObject") ||
+    M.getNamedValue("objc_unretainedPointer");
+}
+
+/// \enum InstructionClass
+/// \brief A simple classification for instructions.
+enum InstructionClass {
+  IC_Retain,              ///< objc_retain
+  IC_RetainRV,            ///< objc_retainAutoreleasedReturnValue
+  IC_RetainBlock,         ///< objc_retainBlock
+  IC_Release,             ///< objc_release
+  IC_Autorelease,         ///< objc_autorelease
+  IC_AutoreleaseRV,       ///< objc_autoreleaseReturnValue
+  IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush
+  IC_AutoreleasepoolPop,  ///< objc_autoreleasePoolPop
+  IC_NoopCast,            ///< objc_retainedObject, etc.
+  IC_FusedRetainAutorelease, ///< objc_retainAutorelease
+  IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
+  IC_LoadWeakRetained,    ///< objc_loadWeakRetained (primitive)
+  IC_StoreWeak,           ///< objc_storeWeak (primitive)
+  IC_InitWeak,            ///< objc_initWeak (derived)
+  IC_LoadWeak,            ///< objc_loadWeak (derived)
+  IC_MoveWeak,            ///< objc_moveWeak (derived)
+  IC_CopyWeak,            ///< objc_copyWeak (derived)
+  IC_DestroyWeak,         ///< objc_destroyWeak (derived)
+  IC_StoreStrong,         ///< objc_storeStrong (derived)
+  IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
+  IC_Call,                ///< could call objc_release
+  IC_User,                ///< could "use" a pointer
+  IC_None                 ///< anything else
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const InstructionClass Class);
+
+/// \brief Test if the given class is objc_retain or equivalent.
+static inline bool IsRetain(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV;
+}
+
+/// \brief Test if the given class is objc_autorelease or equivalent.
+static inline bool IsAutorelease(InstructionClass Class) {
+  return Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// \brief Test if the given class represents instructions which return their
+/// argument verbatim.
+static inline bool IsForwarding(InstructionClass Class) {
+  // objc_retainBlock technically doesn't always return its argument
+  // verbatim, but it doesn't matter for our purposes here.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_RetainBlock ||
+         Class == IC_NoopCast;
+}
+
+/// \brief Test if the given class represents instructions which do nothing if
+/// passed a null pointer.
+static inline bool IsNoopOnNull(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_RetainBlock;
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the "tail" keyword.
+static inline bool IsAlwaysTail(InstructionClass Class) {
+  // IC_RetainBlock may be given a stack argument.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// \brief Test if the given class represents instructions which are never safe
+/// to mark with the "tail" keyword.
+static inline bool IsNeverTail(InstructionClass Class) {
+  /// It is never safe to tail call objc_autorelease since by tail calling
+  /// objc_autorelease, we also tail call -[NSObject autorelease] which supports
+  /// fast autoreleasing causing our object to be potentially reclaimed from the
+  /// autorelease pool which violates the semantics of __autoreleasing types in
+  /// ARC.
+  return Class == IC_Autorelease;
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the nounwind attribute.
+static inline bool IsNoThrow(InstructionClass Class) {
+  // objc_retainBlock is not nounwind because it calls user copy constructors
+  // which could theoretically throw.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_AutoreleasepoolPush ||
+         Class == IC_AutoreleasepoolPop;
+}
+
+/// Test whether the given instruction can autorelease any pointer or cause an
+/// autoreleasepool pop.
+static inline bool
+CanInterruptRV(InstructionClass Class) {
+  switch (Class) {
+  case IC_AutoreleasepoolPop:
+  case IC_CallOrUser:
+  case IC_Call:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// \brief Determine if F is one of the special known Functions.  If it isn't,
+/// return IC_CallOrUser.
+InstructionClass GetFunctionClass(const Function *F);
+
+/// \brief Determine which objc runtime call instruction class V belongs to.
+///
+/// This is similar to GetInstructionClass except that it only detects objc
+/// runtime calls. This allows it to be faster.
+///
+static inline InstructionClass GetBasicInstructionClass(const Value *V) {
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (const Function *F = CI->getCalledFunction())
+      return GetFunctionClass(F);
+    // Otherwise, be conservative.
+    return IC_CallOrUser;
+  }
+
+  // Otherwise, be conservative.
+  return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
+}
+
+/// \brief Determine what kind of construct V is.
+InstructionClass GetInstructionClass(const Value *V);
+
+/// \brief This is a wrapper around getUnderlyingObject which also knows how to
+/// look through objc_retain and objc_autorelease calls, which we know to return
+/// their argument verbatim.
+static inline const Value *GetUnderlyingObjCPtr(const Value *V) {
+  for (;;) {
+    V = GetUnderlyingObject(V);
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+
+  return V;
+}
+
+/// \brief This is a wrapper around Value::stripPointerCasts which also knows
+/// how to look through objc_retain and objc_autorelease calls, which we know to
+/// return their argument verbatim.
+static inline const Value *StripPointerCastsAndObjCCalls(const Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// \brief This is a wrapper around Value::stripPointerCasts which also knows
+/// how to look through objc_retain and objc_autorelease calls, which we know to
+/// return their argument verbatim.
+static inline Value *StripPointerCastsAndObjCCalls(Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// \brief Assuming the given instruction is one of the special calls such as
+/// objc_retain or objc_release, return the argument value, stripped of no-op
+/// casts and forwarding calls.
+static inline Value *GetObjCArg(Value *Inst) {
+  return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
+}
+
+static inline bool isNullOrUndef(const Value *V) {
+  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
+}
+
+static inline bool isNoopInstruction(const Instruction *I) {
+  return isa<BitCastInst>(I) ||
+    (isa<GetElementPtrInst>(I) &&
+     cast<GetElementPtrInst>(I)->hasAllZeroIndices());
+}
+
+
+/// \brief Erase the given instruction.
+///
+/// Many ObjC calls return their argument verbatim,
+/// so if it's such a call and the return value has users, replace them with the
+/// argument value.
+///
+static inline void EraseInstruction(Instruction *CI) {
+  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
+
+  bool Unused = CI->use_empty();
+
+  if (!Unused) {
+    // Replace the return value with the argument.
+    assert(IsForwarding(GetBasicInstructionClass(CI)) &&
+           "Can't delete non-forwarding instruction with users!");
+    CI->replaceAllUsesWith(OldArg);
+  }
+
+  CI->eraseFromParent();
+
+  if (Unused)
+    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
+}
+
+/// \brief Test whether the given value is possible a retainable object pointer.
+static inline bool IsPotentialRetainableObjPtr(const Value *Op) {
+  // Pointers to static or stack storage are not valid retainable object
+  // pointers.
+  if (isa<Constant>(Op) || isa<AllocaInst>(Op))
+    return false;
+  // Special arguments can not be a valid retainable object pointer.
+  if (const Argument *Arg = dyn_cast<Argument>(Op))
+    if (Arg->hasByValAttr() ||
+        Arg->hasNestAttr() ||
+        Arg->hasStructRetAttr())
+      return false;
+  // Only consider values with pointer types.
+  //
+  // It seemes intuitive to exclude function pointer types as well, since
+  // functions are never retainable object pointers, however clang occasionally
+  // bitcasts retainable object pointers to function-pointer type temporarily.
+  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
+  if (!Ty)
+    return false;
+  // Conservatively assume anything else is a potential retainable object
+  // pointer.
+  return true;
+}
+
+static inline bool IsPotentialRetainableObjPtr(const Value *Op,
+                                               AliasAnalysis &AA) {
+  // First make the rudimentary check.
+  if (!IsPotentialRetainableObjPtr(Op))
+    return false;
+
+  // Objects in constant memory are not reference-counted.
+  if (AA.pointsToConstantMemory(Op))
+    return false;
+
+  // Pointers in constant memory are not pointing to reference-counted objects.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
+    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
+      return false;
+
+  // Otherwise assume the worst.
+  return true;
+}
+
+/// \brief Helper for GetInstructionClass. Determines what kind of construct CS
+/// is.
+static inline InstructionClass GetCallSiteClass(ImmutableCallSite CS) {
+  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I)
+    if (IsPotentialRetainableObjPtr(*I))
+      return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser;
+
+  return CS.onlyReadsMemory() ? IC_None : IC_Call;
+}
+
+/// \brief Return true if this value refers to a distinct and identifiable
+/// object.
+///
+/// This is similar to AliasAnalysis's isIdentifiedObject, except that it uses
+/// special knowledge of ObjC conventions.
+static inline bool IsObjCIdentifiedObject(const Value *V) {
+  // Assume that call results and arguments have their own "provenance".
+  // Constants (including GlobalVariables) and Allocas are never
+  // reference-counted.
+  if (isa<CallInst>(V) || isa<InvokeInst>(V) ||
+      isa<Argument>(V) || isa<Constant>(V) ||
+      isa<AllocaInst>(V))
+    return true;
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    const Value *Pointer =
+      StripPointerCastsAndObjCCalls(LI->getPointerOperand());
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
+      // A constant pointer can't be pointing to an object on the heap. It may
+      // be reference-counted, but it won't be deleted.
+      if (GV->isConstant())
+        return true;
+      StringRef Name = GV->getName();
+      // These special variables are known to hold values which are not
+      // reference-counted pointers.
+      if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") ||
+          Name.startswith("\01L_OBJC_METH_VAR_NAME_") ||
+          Name.startswith("\01l_objc_msgSend_fixup_"))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_OBJCARC_H
diff --git a/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
new file mode 100644
index 0000000..00d9864
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -0,0 +1,175 @@
+//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file implements optimizations which remove extraneous
+/// autorelease pools.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-ap-elim"
+#include "ObjCARC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+  /// \brief Autorelease pool elimination.
+  class ObjCARCAPElim : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnModule(Module &M);
+
+    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
+    static bool OptimizeBB(BasicBlock *BB);
+
+  public:
+    static char ID;
+    ObjCARCAPElim() : ModulePass(ID) {
+      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCAPElim::ID = 0;
+INITIALIZE_PASS(ObjCARCAPElim,
+                "objc-arc-apelim",
+                "ObjC ARC autorelease pool elimination",
+                false, false)
+
+Pass *llvm::createObjCARCAPElimPass() {
+  return new ObjCARCAPElim();
+}
+
+void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+/// Interprocedurally determine if calls made by the given call site can
+/// possibly produce autoreleases.
+bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
+  if (const Function *Callee = CS.getCalledFunction()) {
+    if (Callee->isDeclaration() || Callee->mayBeOverridden())
+      return true;
+    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
+         I != E; ++I) {
+      const BasicBlock *BB = I;
+      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
+           J != F; ++J)
+        if (ImmutableCallSite JCS = ImmutableCallSite(J))
+          // This recursion depth limit is arbitrary. It's just great
+          // enough to cover known interesting testcases.
+          if (Depth < 3 &&
+              !JCS.onlyReadsMemory() &&
+              MayAutorelease(JCS, Depth + 1))
+            return true;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
+  bool Changed = false;
+
+  Instruction *Push = 0;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+    Instruction *Inst = I++;
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPush:
+      Push = Inst;
+      break;
+    case IC_AutoreleasepoolPop:
+      // If this pop matches a push and nothing in between can autorelease,
+      // zap the pair.
+      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
+        Changed = true;
+        DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
+                        "autorelease pair:\n"
+                        "                           Pop: " << *Inst << "\n"
+                     << "                           Push: " << *Push << "\n");
+        Inst->eraseFromParent();
+        Push->eraseFromParent();
+      }
+      Push = 0;
+      break;
+    case IC_CallOrUser:
+      if (MayAutorelease(ImmutableCallSite(Inst)))
+        Push = 0;
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool ObjCARCAPElim::runOnModule(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!ModuleHasARC(M))
+    return false;
+
+  // Find the llvm.global_ctors variable, as the first step in
+  // identifying the global constructors. In theory, unnecessary autorelease
+  // pools could occur anywhere, but in practice it's pretty rare. Global
+  // ctors are a place where autorelease pools get inserted automatically,
+  // so it's pretty common for them to be unnecessary, and it's pretty
+  // profitable to eliminate them.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return false;
+
+  assert(GV->hasDefinitiveInitializer() &&
+         "llvm.global_ctors is uncooperative!");
+
+  bool Changed = false;
+
+  // Dig the constructor functions out of GV's initializer.
+  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    // llvm.global_ctors is an array of pairs where the second members
+    // are constructor functions.
+    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    // If the user used a constructor function with the wrong signature and
+    // it got bitcasted or whatever, look the other way.
+    if (!F)
+      continue;
+    // Only look at function definitions.
+    if (F->isDeclaration())
+      continue;
+    // Only look at functions with one basic block.
+    if (llvm::next(F->begin()) != F->end())
+      continue;
+    // Ok, a single-block constructor function definition. Try to optimize it.
+    Changed |= OptimizeBB(F->begin());
+  }
+
+  return Changed;
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
new file mode 100644
index 0000000..46b2de7
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
@@ -0,0 +1,162 @@
+//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -*- mode: c++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines a simple ARC-aware AliasAnalysis using special knowledge
+/// of Objective C to enhance other optimization passes which rely on the Alias
+/// Analysis infrastructure.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-aa"
+#include "ObjCARC.h"
+#include "ObjCARCAliasAnalysis.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassSupport.h"
+
+namespace llvm {
+  class Function;
+  class Value;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+// Register this pass...
+char ObjCARCAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa",
+                   "ObjC-ARC-Based Alias Analysis", false, true, false)
+
+ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
+  return new ObjCARCAliasAnalysis();
+}
+
+void
+ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AliasAnalysis::getAnalysisUsage(AU);
+}
+
+AliasAnalysis::AliasResult
+ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::alias(LocA, LocB);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making a
+  // precise alias query.
+  const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
+  const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
+  AliasResult Result =
+    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag),
+                         Location(SB, LocB.Size, LocB.TBAATag));
+  if (Result != MayAlias)
+    return Result;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *UA = GetUnderlyingObjCPtr(SA);
+  const Value *UB = GetUnderlyingObjCPtr(SB);
+  if (UA != SA || UB != SB) {
+    Result = AliasAnalysis::alias(Location(UA), Location(UB));
+    // We can't use MustAlias or PartialAlias results here because
+    // GetUnderlyingObjCPtr may return an offsetted pointer value.
+    if (Result == NoAlias)
+      return NoAlias;
+  }
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return MayAlias;
+}
+
+bool
+ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
+                                             bool OrLocal) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making
+  // a precise alias query.
+  const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
+  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag),
+                                            OrLocal))
+    return true;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *U = GetUnderlyingObjCPtr(S);
+  if (U != S)
+    return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal);
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return false;
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  // We have nothing to do. Just chain to the next AliasAnalysis.
+  return AliasAnalysis::getModRefBehavior(CS);
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefBehavior(F);
+
+  switch (GetFunctionClass(F)) {
+  case IC_NoopCast:
+    return DoesNotAccessMemory;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefBehavior(F);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefInfo(CS, Loc);
+
+  switch (GetBasicInstructionClass(CS.getInstruction())) {
+  case IC_Retain:
+  case IC_RetainRV:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_NoopCast:
+  case IC_AutoreleasepoolPush:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    // These functions don't access any memory visible to the compiler.
+    // Note that this doesn't include objc_retainBlock, because it updates
+    // pointers when it copies block data.
+    return NoModRef;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefInfo(CS, Loc);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
+                                    ImmutableCallSite CS2) {
+  // TODO: Theoretically we could check for dependencies between objc_* calls
+  // and OnlyAccessesArgumentPointees calls or other well-behaved calls.
+  return AliasAnalysis::getModRefInfo(CS1, CS2);
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
new file mode 100644
index 0000000..7abe995
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
@@ -0,0 +1,74 @@
+//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares a simple ARC-aware AliasAnalysis using special knowledge
+/// of Objective C to enhance other optimization passes which rely on the Alias
+/// Analysis infrastructure.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+namespace objcarc {
+
+  /// \brief This is a simple alias analysis implementation that uses knowledge
+  /// of ARC constructs to answer queries.
+  ///
+  /// TODO: This class could be generalized to know about other ObjC-specific
+  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing
+  /// even though their offsets are dynamic.
+  class ObjCARCAliasAnalysis : public ImmutablePass,
+                               public AliasAnalysis {
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    ObjCARCAliasAnalysis() : ImmutablePass(ID) {
+      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry());
+    }
+
+  private:
+    virtual void initializePass() {
+      InitializeAliasAnalysis(this);
+    }
+
+    /// This method is used when a pass implements an analysis interface through
+    /// multiple inheritance.  If needed, it should override this to adjust the
+    /// this pointer as needed for the specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *PI) {
+      if (PI == &AliasAnalysis::ID)
+        return static_cast<AliasAnalysis *>(this);
+      return this;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual AliasResult alias(const Location &LocA, const Location &LocB);
+    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
+    virtual ModRefBehavior getModRefBehavior(const Function *F);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Location &Loc);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2);
+  };
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
new file mode 100644
index 0000000..1c13d1c
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -0,0 +1,537 @@
+//===- ObjCARCContract.cpp - ObjC ARC Optimization ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines late ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file mainly deals with ``contracting'' multiple lower level
+/// operations into singular higher level operations through pattern matching.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: ObjCARCContract could insert PHI nodes when uses aren't
+// dominated by single calls.
+
+#define DEBUG_TYPE "objc-arc-contract"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
+
+namespace {
+  /// \brief Late ARC optimizations
+  ///
+  /// These change the IR in a way that makes it difficult to be analyzed by
+  /// ObjCARCOpt, so it's run late.
+  class ObjCARCContract : public FunctionPass {
+    bool Changed;
+    AliasAnalysis *AA;
+    DominatorTree *DT;
+    ProvenanceAnalysis PA;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// Declarations for ObjC runtime functions, for use in creating calls to
+    /// them. These are initialized lazily to avoid cluttering up the Module
+    /// with unused declarations.
+
+    /// Declaration for objc_storeStrong().
+    Constant *StoreStrongCallee;
+    /// Declaration for objc_retainAutorelease().
+    Constant *RetainAutoreleaseCallee;
+    /// Declaration for objc_retainAutoreleaseReturnValue().
+    Constant *RetainAutoreleaseRVCallee;
+
+    /// The inline asm string to insert between calls and RetainRV calls to make
+    /// the optimization work on targets which need it.
+    const MDString *RetainRVMarker;
+
+    /// The set of inserted objc_storeStrong calls. If at the end of walking the
+    /// function we have found no alloca instructions, these calls can be marked
+    /// "tail".
+    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
+
+    Constant *getStoreStrongCallee(Module *M);
+    Constant *getRetainAutoreleaseCallee(Module *M);
+    Constant *getRetainAutoreleaseRVCallee(Module *M);
+
+    bool ContractAutorelease(Function &F, Instruction *Autorelease,
+                             InstructionClass Class,
+                             SmallPtrSet<Instruction *, 4>
+                               &DependingInstructions,
+                             SmallPtrSet<const BasicBlock *, 4>
+                               &Visited);
+
+    void ContractRelease(Instruction *Release,
+                         inst_iterator &Iter);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+  public:
+    static char ID;
+    ObjCARCContract() : FunctionPass(ID) {
+      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCContract::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCContract,
+                      "objc-arc-contract", "ObjC ARC contraction", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(ObjCARCContract,
+                    "objc-arc-contract", "ObjC ARC contraction", false, false)
+
+Pass *llvm::createObjCARCContractPass() {
+  return new ObjCARCContract();
+}
+
+void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<DominatorTree>();
+  AU.setPreservesCFG();
+}
+
+Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
+  if (!StoreStrongCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *I8XX = PointerType::getUnqual(I8X);
+    Type *Params[] = { I8XX, I8X };
+
+    AttributeSet Attr = AttributeSet()
+      .addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                    Attribute::NoUnwind)
+      .addAttribute(M->getContext(), 1, Attribute::NoCapture);
+
+    StoreStrongCallee =
+      M->getOrInsertFunction(
+        "objc_storeStrong",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attr);
+  }
+  return StoreStrongCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
+  if (!RetainAutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainAutoreleaseCallee =
+      M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute);
+  }
+  return RetainAutoreleaseCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
+  if (!RetainAutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainAutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainAutoreleaseRVCallee;
+}
+
+/// Merge an autorelease with a retain into a fused call.
+bool
+ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
+                                     InstructionClass Class,
+                                     SmallPtrSet<Instruction *, 4>
+                                       &DependingInstructions,
+                                     SmallPtrSet<const BasicBlock *, 4>
+                                       &Visited) {
+  const Value *Arg = GetObjCArg(Autorelease);
+
+  // Check that there are no instructions between the retain and the autorelease
+  // (such as an autorelease_pop) which may change the count.
+  CallInst *Retain = 0;
+  if (Class == IC_AutoreleaseRV)
+    FindDependencies(RetainAutoreleaseRVDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+  else
+    FindDependencies(RetainAutoreleaseDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+
+  Visited.clear();
+  if (DependingInstructions.size() != 1) {
+    DependingInstructions.clear();
+    return false;
+  }
+
+  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+  DependingInstructions.clear();
+
+  if (!Retain ||
+      GetBasicInstructionClass(Retain) != IC_Retain ||
+      GetObjCArg(Retain) != Arg)
+    return false;
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing "
+                  "retain/autorelease. Erasing: " << *Autorelease << "\n"
+                  "                                      Old Retain: "
+               << *Retain << "\n");
+
+  if (Class == IC_AutoreleaseRV)
+    Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent()));
+  else
+    Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                      New Retain: "
+               << *Retain << "\n");
+
+  EraseInstruction(Autorelease);
+  return true;
+}
+
+/// Attempt to merge an objc_release with a store, load, and objc_retain to form
+/// an objc_storeStrong. This can be a little tricky because the instructions
+/// don't always appear in order, and there may be unrelated intervening
+/// instructions.
+void ObjCARCContract::ContractRelease(Instruction *Release,
+                                      inst_iterator &Iter) {
+  LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
+  if (!Load || !Load->isSimple()) return;
+
+  // For now, require everything to be in one basic block.
+  BasicBlock *BB = Release->getParent();
+  if (Load->getParent() != BB) return;
+
+  // Walk down to find the store and the release, which may be in either order.
+  BasicBlock::iterator I = Load, End = BB->end();
+  ++I;
+  AliasAnalysis::Location Loc = AA->getLocation(Load);
+  StoreInst *Store = 0;
+  bool SawRelease = false;
+  for (; !Store || !SawRelease; ++I) {
+    if (I == End)
+      return;
+
+    Instruction *Inst = I;
+    if (Inst == Release) {
+      SawRelease = true;
+      continue;
+    }
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    // Unrelated retains are harmless.
+    if (IsRetain(Class))
+      continue;
+
+    if (Store) {
+      // The store is the point where we're going to put the objc_storeStrong,
+      // so make sure there are no uses after it.
+      if (CanUse(Inst, Load, PA, Class))
+        return;
+    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
+      // We are moving the load down to the store, so check for anything
+      // else which writes to the memory between the load and the store.
+      Store = dyn_cast<StoreInst>(Inst);
+      if (!Store || !Store->isSimple()) return;
+      if (Store->getPointerOperand() != Loc.Ptr) return;
+    }
+  }
+
+  Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
+
+  // Walk up to find the retain.
+  I = Store;
+  BasicBlock::iterator Begin = BB->begin();
+  while (I != Begin && GetBasicInstructionClass(I) != IC_Retain)
+    --I;
+  Instruction *Retain = I;
+  if (GetBasicInstructionClass(Retain) != IC_Retain) return;
+  if (GetObjCArg(Retain) != New) return;
+
+  Changed = true;
+  ++NumStoreStrongs;
+
+  LLVMContext &C = Release->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+
+  Value *Args[] = { Load->getPointerOperand(), New };
+  if (Args[0]->getType() != I8XX)
+    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
+  if (Args[1]->getType() != I8X)
+    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
+  CallInst *StoreStrong =
+    CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()),
+                     Args, "", Store);
+  StoreStrong->setDoesNotThrow();
+  StoreStrong->setDebugLoc(Store->getDebugLoc());
+
+  // We can't set the tail flag yet, because we haven't yet determined
+  // whether there are any escaping allocas. Remember this call, so that
+  // we can set the tail flag once we know it's safe.
+  StoreStrongCalls.insert(StoreStrong);
+
+  if (&*Iter == Store) ++Iter;
+  Store->eraseFromParent();
+  Release->eraseFromParent();
+  EraseInstruction(Retain);
+  if (Load->use_empty())
+    Load->eraseFromParent();
+}
+
+bool ObjCARCContract::doInitialization(Module &M) {
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  // These are initialized lazily.
+  StoreStrongCallee = 0;
+  RetainAutoreleaseCallee = 0;
+  RetainAutoreleaseRVCallee = 0;
+
+  // Initialize RetainRVMarker.
+  RetainRVMarker = 0;
+  if (NamedMDNode *NMD =
+        M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
+    if (NMD->getNumOperands() == 1) {
+      const MDNode *N = NMD->getOperand(0);
+      if (N->getNumOperands() == 1)
+        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
+          RetainRVMarker = S;
+    }
+
+  return false;
+}
+
+bool ObjCARCContract::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
+  // keyword. Be conservative if the function has variadic arguments.
+  // It seems that functions which "return twice" are also unsafe for the
+  // "tail" argument, because they are setjmp, which could need to
+  // return to an earlier stack state.
+  bool TailOkForStoreStrongs = !F.isVarArg() &&
+                               !F.callsFunctionThatReturnsTwice();
+
+  // For ObjC library calls which return their argument, replace uses of the
+  // argument with uses of the call return value, if it dominates the use. This
+  // reduces register pressure.
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n");
+
+    // Only these library routines return their argument. In particular,
+    // objc_retainBlock does not necessarily return its argument.
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV:
+      break;
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+      if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
+        continue;
+      break;
+    case IC_RetainRV: {
+      // If we're compiling for a target which needs a special inline-asm
+      // marker to do the retainAutoreleasedReturnValue optimization,
+      // insert it now.
+      if (!RetainRVMarker)
+        break;
+      BasicBlock::iterator BBI = Inst;
+      BasicBlock *InstParent = Inst->getParent();
+
+      // Step up to see if the call immediately precedes the RetainRV call.
+      // If it's an invoke, we have to cross a block boundary. And we have
+      // to carefully dodge no-op instructions.
+      do {
+        if (&*BBI == InstParent->begin()) {
+          BasicBlock *Pred = InstParent->getSinglePredecessor();
+          if (!Pred)
+            goto decline_rv_optimization;
+          BBI = Pred->getTerminator();
+          break;
+        }
+        --BBI;
+      } while (isNoopInstruction(BBI));
+
+      if (&*BBI == GetObjCArg(Inst)) {
+        DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for "
+                        "retainAutoreleasedReturnValue optimization.\n");
+        Changed = true;
+        InlineAsm *IA =
+          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
+                                           /*isVarArg=*/false),
+                         RetainRVMarker->getString(),
+                         /*Constraints=*/"", /*hasSideEffects=*/true);
+        CallInst::Create(IA, "", Inst);
+      }
+    decline_rv_optimization:
+      break;
+    }
+    case IC_InitWeak: {
+      // objc_initWeak(p, null) => *p = null
+      CallInst *CI = cast<CallInst>(Inst);
+      if (isNullOrUndef(CI->getArgOperand(1))) {
+        Value *Null =
+          ConstantPointerNull::get(cast<PointerType>(CI->getType()));
+        Changed = true;
+        new StoreInst(Null, CI->getArgOperand(0), CI);
+
+        DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
+                     << "                 New = " << *Null << "\n");
+
+        CI->replaceAllUsesWith(Null);
+        CI->eraseFromParent();
+      }
+      continue;
+    }
+    case IC_Release:
+      ContractRelease(Inst, I);
+      continue;
+    case IC_User:
+      // Be conservative if the function has any alloca instructions.
+      // Technically we only care about escaping alloca instructions,
+      // but this is sufficient to handle some interesting cases.
+      if (isa<AllocaInst>(Inst))
+        TailOkForStoreStrongs = false;
+      continue;
+    default:
+      continue;
+    }
+
+    DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n");
+
+    // Don't use GetObjCArg because we don't want to look through bitcasts
+    // and such; to do the replacement, the argument must have type i8*.
+    const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+    for (;;) {
+      // If we're compiling bugpointed code, don't get in trouble.
+      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
+        break;
+      // Look through the uses of the pointer.
+      for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+           UI != UE; ) {
+        Use &U = UI.getUse();
+        unsigned OperandNo = UI.getOperandNo();
+        ++UI; // Increment UI now, because we may unlink its element.
+
+        // If the call's return value dominates a use of the call's argument
+        // value, rewrite the use to use the return value. We check for
+        // reachability here because an unreachable call is considered to
+        // trivially dominate itself, which would lead us to rewriting its
+        // argument in terms of its return value, which would lead to
+        // infinite loops in GetObjCArg.
+        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
+          Changed = true;
+          Instruction *Replacement = Inst;
+          Type *UseTy = U.get()->getType();
+          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+            // For PHI nodes, insert the bitcast in the predecessor block.
+            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            &BB->back());
+            // While we're here, rewrite all edges for this PHI, rather
+            // than just one use at a time, to minimize the number of
+            // bitcasts we emit.
+            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+              if (PHI->getIncomingBlock(i) == BB) {
+                // Keep the UI iterator valid.
+                if (&PHI->getOperandUse(
+                      PHINode::getOperandNumForIncomingValue(i)) ==
+                    &UI.getUse())
+                  ++UI;
+                PHI->setIncomingValue(i, Replacement);
+              }
+          } else {
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            cast<Instruction>(U.getUser()));
+            U.set(Replacement);
+          }
+        }
+      }
+
+      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
+      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
+        Arg = BI->getOperand(0);
+      else if (isa<GEPOperator>(Arg) &&
+               cast<GEPOperator>(Arg)->hasAllZeroIndices())
+        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
+      else if (isa<GlobalAlias>(Arg) &&
+               !cast<GlobalAlias>(Arg)->mayBeOverridden())
+        Arg = cast<GlobalAlias>(Arg)->getAliasee();
+      else
+        break;
+    }
+  }
+
+  // If this function has no escaping allocas or suspicious vararg usage,
+  // objc_storeStrong calls can be marked with the "tail" keyword.
+  if (TailOkForStoreStrongs)
+    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
+         E = StoreStrongCalls.end(); I != E; ++I)
+      (*I)->setTailCall();
+  StoreStrongCalls.clear();
+
+  return Changed;
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
new file mode 100644
index 0000000..39bf8f3
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -0,0 +1,128 @@
+//===- ObjCARCExpand.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file deals with early optimizations which perform certain
+/// cleanup operations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-expand"
+
+#include "ObjCARC.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class Module;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+  /// \brief Early ARC transformations.
+  class ObjCARCExpand : public FunctionPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+  public:
+    static char ID;
+    ObjCARCExpand() : FunctionPass(ID) {
+      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCExpand::ID = 0;
+INITIALIZE_PASS(ObjCARCExpand,
+                "objc-arc-expand", "ObjC ARC expansion", false, false)
+
+Pass *llvm::createObjCARCExpandPass() {
+  return new ObjCARCExpand();
+}
+
+void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCExpand::doInitialization(Module &M) {
+  Run = ModuleHasARC(M);
+  return false;
+}
+
+bool ObjCARCExpand::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  bool Changed = false;
+
+  DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName() << "\n");
+
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    Instruction *Inst = &*I;
+
+    DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
+
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_Retain:
+    case IC_RetainRV:
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV: {
+      // These calls return their argument verbatim, as a low-level
+      // optimization. However, this makes high-level optimizations
+      // harder. Undo any uses of this optimization that the front-end
+      // emitted here. We'll redo them in the contract pass.
+      Changed = true;
+      Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
+      DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n"
+                      "               New = " << *Value << "\n");
+      Inst->replaceAllUsesWith(Value);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
+
+  return Changed;
+}
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
new file mode 100644
index 0000000..9c14949
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -0,0 +1,2691 @@
+//===- ObjCARCOpts.cpp - ObjC ARC Optimization ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// The optimizations performed include elimination of redundant, partially
+/// redundant, and inconsequential reference count operations, elimination of
+/// redundant weak pointer operations, and numerous minor simplifications.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-opts"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARCAliasAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// \defgroup MiscUtils Miscellaneous utilities that are not ARC specific.
+/// @{
+
+namespace {
+  /// \brief An associative container with fast insertion-order (deterministic)
+  /// iteration over its elements. Plus the special blot operation.
+  template<class KeyT, class ValueT>
+  class MapVector {
+    /// Map keys to indices in Vector.
+    typedef DenseMap<KeyT, size_t> MapTy;
+    MapTy Map;
+
+    typedef std::vector<std::pair<KeyT, ValueT> > VectorTy;
+    /// Keys and values.
+    VectorTy Vector;
+
+  public:
+    typedef typename VectorTy::iterator iterator;
+    typedef typename VectorTy::const_iterator const_iterator;
+    iterator begin() { return Vector.begin(); }
+    iterator end() { return Vector.end(); }
+    const_iterator begin() const { return Vector.begin(); }
+    const_iterator end() const { return Vector.end(); }
+
+#ifdef XDEBUG
+    ~MapVector() {
+      assert(Vector.size() >= Map.size()); // May differ due to blotting.
+      for (typename MapTy::const_iterator I = Map.begin(), E = Map.end();
+           I != E; ++I) {
+        assert(I->second < Vector.size());
+        assert(Vector[I->second].first == I->first);
+      }
+      for (typename VectorTy::const_iterator I = Vector.begin(),
+           E = Vector.end(); I != E; ++I)
+        assert(!I->first ||
+               (Map.count(I->first) &&
+                Map[I->first] == size_t(I - Vector.begin())));
+    }
+#endif
+
+    ValueT &operator[](const KeyT &Arg) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(Arg, size_t(0)));
+      if (Pair.second) {
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
+        Vector.push_back(std::make_pair(Arg, ValueT()));
+        return Vector[Num].second;
+      }
+      return Vector[Pair.first->second].second;
+    }
+
+    std::pair<iterator, bool>
+    insert(const std::pair<KeyT, ValueT> &InsertPair) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
+      if (Pair.second) {
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
+        Vector.push_back(InsertPair);
+        return std::make_pair(Vector.begin() + Num, true);
+      }
+      return std::make_pair(Vector.begin() + Pair.first->second, false);
+    }
+
+    const_iterator find(const KeyT &Key) const {
+      typename MapTy::const_iterator It = Map.find(Key);
+      if (It == Map.end()) return Vector.end();
+      return Vector.begin() + It->second;
+    }
+
+    /// This is similar to erase, but instead of removing the element from the
+    /// vector, it just zeros out the key in the vector. This leaves iterators
+    /// intact, but clients must be prepared for zeroed-out keys when iterating.
+    void blot(const KeyT &Key) {
+      typename MapTy::iterator It = Map.find(Key);
+      if (It == Map.end()) return;
+      Vector[It->second].first = KeyT();
+      Map.erase(It);
+    }
+
+    void clear() {
+      Map.clear();
+      Vector.clear();
+    }
+  };
+}
+
+/// @}
+///
+/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
+/// @{
+
+/// \brief This is similar to StripPointerCastsAndObjCCalls but it stops as soon
+/// as it finds a value with multiple uses.
+static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
+  if (Arg->hasOneUse()) {
+    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
+      return FindSingleUseIdentifiedObject(BC->getOperand(0));
+    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
+      if (GEP->hasAllZeroIndices())
+        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
+    if (IsForwarding(GetBasicInstructionClass(Arg)))
+      return FindSingleUseIdentifiedObject(
+               cast<CallInst>(Arg)->getArgOperand(0));
+    if (!IsObjCIdentifiedObject(Arg))
+      return 0;
+    return Arg;
+  }
+
+  // If we found an identifiable object but it has multiple uses, but they are
+  // trivial uses, we can still consider this to be a single-use value.
+  if (IsObjCIdentifiedObject(Arg)) {
+    for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+         UI != UE; ++UI) {
+      const User *U = *UI;
+      if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
+         return 0;
+    }
+
+    return Arg;
+  }
+
+  return 0;
+}
+
+/// \brief Test whether the given retainable object pointer escapes.
+///
+/// This differs from regular escape analysis in that a use as an
+/// argument to a call is not considered an escape.
+///
+static bool DoesRetainableObjPtrEscape(const User *Ptr) {
+  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Target: " << *Ptr << "\n");
+
+  // Walk the def-use chains.
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(Ptr);
+  // If Ptr has any operands add them as well.
+  for (User::const_op_iterator I = Ptr->op_begin(), E = Ptr->op_end(); I != E;
+       ++I) {
+    Worklist.push_back(*I);
+  }
+
+  // Ensure we do not visit any value twice.
+  SmallPtrSet<const Value *, 8> VisitedSet;
+
+  do {
+    const Value *V = Worklist.pop_back_val();
+
+    DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Visiting: " << *V << "\n");
+
+    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      const User *UUser = *UI;
+
+      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User: " << *UUser << "\n");
+
+      // Special - Use by a call (callee or argument) is not considered
+      // to be an escape.
+      switch (GetBasicInstructionClass(UUser)) {
+      case IC_StoreWeak:
+      case IC_InitWeak:
+      case IC_StoreStrong:
+      case IC_Autorelease:
+      case IC_AutoreleaseRV: {
+        DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies pointer "
+              "arguments. Pointer Escapes!\n");
+        // These special functions make copies of their pointer arguments.
+        return true;
+      }
+      case IC_User:
+      case IC_None:
+        // Use by an instruction which copies the value is an escape if the
+        // result is an escape.
+        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
+            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
+
+          if (VisitedSet.insert(UUser)) {
+            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies value. "
+                  "Ptr escapes if result escapes. Adding to list.\n");
+            Worklist.push_back(UUser);
+          } else {
+            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Already visited node."
+                  "\n");
+          }
+          continue;
+        }
+        // Use by a load is not an escape.
+        if (isa<LoadInst>(UUser))
+          continue;
+        // Use by a store is not an escape if the use is the address.
+        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
+          if (V != SI->getValueOperand())
+            continue;
+        break;
+      default:
+        // Regular calls and other stuff are not considered escapes.
+        continue;
+      }
+      // Otherwise, conservatively assume an escape.
+      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Assuming ptr escapes.\n");
+      return true;
+    }
+  } while (!Worklist.empty());
+
+  // No escapes found.
+  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Ptr does not escape.\n");
+  return false;
+}
+
+/// @}
+///
+/// \defgroup ARCOpt ARC Optimization.
+/// @{
+
+// TODO: On code like this:
+//
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+// stuff_that_cannot_release()
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+//
+// The second retain and autorelease can be deleted.
+
+// TODO: It should be possible to delete
+// objc_autoreleasePoolPush and objc_autoreleasePoolPop
+// pairs if nothing is actually autoreleased between them. Also, autorelease
+// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
+// after inlining) can be turned into plain release calls.
+
+// TODO: Critical-edge splitting. If the optimial insertion point is
+// a critical edge, the current algorithm has to fail, because it doesn't
+// know how to split edges. It should be possible to make the optimizer
+// think in terms of edges, rather than blocks, and then split critical
+// edges on demand.
+
+// TODO: OptimizeSequences could generalized to be Interprocedural.
+
+// TODO: Recognize that a bunch of other objc runtime calls have
+// non-escaping arguments and non-releasing arguments, and may be
+// non-autoreleasing.
+
+// TODO: Sink autorelease calls as far as possible. Unfortunately we
+// usually can't sink them past other calls, which would be the main
+// case where it would be useful.
+
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
+
+STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
+STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
+STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
+STATISTIC(NumRets,        "Number of return value forwarding "
+                          "retain+autoreleaes eliminated");
+STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+
+namespace {
+  /// \enum Sequence
+  ///
+  /// \brief A sequence of states that a pointer may go through in which an
+  /// objc_retain and objc_release are actually needed.
+  enum Sequence {
+    S_None,
+    S_Retain,         ///< objc_retain(x).
+    S_CanRelease,     ///< foo(x) -- x could possibly see a ref count decrement.
+    S_Use,            ///< any use of x.
+    S_Stop,           ///< like S_Release, but code motion is stopped.
+    S_Release,        ///< objc_release(x).
+    S_MovableRelease  ///< objc_release(x), !clang.imprecise_release.
+  };
+
+  raw_ostream &operator<<(raw_ostream &OS, const Sequence S)
+    LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<<(raw_ostream &OS, const Sequence S) {
+    switch (S) {
+    case S_None:
+      return OS << "S_None";
+    case S_Retain:
+      return OS << "S_Retain";
+    case S_CanRelease:
+      return OS << "S_CanRelease";
+    case S_Use:
+      return OS << "S_Use";
+    case S_Release:
+      return OS << "S_Release";
+    case S_MovableRelease:
+      return OS << "S_MovableRelease";
+    case S_Stop:
+      return OS << "S_Stop";
+    }
+    llvm_unreachable("Unknown sequence type.");
+  }
+}
+
+static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
+  // The easy cases.
+  if (A == B)
+    return A;
+  if (A == S_None || B == S_None)
+    return S_None;
+
+  if (A > B) std::swap(A, B);
+  if (TopDown) {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
+      return B;
+  } else {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Use || A == S_CanRelease) &&
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
+      return A;
+    // If both sides are releases, choose the more conservative one.
+    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
+      return A;
+    if (A == S_Release && B == S_MovableRelease)
+      return A;
+  }
+
+  return S_None;
+}
+
+namespace {
+  /// \brief Unidirectional information about either a
+  /// retain-decrement-use-release sequence or release-use-decrement-retain
+  /// reverese sequence.
+  struct RRInfo {
+    /// After an objc_retain, the reference count of the referenced
+    /// object is known to be positive. Similarly, before an objc_release, the
+    /// reference count of the referenced object is known to be positive. If
+    /// there are retain-release pairs in code regions where the retain count
+    /// is known to be positive, they can be eliminated, regardless of any side
+    /// effects between them.
+    ///
+    /// Also, a retain+release pair nested within another retain+release
+    /// pair all on the known same pointer value can be eliminated, regardless
+    /// of any intervening side effects.
+    ///
+    /// KnownSafe is true when either of these conditions is satisfied.
+    bool KnownSafe;
+
+    /// True if the Calls are objc_retainBlock calls (as opposed to objc_retain
+    /// calls).
+    bool IsRetainBlock;
+
+    /// True of the objc_release calls are all marked with the "tail" keyword.
+    bool IsTailCallRelease;
+
+    /// If the Calls are objc_release calls and they all have a
+    /// clang.imprecise_release tag, this is the metadata tag.
+    MDNode *ReleaseMetadata;
+
+    /// For a top-down sequence, the set of objc_retains or
+    /// objc_retainBlocks. For bottom-up, the set of objc_releases.
+    SmallPtrSet<Instruction *, 2> Calls;
+
+    /// The set of optimal insert positions for moving calls in the opposite
+    /// sequence.
+    SmallPtrSet<Instruction *, 2> ReverseInsertPts;
+
+    RRInfo() :
+      KnownSafe(false), IsRetainBlock(false),
+      IsTailCallRelease(false),
+      ReleaseMetadata(0) {}
+
+    void clear();
+  };
+}
+
+void RRInfo::clear() {
+  KnownSafe = false;
+  IsRetainBlock = false;
+  IsTailCallRelease = false;
+  ReleaseMetadata = 0;
+  Calls.clear();
+  ReverseInsertPts.clear();
+}
+
+namespace {
+  /// \brief This class summarizes several per-pointer runtime properties which
+  /// are propogated through the flow graph.
+  class PtrState {
+    /// True if the reference count is known to be incremented.
+    bool KnownPositiveRefCount;
+
+    /// True of we've seen an opportunity for partial RR elimination, such as
+    /// pushing calls into a CFG triangle or into one side of a CFG diamond.
+    bool Partial;
+
+    /// The current position in the sequence.
+    Sequence Seq : 8;
+
+  public:
+    /// Unidirectional information about the current sequence.
+    ///
+    /// TODO: Encapsulate this better.
+    RRInfo RRI;
+
+    PtrState() : KnownPositiveRefCount(false), Partial(false),
+                 Seq(S_None) {}
+
+    void SetKnownPositiveRefCount() {
+      KnownPositiveRefCount = true;
+    }
+
+    void ClearRefCount() {
+      KnownPositiveRefCount = false;
+    }
+
+    bool IsKnownIncremented() const {
+      return KnownPositiveRefCount;
+    }
+
+    void SetSeq(Sequence NewSeq) {
+      Seq = NewSeq;
+    }
+
+    Sequence GetSeq() const {
+      return Seq;
+    }
+
+    void ClearSequenceProgress() {
+      ResetSequenceProgress(S_None);
+    }
+
+    void ResetSequenceProgress(Sequence NewSeq) {
+      Seq = NewSeq;
+      Partial = false;
+      RRI.clear();
+    }
+
+    void Merge(const PtrState &Other, bool TopDown);
+  };
+}
+
+void
+PtrState::Merge(const PtrState &Other, bool TopDown) {
+  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
+  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
+
+  // We can't merge a plain objc_retain with an objc_retainBlock.
+  if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
+    Seq = S_None;
+
+  // If we're not in a sequence (anymore), drop all associated state.
+  if (Seq == S_None) {
+    Partial = false;
+    RRI.clear();
+  } else if (Partial || Other.Partial) {
+    // If we're doing a merge on a path that's previously seen a partial
+    // merge, conservatively drop the sequence, to avoid doing partial
+    // RR elimination. If the branch predicates for the two merge differ,
+    // mixing them is unsafe.
+    ClearSequenceProgress();
+  } else {
+    // Conservatively merge the ReleaseMetadata information.
+    if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
+      RRI.ReleaseMetadata = 0;
+
+    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
+    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
+                            Other.RRI.IsTailCallRelease;
+    RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
+
+    // Merge the insert point sets. If there are any differences,
+    // that makes this a partial merge.
+    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
+    for (SmallPtrSet<Instruction *, 2>::const_iterator
+         I = Other.RRI.ReverseInsertPts.begin(),
+         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
+      Partial |= RRI.ReverseInsertPts.insert(*I);
+  }
+}
+
+namespace {
+  /// \brief Per-BasicBlock state.
+  class BBState {
+    /// The number of unique control paths from the entry which can reach this
+    /// block.
+    unsigned TopDownPathCount;
+
+    /// The number of unique control paths to exits from this block.
+    unsigned BottomUpPathCount;
+
+    /// A type for PerPtrTopDown and PerPtrBottomUp.
+    typedef MapVector<const Value *, PtrState> MapTy;
+
+    /// The top-down traversal uses this to record information known about a
+    /// pointer at the bottom of each block.
+    MapTy PerPtrTopDown;
+
+    /// The bottom-up traversal uses this to record information known about a
+    /// pointer at the top of each block.
+    MapTy PerPtrBottomUp;
+
+    /// Effective predecessors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Preds;
+    /// Effective successors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Succs;
+
+  public:
+    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
+
+    typedef MapTy::iterator ptr_iterator;
+    typedef MapTy::const_iterator ptr_const_iterator;
+
+    ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
+    ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
+    ptr_const_iterator top_down_ptr_begin() const {
+      return PerPtrTopDown.begin();
+    }
+    ptr_const_iterator top_down_ptr_end() const {
+      return PerPtrTopDown.end();
+    }
+
+    ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); }
+    ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
+    ptr_const_iterator bottom_up_ptr_begin() const {
+      return PerPtrBottomUp.begin();
+    }
+    ptr_const_iterator bottom_up_ptr_end() const {
+      return PerPtrBottomUp.end();
+    }
+
+    /// Mark this block as being an entry block, which has one path from the
+    /// entry by definition.
+    void SetAsEntry() { TopDownPathCount = 1; }
+
+    /// Mark this block as being an exit block, which has one path to an exit by
+    /// definition.
+    void SetAsExit()  { BottomUpPathCount = 1; }
+
+    PtrState &getPtrTopDownState(const Value *Arg) {
+      return PerPtrTopDown[Arg];
+    }
+
+    PtrState &getPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp[Arg];
+    }
+
+    void clearBottomUpPointers() {
+      PerPtrBottomUp.clear();
+    }
+
+    void clearTopDownPointers() {
+      PerPtrTopDown.clear();
+    }
+
+    void InitFromPred(const BBState &Other);
+    void InitFromSucc(const BBState &Other);
+    void MergePred(const BBState &Other);
+    void MergeSucc(const BBState &Other);
+
+    /// Return the number of possible unique paths from an entry to an exit
+    /// which pass through this block. This is only valid after both the
+    /// top-down and bottom-up traversals are complete.
+    unsigned GetAllPathCount() const {
+      assert(TopDownPathCount != 0);
+      assert(BottomUpPathCount != 0);
+      return TopDownPathCount * BottomUpPathCount;
+    }
+
+    // Specialized CFG utilities.
+    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
+    edge_iterator pred_begin() { return Preds.begin(); }
+    edge_iterator pred_end() { return Preds.end(); }
+    edge_iterator succ_begin() { return Succs.begin(); }
+    edge_iterator succ_end() { return Succs.end(); }
+
+    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
+    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
+
+    bool isExit() const { return Succs.empty(); }
+  };
+}
+
+void BBState::InitFromPred(const BBState &Other) {
+  PerPtrTopDown = Other.PerPtrTopDown;
+  TopDownPathCount = Other.TopDownPathCount;
+}
+
+void BBState::InitFromSucc(const BBState &Other) {
+  PerPtrBottomUp = Other.PerPtrBottomUp;
+  BottomUpPathCount = Other.BottomUpPathCount;
+}
+
+/// The top-down traversal uses this to merge information about predecessors to
+/// form the initial state for a new block.
+void BBState::MergePred(const BBState &Other) {
+  // Other.TopDownPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  TopDownPathCount += Other.TopDownPathCount;
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (TopDownPathCount < Other.TopDownPathCount) {
+    clearTopDownPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the same key,
+  // merge the entries. Otherwise, copy the entry and merge it with an empty
+  // entry.
+  for (ptr_const_iterator MI = Other.top_down_ptr_begin(),
+       ME = Other.top_down_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/true);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry with the
+  // same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = top_down_ptr_begin(),
+       ME = top_down_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/true);
+}
+
+/// The bottom-up traversal uses this to merge information about successors to
+/// form the initial state for a new block.
+void BBState::MergeSucc(const BBState &Other) {
+  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  BottomUpPathCount += Other.BottomUpPathCount;
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (BottomUpPathCount < Other.BottomUpPathCount) {
+    clearBottomUpPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the
+  // same key, merge the entries. Otherwise, copy the entry and merge
+  // it with an empty entry.
+  for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(),
+       ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/false);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry
+  // with the same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = bottom_up_ptr_begin(),
+       ME = bottom_up_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/false);
+}
+
+namespace {
+  /// \brief The main ARC optimization pass.
+  class ObjCARCOpt : public FunctionPass {
+    bool Changed;
+    ProvenanceAnalysis PA;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// Declarations for ObjC runtime functions, for use in creating calls to
+    /// them. These are initialized lazily to avoid cluttering up the Module
+    /// with unused declarations.
+
+    /// Declaration for ObjC runtime function
+    /// objc_retainAutoreleasedReturnValue.
+    Constant *RetainRVCallee;
+    /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
+    Constant *AutoreleaseRVCallee;
+    /// Declaration for ObjC runtime function objc_release.
+    Constant *ReleaseCallee;
+    /// Declaration for ObjC runtime function objc_retain.
+    Constant *RetainCallee;
+    /// Declaration for ObjC runtime function objc_retainBlock.
+    Constant *RetainBlockCallee;
+    /// Declaration for ObjC runtime function objc_autorelease.
+    Constant *AutoreleaseCallee;
+
+    /// Flags which determine whether each of the interesting runtine functions
+    /// is in fact used in the current function.
+    unsigned UsedInThisFunction;
+
+    /// The Metadata Kind for clang.imprecise_release metadata.
+    unsigned ImpreciseReleaseMDKind;
+
+    /// The Metadata Kind for clang.arc.copy_on_escape metadata.
+    unsigned CopyOnEscapeMDKind;
+
+    /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata.
+    unsigned NoObjCARCExceptionsMDKind;
+
+    Constant *getRetainRVCallee(Module *M);
+    Constant *getAutoreleaseRVCallee(Module *M);
+    Constant *getReleaseCallee(Module *M);
+    Constant *getRetainCallee(Module *M);
+    Constant *getRetainBlockCallee(Module *M);
+    Constant *getAutoreleaseCallee(Module *M);
+
+    bool IsRetainBlockOptimizable(const Instruction *Inst);
+
+    void OptimizeRetainCall(Function &F, Instruction *Retain);
+    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
+    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                   InstructionClass &Class);
+    void OptimizeIndividualCalls(Function &F);
+
+    void CheckForCFGHazards(const BasicBlock *BB,
+                            DenseMap<const BasicBlock *, BBState> &BBStates,
+                            BBState &MyStates) const;
+    bool VisitInstructionBottomUp(Instruction *Inst,
+                                  BasicBlock *BB,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  BBState &MyStates);
+    bool VisitBottomUp(BasicBlock *BB,
+                       DenseMap<const BasicBlock *, BBState> &BBStates,
+                       MapVector<Value *, RRInfo> &Retains);
+    bool VisitInstructionTopDown(Instruction *Inst,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 BBState &MyStates);
+    bool VisitTopDown(BasicBlock *BB,
+                      DenseMap<const BasicBlock *, BBState> &BBStates,
+                      DenseMap<Value *, RRInfo> &Releases);
+    bool Visit(Function &F,
+               DenseMap<const BasicBlock *, BBState> &BBStates,
+               MapVector<Value *, RRInfo> &Retains,
+               DenseMap<Value *, RRInfo> &Releases);
+
+    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                   MapVector<Value *, RRInfo> &Retains,
+                   DenseMap<Value *, RRInfo> &Releases,
+                   SmallVectorImpl<Instruction *> &DeadInsts,
+                   Module *M);
+
+    bool ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> &BBStates,
+                               MapVector<Value *, RRInfo> &Retains,
+                               DenseMap<Value *, RRInfo> &Releases,
+                               Module *M,
+                               SmallVector<Instruction *, 4> &NewRetains,
+                               SmallVector<Instruction *, 4> &NewReleases,
+                               SmallVector<Instruction *, 8> &DeadInsts,
+                               RRInfo &RetainsToMove,
+                               RRInfo &ReleasesToMove,
+                               Value *Arg,
+                               bool KnownSafe,
+                               bool &AnyPairsCompletelyEliminated);
+
+    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
+                              MapVector<Value *, RRInfo> &Retains,
+                              DenseMap<Value *, RRInfo> &Releases,
+                              Module *M);
+
+    void OptimizeWeakCalls(Function &F);
+
+    bool OptimizeSequences(Function &F);
+
+    void OptimizeReturns(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+    virtual void releaseMemory();
+
+  public:
+    static char ID;
+    ObjCARCOpt() : FunctionPass(ID) {
+      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCOpt::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCOpt,
+                      "objc-arc", "ObjC ARC optimization", false, false)
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis)
+INITIALIZE_PASS_END(ObjCARCOpt,
+                    "objc-arc", "ObjC ARC optimization", false, false)
+
+Pass *llvm::createObjCARCOptPass() {
+  return new ObjCARCOpt();
+}
+
+void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<ObjCARCAliasAnalysis>();
+  AU.addRequired<AliasAnalysis>();
+  // ARC optimization doesn't currently split critical edges.
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
+  // Without the magic metadata tag, we have to assume this might be an
+  // objc_retainBlock call inserted to convert a block pointer to an id,
+  // in which case it really is needed.
+  if (!Inst->getMetadata(CopyOnEscapeMDKind))
+    return false;
+
+  // If the pointer "escapes" (not including being used in a call),
+  // the copy may be needed.
+  if (DoesRetainableObjPtrEscape(Inst))
+    return false;
+
+  // Otherwise, it's not needed.
+  return true;
+}
+
+Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
+  if (!RetainRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainRVCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
+  if (!AutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    AutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
+                             Attribute);
+  }
+  return AutoreleaseRVCallee;
+}
+
+Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
+  if (!ReleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    ReleaseCallee =
+      M->getOrInsertFunction(
+        "objc_release",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return ReleaseCallee;
+}
+
+Constant *ObjCARCOpt::getRetainCallee(Module *M) {
+  if (!RetainCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainCallee =
+      M->getOrInsertFunction(
+        "objc_retain",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return RetainCallee;
+}
+
+Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
+  if (!RetainBlockCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    // objc_retainBlock is not nounwind because it calls user copy constructors
+    // which could theoretically throw.
+    RetainBlockCallee =
+      M->getOrInsertFunction(
+        "objc_retainBlock",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        AttributeSet());
+  }
+  return RetainBlockCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
+  if (!AutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    AutoreleaseCallee =
+      M->getOrInsertFunction(
+        "objc_autorelease",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return AutoreleaseCallee;
+}
+
+/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
+/// return value.
+void
+ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
+  ImmutableCallSite CS(GetObjCArg(Retain));
+  const Instruction *Call = CS.getInstruction();
+  if (!Call) return;
+  if (Call->getParent() != Retain->getParent()) return;
+
+  // Check that the call is next to the retain.
+  BasicBlock::const_iterator I = Call;
+  ++I;
+  while (isNoopInstruction(I)) ++I;
+  if (&*I != Retain)
+    return;
+
+  // Turn it to an objc_retainAutoreleasedReturnValue..
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming "
+                  "objc_retain => objc_retainAutoreleasedReturnValue"
+                  " since the operand is a return value.\n"
+                  "                                Old: "
+               << *Retain << "\n");
+
+  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                New: "
+               << *Retain << "\n");
+}
+
+/// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
+/// not a return value.  Or, if it can be paired with an
+/// objc_autoreleaseReturnValue, delete the pair and return true.
+bool
+ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
+  // Check for the argument being from an immediately preceding call or invoke.
+  const Value *Arg = GetObjCArg(RetainRV);
+  ImmutableCallSite CS(Arg);
+  if (const Instruction *Call = CS.getInstruction()) {
+    if (Call->getParent() == RetainRV->getParent()) {
+      BasicBlock::const_iterator I = Call;
+      ++I;
+      while (isNoopInstruction(I)) ++I;
+      if (&*I == RetainRV)
+        return false;
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      BasicBlock *RetainRVParent = RetainRV->getParent();
+      if (II->getNormalDest() == RetainRVParent) {
+        BasicBlock::const_iterator I = RetainRVParent->begin();
+        while (isNoopInstruction(I)) ++I;
+        if (&*I == RetainRV)
+          return false;
+      }
+    }
+  }
+
+  // Check for being preceded by an objc_autoreleaseReturnValue on the same
+  // pointer. In this case, we can delete the pair.
+  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
+  if (I != Begin) {
+    do --I; while (I != Begin && isNoopInstruction(I));
+    if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
+        GetObjCArg(I) == Arg) {
+      Changed = true;
+      ++NumPeeps;
+
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n"
+                   << "                                  Erasing " << *RetainRV
+                   << "\n");
+
+      EraseInstruction(I);
+      EraseInstruction(RetainRV);
+      return true;
+    }
+  }
+
+  // Turn it to a plain objc_retain.
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming "
+                  "objc_retainAutoreleasedReturnValue => "
+                  "objc_retain since the operand is not a return value.\n"
+                  "                                  Old: "
+               << *RetainRV << "\n");
+
+  cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                  New: "
+               << *RetainRV << "\n");
+
+  return false;
+}
+
+/// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
+/// used as a return value.
+void
+ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                      InstructionClass &Class) {
+  // Check for a return of the pointer value.
+  const Value *Ptr = GetObjCArg(AutoreleaseRV);
+  SmallVector<const Value *, 2> Users;
+  Users.push_back(Ptr);
+  do {
+    Ptr = Users.pop_back_val();
+    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
+         UI != UE; ++UI) {
+      const User *I = *UI;
+      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
+        return;
+      if (isa<BitCastInst>(I))
+        Users.push_back(I);
+    }
+  } while (!Users.empty());
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming "
+                  "objc_autoreleaseReturnValue => "
+                  "objc_autorelease since its operand is not used as a return "
+                  "value.\n"
+                  "                                       Old: "
+               << *AutoreleaseRV << "\n");
+
+  CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
+  AutoreleaseRVCI->
+    setCalledFunction(getAutoreleaseCallee(F.getParent()));
+  AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
+  Class = IC_Autorelease;
+
+  DEBUG(dbgs() << "                                       New: "
+               << *AutoreleaseRV << "\n");
+
+}
+
+/// Visit each call, one at a time, and make simplifications without doing any
+/// additional analysis.
+void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+  // Reset all the flags in preparation for recomputing them.
+  UsedInThisFunction = 0;
+
+  // Visit all objc_* calls in F.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: Class: "
+          << Class << "; " << *Inst << "\n");
+
+    switch (Class) {
+    default: break;
+
+    // Delete no-op casts. These function calls have special semantics, but
+    // the semantics are entirely implemented via lowering in the front-end,
+    // so by the time they reach the optimizer, they are just no-op calls
+    // which return their argument.
+    //
+    // There are gray areas here, as the ability to cast reference-counted
+    // pointers to raw void* and back allows code to break ARC assumptions,
+    // however these are currently considered to be unimportant.
+    case IC_NoopCast:
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:"
+                   " " << *Inst << "\n");
+      EraseInstruction(Inst);
+      continue;
+
+    // If the pointer-to-weak-pointer is null, it's undefined behavior.
+    case IC_StoreWeak:
+    case IC_LoadWeak:
+    case IC_LoadWeakRetained:
+    case IC_InitWeak:
+    case IC_DestroyWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (isNullOrUndef(CI->getArgOperand(0))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
+                        "pointer-to-weak-pointer is undefined behavior.\n"
+                        "                                     Old = " << *CI <<
+                        "\n                                     New = " <<
+                        *NewValue << "\n");
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_CopyWeak:
+    case IC_MoveWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (isNullOrUndef(CI->getArgOperand(0)) ||
+          isNullOrUndef(CI->getArgOperand(1))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
+                        "pointer-to-weak-pointer is undefined behavior.\n"
+                        "                                     Old = " << *CI <<
+                        "\n                                     New = " <<
+                        *NewValue << "\n");
+
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_Retain:
+      OptimizeRetainCall(F, Inst);
+      break;
+    case IC_RetainRV:
+      if (OptimizeRetainRVCall(F, Inst))
+        continue;
+      break;
+    case IC_AutoreleaseRV:
+      OptimizeAutoreleaseRVCall(F, Inst, Class);
+      break;
+    }
+
+    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
+    if (IsAutorelease(Class) && Inst->use_empty()) {
+      CallInst *Call = cast<CallInst>(Inst);
+      const Value *Arg = Call->getArgOperand(0);
+      Arg = FindSingleUseIdentifiedObject(Arg);
+      if (Arg) {
+        Changed = true;
+        ++NumAutoreleases;
+
+        // Create the declaration lazily.
+        LLVMContext &C = Inst->getContext();
+        CallInst *NewCall =
+          CallInst::Create(getReleaseCallee(F.getParent()),
+                           Call->getArgOperand(0), "", Call);
+        NewCall->setMetadata(ImpreciseReleaseMDKind,
+                             MDNode::get(C, ArrayRef<Value *>()));
+
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing "
+                        "objc_autorelease(x) with objc_release(x) since x is "
+                        "otherwise unused.\n"
+                        "                                     Old: " << *Call <<
+                        "\n                                     New: " <<
+                        *NewCall << "\n");
+
+        EraseInstruction(Call);
+        Inst = NewCall;
+        Class = IC_Release;
+      }
+    }
+
+    // For functions which can never be passed stack arguments, add
+    // a tail keyword.
+    if (IsAlwaysTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword"
+            " to function since it can never be passed stack args: " << *Inst <<
+            "\n");
+      cast<CallInst>(Inst)->setTailCall();
+    }
+
+    // Ensure that functions that can never have a "tail" keyword due to the
+    // semantics of ARC truly do not do so.
+    if (IsNeverTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Removing tail "
+            "keyword from function: " << *Inst <<
+            "\n");
+      cast<CallInst>(Inst)->setTailCall(false);
+    }
+
+    // Set nounwind as needed.
+    if (IsNoThrow(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw"
+            " class. Setting nounwind on: " << *Inst << "\n");
+      cast<CallInst>(Inst)->setDoesNotThrow();
+    }
+
+    if (!IsNoopOnNull(Class)) {
+      UsedInThisFunction |= 1 << Class;
+      continue;
+    }
+
+    const Value *Arg = GetObjCArg(Inst);
+
+    // ARC calls with null are no-ops. Delete them.
+    if (isNullOrUndef(Arg)) {
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with "
+            " null are no-ops. Erasing: " << *Inst << "\n");
+      EraseInstruction(Inst);
+      continue;
+    }
+
+    // Keep track of which of retain, release, autorelease, and retain_block
+    // are actually present in this function.
+    UsedInThisFunction |= 1 << Class;
+
+    // If Arg is a PHI, and one or more incoming values to the
+    // PHI are null, and the call is control-equivalent to the PHI, and there
+    // are no relevant side effects between the PHI and the call, the call
+    // could be pushed up to just those paths with non-null incoming values.
+    // For now, don't bother splitting critical edges for this.
+    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
+    Worklist.push_back(std::make_pair(Inst, Arg));
+    do {
+      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
+      Inst = Pair.first;
+      Arg = Pair.second;
+
+      const PHINode *PN = dyn_cast<PHINode>(Arg);
+      if (!PN) continue;
+
+      // Determine if the PHI has any null operands, or any incoming
+      // critical edges.
+      bool HasNull = false;
+      bool HasCriticalEdges = false;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        Value *Incoming =
+          StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+        if (isNullOrUndef(Incoming))
+          HasNull = true;
+        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
+                   .getNumSuccessors() != 1) {
+          HasCriticalEdges = true;
+          break;
+        }
+      }
+      // If we have null operands and no critical edges, optimize.
+      if (!HasCriticalEdges && HasNull) {
+        SmallPtrSet<Instruction *, 4> DependingInstructions;
+        SmallPtrSet<const BasicBlock *, 4> Visited;
+
+        // Check that there is nothing that cares about the reference
+        // count between the call and the phi.
+        switch (Class) {
+        case IC_Retain:
+        case IC_RetainBlock:
+          // These can always be moved up.
+          break;
+        case IC_Release:
+          // These can't be moved across things that care about the retain
+          // count.
+          FindDependencies(NeedsPositiveRetainCount, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_Autorelease:
+          // These can't be moved across autorelease pool scope boundaries.
+          FindDependencies(AutoreleasePoolBoundary, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_RetainRV:
+        case IC_AutoreleaseRV:
+          // Don't move these; the RV optimization depends on the autoreleaseRV
+          // being tail called, and the retainRV being immediately after a call
+          // (which might still happen if we get lucky with codegen layout, but
+          // it's not worth taking the chance).
+          continue;
+        default:
+          llvm_unreachable("Invalid dependence flavor");
+        }
+
+        if (DependingInstructions.size() == 1 &&
+            *DependingInstructions.begin() == PN) {
+          Changed = true;
+          ++NumPartialNoops;
+          // Clone the call into each predecessor that has a non-null value.
+          CallInst *CInst = cast<CallInst>(Inst);
+          Type *ParamTy = CInst->getArgOperand(0)->getType();
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+            Value *Incoming =
+              StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+            if (!isNullOrUndef(Incoming)) {
+              CallInst *Clone = cast<CallInst>(CInst->clone());
+              Value *Op = PN->getIncomingValue(i);
+              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+              if (Op->getType() != ParamTy)
+                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
+              Clone->setArgOperand(0, Op);
+              Clone->insertBefore(InsertPos);
+
+              DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Cloning "
+                           << *CInst << "\n"
+                           "                                     And inserting "
+                           "clone at " << *InsertPos << "\n");
+              Worklist.push_back(std::make_pair(Clone, Incoming));
+            }
+          }
+          // Erase the original call.
+          DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+          EraseInstruction(CInst);
+          continue;
+        }
+      }
+    } while (!Worklist.empty());
+  }
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished List.\n");
+}
+
+/// Check for critical edges, loop boundaries, irreducible control flow, or
+/// other CFG structures where moving code across the edge would result in it
+/// being executed more.
+void
+ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BBState &MyStates) const {
+  // If any top-down local-use or possible-dec has a succ which is earlier in
+  // the sequence, forget it.
+  for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
+       E = MyStates.top_down_ptr_end(); I != E; ++I)
+    switch (I->second.GetSeq()) {
+    default: break;
+    case S_Use: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        switch (SuccSSeq) {
+        case S_None:
+        case S_CanRelease: {
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+            S.ClearSequenceProgress();
+            break;
+          }
+          continue;
+        }
+        case S_Use:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+            AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        S.ClearSequenceProgress();
+      break;
+    }
+    case S_CanRelease: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        switch (SuccSSeq) {
+        case S_None: {
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+            S.ClearSequenceProgress();
+            break;
+          }
+          continue;
+        }
+        case S_CanRelease:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+        case S_Use:
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+            AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        S.ClearSequenceProgress();
+      break;
+    }
+    }
+}
+
+bool
+ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
+                                     BasicBlock *BB,
+                                     MapVector<Value *, RRInfo> &Retains,
+                                     BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+
+    // If we see two releases in a row on the same pointer. If so, make
+    // a note, and we'll cicle back to revisit it after we've
+    // hopefully eliminated the second release, which may allow us to
+    // eliminate the first release too.
+    // Theoretically we could implement removal of nested retain+release
+    // pairs by making PtrState hold a stack of states, but this is
+    // simple and avoids adding overhead for the non-nested case.
+    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) {
+      DEBUG(dbgs() << "ObjCARCOpt::VisitInstructionBottomUp: Found nested "
+                      "releases (i.e. a release pair)\n");
+      NestingDetected = true;
+    }
+
+    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+    S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
+    S.RRI.ReleaseMetadata = ReleaseMetadata;
+    S.RRI.KnownSafe = S.IsKnownIncremented();
+    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+    S.RRI.Calls.insert(Inst);
+
+    S.SetKnownPositiveRefCount();
+    break;
+  }
+  case IC_RetainBlock:
+    // An objc_retainBlock call with just a use may need to be kept,
+    // because it may be copying a block from the stack to the heap.
+    if (!IsRetainBlockOptimizable(Inst))
+      break;
+    // FALLTHROUGH
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+    S.SetKnownPositiveRefCount();
+
+    switch (S.GetSeq()) {
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+    case S_Use:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_CanRelease:
+      // Don't do retain+release tracking for IC_RetainRV, because it's
+      // better to let it remain as the first instruction after a call.
+      if (Class != IC_RetainRV) {
+        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+        Retains[Inst] = S.RRI;
+      }
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+    return NestingDetected;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearBottomUpPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
+       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.ClearRefCount();
+      switch (Seq) {
+      case S_Use:
+        S.SetSeq(S_CanRelease);
+        continue;
+      case S_CanRelease:
+      case S_Release:
+      case S_MovableRelease:
+      case S_Stop:
+      case S_None:
+        break;
+      case S_Retain:
+        llvm_unreachable("bottom-up pointer in retain state!");
+      }
+    }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_Release:
+    case S_MovableRelease:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        assert(S.RRI.ReverseInsertPts.empty());
+        // If this is an invoke instruction, we're scanning it as part of
+        // one of its successor blocks, since we can't insert code after it
+        // in its own block, and we don't want to split critical edges.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+        S.SetSeq(S_Use);
+      } else if (Seq == S_Release &&
+                 (Class == IC_User || Class == IC_CallOrUser)) {
+        // Non-movable releases depend on any possible objc pointer use.
+        S.SetSeq(S_Stop);
+        assert(S.RRI.ReverseInsertPts.empty());
+        // As above; handle invoke specially.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+      }
+      break;
+    case S_Stop:
+      if (CanUse(Inst, Ptr, PA, Class))
+        S.SetSeq(S_Use);
+      break;
+    case S_CanRelease:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
+                          DenseMap<const BasicBlock *, BBState> &BBStates,
+                          MapVector<Value *, RRInfo> &Retains) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each successor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
+    const BasicBlock *Succ = *SI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+    assert(I != BBStates.end());
+    MyStates.InitFromSucc(I->second);
+    ++SI;
+    for (; SI != SE; ++SI) {
+      Succ = *SI;
+      I = BBStates.find(Succ);
+      assert(I != BBStates.end());
+      MyStates.MergeSucc(I->second);
+    }
+  }
+
+  // Visit all the instructions, bottom-up.
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
+    Instruction *Inst = llvm::prior(I);
+
+    // Invoke instructions are visited as part of their successors (below).
+    if (isa<InvokeInst>(Inst))
+      continue;
+
+    DEBUG(dbgs() << "ObjCARCOpt::VisitButtonUp: Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
+  }
+
+  // If there's a predecessor with an invoke, visit the invoke as if it were
+  // part of this block, since we can't insert code after an invoke in its own
+  // block, and we don't want to split critical edges.
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
+      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
+                                    DenseMap<Value *, RRInfo> &Releases,
+                                    BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_RetainBlock:
+    // An objc_retainBlock call with just a use may need to be kept,
+    // because it may be copying a block from the stack to the heap.
+    if (!IsRetainBlockOptimizable(Inst))
+      break;
+    // FALLTHROUGH
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+
+    // Don't do retain+release tracking for IC_RetainRV, because it's
+    // better to let it remain as the first instruction after a call.
+    if (Class != IC_RetainRV) {
+      // If we see two retains in a row on the same pointer. If so, make
+      // a note, and we'll cicle back to revisit it after we've
+      // hopefully eliminated the second retain, which may allow us to
+      // eliminate the first retain too.
+      // Theoretically we could implement removal of nested retain+release
+      // pairs by making PtrState hold a stack of states, but this is
+      // simple and avoids adding overhead for the non-nested case.
+      if (S.GetSeq() == S_Retain)
+        NestingDetected = true;
+
+      S.ResetSequenceProgress(S_Retain);
+      S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+      S.RRI.KnownSafe = S.IsKnownIncremented();
+      S.RRI.Calls.insert(Inst);
+    }
+
+    S.SetKnownPositiveRefCount();
+
+    // A retain can be a potential use; procede to the generic checking
+    // code below.
+    break;
+  }
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+    S.ClearRefCount();
+
+    switch (S.GetSeq()) {
+    case S_Retain:
+    case S_CanRelease:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_Use:
+      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+      Releases[Inst] = S.RRI;
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
+    break;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearTopDownPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
+       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.ClearRefCount();
+      switch (Seq) {
+      case S_Retain:
+        S.SetSeq(S_CanRelease);
+        assert(S.RRI.ReverseInsertPts.empty());
+        S.RRI.ReverseInsertPts.insert(Inst);
+
+        // One call can't cause a transition from S_Retain to S_CanRelease
+        // and S_CanRelease to S_Use. If we've made the first transition,
+        // we're done.
+        continue;
+      case S_Use:
+      case S_CanRelease:
+      case S_None:
+        break;
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        llvm_unreachable("top-down pointer in release state!");
+      }
+    }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_CanRelease:
+      if (CanUse(Inst, Ptr, PA, Class))
+        S.SetSeq(S_Use);
+      break;
+    case S_Retain:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitTopDown(BasicBlock *BB,
+                         DenseMap<const BasicBlock *, BBState> &BBStates,
+                         DenseMap<Value *, RRInfo> &Releases) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each predecessor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
+    const BasicBlock *Pred = *PI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+    assert(I != BBStates.end());
+    MyStates.InitFromPred(I->second);
+    ++PI;
+    for (; PI != PE; ++PI) {
+      Pred = *PI;
+      I = BBStates.find(Pred);
+      assert(I != BBStates.end());
+      MyStates.MergePred(I->second);
+    }
+  }
+
+  // Visit all the instructions, top-down.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    Instruction *Inst = I;
+
+    DEBUG(dbgs() << "ObjCARCOpt::VisitTopDown: Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
+  }
+
+  CheckForCFGHazards(BB, BBStates, MyStates);
+  return NestingDetected;
+}
+
+static void
+ComputePostOrders(Function &F,
+                  SmallVectorImpl<BasicBlock *> &PostOrder,
+                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
+                  unsigned NoObjCARCExceptionsMDKind,
+                  DenseMap<const BasicBlock *, BBState> &BBStates) {
+  /// The visited set, for doing DFS walks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  // Do DFS, computing the PostOrder.
+  SmallPtrSet<BasicBlock *, 16> OnStack;
+  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+
+  // Functions always have exactly one entry block, and we don't have
+  // any other block that we treat like an entry block.
+  BasicBlock *EntryBB = &F.getEntryBlock();
+  BBState &MyStates = BBStates[EntryBB];
+  MyStates.SetAsEntry();
+  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
+  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
+  Visited.insert(EntryBB);
+  OnStack.insert(EntryBB);
+  do {
+  dfs_next_succ:
+    BasicBlock *CurrBB = SuccStack.back().first;
+    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
+    succ_iterator SE(TI, false);
+
+    while (SuccStack.back().second != SE) {
+      BasicBlock *SuccBB = *SuccStack.back().second++;
+      if (Visited.insert(SuccBB)) {
+        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
+        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBState &SuccStates = BBStates[SuccBB];
+        SuccStates.addPred(CurrBB);
+        OnStack.insert(SuccBB);
+        goto dfs_next_succ;
+      }
+
+      if (!OnStack.count(SuccBB)) {
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBStates[SuccBB].addPred(CurrBB);
+      }
+    }
+    OnStack.erase(CurrBB);
+    PostOrder.push_back(CurrBB);
+    SuccStack.pop_back();
+  } while (!SuccStack.empty());
+
+  Visited.clear();
+
+  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+  // Functions may have many exits, and there also blocks which we treat
+  // as exits due to ignored edges.
+  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *ExitBB = I;
+    BBState &MyStates = BBStates[ExitBB];
+    if (!MyStates.isExit())
+      continue;
+
+    MyStates.SetAsExit();
+
+    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
+    Visited.insert(ExitBB);
+    while (!PredStack.empty()) {
+    reverse_dfs_next_succ:
+      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
+      while (PredStack.back().second != PE) {
+        BasicBlock *BB = *PredStack.back().second++;
+        if (Visited.insert(BB)) {
+          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
+          goto reverse_dfs_next_succ;
+        }
+      }
+      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
+    }
+  }
+}
+
+// Visit the function both top-down and bottom-up.
+bool
+ObjCARCOpt::Visit(Function &F,
+                  DenseMap<const BasicBlock *, BBState> &BBStates,
+                  MapVector<Value *, RRInfo> &Retains,
+                  DenseMap<Value *, RRInfo> &Releases) {
+
+  // Use reverse-postorder traversals, because we magically know that loops
+  // will be well behaved, i.e. they won't repeatedly call retain on a single
+  // pointer without doing a release. We can't use the ReversePostOrderTraversal
+  // class here because we want the reverse-CFG postorder to consider each
+  // function exit point, and we want to ignore selected cycle edges.
+  SmallVector<BasicBlock *, 16> PostOrder;
+  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
+  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
+                    NoObjCARCExceptionsMDKind,
+                    BBStates);
+
+  // Use reverse-postorder on the reverse CFG for bottom-up.
+  bool BottomUpNestingDetected = false;
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+       ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend();
+       I != E; ++I)
+    BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains);
+
+  // Use reverse-postorder for top-down.
+  bool TopDownNestingDetected = false;
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+       PostOrder.rbegin(), E = PostOrder.rend();
+       I != E; ++I)
+    TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases);
+
+  return TopDownNestingDetected && BottomUpNestingDetected;
+}
+
+/// Move the calls in RetainsToMove and ReleasesToMove.
+void ObjCARCOpt::MoveCalls(Value *Arg,
+                           RRInfo &RetainsToMove,
+                           RRInfo &ReleasesToMove,
+                           MapVector<Value *, RRInfo> &Retains,
+                           DenseMap<Value *, RRInfo> &Releases,
+                           SmallVectorImpl<Instruction *> &DeadInsts,
+                           Module *M) {
+  Type *ArgTy = Arg->getType();
+  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
+
+  // Insert the new retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = ReleasesToMove.ReverseInsertPts.begin(),
+       PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call =
+      CallInst::Create(RetainsToMove.IsRetainBlock ?
+                         getRetainBlockCallee(M) : getRetainCallee(M),
+                       MyArg, "", InsertPt);
+    Call->setDoesNotThrow();
+    if (RetainsToMove.IsRetainBlock)
+      Call->setMetadata(CopyOnEscapeMDKind,
+                        MDNode::get(M->getContext(), ArrayRef<Value *>()));
+    else
+      Call->setTailCall();
+
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Release: " << *Call
+                 << "\n"
+                    "                       At insertion point: " << *InsertPt
+                 << "\n");
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = RetainsToMove.ReverseInsertPts.begin(),
+       PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
+                                      "", InsertPt);
+    // Attach a clang.imprecise_release metadata tag, if appropriate.
+    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+      Call->setMetadata(ImpreciseReleaseMDKind, M);
+    Call->setDoesNotThrow();
+    if (ReleasesToMove.IsTailCallRelease)
+      Call->setTailCall();
+
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Retain: " << *Call
+                 << "\n"
+                    "                       At insertion point: " << *InsertPt
+                 << "\n");
+  }
+
+  // Delete the original retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = RetainsToMove.Calls.begin(),
+       AE = RetainsToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRetain = *AI;
+    Retains.blot(OrigRetain);
+    DeadInsts.push_back(OrigRetain);
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting retain: " << *OrigRetain <<
+                    "\n");
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = ReleasesToMove.Calls.begin(),
+       AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRelease = *AI;
+    Releases.erase(OrigRelease);
+    DeadInsts.push_back(OrigRelease);
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting release: " << *OrigRelease
+                 << "\n");
+  }
+}
+
+bool
+ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
+                                    &BBStates,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  DenseMap<Value *, RRInfo> &Releases,
+                                  Module *M,
+                                  SmallVector<Instruction *, 4> &NewRetains,
+                                  SmallVector<Instruction *, 4> &NewReleases,
+                                  SmallVector<Instruction *, 8> &DeadInsts,
+                                  RRInfo &RetainsToMove,
+                                  RRInfo &ReleasesToMove,
+                                  Value *Arg,
+                                  bool KnownSafe,
+                                  bool &AnyPairsCompletelyEliminated) {
+  // If a pair happens in a region where it is known that the reference count
+  // is already incremented, we can similarly ignore possible decrements.
+  bool KnownSafeTD = true, KnownSafeBU = true;
+
+  // Connect the dots between the top-down-collected RetainsToMove and
+  // bottom-up-collected ReleasesToMove to form sets of related calls.
+  // This is an iterative process so that we connect multiple releases
+  // to multiple retains if needed.
+  unsigned OldDelta = 0;
+  unsigned NewDelta = 0;
+  unsigned OldCount = 0;
+  unsigned NewCount = 0;
+  bool FirstRelease = true;
+  bool FirstRetain = true;
+  for (;;) {
+    for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
+      Instruction *NewRetain = *NI;
+      MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
+      assert(It != Retains.end());
+      const RRInfo &NewRetainRRI = It->second;
+      KnownSafeTD &= NewRetainRRI.KnownSafe;
+      for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewRetainRRI.Calls.begin(),
+             LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
+        Instruction *NewRetainRelease = *LI;
+        DenseMap<Value *, RRInfo>::const_iterator Jt =
+          Releases.find(NewRetainRelease);
+        if (Jt == Releases.end())
+          return false;
+        const RRInfo &NewRetainReleaseRRI = Jt->second;
+        assert(NewRetainReleaseRRI.Calls.count(NewRetain));
+        if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
+          OldDelta -=
+            BBStates[NewRetainRelease->getParent()].GetAllPathCount();
+
+          // Merge the ReleaseMetadata and IsTailCallRelease values.
+          if (FirstRelease) {
+            ReleasesToMove.ReleaseMetadata =
+              NewRetainReleaseRRI.ReleaseMetadata;
+            ReleasesToMove.IsTailCallRelease =
+              NewRetainReleaseRRI.IsTailCallRelease;
+            FirstRelease = false;
+          } else {
+            if (ReleasesToMove.ReleaseMetadata !=
+                NewRetainReleaseRRI.ReleaseMetadata)
+              ReleasesToMove.ReleaseMetadata = 0;
+            if (ReleasesToMove.IsTailCallRelease !=
+                NewRetainReleaseRRI.IsTailCallRelease)
+              ReleasesToMove.IsTailCallRelease = false;
+          }
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewRetainReleaseRRI.ReverseInsertPts.begin(),
+                   RE = NewRetainReleaseRRI.ReverseInsertPts.end();
+                 RI != RE; ++RI) {
+              Instruction *RIP = *RI;
+              if (ReleasesToMove.ReverseInsertPts.insert(RIP))
+                NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
+            }
+          NewReleases.push_back(NewRetainRelease);
+        }
+      }
+    }
+    NewRetains.clear();
+    if (NewReleases.empty()) break;
+
+    // Back the other way.
+    for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
+      Instruction *NewRelease = *NI;
+      DenseMap<Value *, RRInfo>::const_iterator It =
+        Releases.find(NewRelease);
+      assert(It != Releases.end());
+      const RRInfo &NewReleaseRRI = It->second;
+      KnownSafeBU &= NewReleaseRRI.KnownSafe;
+      for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewReleaseRRI.Calls.begin(),
+             LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
+        Instruction *NewReleaseRetain = *LI;
+        MapVector<Value *, RRInfo>::const_iterator Jt =
+          Retains.find(NewReleaseRetain);
+        if (Jt == Retains.end())
+          return false;
+        const RRInfo &NewReleaseRetainRRI = Jt->second;
+        assert(NewReleaseRetainRRI.Calls.count(NewRelease));
+        if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
+          unsigned PathCount =
+            BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
+          OldDelta += PathCount;
+          OldCount += PathCount;
+
+          // Merge the IsRetainBlock values.
+          if (FirstRetain) {
+            RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock;
+            FirstRetain = false;
+          } else if (ReleasesToMove.IsRetainBlock !=
+                     NewReleaseRetainRRI.IsRetainBlock)
+            // It's not possible to merge the sequences if one uses
+            // objc_retain and the other uses objc_retainBlock.
+            return false;
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewReleaseRetainRRI.ReverseInsertPts.begin(),
+                   RE = NewReleaseRetainRRI.ReverseInsertPts.end();
+                 RI != RE; ++RI) {
+              Instruction *RIP = *RI;
+              if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
+                PathCount = BBStates[RIP->getParent()].GetAllPathCount();
+                NewDelta += PathCount;
+                NewCount += PathCount;
+              }
+            }
+          NewRetains.push_back(NewReleaseRetain);
+        }
+      }
+    }
+    NewReleases.clear();
+    if (NewRetains.empty()) break;
+  }
+
+  // If the pointer is known incremented or nested, we can safely delete the
+  // pair regardless of what's between them.
+  if (KnownSafeTD || KnownSafeBU) {
+    RetainsToMove.ReverseInsertPts.clear();
+    ReleasesToMove.ReverseInsertPts.clear();
+    NewCount = 0;
+  } else {
+    // Determine whether the new insertion points we computed preserve the
+    // balance of retain and release calls through the program.
+    // TODO: If the fully aggressive solution isn't valid, try to find a
+    // less aggressive solution which is.
+    if (NewDelta != 0)
+      return false;
+  }
+
+  // Determine whether the original call points are balanced in the retain and
+  // release calls through the program. If not, conservatively don't touch
+  // them.
+  // TODO: It's theoretically possible to do code motion in this case, as
+  // long as the existing imbalances are maintained.
+  if (OldDelta != 0)
+    return false;
+
+  Changed = true;
+  assert(OldCount != 0 && "Unreachable code?");
+  NumRRs += OldCount - NewCount;
+  // Set to true if we completely removed any RR pairs.
+  AnyPairsCompletelyEliminated = NewCount == 0;
+
+  // We can move calls!
+  return true;
+}
+
+/// Identify pairings between the retains and releases, and delete and/or move
+/// them.
+bool
+ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
+                                   &BBStates,
+                                 MapVector<Value *, RRInfo> &Retains,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 Module *M) {
+  bool AnyPairsCompletelyEliminated = false;
+  RRInfo RetainsToMove;
+  RRInfo ReleasesToMove;
+  SmallVector<Instruction *, 4> NewRetains;
+  SmallVector<Instruction *, 4> NewReleases;
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  // Visit each retain.
+  for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
+       E = Retains.end(); I != E; ++I) {
+    Value *V = I->first;
+    if (!V) continue; // blotted
+
+    Instruction *Retain = cast<Instruction>(V);
+
+    DEBUG(dbgs() << "ObjCARCOpt::PerformCodePlacement: Visiting: " << *Retain
+          << "\n");
+
+    Value *Arg = GetObjCArg(Retain);
+
+    // If the object being released is in static or stack storage, we know it's
+    // not being managed by ObjC reference counting, so we can delete pairs
+    // regardless of what possible decrements or uses lie between them.
+    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+
+    // A constant pointer can't be pointing to an object on the heap. It may
+    // be reference-counted, but it won't be deleted.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+      if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(
+              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
+        if (GV->isConstant())
+          KnownSafe = true;
+
+    // Connect the dots between the top-down-collected RetainsToMove and
+    // bottom-up-collected ReleasesToMove to form sets of related calls.
+    NewRetains.push_back(Retain);
+    bool PerformMoveCalls =
+      ConnectTDBUTraversals(BBStates, Retains, Releases, M, NewRetains,
+                            NewReleases, DeadInsts, RetainsToMove,
+                            ReleasesToMove, Arg, KnownSafe,
+                            AnyPairsCompletelyEliminated);
+
+    if (PerformMoveCalls) {
+      // Ok, everything checks out and we're all set. Let's move/delete some
+      // code!
+      MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+                Retains, Releases, DeadInsts, M);
+    }
+
+    // Clean up state for next retain.
+    NewReleases.clear();
+    NewRetains.clear();
+    RetainsToMove.clear();
+    ReleasesToMove.clear();
+  }
+
+  // Now that we're done moving everything, we can delete the newly dead
+  // instructions, as we no longer need them as insert points.
+  while (!DeadInsts.empty())
+    EraseInstruction(DeadInsts.pop_back_val());
+
+  return AnyPairsCompletelyEliminated;
+}
+
+/// Weak pointer optimizations.
+void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
+  // itself because it uses AliasAnalysis and we need to do provenance
+  // queries instead.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst <<
+          "\n");
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
+      continue;
+
+    // Delete objc_loadWeak calls with no users.
+    if (Class == IC_LoadWeak && Inst->use_empty()) {
+      Inst->eraseFromParent();
+      continue;
+    }
+
+    // TODO: For now, just look for an earlier available version of this value
+    // within the same block. Theoretically, we could do memdep-style non-local
+    // analysis too, but that would want caching. A better approach would be to
+    // use the technique that EarlyCSE uses.
+    inst_iterator Current = llvm::prior(I);
+    BasicBlock *CurrentBB = Current.getBasicBlockIterator();
+    for (BasicBlock::iterator B = CurrentBB->begin(),
+                              J = Current.getInstructionIterator();
+         J != B; --J) {
+      Instruction *EarlierInst = &*llvm::prior(J);
+      InstructionClass EarlierClass = GetInstructionClass(EarlierInst);
+      switch (EarlierClass) {
+      case IC_LoadWeak:
+      case IC_LoadWeakRetained: {
+        // If this is loading from the same pointer, replace this load's value
+        // with that one.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall);
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_StoreWeak:
+      case IC_InitWeak: {
+        // If this is storing to the same pointer and has the same size etc.
+        // replace this load's value with the stored value.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_MoveWeak:
+      case IC_CopyWeak:
+        // TOOD: Grab the copied value.
+        goto clobbered;
+      case IC_AutoreleasepoolPush:
+      case IC_None:
+      case IC_User:
+        // Weak pointers are only modified through the weak entry points
+        // (and arbitrary calls, which could call the weak entry points).
+        break;
+      default:
+        // Anything else could modify the weak pointer.
+        goto clobbered;
+      }
+    }
+  clobbered:;
+  }
+
+  // Then, for each destroyWeak with an alloca operand, check to see if
+  // the alloca and all its users can be zapped.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_DestroyWeak)
+      continue;
+
+    CallInst *Call = cast<CallInst>(Inst);
+    Value *Arg = Call->getArgOperand(0);
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ++UI) {
+        const Instruction *UserInst = cast<Instruction>(*UI);
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+        case IC_DestroyWeak:
+          continue;
+        default:
+          goto done;
+        }
+      }
+      Changed = true;
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ) {
+        CallInst *UserInst = cast<CallInst>(*UI++);
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+          // These functions return their second argument.
+          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
+          break;
+        case IC_DestroyWeak:
+          // No return value.
+          break;
+        default:
+          llvm_unreachable("alloca really is used!");
+        }
+        UserInst->eraseFromParent();
+      }
+      Alloca->eraseFromParent();
+    done:;
+    }
+  }
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n");
+
+}
+
+/// Identify program paths which execute sequences of retains and releases which
+/// can be eliminated.
+bool ObjCARCOpt::OptimizeSequences(Function &F) {
+  /// Releases, Retains - These are used to store the results of the main flow
+  /// analysis. These use Value* as the key instead of Instruction* so that the
+  /// map stays valid when we get around to rewriting code and calls get
+  /// replaced by arguments.
+  DenseMap<Value *, RRInfo> Releases;
+  MapVector<Value *, RRInfo> Retains;
+
+  /// This is used during the traversal of the function to track the
+  /// states for each identified object at each block.
+  DenseMap<const BasicBlock *, BBState> BBStates;
+
+  // Analyze the CFG of the function, and all instructions.
+  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
+
+  // Transform.
+  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
+         NestingDetected;
+}
+
+/// Look for this pattern:
+/// \code
+///    %call = call i8* @something(...)
+///    %2 = call i8* @objc_retain(i8* %call)
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+/// \endcode
+/// And delete the retain and autorelease.
+///
+/// Otherwise if it's just this:
+/// \code
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+/// \endcode
+/// convert the autorelease to autoreleaseRV.
+void ObjCARCOpt::OptimizeReturns(Function &F) {
+  if (!F.getReturnType()->isPointerTy())
+    return;
+
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    BasicBlock *BB = FI;
+    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n");
+
+    if (!Ret) continue;
+
+    const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
+    FindDependencies(NeedsPositiveRetainCount, Arg,
+                     BB, Ret, DependingInstructions, Visited, PA);
+    if (DependingInstructions.size() != 1)
+      goto next_block;
+
+    {
+      CallInst *Autorelease =
+        dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+      if (!Autorelease)
+        goto next_block;
+      InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
+      if (!IsAutorelease(AutoreleaseClass))
+        goto next_block;
+      if (GetObjCArg(Autorelease) != Arg)
+        goto next_block;
+
+      DependingInstructions.clear();
+      Visited.clear();
+
+      // Check that there is nothing that can affect the reference
+      // count between the autorelease and the retain.
+      FindDependencies(CanChangeRetainCount, Arg,
+                       BB, Autorelease, DependingInstructions, Visited, PA);
+      if (DependingInstructions.size() != 1)
+        goto next_block;
+
+      {
+        CallInst *Retain =
+          dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+
+        // Check that we found a retain with the same argument.
+        if (!Retain ||
+            !IsRetain(GetBasicInstructionClass(Retain)) ||
+            GetObjCArg(Retain) != Arg)
+          goto next_block;
+
+        DependingInstructions.clear();
+        Visited.clear();
+
+        // Convert the autorelease to an autoreleaseRV, since it's
+        // returning the value.
+        if (AutoreleaseClass == IC_Autorelease) {
+          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Converting autorelease "
+                          "=> autoreleaseRV since it's returning a value.\n"
+                          "                             In: " << *Autorelease
+                       << "\n");
+          Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent()));
+          DEBUG(dbgs() << "                             Out: " << *Autorelease
+                       << "\n");
+          Autorelease->setTailCall(); // Always tail call autoreleaseRV.
+          AutoreleaseClass = IC_AutoreleaseRV;
+        }
+
+        // Check that there is nothing that can affect the reference
+        // count between the retain and the call.
+        // Note that Retain need not be in BB.
+        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
+                         DependingInstructions, Visited, PA);
+        if (DependingInstructions.size() != 1)
+          goto next_block;
+
+        {
+          CallInst *Call =
+            dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+
+          // Check that the pointer is the return value of the call.
+          if (!Call || Arg != Call)
+            goto next_block;
+
+          // Check that the call is a regular call.
+          InstructionClass Class = GetBasicInstructionClass(Call);
+          if (Class != IC_CallOrUser && Class != IC_Call)
+            goto next_block;
+
+          // If so, we can zap the retain and autorelease.
+          Changed = true;
+          ++NumRets;
+          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain
+                       << "\n                             Erasing: "
+                       << *Autorelease << "\n");
+          EraseInstruction(Retain);
+          EraseInstruction(Autorelease);
+        }
+      }
+    }
+
+  next_block:
+    DependingInstructions.clear();
+    Visited.clear();
+  }
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n");
+
+}
+
+bool ObjCARCOpt::doInitialization(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  // Identify the imprecise release metadata kind.
+  ImpreciseReleaseMDKind =
+    M.getContext().getMDKindID("clang.imprecise_release");
+  CopyOnEscapeMDKind =
+    M.getContext().getMDKindID("clang.arc.copy_on_escape");
+  NoObjCARCExceptionsMDKind =
+    M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
+
+  // Intuitively, objc_retain and others are nocapture, however in practice
+  // they are not, because they return their argument value. And objc_release
+  // calls finalizers which can have arbitrary side effects.
+
+  // These are initialized lazily.
+  RetainRVCallee = 0;
+  AutoreleaseRVCallee = 0;
+  ReleaseCallee = 0;
+  RetainCallee = 0;
+  RetainBlockCallee = 0;
+  AutoreleaseCallee = 0;
+
+  return false;
+}
+
+bool ObjCARCOpt::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+
+  DEBUG(dbgs() << "ObjCARCOpt: Visiting Function: " << F.getName() << "\n");
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // This pass performs several distinct transformations. As a compile-time aid
+  // when compiling code that isn't ObjC, skip these if the relevant ObjC
+  // library functions aren't declared.
+
+  // Preliminary optimizations. This also computs UsedInThisFunction.
+  OptimizeIndividualCalls(F);
+
+  // Optimizations for weak pointers.
+  if (UsedInThisFunction & ((1 << IC_LoadWeak) |
+                            (1 << IC_LoadWeakRetained) |
+                            (1 << IC_StoreWeak) |
+                            (1 << IC_InitWeak) |
+                            (1 << IC_CopyWeak) |
+                            (1 << IC_MoveWeak) |
+                            (1 << IC_DestroyWeak)))
+    OptimizeWeakCalls(F);
+
+  // Optimizations for retain+release pairs.
+  if (UsedInThisFunction & ((1 << IC_Retain) |
+                            (1 << IC_RetainRV) |
+                            (1 << IC_RetainBlock)))
+    if (UsedInThisFunction & (1 << IC_Release))
+      // Run OptimizeSequences until it either stops making changes or
+      // no retain+release pair nesting is detected.
+      while (OptimizeSequences(F)) {}
+
+  // Optimizations if objc_autorelease is used.
+  if (UsedInThisFunction & ((1 << IC_Autorelease) |
+                            (1 << IC_AutoreleaseRV)))
+    OptimizeReturns(F);
+
+  DEBUG(dbgs() << "\n");
+
+  return Changed;
+}
+
+void ObjCARCOpt::releaseMemory() {
+  PA.clear();
+}
+
+/// @}
+///
diff --git a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
new file mode 100644
index 0000000..a841c64
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
@@ -0,0 +1,241 @@
+//===- ObjCARCUtil.cpp - ObjC ARC Optimization --------*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines several utility functions used by various ARC
+/// optimizations which are IMHO too big to be in a header file.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
+                                       const InstructionClass Class) {
+  switch (Class) {
+  case IC_Retain:
+    return OS << "IC_Retain";
+  case IC_RetainRV:
+    return OS << "IC_RetainRV";
+  case IC_RetainBlock:
+    return OS << "IC_RetainBlock";
+  case IC_Release:
+    return OS << "IC_Release";
+  case IC_Autorelease:
+    return OS << "IC_Autorelease";
+  case IC_AutoreleaseRV:
+    return OS << "IC_AutoreleaseRV";
+  case IC_AutoreleasepoolPush:
+    return OS << "IC_AutoreleasepoolPush";
+  case IC_AutoreleasepoolPop:
+    return OS << "IC_AutoreleasepoolPop";
+  case IC_NoopCast:
+    return OS << "IC_NoopCast";
+  case IC_FusedRetainAutorelease:
+    return OS << "IC_FusedRetainAutorelease";
+  case IC_FusedRetainAutoreleaseRV:
+    return OS << "IC_FusedRetainAutoreleaseRV";
+  case IC_LoadWeakRetained:
+    return OS << "IC_LoadWeakRetained";
+  case IC_StoreWeak:
+    return OS << "IC_StoreWeak";
+  case IC_InitWeak:
+    return OS << "IC_InitWeak";
+  case IC_LoadWeak:
+    return OS << "IC_LoadWeak";
+  case IC_MoveWeak:
+    return OS << "IC_MoveWeak";
+  case IC_CopyWeak:
+    return OS << "IC_CopyWeak";
+  case IC_DestroyWeak:
+    return OS << "IC_DestroyWeak";
+  case IC_StoreStrong:
+    return OS << "IC_StoreStrong";
+  case IC_CallOrUser:
+    return OS << "IC_CallOrUser";
+  case IC_Call:
+    return OS << "IC_Call";
+  case IC_User:
+    return OS << "IC_User";
+  case IC_None:
+    return OS << "IC_None";
+  }
+  llvm_unreachable("Unknown instruction class!");
+}
+
+InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
+  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+
+  // No arguments.
+  if (AI == AE)
+    return StringSwitch<InstructionClass>(F->getName())
+      .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
+      .Default(IC_CallOrUser);
+
+  // One argument.
+  const Argument *A0 = AI++;
+  if (AI == AE)
+    // Argument is a pointer.
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
+      Type *ETy = PTy->getElementType();
+      // Argument is i8*.
+      if (ETy->isIntegerTy(8))
+        return StringSwitch<InstructionClass>(F->getName())
+          .Case("objc_retain",                IC_Retain)
+          .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV)
+          .Case("objc_retainBlock",           IC_RetainBlock)
+          .Case("objc_release",               IC_Release)
+          .Case("objc_autorelease",           IC_Autorelease)
+          .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV)
+          .Case("objc_autoreleasePoolPop",    IC_AutoreleasepoolPop)
+          .Case("objc_retainedObject",        IC_NoopCast)
+          .Case("objc_unretainedObject",      IC_NoopCast)
+          .Case("objc_unretainedPointer",     IC_NoopCast)
+          .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
+          .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
+          .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
+          .Default(IC_CallOrUser);
+
+      // Argument is i8**
+      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
+        if (Pte->getElementType()->isIntegerTy(8))
+          return StringSwitch<InstructionClass>(F->getName())
+            .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
+            .Case("objc_loadWeak",              IC_LoadWeak)
+            .Case("objc_destroyWeak",           IC_DestroyWeak)
+            .Default(IC_CallOrUser);
+    }
+
+  // Two arguments, first is i8**.
+  const Argument *A1 = AI++;
+  if (AI == AE)
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
+      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
+        if (Pte->getElementType()->isIntegerTy(8))
+          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
+            Type *ETy1 = PTy1->getElementType();
+            // Second argument is i8*
+            if (ETy1->isIntegerTy(8))
+              return StringSwitch<InstructionClass>(F->getName())
+                .Case("objc_storeWeak",             IC_StoreWeak)
+                .Case("objc_initWeak",              IC_InitWeak)
+                .Case("objc_storeStrong",           IC_StoreStrong)
+                .Default(IC_CallOrUser);
+            // Second argument is i8**.
+            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
+              if (Pte1->getElementType()->isIntegerTy(8))
+                return StringSwitch<InstructionClass>(F->getName())
+                  .Case("objc_moveWeak",              IC_MoveWeak)
+                  .Case("objc_copyWeak",              IC_CopyWeak)
+                  .Default(IC_CallOrUser);
+          }
+
+  // Anything else.
+  return IC_CallOrUser;
+}
+
+/// \brief Determine what kind of construct V is.
+InstructionClass
+llvm::objcarc::GetInstructionClass(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Any instruction other than bitcast and gep with a pointer operand have a
+    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
+    // to a subsequent use, rather than using it themselves, in this sense.
+    // As a short cut, several other opcodes are known to have no pointer
+    // operands of interest. And ret is never followed by a release, so it's
+    // not interesting to examine.
+    switch (I->getOpcode()) {
+    case Instruction::Call: {
+      const CallInst *CI = cast<CallInst>(I);
+      // Check for calls to special functions.
+      if (const Function *F = CI->getCalledFunction()) {
+        InstructionClass Class = GetFunctionClass(F);
+        if (Class != IC_CallOrUser)
+          return Class;
+
+        // None of the intrinsic functions do objc_release. For intrinsics, the
+        // only question is whether or not they may be users.
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::returnaddress: case Intrinsic::frameaddress:
+        case Intrinsic::stacksave: case Intrinsic::stackrestore:
+        case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
+        case Intrinsic::objectsize: case Intrinsic::prefetch:
+        case Intrinsic::stackprotector:
+        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
+        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
+        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
+        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
+        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
+        // Don't let dbg info affect our results.
+        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
+          // Short cut: Some intrinsics obviously don't use ObjC pointers.
+          return IC_None;
+        default:
+          break;
+        }
+      }
+      return GetCallSiteClass(CI);
+    }
+    case Instruction::Invoke:
+      return GetCallSiteClass(cast<InvokeInst>(I));
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::Select: case Instruction::PHI:
+    case Instruction::Ret: case Instruction::Br:
+    case Instruction::Switch: case Instruction::IndirectBr:
+    case Instruction::Alloca: case Instruction::VAArg:
+    case Instruction::Add: case Instruction::FAdd:
+    case Instruction::Sub: case Instruction::FSub:
+    case Instruction::Mul: case Instruction::FMul:
+    case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv:
+    case Instruction::SRem: case Instruction::URem: case Instruction::FRem:
+    case Instruction::Shl: case Instruction::LShr: case Instruction::AShr:
+    case Instruction::And: case Instruction::Or: case Instruction::Xor:
+    case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc:
+    case Instruction::IntToPtr: case Instruction::FCmp:
+    case Instruction::FPTrunc: case Instruction::FPExt:
+    case Instruction::FPToUI: case Instruction::FPToSI:
+    case Instruction::UIToFP: case Instruction::SIToFP:
+    case Instruction::InsertElement: case Instruction::ExtractElement:
+    case Instruction::ShuffleVector:
+    case Instruction::ExtractValue:
+      break;
+    case Instruction::ICmp:
+      // Comparing a pointer with null, or any other constant, isn't an
+      // interesting use, because we don't care what the pointer points to, or
+      // about the values of any other dynamic reference-counted pointers.
+      if (IsPotentialRetainableObjPtr(I->getOperand(1)))
+        return IC_User;
+      break;
+    default:
+      // For anything else, check all the operands.
+      // Note that this includes both operands of a Store: while the first
+      // operand isn't actually being dereferenced, it is being stored to
+      // memory where we can no longer track who might read it and dereference
+      // it, so we have to consider it potentially used.
+      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
+           OI != OE; ++OI)
+        if (IsPotentialRetainableObjPtr(*OI))
+          return IC_User;
+    }
+  }
+
+  // Otherwise, it's totally inert for ARC purposes.
+  return IC_None;
+}
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
new file mode 100644
index 0000000..ae3c628
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -0,0 +1,177 @@
+//===- ProvenanceAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
+                                       const Value *B) {
+  // If the values are Selects with the same condition, we can do a more precise
+  // check: just check for relations between the values on corresponding arms.
+  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
+    if (A->getCondition() == SB->getCondition())
+      return related(A->getTrueValue(), SB->getTrueValue()) ||
+             related(A->getFalseValue(), SB->getFalseValue());
+
+  // Check both arms of the Select node individually.
+  return related(A->getTrueValue(), B) ||
+         related(A->getFalseValue(), B);
+}
+
+bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
+                                    const Value *B) {
+  // If the values are PHIs in the same block, we can do a more precise as well
+  // as efficient check: just check for relations between the values on
+  // corresponding edges.
+  if (const PHINode *PNB = dyn_cast<PHINode>(B))
+    if (PNB->getParent() == A->getParent()) {
+      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
+        if (related(A->getIncomingValue(i),
+                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
+          return true;
+      return false;
+    }
+
+  // Check each unique source of the PHI node against B.
+  SmallPtrSet<const Value *, 4> UniqueSrc;
+  for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
+    const Value *PV1 = A->getIncomingValue(i);
+    if (UniqueSrc.insert(PV1) && related(PV1, B))
+      return true;
+  }
+
+  // All of the arms checked out.
+  return false;
+}
+
+/// Test if the value of P, or any value covered by its provenance, is ever
+/// stored within the function (not counting callees).
+static bool IsStoredObjCPointer(const Value *P) {
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Worklist;
+  Worklist.push_back(P);
+  Visited.insert(P);
+  do {
+    P = Worklist.pop_back_val();
+    for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end();
+         UI != UE; ++UI) {
+      const User *Ur = *UI;
+      if (isa<StoreInst>(Ur)) {
+        if (UI.getOperandNo() == 0)
+          // The pointer is stored.
+          return true;
+        // The pointed is stored through.
+        continue;
+      }
+      if (isa<CallInst>(Ur))
+        // The pointer is passed as an argument, ignore this.
+        continue;
+      if (isa<PtrToIntInst>(P))
+        // Assume the worst.
+        return true;
+      if (Visited.insert(Ur))
+        Worklist.push_back(Ur);
+    }
+  } while (!Worklist.empty());
+
+  // Everything checked out.
+  return false;
+}
+
+bool ProvenanceAnalysis::relatedCheck(const Value *A,
+                                      const Value *B) {
+  // Skip past provenance pass-throughs.
+  A = GetUnderlyingObjCPtr(A);
+  B = GetUnderlyingObjCPtr(B);
+
+  // Quick check.
+  if (A == B)
+    return true;
+
+  // Ask regular AliasAnalysis, for a first approximation.
+  switch (AA->alias(A, B)) {
+  case AliasAnalysis::NoAlias:
+    return false;
+  case AliasAnalysis::MustAlias:
+  case AliasAnalysis::PartialAlias:
+    return true;
+  case AliasAnalysis::MayAlias:
+    break;
+  }
+
+  bool AIsIdentified = IsObjCIdentifiedObject(A);
+  bool BIsIdentified = IsObjCIdentifiedObject(B);
+
+  // An ObjC-Identified object can't alias a load if it is never locally stored.
+  if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return IsStoredObjCPointer(A);
+    if (BIsIdentified) {
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return IsStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
+    }
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
+      return IsStoredObjCPointer(B);
+  }
+
+   // Special handling for PHI and Select.
+  if (const PHINode *PN = dyn_cast<PHINode>(A))
+    return relatedPHI(PN, B);
+  if (const PHINode *PN = dyn_cast<PHINode>(B))
+    return relatedPHI(PN, A);
+  if (const SelectInst *S = dyn_cast<SelectInst>(A))
+    return relatedSelect(S, B);
+  if (const SelectInst *S = dyn_cast<SelectInst>(B))
+    return relatedSelect(S, A);
+
+  // Conservative.
+  return true;
+}
+
+bool ProvenanceAnalysis::related(const Value *A,
+                                 const Value *B) {
+  // Begin by inserting a conservative value into the map. If the insertion
+  // fails, we have the answer already. If it succeeds, leave it there until we
+  // compute the real answer to guard against recursive queries.
+  if (A > B) std::swap(A, B);
+  std::pair<CachedResultsTy::iterator, bool> Pair =
+    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  bool Result = relatedCheck(A, B);
+  CachedResults[ValuePairTy(A, B)] = Result;
+  return Result;
+}
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
new file mode 100644
index 0000000..ec449fd
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -0,0 +1,80 @@
+//===- ProvenanceAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+  class Value;
+  class AliasAnalysis;
+  class PHINode;
+  class SelectInst;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief This is similar to BasicAliasAnalysis, and it uses many of the same
+/// techniques, except it uses special ObjC-specific reasoning about pointer
+/// relationships.
+///
+/// In this context ``Provenance'' is defined as the history of an object's
+/// ownership. Thus ``Provenance Analysis'' is defined by using the notion of
+/// an ``independent provenance source'' of a pointer to determine whether or
+/// not two pointers have the same provenance source and thus could
+/// potentially be related.
+class ProvenanceAnalysis {
+  AliasAnalysis *AA;
+
+  typedef std::pair<const Value *, const Value *> ValuePairTy;
+  typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
+  CachedResultsTy CachedResults;
+
+  bool relatedCheck(const Value *A, const Value *B);
+  bool relatedSelect(const SelectInst *A, const Value *B);
+  bool relatedPHI(const PHINode *A, const Value *B);
+
+  void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+  ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+
+public:
+  ProvenanceAnalysis() {}
+
+  void setAA(AliasAnalysis *aa) { AA = aa; }
+
+  AliasAnalysis *getAA() const { return AA; }
+
+  bool related(const Value *A, const Value *B);
+
+  void clear() {
+    CachedResults.clear();
+  }
+};
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index b3fc6e3..fd55e08 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -21,7 +21,6 @@ add_llvm_library(LLVMScalarOpts
   LoopUnswitch.cpp
   LowerAtomic.cpp
   MemCpyOptimizer.cpp
-  ObjCARC.cpp
   Reassociate.cpp
   Reg2Mem.cpp
   SCCP.cpp
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index d513c96..d71dd5d 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -729,9 +729,9 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
   // It's not safe to eliminate the sign / zero extension of the return value.
   // See llvm::isInTailCallPosition().
   const Function *F = BB->getParent();
-  Attribute CallerRetAttr = F->getAttributes().getRetAttributes();
-  if (CallerRetAttr.hasAttribute(Attribute::ZExt) ||
-      CallerRetAttr.hasAttribute(Attribute::SExt))
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
     return false;
 
   // Make sure there are no instructions between the PHI and return, or that the
@@ -788,10 +788,10 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
 
     // Conservatively require the attributes of the call to match those of the
     // return. Ignore noalias because it doesn't affect the call sequence.
-    Attribute CalleeRetAttr = CS.getAttributes().getRetAttributes();
-    if (AttrBuilder(CalleeRetAttr).
+    AttributeSet CalleeAttrs = CS.getAttributes();
+    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
           removeAttribute(Attribute::NoAlias) !=
-        AttrBuilder(CallerRetAttr).
+        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
           removeAttribute(Attribute::NoAlias))
       continue;
 
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 4c3631b..995782e 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -21,6 +21,8 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -97,12 +99,29 @@ bool CorrelatedValuePropagation::processPHI(PHINode *P) {
     Value *Incoming = P->getIncomingValue(i);
     if (isa<Constant>(Incoming)) continue;
 
-    Constant *C = LVI->getConstantOnEdge(P->getIncomingValue(i),
-                                         P->getIncomingBlock(i),
-                                         BB);
-    if (!C) continue;
+    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB);
 
-    P->setIncomingValue(i, C);
+    // Look if the incoming value is a select with a constant but LVI tells us
+    // that the incoming value can never be that constant. In that case replace
+    // the incoming value with the other value of the select. This often allows
+    // us to remove the select later.
+    if (!V) {
+      SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+      if (!SI) continue;
+
+      Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+      if (!C) continue;
+
+      if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+                                  P->getIncomingBlock(i), BB) !=
+          LazyValueInfo::False)
+        continue;
+
+      DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+      V = SI->getTrueValue();
+    }
+
+    P->setIncomingValue(i, V);
     Changed = true;
   }
 
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index fe3acbf..57432c7 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -376,10 +376,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval argument).  If so, then it clearly overwrites any
   // other store to the same object.
-  const DataLayout &TD = *AA.getDataLayout();
+  const DataLayout *TD = AA.getDataLayout();
 
-  const Value *UO1 = GetUnderlyingObject(P1, &TD),
-              *UO2 = GetUnderlyingObject(P2, &TD);
+  const Value *UO1 = GetUnderlyingObject(P1, TD),
+              *UO2 = GetUnderlyingObject(P2, TD);
 
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 14201b9..c04b447 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -849,8 +849,8 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
     return -1;
 
   int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD);
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD);
+  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&TD);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &TD);
   if (StoreBase != LoadBase)
     return -1;
 
@@ -945,7 +945,7 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
   // then we should widen it!
   int64_t LoadOffs = 0;
   const Value *LoadBase =
-    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD);
+    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &TD);
   unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
 
   unsigned Size = MemoryDependenceAnalysis::
@@ -1526,10 +1526,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
 
-  bool isSinglePred = false;
   bool allSingleSucc = true;
   while (TmpBB->getSinglePredecessor()) {
-    isSinglePred = true;
     TmpBB = TmpBB->getSinglePredecessor();
     if (TmpBB == LoadBB) // Infinite (unreachable) loop.
       return false;
@@ -1548,28 +1546,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   assert(TmpBB);
   LoadBB = TmpBB;
 
-  // FIXME: It is extremely unclear what this loop is doing, other than
-  // artificially restricting loadpre.
-  if (isSinglePred) {
-    bool isHot = false;
-    for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
-      const AvailableValueInBlock &AV = ValuesPerBlock[i];
-      if (AV.isSimpleValue())
-        // "Hot" Instruction is in some loop (because it dominates its dep.
-        // instruction).
-        if (Instruction *I = dyn_cast<Instruction>(AV.getSimpleValue()))
-          if (DT->dominates(LI, I)) {
-            isHot = true;
-            break;
-          }
-    }
-
-    // We are interested only in "hot" instructions. We don't want to do any
-    // mis-optimizations here.
-    if (!isHot)
-      return false;
-  }
-
   // Check to see how many predecessors have the loaded value fully
   // available.
   DenseMap<BasicBlock*, Value*> PredLoads;
@@ -2371,8 +2347,8 @@ bool GVN::processBlock(BasicBlock *BB) {
          E = InstrsToErase.end(); I != E; ++I) {
       DEBUG(dbgs() << "GVN removed: " << **I << '\n');
       if (MD) MD->removeInstruction(*I);
-      (*I)->eraseFromParent();
       DEBUG(verifyRemoved(*I));
+      (*I)->eraseFromParent();
     }
     InstrsToErase.clear();
 
@@ -2389,7 +2365,7 @@ bool GVN::processBlock(BasicBlock *BB) {
 /// control flow patterns and attempts to perform simple PRE at the join point.
 bool GVN::performPRE(Function &F) {
   bool Changed = false;
-  DenseMap<BasicBlock*, Value*> predMap;
+  SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
   for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
        DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
     BasicBlock *CurrentBlock = *DI;
@@ -2445,19 +2421,22 @@ bool GVN::performPRE(Function &F) {
         if (P == CurrentBlock) {
           NumWithout = 2;
           break;
-        } else if (!DT->dominates(&F.getEntryBlock(), P))  {
+        } else if (!DT->isReachableFromEntry(P))  {
           NumWithout = 2;
           break;
         }
 
         Value* predV = findLeader(P, ValNo);
         if (predV == 0) {
+          predMap.push_back(std::make_pair(static_cast<Value *>(0), P));
           PREPred = P;
           ++NumWithout;
         } else if (predV == CurInst) {
+          /* CurInst dominates this predecessor. */
           NumWithout = 2;
+          break;
         } else {
-          predMap[P] = predV;
+          predMap.push_back(std::make_pair(predV, P));
           ++NumWith;
         }
       }
@@ -2504,15 +2483,14 @@ bool GVN::performPRE(Function &F) {
       // the PRE predecessor.  This is typically because of loads which
       // are not value numbered precisely.
       if (!success) {
-        delete PREInstr;
         DEBUG(verifyRemoved(PREInstr));
+        delete PREInstr;
         continue;
       }
 
       PREInstr->insertBefore(PREPred->getTerminator());
       PREInstr->setName(CurInst->getName() + ".pre");
       PREInstr->setDebugLoc(CurInst->getDebugLoc());
-      predMap[PREPred] = PREInstr;
       VN.add(PREInstr, ValNo);
       ++NumGVNPRE;
 
@@ -2520,13 +2498,14 @@ bool GVN::performPRE(Function &F) {
       addToLeaderTable(ValNo, PREInstr, PREPred);
 
       // Create a PHI to make the value available in this block.
-      pred_iterator PB = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock);
-      PHINode* Phi = PHINode::Create(CurInst->getType(), std::distance(PB, PE),
+      PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(),
                                      CurInst->getName() + ".pre-phi",
                                      CurrentBlock->begin());
-      for (pred_iterator PI = PB; PI != PE; ++PI) {
-        BasicBlock *P = *PI;
-        Phi->addIncoming(predMap[P], P);
+      for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+        if (Value *V = predMap[i].first)
+          Phi->addIncoming(V, predMap[i].second);
+        else
+          Phi->addIncoming(PREInstr, PREPred);
       }
 
       VN.add(Phi, ValNo);
@@ -2551,8 +2530,8 @@ bool GVN::performPRE(Function &F) {
 
       DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
       if (MD) MD->removeInstruction(CurInst);
-      CurInst->eraseFromParent();
       DEBUG(verifyRemoved(CurInst));
+      CurInst->eraseFromParent();
       Changed = true;
     }
   }
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index dc6bef7..f94cd2a 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -440,13 +440,12 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
   }
 
   // Only these instructions are hoistable/sinkable.
-  bool HoistableKind = (isa<BinaryOperator>(I) || isa<CastInst>(I) ||
-                            isa<SelectInst>(I) || isa<GetElementPtrInst>(I) ||
-                            isa<CmpInst>(I)    || isa<InsertElementInst>(I) ||
-                            isa<ExtractElementInst>(I) ||
-                            isa<ShuffleVectorInst>(I));
-  if (!HoistableKind)
-      return false;
+  if (!isa<BinaryOperator>(I) && !isa<CastInst>(I) && !isa<SelectInst>(I) &&
+      !isa<GetElementPtrInst>(I) && !isa<CmpInst>(I) &&
+      !isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+      !isa<ShuffleVectorInst>(I) && !isa<ExtractValueInst>(I) &&
+      !isa<InsertValueInst>(I))
+    return false;
 
   return isSafeToExecuteUnconditionally(I);
 }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index c4f9012..8258719 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -407,7 +407,7 @@ bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
 
   // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
   {
-    if (DefX2->getOpcode() != Instruction::And)
+    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
       return false;
 
     BinaryOperator *SubOneOp;
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index c48808f..a23860a 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -14,6 +14,7 @@
 #define DEBUG_TYPE "loop-instsimplify"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 0ea80f3..e98ae95 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -51,6 +52,7 @@ namespace {
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
@@ -59,11 +61,13 @@ namespace {
 
   private:
     LoopInfo *LI;
+    const TargetTransformInfo *TTI;
   };
 }
 
 char LoopRotate::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -75,6 +79,7 @@ Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
 /// the loop is rotated at least once.
 bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfo>();
+  TTI = &getAnalysis<TargetTransformInfo>();
 
   // Simplify the loop latch before attempting to rotate the header
   // upward. Rotation may not be needed if the loop tail can be folded into the
@@ -278,7 +283,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
   // duplicate blocks inside it.
   {
     CodeMetrics Metrics;
-    Metrics.analyzeBasicBlock(OrigHeader);
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI);
     if (Metrics.notDuplicatable) {
       DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non duplicatable"
             << " instructions: "; L->dump());
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index c7b853e..4e4cb86 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -58,6 +58,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -237,7 +238,7 @@ struct Formula {
 
   /// BaseRegs - The list of "base" registers for this use. When this is
   /// non-empty,
-  SmallVector<const SCEV *, 2> BaseRegs;
+  SmallVector<const SCEV *, 4> BaseRegs;
 
   /// ScaledReg - The 'scaled' register for this use. This should be non-null
   /// when Scale is not zero.
@@ -1087,19 +1088,19 @@ namespace {
 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
 struct UniquifierDenseMapInfo {
-  static SmallVector<const SCEV *, 2> getEmptyKey() {
-    SmallVector<const SCEV *, 2> V;
+  static SmallVector<const SCEV *, 4> getEmptyKey() {
+    SmallVector<const SCEV *, 4>  V;
     V.push_back(reinterpret_cast<const SCEV *>(-1));
     return V;
   }
 
-  static SmallVector<const SCEV *, 2> getTombstoneKey() {
-    SmallVector<const SCEV *, 2> V;
+  static SmallVector<const SCEV *, 4> getTombstoneKey() {
+    SmallVector<const SCEV *, 4> V;
     V.push_back(reinterpret_cast<const SCEV *>(-2));
     return V;
   }
 
-  static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
+  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
     unsigned Result = 0;
     for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
          E = V.end(); I != E; ++I)
@@ -1107,8 +1108,8 @@ struct UniquifierDenseMapInfo {
     return Result;
   }
 
-  static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
-                      const SmallVector<const SCEV *, 2> &RHS) {
+  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+                      const SmallVector<const SCEV *, 4> &RHS) {
     return LHS == RHS;
   }
 };
@@ -1119,7 +1120,7 @@ struct UniquifierDenseMapInfo {
 /// the user itself, and information about how the use may be satisfied.
 /// TODO: Represent multiple users of the same expression in common?
 class LSRUse {
-  DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
+  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
 
 public:
   /// KindType - An enum for a kind of use, indicating what types of
@@ -1178,7 +1179,7 @@ public:
 /// HasFormula - Test whether this use as a formula which has the same
 /// registers as the given formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
-  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
   std::sort(Key.begin(), Key.end());
@@ -1188,7 +1189,7 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
-  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
   std::sort(Key.begin(), Key.end());
@@ -2536,6 +2537,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
     // Add this IV user to the end of the chain.
     IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
   }
+  IVChain &Chain = IVChainVec[ChainIdx];
 
   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
   // This chain's NearUsers become FarUsers.
@@ -2553,8 +2555,19 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
   for (Value::use_iterator UseIter = IVOper->use_begin(),
          UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) {
     Instruction *OtherUse = dyn_cast<Instruction>(*UseIter);
-    if (!OtherUse || OtherUse == UserInst)
+    if (!OtherUse)
       continue;
+    // Uses in the chain will no longer be uses if the chain is formed.
+    // Include the head of the chain in this iteration (not Chain.begin()).
+    IVChain::const_iterator IncIter = Chain.Incs.begin();
+    IVChain::const_iterator IncEnd = Chain.Incs.end();
+    for( ; IncIter != IncEnd; ++IncIter) {
+      if (IncIter->UserInst == OtherUse)
+        break;
+    }
+    if (IncIter != IncEnd)
+      continue;
+
     if (SE.isSCEVable(OtherUse->getType())
         && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
         && IU.isIVUserOrOperand(OtherUse)) {
@@ -2891,7 +2904,6 @@ void
 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
   Formula F;
   F.InitialMatch(S, L, SE);
-  F.HasBaseReg = true;
   bool Inserted = InsertFormula(LU, LUIdx, F);
   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
 }
@@ -2903,6 +2915,7 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S,
                                        LSRUse &LU, size_t LUIdx) {
   Formula F;
   F.BaseRegs.push_back(S);
+  F.HasBaseReg = true;
   bool Inserted = InsertFormula(LU, LUIdx, F);
   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
 }
@@ -3656,7 +3669,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 
   // Collect the best formula for each unique set of shared registers. This
   // is reset for each use.
-  typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
+  typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
     BestFormulaeTy;
   BestFormulaeTy BestFormulae;
 
@@ -3691,7 +3704,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
               dbgs() << "\n");
       }
       else {
-        SmallVector<const SCEV *, 2> Key;
+        SmallVector<const SCEV *, 4> Key;
         for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
                JE = F.BaseRegs.end(); J != JE; ++J) {
           const SCEV *Reg = *J;
@@ -3837,83 +3850,83 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
 /// for expressions like A, A+1, A+2, etc., allocate a single register for
 /// them.
 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
-  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
-    DEBUG(dbgs() << "The search space is too complex.\n");
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
 
-    DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
-                    "separated by a constant offset will use the same "
-                    "registers.\n");
+  DEBUG(dbgs() << "The search space is too complex.\n"
+                  "Narrowing the search space by assuming that uses separated "
+                  "by a constant offset will use the same registers.\n");
 
-    // This is especially useful for unrolled loops.
+  // This is especially useful for unrolled loops.
 
-    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
-      LSRUse &LU = Uses[LUIdx];
-      for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
-           E = LU.Formulae.end(); I != E; ++I) {
-        const Formula &F = *I;
-        if (F.BaseOffset != 0 && F.Scale == 0) {
-          if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
-            if (reconcileNewOffset(*LUThatHas, F.BaseOffset,
-                                   /*HasBaseReg=*/false,
-                                   LU.Kind, LU.AccessTy)) {
-              DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
-                    dbgs() << '\n');
-
-              LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
-
-              // Update the relocs to reference the new use.
-              for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
-                   E = Fixups.end(); I != E; ++I) {
-                LSRFixup &Fixup = *I;
-                if (Fixup.LUIdx == LUIdx) {
-                  Fixup.LUIdx = LUThatHas - &Uses.front();
-                  Fixup.Offset += F.BaseOffset;
-                  // Add the new offset to LUThatHas' offset list.
-                  if (LUThatHas->Offsets.back() != Fixup.Offset) {
-                    LUThatHas->Offsets.push_back(Fixup.Offset);
-                    if (Fixup.Offset > LUThatHas->MaxOffset)
-                      LUThatHas->MaxOffset = Fixup.Offset;
-                    if (Fixup.Offset < LUThatHas->MinOffset)
-                      LUThatHas->MinOffset = Fixup.Offset;
-                  }
-                  DEBUG(dbgs() << "New fixup has offset "
-                               << Fixup.Offset << '\n');
-                }
-                if (Fixup.LUIdx == NumUses-1)
-                  Fixup.LUIdx = LUIdx;
-              }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
+         E = LU.Formulae.end(); I != E; ++I) {
+      const Formula &F = *I;
+      if (F.BaseOffset == 0 || F.Scale != 0)
+        continue;
 
-              // Delete formulae from the new use which are no longer legal.
-              bool Any = false;
-              for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
-                Formula &F = LUThatHas->Formulae[i];
-                if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
-                                LUThatHas->Kind, LUThatHas->AccessTy, F)) {
-                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
-                        dbgs() << '\n');
-                  LUThatHas->DeleteFormula(F);
-                  --i;
-                  --e;
-                  Any = true;
-                }
-              }
-              if (Any)
-                LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+      if (!LUThatHas)
+        continue;
 
-              // Delete the old use.
-              DeleteUse(LU, LUIdx);
-              --LUIdx;
-              --NumUses;
-              break;
-            }
+      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+                              LU.Kind, LU.AccessTy))
+        continue;
+
+      DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+      // Update the relocs to reference the new use.
+      for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
+           E = Fixups.end(); I != E; ++I) {
+        LSRFixup &Fixup = *I;
+        if (Fixup.LUIdx == LUIdx) {
+          Fixup.LUIdx = LUThatHas - &Uses.front();
+          Fixup.Offset += F.BaseOffset;
+          // Add the new offset to LUThatHas' offset list.
+          if (LUThatHas->Offsets.back() != Fixup.Offset) {
+            LUThatHas->Offsets.push_back(Fixup.Offset);
+            if (Fixup.Offset > LUThatHas->MaxOffset)
+              LUThatHas->MaxOffset = Fixup.Offset;
+            if (Fixup.Offset < LUThatHas->MinOffset)
+              LUThatHas->MinOffset = Fixup.Offset;
           }
+          DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
         }
+        if (Fixup.LUIdx == NumUses-1)
+          Fixup.LUIdx = LUIdx;
       }
-    }
 
-    DEBUG(dbgs() << "After pre-selection:\n";
-          print_uses(dbgs()));
+      // Delete formulae from the new use which are no longer legal.
+      bool Any = false;
+      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+        Formula &F = LUThatHas->Formulae[i];
+        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+                        LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+          DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                dbgs() << '\n');
+          LUThatHas->DeleteFormula(F);
+          --i;
+          --e;
+          Any = true;
+        }
+      }
+
+      if (Any)
+        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+      // Delete the old use.
+      DeleteUse(LU, LUIdx);
+      --LUIdx;
+      --NumUses;
+      break;
+    }
   }
+
+  DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
 }
 
 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index e0f915b..80d060b 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/CommandLine.h"
@@ -90,6 +91,7 @@ namespace {
       AU.addPreservedID(LCSSAID);
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
       // loop will receive invalid dom info.
@@ -101,6 +103,7 @@ namespace {
 
 char LoopUnroll::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -113,11 +116,12 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                                    bool &NotDuplicatable, const DataLayout *TD) {
+                                    bool &NotDuplicatable,
+                                    const TargetTransformInfo &TTI) {
   CodeMetrics Metrics;
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
-    Metrics.analyzeBasicBlock(*I, TD);
+    Metrics.analyzeBasicBlock(*I, TTI);
   NumCalls = Metrics.NumInlineCandidates;
   NotDuplicatable = Metrics.notDuplicatable;
 
@@ -134,6 +138,7 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   LoopInfo *LI = &getAnalysis<LoopInfo>();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+  const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
@@ -181,11 +186,10 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Enforce the threshold.
   if (Threshold != NoThreshold) {
-    const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
     unsigned NumInlineCandidates;
     bool notDuplicatable;
     unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates,
-                                            notDuplicatable, TD);
+                                            notDuplicatable, TTI);
     DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
     if (notDuplicatable) {
       DEBUG(dbgs() << "  Not unrolling loop which contains non duplicatable"
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 68d4423..0e8199f 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -101,7 +102,7 @@ namespace {
 
       // Analyze loop. Check its size, calculate is it possible to unswitch
       // it. Returns true if we can unswitch this loop.
-      bool countLoop(const Loop* L);
+      bool countLoop(const Loop* L, const TargetTransformInfo &TTI);
 
       // Clean all data related to given loop.
       void forgetLoop(const Loop* L);
@@ -170,6 +171,7 @@ namespace {
       AU.addPreservedID(LCSSAID);
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
     }
 
   private:
@@ -221,7 +223,7 @@ namespace {
 
 // Analyze loop. Check its size, calculate is it possible to unswitch
 // it. Returns true if we can unswitch this loop.
-bool LUAnalysisCache::countLoop(const Loop* L) {
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 
   std::pair<LoopPropsMapIt, bool> InsertRes =
       LoopsProperties.insert(std::make_pair(L, LoopProperties()));
@@ -243,7 +245,7 @@ bool LUAnalysisCache::countLoop(const Loop* L) {
     for (Loop::block_iterator I = L->block_begin(),
            E = L->block_end();
          I != E; ++I)
-      Metrics.analyzeBasicBlock(*I);
+      Metrics.analyzeBasicBlock(*I, TTI);
 
     Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5);
     Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
@@ -334,6 +336,7 @@ void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
 char LoopUnswitch::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
@@ -424,7 +427,7 @@ bool LoopUnswitch::processCurrentLoop() {
 
   // Probably we reach the quota of branches for this loop. If so
   // stop unswitching.
-  if (!BranchesInfo.countLoop(currentLoop))
+  if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>()))
     return false;
 
   // Loop over all of the basic blocks in the loop.  If we find an interior
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
deleted file mode 100644
index e6ec841..0000000
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ /dev/null
@@ -1,4354 +0,0 @@
-//===- ObjCARC.cpp - ObjC ARC Optimization --------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines ObjC ARC optimizations. ARC stands for
-// Automatic Reference Counting and is a system for managing reference counts
-// for objects in Objective C.
-//
-// The optimizations performed include elimination of redundant, partially
-// redundant, and inconsequential reference count operations, elimination of
-// redundant weak pointer operations, pattern-matching and replacement of
-// low-level operations into higher-level operations, and numerous minor
-// simplifications.
-//
-// This file also defines a simple ARC-aware AliasAnalysis.
-//
-// WARNING: This file knows about certain library functions. It recognizes them
-// by name, and hardwires knowledge of their semantics.
-//
-// WARNING: This file knows about how certain Objective-C library functions are
-// used. Naive LLVM IR transformations which would otherwise be
-// behavior-preserving may break these assumptions.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "objc-arc"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-// A handy option to enable/disable all optimizations in this file.
-static cl::opt<bool> EnableARCOpts("enable-objc-arc-opts", cl::init(true));
-
-//===----------------------------------------------------------------------===//
-// Misc. Utilities
-//===----------------------------------------------------------------------===//
-
-namespace {
-  /// MapVector - An associative container with fast insertion-order
-  /// (deterministic) iteration over its elements. Plus the special
-  /// blot operation.
-  template<class KeyT, class ValueT>
-  class MapVector {
-    /// Map - Map keys to indices in Vector.
-    typedef DenseMap<KeyT, size_t> MapTy;
-    MapTy Map;
-
-    /// Vector - Keys and values.
-    typedef std::vector<std::pair<KeyT, ValueT> > VectorTy;
-    VectorTy Vector;
-
-  public:
-    typedef typename VectorTy::iterator iterator;
-    typedef typename VectorTy::const_iterator const_iterator;
-    iterator begin() { return Vector.begin(); }
-    iterator end() { return Vector.end(); }
-    const_iterator begin() const { return Vector.begin(); }
-    const_iterator end() const { return Vector.end(); }
-
-#ifdef XDEBUG
-    ~MapVector() {
-      assert(Vector.size() >= Map.size()); // May differ due to blotting.
-      for (typename MapTy::const_iterator I = Map.begin(), E = Map.end();
-           I != E; ++I) {
-        assert(I->second < Vector.size());
-        assert(Vector[I->second].first == I->first);
-      }
-      for (typename VectorTy::const_iterator I = Vector.begin(),
-           E = Vector.end(); I != E; ++I)
-        assert(!I->first ||
-               (Map.count(I->first) &&
-                Map[I->first] == size_t(I - Vector.begin())));
-    }
-#endif
-
-    ValueT &operator[](const KeyT &Arg) {
-      std::pair<typename MapTy::iterator, bool> Pair =
-        Map.insert(std::make_pair(Arg, size_t(0)));
-      if (Pair.second) {
-        size_t Num = Vector.size();
-        Pair.first->second = Num;
-        Vector.push_back(std::make_pair(Arg, ValueT()));
-        return Vector[Num].second;
-      }
-      return Vector[Pair.first->second].second;
-    }
-
-    std::pair<iterator, bool>
-    insert(const std::pair<KeyT, ValueT> &InsertPair) {
-      std::pair<typename MapTy::iterator, bool> Pair =
-        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
-      if (Pair.second) {
-        size_t Num = Vector.size();
-        Pair.first->second = Num;
-        Vector.push_back(InsertPair);
-        return std::make_pair(Vector.begin() + Num, true);
-      }
-      return std::make_pair(Vector.begin() + Pair.first->second, false);
-    }
-
-    const_iterator find(const KeyT &Key) const {
-      typename MapTy::const_iterator It = Map.find(Key);
-      if (It == Map.end()) return Vector.end();
-      return Vector.begin() + It->second;
-    }
-
-    /// blot - This is similar to erase, but instead of removing the element
-    /// from the vector, it just zeros out the key in the vector. This leaves
-    /// iterators intact, but clients must be prepared for zeroed-out keys when
-    /// iterating.
-    void blot(const KeyT &Key) {
-      typename MapTy::iterator It = Map.find(Key);
-      if (It == Map.end()) return;
-      Vector[It->second].first = KeyT();
-      Map.erase(It);
-    }
-
-    void clear() {
-      Map.clear();
-      Vector.clear();
-    }
-  };
-}
-
-//===----------------------------------------------------------------------===//
-// ARC Utilities.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-namespace {
-  /// InstructionClass - A simple classification for instructions.
-  enum InstructionClass {
-    IC_Retain,              ///< objc_retain
-    IC_RetainRV,            ///< objc_retainAutoreleasedReturnValue
-    IC_RetainBlock,         ///< objc_retainBlock
-    IC_Release,             ///< objc_release
-    IC_Autorelease,         ///< objc_autorelease
-    IC_AutoreleaseRV,       ///< objc_autoreleaseReturnValue
-    IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush
-    IC_AutoreleasepoolPop,  ///< objc_autoreleasePoolPop
-    IC_NoopCast,            ///< objc_retainedObject, etc.
-    IC_FusedRetainAutorelease, ///< objc_retainAutorelease
-    IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
-    IC_LoadWeakRetained,    ///< objc_loadWeakRetained (primitive)
-    IC_StoreWeak,           ///< objc_storeWeak (primitive)
-    IC_InitWeak,            ///< objc_initWeak (derived)
-    IC_LoadWeak,            ///< objc_loadWeak (derived)
-    IC_MoveWeak,            ///< objc_moveWeak (derived)
-    IC_CopyWeak,            ///< objc_copyWeak (derived)
-    IC_DestroyWeak,         ///< objc_destroyWeak (derived)
-    IC_StoreStrong,         ///< objc_storeStrong (derived)
-    IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
-    IC_Call,                ///< could call objc_release
-    IC_User,                ///< could "use" a pointer
-    IC_None                 ///< anything else
-  };
-}
-
-/// IsPotentialUse - Test whether the given value is possible a
-/// reference-counted pointer.
-static bool IsPotentialUse(const Value *Op) {
-  // Pointers to static or stack storage are not reference-counted pointers.
-  if (isa<Constant>(Op) || isa<AllocaInst>(Op))
-    return false;
-  // Special arguments are not reference-counted.
-  if (const Argument *Arg = dyn_cast<Argument>(Op))
-    if (Arg->hasByValAttr() ||
-        Arg->hasNestAttr() ||
-        Arg->hasStructRetAttr())
-      return false;
-  // Only consider values with pointer types.
-  // It seemes intuitive to exclude function pointer types as well, since
-  // functions are never reference-counted, however clang occasionally
-  // bitcasts reference-counted pointers to function-pointer type
-  // temporarily.
-  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
-  if (!Ty)
-    return false;
-  // Conservatively assume anything else is a potential use.
-  return true;
-}
-
-/// GetCallSiteClass - Helper for GetInstructionClass. Determines what kind
-/// of construct CS is.
-static InstructionClass GetCallSiteClass(ImmutableCallSite CS) {
-  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
-       I != E; ++I)
-    if (IsPotentialUse(*I))
-      return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser;
-
-  return CS.onlyReadsMemory() ? IC_None : IC_Call;
-}
-
-/// GetFunctionClass - Determine if F is one of the special known Functions.
-/// If it isn't, return IC_CallOrUser.
-static InstructionClass GetFunctionClass(const Function *F) {
-  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
-
-  // No arguments.
-  if (AI == AE)
-    return StringSwitch<InstructionClass>(F->getName())
-      .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
-      .Default(IC_CallOrUser);
-
-  // One argument.
-  const Argument *A0 = AI++;
-  if (AI == AE)
-    // Argument is a pointer.
-    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
-      Type *ETy = PTy->getElementType();
-      // Argument is i8*.
-      if (ETy->isIntegerTy(8))
-        return StringSwitch<InstructionClass>(F->getName())
-          .Case("objc_retain",                IC_Retain)
-          .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV)
-          .Case("objc_retainBlock",           IC_RetainBlock)
-          .Case("objc_release",               IC_Release)
-          .Case("objc_autorelease",           IC_Autorelease)
-          .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV)
-          .Case("objc_autoreleasePoolPop",    IC_AutoreleasepoolPop)
-          .Case("objc_retainedObject",        IC_NoopCast)
-          .Case("objc_unretainedObject",      IC_NoopCast)
-          .Case("objc_unretainedPointer",     IC_NoopCast)
-          .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
-          .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
-          .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
-          .Default(IC_CallOrUser);
-
-      // Argument is i8**
-      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
-        if (Pte->getElementType()->isIntegerTy(8))
-          return StringSwitch<InstructionClass>(F->getName())
-            .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
-            .Case("objc_loadWeak",              IC_LoadWeak)
-            .Case("objc_destroyWeak",           IC_DestroyWeak)
-            .Default(IC_CallOrUser);
-    }
-
-  // Two arguments, first is i8**.
-  const Argument *A1 = AI++;
-  if (AI == AE)
-    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
-      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
-        if (Pte->getElementType()->isIntegerTy(8))
-          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
-            Type *ETy1 = PTy1->getElementType();
-            // Second argument is i8*
-            if (ETy1->isIntegerTy(8))
-              return StringSwitch<InstructionClass>(F->getName())
-                     .Case("objc_storeWeak",             IC_StoreWeak)
-                     .Case("objc_initWeak",              IC_InitWeak)
-                     .Case("objc_storeStrong",           IC_StoreStrong)
-                     .Default(IC_CallOrUser);
-            // Second argument is i8**.
-            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
-              if (Pte1->getElementType()->isIntegerTy(8))
-                return StringSwitch<InstructionClass>(F->getName())
-                       .Case("objc_moveWeak",              IC_MoveWeak)
-                       .Case("objc_copyWeak",              IC_CopyWeak)
-                       .Default(IC_CallOrUser);
-          }
-
-  // Anything else.
-  return IC_CallOrUser;
-}
-
-/// GetInstructionClass - Determine what kind of construct V is.
-static InstructionClass GetInstructionClass(const Value *V) {
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
-    // Any instruction other than bitcast and gep with a pointer operand have a
-    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
-    // to a subsequent use, rather than using it themselves, in this sense.
-    // As a short cut, several other opcodes are known to have no pointer
-    // operands of interest. And ret is never followed by a release, so it's
-    // not interesting to examine.
-    switch (I->getOpcode()) {
-    case Instruction::Call: {
-      const CallInst *CI = cast<CallInst>(I);
-      // Check for calls to special functions.
-      if (const Function *F = CI->getCalledFunction()) {
-        InstructionClass Class = GetFunctionClass(F);
-        if (Class != IC_CallOrUser)
-          return Class;
-
-        // None of the intrinsic functions do objc_release. For intrinsics, the
-        // only question is whether or not they may be users.
-        switch (F->getIntrinsicID()) {
-        case Intrinsic::returnaddress: case Intrinsic::frameaddress:
-        case Intrinsic::stacksave: case Intrinsic::stackrestore:
-        case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
-        case Intrinsic::objectsize: case Intrinsic::prefetch:
-        case Intrinsic::stackprotector:
-        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
-        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
-        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
-        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
-        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
-        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
-        // Don't let dbg info affect our results.
-        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
-          // Short cut: Some intrinsics obviously don't use ObjC pointers.
-          return IC_None;
-        default:
-          break;
-        }
-      }
-      return GetCallSiteClass(CI);
-    }
-    case Instruction::Invoke:
-      return GetCallSiteClass(cast<InvokeInst>(I));
-    case Instruction::BitCast:
-    case Instruction::GetElementPtr:
-    case Instruction::Select: case Instruction::PHI:
-    case Instruction::Ret: case Instruction::Br:
-    case Instruction::Switch: case Instruction::IndirectBr:
-    case Instruction::Alloca: case Instruction::VAArg:
-    case Instruction::Add: case Instruction::FAdd:
-    case Instruction::Sub: case Instruction::FSub:
-    case Instruction::Mul: case Instruction::FMul:
-    case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv:
-    case Instruction::SRem: case Instruction::URem: case Instruction::FRem:
-    case Instruction::Shl: case Instruction::LShr: case Instruction::AShr:
-    case Instruction::And: case Instruction::Or: case Instruction::Xor:
-    case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc:
-    case Instruction::IntToPtr: case Instruction::FCmp:
-    case Instruction::FPTrunc: case Instruction::FPExt:
-    case Instruction::FPToUI: case Instruction::FPToSI:
-    case Instruction::UIToFP: case Instruction::SIToFP:
-    case Instruction::InsertElement: case Instruction::ExtractElement:
-    case Instruction::ShuffleVector:
-    case Instruction::ExtractValue:
-      break;
-    case Instruction::ICmp:
-      // Comparing a pointer with null, or any other constant, isn't an
-      // interesting use, because we don't care what the pointer points to, or
-      // about the values of any other dynamic reference-counted pointers.
-      if (IsPotentialUse(I->getOperand(1)))
-        return IC_User;
-      break;
-    default:
-      // For anything else, check all the operands.
-      // Note that this includes both operands of a Store: while the first
-      // operand isn't actually being dereferenced, it is being stored to
-      // memory where we can no longer track who might read it and dereference
-      // it, so we have to consider it potentially used.
-      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
-           OI != OE; ++OI)
-        if (IsPotentialUse(*OI))
-          return IC_User;
-    }
-  }
-
-  // Otherwise, it's totally inert for ARC purposes.
-  return IC_None;
-}
-
-/// GetBasicInstructionClass - Determine what kind of construct V is. This is
-/// similar to GetInstructionClass except that it only detects objc runtine
-/// calls. This allows it to be faster.
-static InstructionClass GetBasicInstructionClass(const Value *V) {
-  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
-    if (const Function *F = CI->getCalledFunction())
-      return GetFunctionClass(F);
-    // Otherwise, be conservative.
-    return IC_CallOrUser;
-  }
-
-  // Otherwise, be conservative.
-  return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
-}
-
-/// IsRetain - Test if the given class is objc_retain or
-/// equivalent.
-static bool IsRetain(InstructionClass Class) {
-  return Class == IC_Retain ||
-         Class == IC_RetainRV;
-}
-
-/// IsAutorelease - Test if the given class is objc_autorelease or
-/// equivalent.
-static bool IsAutorelease(InstructionClass Class) {
-  return Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV;
-}
-
-/// IsForwarding - Test if the given class represents instructions which return
-/// their argument verbatim.
-static bool IsForwarding(InstructionClass Class) {
-  // objc_retainBlock technically doesn't always return its argument
-  // verbatim, but it doesn't matter for our purposes here.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_RetainBlock ||
-         Class == IC_NoopCast;
-}
-
-/// IsNoopOnNull - Test if the given class represents instructions which do
-/// nothing if passed a null pointer.
-static bool IsNoopOnNull(InstructionClass Class) {
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Release ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_RetainBlock;
-}
-
-/// IsAlwaysTail - Test if the given class represents instructions which are
-/// always safe to mark with the "tail" keyword.
-static bool IsAlwaysTail(InstructionClass Class) {
-  // IC_RetainBlock may be given a stack argument.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV;
-}
-
-/// IsNoThrow - Test if the given class represents instructions which are always
-/// safe to mark with the nounwind attribute..
-static bool IsNoThrow(InstructionClass Class) {
-  // objc_retainBlock is not nounwind because it calls user copy constructors
-  // which could theoretically throw.
-  return Class == IC_Retain ||
-         Class == IC_RetainRV ||
-         Class == IC_Release ||
-         Class == IC_Autorelease ||
-         Class == IC_AutoreleaseRV ||
-         Class == IC_AutoreleasepoolPush ||
-         Class == IC_AutoreleasepoolPop;
-}
-
-/// EraseInstruction - Erase the given instruction. Many ObjC calls return their
-/// argument verbatim, so if it's such a call and the return value has users,
-/// replace them with the argument value.
-static void EraseInstruction(Instruction *CI) {
-  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
-
-  bool Unused = CI->use_empty();
-
-  if (!Unused) {
-    // Replace the return value with the argument.
-    assert(IsForwarding(GetBasicInstructionClass(CI)) &&
-           "Can't delete non-forwarding instruction with users!");
-    CI->replaceAllUsesWith(OldArg);
-  }
-
-  CI->eraseFromParent();
-
-  if (Unused)
-    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
-}
-
-/// GetUnderlyingObjCPtr - This is a wrapper around getUnderlyingObject which
-/// also knows how to look through objc_retain and objc_autorelease calls, which
-/// we know to return their argument verbatim.
-static const Value *GetUnderlyingObjCPtr(const Value *V) {
-  for (;;) {
-    V = GetUnderlyingObject(V);
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-
-  return V;
-}
-
-/// StripPointerCastsAndObjCCalls - This is a wrapper around
-/// Value::stripPointerCasts which also knows how to look through objc_retain
-/// and objc_autorelease calls, which we know to return their argument verbatim.
-static const Value *StripPointerCastsAndObjCCalls(const Value *V) {
-  for (;;) {
-    V = V->stripPointerCasts();
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-  return V;
-}
-
-/// StripPointerCastsAndObjCCalls - This is a wrapper around
-/// Value::stripPointerCasts which also knows how to look through objc_retain
-/// and objc_autorelease calls, which we know to return their argument verbatim.
-static Value *StripPointerCastsAndObjCCalls(Value *V) {
-  for (;;) {
-    V = V->stripPointerCasts();
-    if (!IsForwarding(GetBasicInstructionClass(V)))
-      break;
-    V = cast<CallInst>(V)->getArgOperand(0);
-  }
-  return V;
-}
-
-/// GetObjCArg - Assuming the given instruction is one of the special calls such
-/// as objc_retain or objc_release, return the argument value, stripped of no-op
-/// casts and forwarding calls.
-static Value *GetObjCArg(Value *Inst) {
-  return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
-}
-
-/// IsObjCIdentifiedObject - This is similar to AliasAnalysis'
-/// isObjCIdentifiedObject, except that it uses special knowledge of
-/// ObjC conventions...
-static bool IsObjCIdentifiedObject(const Value *V) {
-  // Assume that call results and arguments have their own "provenance".
-  // Constants (including GlobalVariables) and Allocas are never
-  // reference-counted.
-  if (isa<CallInst>(V) || isa<InvokeInst>(V) ||
-      isa<Argument>(V) || isa<Constant>(V) ||
-      isa<AllocaInst>(V))
-    return true;
-
-  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
-    const Value *Pointer =
-      StripPointerCastsAndObjCCalls(LI->getPointerOperand());
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
-      // A constant pointer can't be pointing to an object on the heap. It may
-      // be reference-counted, but it won't be deleted.
-      if (GV->isConstant())
-        return true;
-      StringRef Name = GV->getName();
-      // These special variables are known to hold values which are not
-      // reference-counted pointers.
-      if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") ||
-          Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") ||
-          Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") ||
-          Name.startswith("\01L_OBJC_METH_VAR_NAME_") ||
-          Name.startswith("\01l_objc_msgSend_fixup_"))
-        return true;
-    }
-  }
-
-  return false;
-}
-
-/// FindSingleUseIdentifiedObject - This is similar to
-/// StripPointerCastsAndObjCCalls but it stops as soon as it finds a value
-/// with multiple uses.
-static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
-  if (Arg->hasOneUse()) {
-    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
-      return FindSingleUseIdentifiedObject(BC->getOperand(0));
-    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
-      if (GEP->hasAllZeroIndices())
-        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
-    if (IsForwarding(GetBasicInstructionClass(Arg)))
-      return FindSingleUseIdentifiedObject(
-               cast<CallInst>(Arg)->getArgOperand(0));
-    if (!IsObjCIdentifiedObject(Arg))
-      return 0;
-    return Arg;
-  }
-
-  // If we found an identifiable object but it has multiple uses, but they are
-  // trivial uses, we can still consider this to be a single-use value.
-  if (IsObjCIdentifiedObject(Arg)) {
-    for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
-         UI != UE; ++UI) {
-      const User *U = *UI;
-      if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
-         return 0;
-    }
-
-    return Arg;
-  }
-
-  return 0;
-}
-
-/// ModuleHasARC - Test if the given module looks interesting to run ARC
-/// optimization on.
-static bool ModuleHasARC(const Module &M) {
-  return
-    M.getNamedValue("objc_retain") ||
-    M.getNamedValue("objc_release") ||
-    M.getNamedValue("objc_autorelease") ||
-    M.getNamedValue("objc_retainAutoreleasedReturnValue") ||
-    M.getNamedValue("objc_retainBlock") ||
-    M.getNamedValue("objc_autoreleaseReturnValue") ||
-    M.getNamedValue("objc_autoreleasePoolPush") ||
-    M.getNamedValue("objc_loadWeakRetained") ||
-    M.getNamedValue("objc_loadWeak") ||
-    M.getNamedValue("objc_destroyWeak") ||
-    M.getNamedValue("objc_storeWeak") ||
-    M.getNamedValue("objc_initWeak") ||
-    M.getNamedValue("objc_moveWeak") ||
-    M.getNamedValue("objc_copyWeak") ||
-    M.getNamedValue("objc_retainedObject") ||
-    M.getNamedValue("objc_unretainedObject") ||
-    M.getNamedValue("objc_unretainedPointer");
-}
-
-/// DoesObjCBlockEscape - Test whether the given pointer, which is an
-/// Objective C block pointer, does not "escape". This differs from regular
-/// escape analysis in that a use as an argument to a call is not considered
-/// an escape.
-static bool DoesObjCBlockEscape(const Value *BlockPtr) {
-  // Walk the def-use chains.
-  SmallVector<const Value *, 4> Worklist;
-  Worklist.push_back(BlockPtr);
-  do {
-    const Value *V = Worklist.pop_back_val();
-    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
-         UI != UE; ++UI) {
-      const User *UUser = *UI;
-      // Special - Use by a call (callee or argument) is not considered
-      // to be an escape.
-      switch (GetBasicInstructionClass(UUser)) {
-      case IC_StoreWeak:
-      case IC_InitWeak:
-      case IC_StoreStrong:
-      case IC_Autorelease:
-      case IC_AutoreleaseRV:
-        // These special functions make copies of their pointer arguments.
-        return true;
-      case IC_User:
-      case IC_None:
-        // Use by an instruction which copies the value is an escape if the
-        // result is an escape.
-        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
-            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
-          Worklist.push_back(UUser);
-          continue;
-        }
-        // Use by a load is not an escape.
-        if (isa<LoadInst>(UUser))
-          continue;
-        // Use by a store is not an escape if the use is the address.
-        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
-          if (V != SI->getValueOperand())
-            continue;
-        break;
-      default:
-        // Regular calls and other stuff are not considered escapes.
-        continue;
-      }
-      // Otherwise, conservatively assume an escape.
-      return true;
-    }
-  } while (!Worklist.empty());
-
-  // No escapes found.
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// ARC AliasAnalysis.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Pass.h"
-
-namespace {
-  /// ObjCARCAliasAnalysis - This is a simple alias analysis
-  /// implementation that uses knowledge of ARC constructs to answer queries.
-  ///
-  /// TODO: This class could be generalized to know about other ObjC-specific
-  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing
-  /// even though their offsets are dynamic.
-  class ObjCARCAliasAnalysis : public ImmutablePass,
-                               public AliasAnalysis {
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    ObjCARCAliasAnalysis() : ImmutablePass(ID) {
-      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry());
-    }
-
-  private:
-    virtual void initializePass() {
-      InitializeAliasAnalysis(this);
-    }
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(const void *PI) {
-      if (PI == &AliasAnalysis::ID)
-        return static_cast<AliasAnalysis *>(this);
-      return this;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual AliasResult alias(const Location &LocA, const Location &LocB);
-    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
-    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
-    virtual ModRefBehavior getModRefBehavior(const Function *F);
-    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
-                                       const Location &Loc);
-    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
-                                       ImmutableCallSite CS2);
-  };
-}  // End of anonymous namespace
-
-// Register this pass...
-char ObjCARCAliasAnalysis::ID = 0;
-INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa",
-                   "ObjC-ARC-Based Alias Analysis", false, true, false)
-
-ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
-  return new ObjCARCAliasAnalysis();
-}
-
-void
-ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AliasAnalysis::getAnalysisUsage(AU);
-}
-
-AliasAnalysis::AliasResult
-ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::alias(LocA, LocB);
-
-  // First, strip off no-ops, including ObjC-specific no-ops, and try making a
-  // precise alias query.
-  const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
-  const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
-  AliasResult Result =
-    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag),
-                         Location(SB, LocB.Size, LocB.TBAATag));
-  if (Result != MayAlias)
-    return Result;
-
-  // If that failed, climb to the underlying object, including climbing through
-  // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *UA = GetUnderlyingObjCPtr(SA);
-  const Value *UB = GetUnderlyingObjCPtr(SB);
-  if (UA != SA || UB != SB) {
-    Result = AliasAnalysis::alias(Location(UA), Location(UB));
-    // We can't use MustAlias or PartialAlias results here because
-    // GetUnderlyingObjCPtr may return an offsetted pointer value.
-    if (Result == NoAlias)
-      return NoAlias;
-  }
-
-  // If that failed, fail. We don't need to chain here, since that's covered
-  // by the earlier precise query.
-  return MayAlias;
-}
-
-bool
-ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
-                                             bool OrLocal) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
-
-  // First, strip off no-ops, including ObjC-specific no-ops, and try making
-  // a precise alias query.
-  const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
-  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag),
-                                            OrLocal))
-    return true;
-
-  // If that failed, climb to the underlying object, including climbing through
-  // ObjC-specific no-ops, and try making an imprecise alias query.
-  const Value *U = GetUnderlyingObjCPtr(S);
-  if (U != S)
-    return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal);
-
-  // If that failed, fail. We don't need to chain here, since that's covered
-  // by the earlier precise query.
-  return false;
-}
-
-AliasAnalysis::ModRefBehavior
-ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
-  // We have nothing to do. Just chain to the next AliasAnalysis.
-  return AliasAnalysis::getModRefBehavior(CS);
-}
-
-AliasAnalysis::ModRefBehavior
-ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::getModRefBehavior(F);
-
-  switch (GetFunctionClass(F)) {
-  case IC_NoopCast:
-    return DoesNotAccessMemory;
-  default:
-    break;
-  }
-
-  return AliasAnalysis::getModRefBehavior(F);
-}
-
-AliasAnalysis::ModRefResult
-ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
-  if (!EnableARCOpts)
-    return AliasAnalysis::getModRefInfo(CS, Loc);
-
-  switch (GetBasicInstructionClass(CS.getInstruction())) {
-  case IC_Retain:
-  case IC_RetainRV:
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_NoopCast:
-  case IC_AutoreleasepoolPush:
-  case IC_FusedRetainAutorelease:
-  case IC_FusedRetainAutoreleaseRV:
-    // These functions don't access any memory visible to the compiler.
-    // Note that this doesn't include objc_retainBlock, because it updates
-    // pointers when it copies block data.
-    return NoModRef;
-  default:
-    break;
-  }
-
-  return AliasAnalysis::getModRefInfo(CS, Loc);
-}
-
-AliasAnalysis::ModRefResult
-ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
-                                    ImmutableCallSite CS2) {
-  // TODO: Theoretically we could check for dependencies between objc_* calls
-  // and OnlyAccessesArgumentPointees calls or other well-behaved calls.
-  return AliasAnalysis::getModRefInfo(CS1, CS2);
-}
-
-//===----------------------------------------------------------------------===//
-// ARC expansion.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/InstIterator.h"
-#include "llvm/Transforms/Scalar.h"
-
-namespace {
-  /// ObjCARCExpand - Early ARC transformations.
-  class ObjCARCExpand : public FunctionPass {
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-
-    /// Run - A flag indicating whether this optimization pass should run.
-    bool Run;
-
-  public:
-    static char ID;
-    ObjCARCExpand() : FunctionPass(ID) {
-      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCExpand::ID = 0;
-INITIALIZE_PASS(ObjCARCExpand,
-                "objc-arc-expand", "ObjC ARC expansion", false, false)
-
-Pass *llvm::createObjCARCExpandPass() {
-  return new ObjCARCExpand();
-}
-
-void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-}
-
-bool ObjCARCExpand::doInitialization(Module &M) {
-  Run = ModuleHasARC(M);
-  return false;
-}
-
-bool ObjCARCExpand::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  bool Changed = false;
-
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
-    Instruction *Inst = &*I;
-
-    DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
-
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_Retain:
-    case IC_RetainRV:
-    case IC_Autorelease:
-    case IC_AutoreleaseRV:
-    case IC_FusedRetainAutorelease:
-    case IC_FusedRetainAutoreleaseRV: {
-      // These calls return their argument verbatim, as a low-level
-      // optimization. However, this makes high-level optimizations
-      // harder. Undo any uses of this optimization that the front-end
-      // emitted here. We'll redo them in the contract pass.
-      Changed = true;
-      Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
-      DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n"
-                      "               New = " << *Value << "\n");
-      Inst->replaceAllUsesWith(Value);
-      break;
-    }
-    default:
-      break;
-    }
-  }
-
-  DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// ARC autorelease pool elimination.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/Constants.h"
-
-namespace {
-  /// ObjCARCAPElim - Autorelease pool elimination.
-  class ObjCARCAPElim : public ModulePass {
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool runOnModule(Module &M);
-
-    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
-    static bool OptimizeBB(BasicBlock *BB);
-
-  public:
-    static char ID;
-    ObjCARCAPElim() : ModulePass(ID) {
-      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCAPElim::ID = 0;
-INITIALIZE_PASS(ObjCARCAPElim,
-                "objc-arc-apelim",
-                "ObjC ARC autorelease pool elimination",
-                false, false)
-
-Pass *llvm::createObjCARCAPElimPass() {
-  return new ObjCARCAPElim();
-}
-
-void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-}
-
-/// MayAutorelease - Interprocedurally determine if calls made by the
-/// given call site can possibly produce autoreleases.
-bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
-  if (const Function *Callee = CS.getCalledFunction()) {
-    if (Callee->isDeclaration() || Callee->mayBeOverridden())
-      return true;
-    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
-         I != E; ++I) {
-      const BasicBlock *BB = I;
-      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
-           J != F; ++J)
-        if (ImmutableCallSite JCS = ImmutableCallSite(J))
-          // This recursion depth limit is arbitrary. It's just great
-          // enough to cover known interesting testcases.
-          if (Depth < 3 &&
-              !JCS.onlyReadsMemory() &&
-              MayAutorelease(JCS, Depth + 1))
-            return true;
-    }
-    return false;
-  }
-
-  return true;
-}
-
-bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
-  bool Changed = false;
-
-  Instruction *Push = 0;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-    Instruction *Inst = I++;
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_AutoreleasepoolPush:
-      Push = Inst;
-      break;
-    case IC_AutoreleasepoolPop:
-      // If this pop matches a push and nothing in between can autorelease,
-      // zap the pair.
-      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
-        Changed = true;
-        DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop autorelease pair:\n"
-                     << "                           Pop: " << *Inst << "\n"
-                     << "                           Push: " << *Push << "\n");
-        Inst->eraseFromParent();
-        Push->eraseFromParent();
-      }
-      Push = 0;
-      break;
-    case IC_CallOrUser:
-      if (MayAutorelease(ImmutableCallSite(Inst)))
-        Push = 0;
-      break;
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-bool ObjCARCAPElim::runOnModule(Module &M) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!ModuleHasARC(M))
-    return false;
-
-  // Find the llvm.global_ctors variable, as the first step in
-  // identifying the global constructors. In theory, unnecessary autorelease
-  // pools could occur anywhere, but in practice it's pretty rare. Global
-  // ctors are a place where autorelease pools get inserted automatically,
-  // so it's pretty common for them to be unnecessary, and it's pretty
-  // profitable to eliminate them.
-  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
-  if (!GV)
-    return false;
-
-  assert(GV->hasDefinitiveInitializer() &&
-         "llvm.global_ctors is uncooperative!");
-
-  bool Changed = false;
-
-  // Dig the constructor functions out of GV's initializer.
-  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
-  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
-       OI != OE; ++OI) {
-    Value *Op = *OI;
-    // llvm.global_ctors is an array of pairs where the second members
-    // are constructor functions.
-    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
-    // If the user used a constructor function with the wrong signature and
-    // it got bitcasted or whatever, look the other way.
-    if (!F)
-      continue;
-    // Only look at function definitions.
-    if (F->isDeclaration())
-      continue;
-    // Only look at functions with one basic block.
-    if (llvm::next(F->begin()) != F->end())
-      continue;
-    // Ok, a single-block constructor function definition. Try to optimize it.
-    Changed |= OptimizeBB(F->begin());
-  }
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// ARC optimization.
-//===----------------------------------------------------------------------===//
-
-// TODO: On code like this:
-//
-// objc_retain(%x)
-// stuff_that_cannot_release()
-// objc_autorelease(%x)
-// stuff_that_cannot_release()
-// objc_retain(%x)
-// stuff_that_cannot_release()
-// objc_autorelease(%x)
-//
-// The second retain and autorelease can be deleted.
-
-// TODO: It should be possible to delete
-// objc_autoreleasePoolPush and objc_autoreleasePoolPop
-// pairs if nothing is actually autoreleased between them. Also, autorelease
-// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
-// after inlining) can be turned into plain release calls.
-
-// TODO: Critical-edge splitting. If the optimial insertion point is
-// a critical edge, the current algorithm has to fail, because it doesn't
-// know how to split edges. It should be possible to make the optimizer
-// think in terms of edges, rather than blocks, and then split critical
-// edges on demand.
-
-// TODO: OptimizeSequences could generalized to be Interprocedural.
-
-// TODO: Recognize that a bunch of other objc runtime calls have
-// non-escaping arguments and non-releasing arguments, and may be
-// non-autoreleasing.
-
-// TODO: Sink autorelease calls as far as possible. Unfortunately we
-// usually can't sink them past other calls, which would be the main
-// case where it would be useful.
-
-// TODO: The pointer returned from objc_loadWeakRetained is retained.
-
-// TODO: Delete release+retain pairs (rare).
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Support/CFG.h"
-
-STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
-STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
-STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
-STATISTIC(NumRets,        "Number of return value forwarding "
-                          "retain+autoreleaes eliminated");
-STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
-STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
-
-namespace {
-  /// ProvenanceAnalysis - This is similar to BasicAliasAnalysis, and it
-  /// uses many of the same techniques, except it uses special ObjC-specific
-  /// reasoning about pointer relationships.
-  class ProvenanceAnalysis {
-    AliasAnalysis *AA;
-
-    typedef std::pair<const Value *, const Value *> ValuePairTy;
-    typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
-    CachedResultsTy CachedResults;
-
-    bool relatedCheck(const Value *A, const Value *B);
-    bool relatedSelect(const SelectInst *A, const Value *B);
-    bool relatedPHI(const PHINode *A, const Value *B);
-
-    void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
-    ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
-
-  public:
-    ProvenanceAnalysis() {}
-
-    void setAA(AliasAnalysis *aa) { AA = aa; }
-
-    AliasAnalysis *getAA() const { return AA; }
-
-    bool related(const Value *A, const Value *B);
-
-    void clear() {
-      CachedResults.clear();
-    }
-  };
-}
-
-bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) {
-  // If the values are Selects with the same condition, we can do a more precise
-  // check: just check for relations between the values on corresponding arms.
-  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
-    if (A->getCondition() == SB->getCondition())
-      return related(A->getTrueValue(), SB->getTrueValue()) ||
-             related(A->getFalseValue(), SB->getFalseValue());
-
-  // Check both arms of the Select node individually.
-  return related(A->getTrueValue(), B) ||
-         related(A->getFalseValue(), B);
-}
-
-bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) {
-  // If the values are PHIs in the same block, we can do a more precise as well
-  // as efficient check: just check for relations between the values on
-  // corresponding edges.
-  if (const PHINode *PNB = dyn_cast<PHINode>(B))
-    if (PNB->getParent() == A->getParent()) {
-      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
-        if (related(A->getIncomingValue(i),
-                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
-          return true;
-      return false;
-    }
-
-  // Check each unique source of the PHI node against B.
-  SmallPtrSet<const Value *, 4> UniqueSrc;
-  for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
-    const Value *PV1 = A->getIncomingValue(i);
-    if (UniqueSrc.insert(PV1) && related(PV1, B))
-      return true;
-  }
-
-  // All of the arms checked out.
-  return false;
-}
-
-/// isStoredObjCPointer - Test if the value of P, or any value covered by its
-/// provenance, is ever stored within the function (not counting callees).
-static bool isStoredObjCPointer(const Value *P) {
-  SmallPtrSet<const Value *, 8> Visited;
-  SmallVector<const Value *, 8> Worklist;
-  Worklist.push_back(P);
-  Visited.insert(P);
-  do {
-    P = Worklist.pop_back_val();
-    for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end();
-         UI != UE; ++UI) {
-      const User *Ur = *UI;
-      if (isa<StoreInst>(Ur)) {
-        if (UI.getOperandNo() == 0)
-          // The pointer is stored.
-          return true;
-        // The pointed is stored through.
-        continue;
-      }
-      if (isa<CallInst>(Ur))
-        // The pointer is passed as an argument, ignore this.
-        continue;
-      if (isa<PtrToIntInst>(P))
-        // Assume the worst.
-        return true;
-      if (Visited.insert(Ur))
-        Worklist.push_back(Ur);
-    }
-  } while (!Worklist.empty());
-
-  // Everything checked out.
-  return false;
-}
-
-bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
-  // Skip past provenance pass-throughs.
-  A = GetUnderlyingObjCPtr(A);
-  B = GetUnderlyingObjCPtr(B);
-
-  // Quick check.
-  if (A == B)
-    return true;
-
-  // Ask regular AliasAnalysis, for a first approximation.
-  switch (AA->alias(A, B)) {
-  case AliasAnalysis::NoAlias:
-    return false;
-  case AliasAnalysis::MustAlias:
-  case AliasAnalysis::PartialAlias:
-    return true;
-  case AliasAnalysis::MayAlias:
-    break;
-  }
-
-  bool AIsIdentified = IsObjCIdentifiedObject(A);
-  bool BIsIdentified = IsObjCIdentifiedObject(B);
-
-  // An ObjC-Identified object can't alias a load if it is never locally stored.
-  if (AIsIdentified) {
-    // Check for an obvious escape.
-    if (isa<LoadInst>(B))
-      return isStoredObjCPointer(A);
-    if (BIsIdentified) {
-      // Check for an obvious escape.
-      if (isa<LoadInst>(A))
-        return isStoredObjCPointer(B);
-      // Both pointers are identified and escapes aren't an evident problem.
-      return false;
-    }
-  } else if (BIsIdentified) {
-    // Check for an obvious escape.
-    if (isa<LoadInst>(A))
-      return isStoredObjCPointer(B);
-  }
-
-   // Special handling for PHI and Select.
-  if (const PHINode *PN = dyn_cast<PHINode>(A))
-    return relatedPHI(PN, B);
-  if (const PHINode *PN = dyn_cast<PHINode>(B))
-    return relatedPHI(PN, A);
-  if (const SelectInst *S = dyn_cast<SelectInst>(A))
-    return relatedSelect(S, B);
-  if (const SelectInst *S = dyn_cast<SelectInst>(B))
-    return relatedSelect(S, A);
-
-  // Conservative.
-  return true;
-}
-
-bool ProvenanceAnalysis::related(const Value *A, const Value *B) {
-  // Begin by inserting a conservative value into the map. If the insertion
-  // fails, we have the answer already. If it succeeds, leave it there until we
-  // compute the real answer to guard against recursive queries.
-  if (A > B) std::swap(A, B);
-  std::pair<CachedResultsTy::iterator, bool> Pair =
-    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
-  if (!Pair.second)
-    return Pair.first->second;
-
-  bool Result = relatedCheck(A, B);
-  CachedResults[ValuePairTy(A, B)] = Result;
-  return Result;
-}
-
-namespace {
-  // Sequence - A sequence of states that a pointer may go through in which an
-  // objc_retain and objc_release are actually needed.
-  enum Sequence {
-    S_None,
-    S_Retain,         ///< objc_retain(x)
-    S_CanRelease,     ///< foo(x) -- x could possibly see a ref count decrement
-    S_Use,            ///< any use of x
-    S_Stop,           ///< like S_Release, but code motion is stopped
-    S_Release,        ///< objc_release(x)
-    S_MovableRelease  ///< objc_release(x), !clang.imprecise_release
-  };
-}
-
-static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
-  // The easy cases.
-  if (A == B)
-    return A;
-  if (A == S_None || B == S_None)
-    return S_None;
-
-  if (A > B) std::swap(A, B);
-  if (TopDown) {
-    // Choose the side which is further along in the sequence.
-    if ((A == S_Retain || A == S_CanRelease) &&
-        (B == S_CanRelease || B == S_Use))
-      return B;
-  } else {
-    // Choose the side which is further along in the sequence.
-    if ((A == S_Use || A == S_CanRelease) &&
-        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
-      return A;
-    // If both sides are releases, choose the more conservative one.
-    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
-      return A;
-    if (A == S_Release && B == S_MovableRelease)
-      return A;
-  }
-
-  return S_None;
-}
-
-namespace {
-  /// RRInfo - Unidirectional information about either a
-  /// retain-decrement-use-release sequence or release-use-decrement-retain
-  /// reverese sequence.
-  struct RRInfo {
-    /// KnownSafe - After an objc_retain, the reference count of the referenced
-    /// object is known to be positive. Similarly, before an objc_release, the
-    /// reference count of the referenced object is known to be positive. If
-    /// there are retain-release pairs in code regions where the retain count
-    /// is known to be positive, they can be eliminated, regardless of any side
-    /// effects between them.
-    ///
-    /// Also, a retain+release pair nested within another retain+release
-    /// pair all on the known same pointer value can be eliminated, regardless
-    /// of any intervening side effects.
-    ///
-    /// KnownSafe is true when either of these conditions is satisfied.
-    bool KnownSafe;
-
-    /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as
-    /// opposed to objc_retain calls).
-    bool IsRetainBlock;
-
-    /// IsTailCallRelease - True of the objc_release calls are all marked
-    /// with the "tail" keyword.
-    bool IsTailCallRelease;
-
-    /// ReleaseMetadata - If the Calls are objc_release calls and they all have
-    /// a clang.imprecise_release tag, this is the metadata tag.
-    MDNode *ReleaseMetadata;
-
-    /// Calls - For a top-down sequence, the set of objc_retains or
-    /// objc_retainBlocks. For bottom-up, the set of objc_releases.
-    SmallPtrSet<Instruction *, 2> Calls;
-
-    /// ReverseInsertPts - The set of optimal insert positions for
-    /// moving calls in the opposite sequence.
-    SmallPtrSet<Instruction *, 2> ReverseInsertPts;
-
-    RRInfo() :
-      KnownSafe(false), IsRetainBlock(false),
-      IsTailCallRelease(false),
-      ReleaseMetadata(0) {}
-
-    void clear();
-  };
-}
-
-void RRInfo::clear() {
-  KnownSafe = false;
-  IsRetainBlock = false;
-  IsTailCallRelease = false;
-  ReleaseMetadata = 0;
-  Calls.clear();
-  ReverseInsertPts.clear();
-}
-
-namespace {
-  /// PtrState - This class summarizes several per-pointer runtime properties
-  /// which are propogated through the flow graph.
-  class PtrState {
-    /// KnownPositiveRefCount - True if the reference count is known to
-    /// be incremented.
-    bool KnownPositiveRefCount;
-
-    /// Partial - True of we've seen an opportunity for partial RR elimination,
-    /// such as pushing calls into a CFG triangle or into one side of a
-    /// CFG diamond.
-    bool Partial;
-
-    /// Seq - The current position in the sequence.
-    Sequence Seq : 8;
-
-  public:
-    /// RRI - Unidirectional information about the current sequence.
-    /// TODO: Encapsulate this better.
-    RRInfo RRI;
-
-    PtrState() : KnownPositiveRefCount(false), Partial(false),
-                 Seq(S_None) {}
-
-    void SetKnownPositiveRefCount() {
-      KnownPositiveRefCount = true;
-    }
-
-    void ClearRefCount() {
-      KnownPositiveRefCount = false;
-    }
-
-    bool IsKnownIncremented() const {
-      return KnownPositiveRefCount;
-    }
-
-    void SetSeq(Sequence NewSeq) {
-      Seq = NewSeq;
-    }
-
-    Sequence GetSeq() const {
-      return Seq;
-    }
-
-    void ClearSequenceProgress() {
-      ResetSequenceProgress(S_None);
-    }
-
-    void ResetSequenceProgress(Sequence NewSeq) {
-      Seq = NewSeq;
-      Partial = false;
-      RRI.clear();
-    }
-
-    void Merge(const PtrState &Other, bool TopDown);
-  };
-}
-
-void
-PtrState::Merge(const PtrState &Other, bool TopDown) {
-  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
-  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
-
-  // We can't merge a plain objc_retain with an objc_retainBlock.
-  if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
-    Seq = S_None;
-
-  // If we're not in a sequence (anymore), drop all associated state.
-  if (Seq == S_None) {
-    Partial = false;
-    RRI.clear();
-  } else if (Partial || Other.Partial) {
-    // If we're doing a merge on a path that's previously seen a partial
-    // merge, conservatively drop the sequence, to avoid doing partial
-    // RR elimination. If the branch predicates for the two merge differ,
-    // mixing them is unsafe.
-    ClearSequenceProgress();
-  } else {
-    // Conservatively merge the ReleaseMetadata information.
-    if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
-      RRI.ReleaseMetadata = 0;
-
-    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
-    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
-                            Other.RRI.IsTailCallRelease;
-    RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
-
-    // Merge the insert point sets. If there are any differences,
-    // that makes this a partial merge.
-    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
-    for (SmallPtrSet<Instruction *, 2>::const_iterator
-         I = Other.RRI.ReverseInsertPts.begin(),
-         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
-      Partial |= RRI.ReverseInsertPts.insert(*I);
-  }
-}
-
-namespace {
-  /// BBState - Per-BasicBlock state.
-  class BBState {
-    /// TopDownPathCount - The number of unique control paths from the entry
-    /// which can reach this block.
-    unsigned TopDownPathCount;
-
-    /// BottomUpPathCount - The number of unique control paths to exits
-    /// from this block.
-    unsigned BottomUpPathCount;
-
-    /// MapTy - A type for PerPtrTopDown and PerPtrBottomUp.
-    typedef MapVector<const Value *, PtrState> MapTy;
-
-    /// PerPtrTopDown - The top-down traversal uses this to record information
-    /// known about a pointer at the bottom of each block.
-    MapTy PerPtrTopDown;
-
-    /// PerPtrBottomUp - The bottom-up traversal uses this to record information
-    /// known about a pointer at the top of each block.
-    MapTy PerPtrBottomUp;
-
-    /// Preds, Succs - Effective successors and predecessors of the current
-    /// block (this ignores ignorable edges and ignored backedges).
-    SmallVector<BasicBlock *, 2> Preds;
-    SmallVector<BasicBlock *, 2> Succs;
-
-  public:
-    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
-
-    typedef MapTy::iterator ptr_iterator;
-    typedef MapTy::const_iterator ptr_const_iterator;
-
-    ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
-    ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
-    ptr_const_iterator top_down_ptr_begin() const {
-      return PerPtrTopDown.begin();
-    }
-    ptr_const_iterator top_down_ptr_end() const {
-      return PerPtrTopDown.end();
-    }
-
-    ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); }
-    ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
-    ptr_const_iterator bottom_up_ptr_begin() const {
-      return PerPtrBottomUp.begin();
-    }
-    ptr_const_iterator bottom_up_ptr_end() const {
-      return PerPtrBottomUp.end();
-    }
-
-    /// SetAsEntry - Mark this block as being an entry block, which has one
-    /// path from the entry by definition.
-    void SetAsEntry() { TopDownPathCount = 1; }
-
-    /// SetAsExit - Mark this block as being an exit block, which has one
-    /// path to an exit by definition.
-    void SetAsExit()  { BottomUpPathCount = 1; }
-
-    PtrState &getPtrTopDownState(const Value *Arg) {
-      return PerPtrTopDown[Arg];
-    }
-
-    PtrState &getPtrBottomUpState(const Value *Arg) {
-      return PerPtrBottomUp[Arg];
-    }
-
-    void clearBottomUpPointers() {
-      PerPtrBottomUp.clear();
-    }
-
-    void clearTopDownPointers() {
-      PerPtrTopDown.clear();
-    }
-
-    void InitFromPred(const BBState &Other);
-    void InitFromSucc(const BBState &Other);
-    void MergePred(const BBState &Other);
-    void MergeSucc(const BBState &Other);
-
-    /// GetAllPathCount - Return the number of possible unique paths from an
-    /// entry to an exit which pass through this block. This is only valid
-    /// after both the top-down and bottom-up traversals are complete.
-    unsigned GetAllPathCount() const {
-      assert(TopDownPathCount != 0);
-      assert(BottomUpPathCount != 0);
-      return TopDownPathCount * BottomUpPathCount;
-    }
-
-    // Specialized CFG utilities.
-    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
-    edge_iterator pred_begin() { return Preds.begin(); }
-    edge_iterator pred_end() { return Preds.end(); }
-    edge_iterator succ_begin() { return Succs.begin(); }
-    edge_iterator succ_end() { return Succs.end(); }
-
-    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
-    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
-
-    bool isExit() const { return Succs.empty(); }
-  };
-}
-
-void BBState::InitFromPred(const BBState &Other) {
-  PerPtrTopDown = Other.PerPtrTopDown;
-  TopDownPathCount = Other.TopDownPathCount;
-}
-
-void BBState::InitFromSucc(const BBState &Other) {
-  PerPtrBottomUp = Other.PerPtrBottomUp;
-  BottomUpPathCount = Other.BottomUpPathCount;
-}
-
-/// MergePred - The top-down traversal uses this to merge information about
-/// predecessors to form the initial state for a new block.
-void BBState::MergePred(const BBState &Other) {
-  // Other.TopDownPathCount can be 0, in which case it is either dead or a
-  // loop backedge. Loop backedges are special.
-  TopDownPathCount += Other.TopDownPathCount;
-
-  // Check for overflow. If we have overflow, fall back to conservative behavior.
-  if (TopDownPathCount < Other.TopDownPathCount) {
-    clearTopDownPointers();
-    return;
-  }
-
-  // For each entry in the other set, if our set has an entry with the same key,
-  // merge the entries. Otherwise, copy the entry and merge it with an empty
-  // entry.
-  for (ptr_const_iterator MI = Other.top_down_ptr_begin(),
-       ME = Other.top_down_ptr_end(); MI != ME; ++MI) {
-    std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI);
-    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
-                             /*TopDown=*/true);
-  }
-
-  // For each entry in our set, if the other set doesn't have an entry with the
-  // same key, force it to merge with an empty entry.
-  for (ptr_iterator MI = top_down_ptr_begin(),
-       ME = top_down_ptr_end(); MI != ME; ++MI)
-    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
-      MI->second.Merge(PtrState(), /*TopDown=*/true);
-}
-
-/// MergeSucc - The bottom-up traversal uses this to merge information about
-/// successors to form the initial state for a new block.
-void BBState::MergeSucc(const BBState &Other) {
-  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
-  // loop backedge. Loop backedges are special.
-  BottomUpPathCount += Other.BottomUpPathCount;
-
-  // Check for overflow. If we have overflow, fall back to conservative behavior.
-  if (BottomUpPathCount < Other.BottomUpPathCount) {
-    clearBottomUpPointers();
-    return;
-  }
-
-  // For each entry in the other set, if our set has an entry with the
-  // same key, merge the entries. Otherwise, copy the entry and merge
-  // it with an empty entry.
-  for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(),
-       ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) {
-    std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI);
-    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
-                             /*TopDown=*/false);
-  }
-
-  // For each entry in our set, if the other set doesn't have an entry
-  // with the same key, force it to merge with an empty entry.
-  for (ptr_iterator MI = bottom_up_ptr_begin(),
-       ME = bottom_up_ptr_end(); MI != ME; ++MI)
-    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
-      MI->second.Merge(PtrState(), /*TopDown=*/false);
-}
-
-namespace {
-  /// ObjCARCOpt - The main ARC optimization pass.
-  class ObjCARCOpt : public FunctionPass {
-    bool Changed;
-    ProvenanceAnalysis PA;
-
-    /// Run - A flag indicating whether this optimization pass should run.
-    bool Run;
-
-    /// RetainRVCallee, etc. - Declarations for ObjC runtime
-    /// functions, for use in creating calls to them. These are initialized
-    /// lazily to avoid cluttering up the Module with unused declarations.
-    Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee,
-             *RetainCallee, *RetainBlockCallee, *AutoreleaseCallee;
-
-    /// UsedInThisFunciton - Flags which determine whether each of the
-    /// interesting runtine functions is in fact used in the current function.
-    unsigned UsedInThisFunction;
-
-    /// ImpreciseReleaseMDKind - The Metadata Kind for clang.imprecise_release
-    /// metadata.
-    unsigned ImpreciseReleaseMDKind;
-
-    /// CopyOnEscapeMDKind - The Metadata Kind for clang.arc.copy_on_escape
-    /// metadata.
-    unsigned CopyOnEscapeMDKind;
-
-    /// NoObjCARCExceptionsMDKind - The Metadata Kind for
-    /// clang.arc.no_objc_arc_exceptions metadata.
-    unsigned NoObjCARCExceptionsMDKind;
-
-    Constant *getRetainRVCallee(Module *M);
-    Constant *getAutoreleaseRVCallee(Module *M);
-    Constant *getReleaseCallee(Module *M);
-    Constant *getRetainCallee(Module *M);
-    Constant *getRetainBlockCallee(Module *M);
-    Constant *getAutoreleaseCallee(Module *M);
-
-    bool IsRetainBlockOptimizable(const Instruction *Inst);
-
-    void OptimizeRetainCall(Function &F, Instruction *Retain);
-    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
-    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV);
-    void OptimizeIndividualCalls(Function &F);
-
-    void CheckForCFGHazards(const BasicBlock *BB,
-                            DenseMap<const BasicBlock *, BBState> &BBStates,
-                            BBState &MyStates) const;
-    bool VisitInstructionBottomUp(Instruction *Inst,
-                                  BasicBlock *BB,
-                                  MapVector<Value *, RRInfo> &Retains,
-                                  BBState &MyStates);
-    bool VisitBottomUp(BasicBlock *BB,
-                       DenseMap<const BasicBlock *, BBState> &BBStates,
-                       MapVector<Value *, RRInfo> &Retains);
-    bool VisitInstructionTopDown(Instruction *Inst,
-                                 DenseMap<Value *, RRInfo> &Releases,
-                                 BBState &MyStates);
-    bool VisitTopDown(BasicBlock *BB,
-                      DenseMap<const BasicBlock *, BBState> &BBStates,
-                      DenseMap<Value *, RRInfo> &Releases);
-    bool Visit(Function &F,
-               DenseMap<const BasicBlock *, BBState> &BBStates,
-               MapVector<Value *, RRInfo> &Retains,
-               DenseMap<Value *, RRInfo> &Releases);
-
-    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
-                   MapVector<Value *, RRInfo> &Retains,
-                   DenseMap<Value *, RRInfo> &Releases,
-                   SmallVectorImpl<Instruction *> &DeadInsts,
-                   Module *M);
-
-    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
-                              MapVector<Value *, RRInfo> &Retains,
-                              DenseMap<Value *, RRInfo> &Releases,
-                              Module *M);
-
-    void OptimizeWeakCalls(Function &F);
-
-    bool OptimizeSequences(Function &F);
-
-    void OptimizeReturns(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-    virtual void releaseMemory();
-
-  public:
-    static char ID;
-    ObjCARCOpt() : FunctionPass(ID) {
-      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCOpt::ID = 0;
-INITIALIZE_PASS_BEGIN(ObjCARCOpt,
-                      "objc-arc", "ObjC ARC optimization", false, false)
-INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis)
-INITIALIZE_PASS_END(ObjCARCOpt,
-                    "objc-arc", "ObjC ARC optimization", false, false)
-
-Pass *llvm::createObjCARCOptPass() {
-  return new ObjCARCOpt();
-}
-
-void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<ObjCARCAliasAnalysis>();
-  AU.addRequired<AliasAnalysis>();
-  // ARC optimization doesn't currently split critical edges.
-  AU.setPreservesCFG();
-}
-
-bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
-  // Without the magic metadata tag, we have to assume this might be an
-  // objc_retainBlock call inserted to convert a block pointer to an id,
-  // in which case it really is needed.
-  if (!Inst->getMetadata(CopyOnEscapeMDKind))
-    return false;
-
-  // If the pointer "escapes" (not including being used in a call),
-  // the copy may be needed.
-  if (DoesObjCBlockEscape(Inst))
-    return false;
-
-  // Otherwise, it's not needed.
-  return true;
-}
-
-Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
-  if (!RetainRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    RetainRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
-                             Attribute);
-  }
-  return RetainRVCallee;
-}
-
-Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
-  if (!AutoreleaseRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    AutoreleaseRVCallee =
-      M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
-                             Attribute);
-  }
-  return AutoreleaseRVCallee;
-}
-
-Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
-  if (!ReleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    ReleaseCallee =
-      M->getOrInsertFunction(
-        "objc_release",
-        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return ReleaseCallee;
-}
-
-Constant *ObjCARCOpt::getRetainCallee(Module *M) {
-  if (!RetainCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    RetainCallee =
-      M->getOrInsertFunction(
-        "objc_retain",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return RetainCallee;
-}
-
-Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
-  if (!RetainBlockCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    // objc_retainBlock is not nounwind because it calls user copy constructors
-    // which could theoretically throw.
-    RetainBlockCallee =
-      M->getOrInsertFunction(
-        "objc_retainBlock",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        AttributeSet());
-  }
-  return RetainBlockCallee;
-}
-
-Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
-  if (!AutoreleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    AutoreleaseCallee =
-      M->getOrInsertFunction(
-        "objc_autorelease",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return AutoreleaseCallee;
-}
-
-/// IsPotentialUse - Test whether the given value is possible a
-/// reference-counted pointer, including tests which utilize AliasAnalysis.
-static bool IsPotentialUse(const Value *Op, AliasAnalysis &AA) {
-  // First make the rudimentary check.
-  if (!IsPotentialUse(Op))
-    return false;
-
-  // Objects in constant memory are not reference-counted.
-  if (AA.pointsToConstantMemory(Op))
-    return false;
-
-  // Pointers in constant memory are not pointing to reference-counted objects.
-  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
-    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
-      return false;
-
-  // Otherwise assume the worst.
-  return true;
-}
-
-/// CanAlterRefCount - Test whether the given instruction can result in a
-/// reference count modification (positive or negative) for the pointer's
-/// object.
-static bool
-CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
-                 ProvenanceAnalysis &PA, InstructionClass Class) {
-  switch (Class) {
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_User:
-    // These operations never directly modify a reference count.
-    return false;
-  default: break;
-  }
-
-  ImmutableCallSite CS = static_cast<const Value *>(Inst);
-  assert(CS && "Only calls can alter reference counts!");
-
-  // See if AliasAnalysis can help us with the call.
-  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
-  if (AliasAnalysis::onlyReadsMemory(MRB))
-    return false;
-  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
-    for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
-         I != E; ++I) {
-      const Value *Op = *I;
-      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
-        return true;
-    }
-    return false;
-  }
-
-  // Assume the worst.
-  return true;
-}
-
-/// CanUse - Test whether the given instruction can "use" the given pointer's
-/// object in a way that requires the reference count to be positive.
-static bool
-CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
-       InstructionClass Class) {
-  // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers.
-  if (Class == IC_Call)
-    return false;
-
-  // Consider various instructions which may have pointer arguments which are
-  // not "uses".
-  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
-    // Comparing a pointer with null, or any other constant, isn't really a use,
-    // because we don't care what the pointer points to, or about the values
-    // of any other dynamic reference-counted pointers.
-    if (!IsPotentialUse(ICI->getOperand(1), *PA.getAA()))
-      return false;
-  } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
-    // For calls, just check the arguments (and not the callee operand).
-    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
-         OE = CS.arg_end(); OI != OE; ++OI) {
-      const Value *Op = *OI;
-      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
-        return true;
-    }
-    return false;
-  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    // Special-case stores, because we don't care about the stored value, just
-    // the store address.
-    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
-    // If we can't tell what the underlying object was, assume there is a
-    // dependence.
-    return IsPotentialUse(Op, *PA.getAA()) && PA.related(Op, Ptr);
-  }
-
-  // Check each operand for a match.
-  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
-       OI != OE; ++OI) {
-    const Value *Op = *OI;
-    if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
-      return true;
-  }
-  return false;
-}
-
-/// CanInterruptRV - Test whether the given instruction can autorelease
-/// any pointer or cause an autoreleasepool pop.
-static bool
-CanInterruptRV(InstructionClass Class) {
-  switch (Class) {
-  case IC_AutoreleasepoolPop:
-  case IC_CallOrUser:
-  case IC_Call:
-  case IC_Autorelease:
-  case IC_AutoreleaseRV:
-  case IC_FusedRetainAutorelease:
-  case IC_FusedRetainAutoreleaseRV:
-    return true;
-  default:
-    return false;
-  }
-}
-
-namespace {
-  /// DependenceKind - There are several kinds of dependence-like concepts in
-  /// use here.
-  enum DependenceKind {
-    NeedsPositiveRetainCount,
-    AutoreleasePoolBoundary,
-    CanChangeRetainCount,
-    RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
-    RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
-    RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
-  };
-}
-
-/// Depends - Test if there can be dependencies on Inst through Arg. This
-/// function only tests dependencies relevant for removing pairs of calls.
-static bool
-Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
-        ProvenanceAnalysis &PA) {
-  // If we've reached the definition of Arg, stop.
-  if (Inst == Arg)
-    return true;
-
-  switch (Flavor) {
-  case NeedsPositiveRetainCount: {
-    InstructionClass Class = GetInstructionClass(Inst);
-    switch (Class) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      return false;
-    default:
-      return CanUse(Inst, Arg, PA, Class);
-    }
-  }
-
-  case AutoreleasePoolBoundary: {
-    InstructionClass Class = GetInstructionClass(Inst);
-    switch (Class) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-      // These mark the end and begin of an autorelease pool scope.
-      return true;
-    default:
-      // Nothing else does this.
-      return false;
-    }
-  }
-
-  case CanChangeRetainCount: {
-    InstructionClass Class = GetInstructionClass(Inst);
-    switch (Class) {
-    case IC_AutoreleasepoolPop:
-      // Conservatively assume this can decrement any count.
-      return true;
-    case IC_AutoreleasepoolPush:
-    case IC_None:
-      return false;
-    default:
-      return CanAlterRefCount(Inst, Arg, PA, Class);
-    }
-  }
-
-  case RetainAutoreleaseDep:
-    switch (GetBasicInstructionClass(Inst)) {
-    case IC_AutoreleasepoolPop:
-    case IC_AutoreleasepoolPush:
-      // Don't merge an objc_autorelease with an objc_retain inside a different
-      // autoreleasepool scope.
-      return true;
-    case IC_Retain:
-    case IC_RetainRV:
-      // Check for a retain of the same pointer for merging.
-      return GetObjCArg(Inst) == Arg;
-    default:
-      // Nothing else matters for objc_retainAutorelease formation.
-      return false;
-    }
-
-  case RetainAutoreleaseRVDep: {
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    switch (Class) {
-    case IC_Retain:
-    case IC_RetainRV:
-      // Check for a retain of the same pointer for merging.
-      return GetObjCArg(Inst) == Arg;
-    default:
-      // Anything that can autorelease interrupts
-      // retainAutoreleaseReturnValue formation.
-      return CanInterruptRV(Class);
-    }
-  }
-
-  case RetainRVDep:
-    return CanInterruptRV(GetBasicInstructionClass(Inst));
-  }
-
-  llvm_unreachable("Invalid dependence flavor");
-}
-
-/// FindDependencies - Walk up the CFG from StartPos (which is in StartBB) and
-/// find local and non-local dependencies on Arg.
-/// TODO: Cache results?
-static void
-FindDependencies(DependenceKind Flavor,
-                 const Value *Arg,
-                 BasicBlock *StartBB, Instruction *StartInst,
-                 SmallPtrSet<Instruction *, 4> &DependingInstructions,
-                 SmallPtrSet<const BasicBlock *, 4> &Visited,
-                 ProvenanceAnalysis &PA) {
-  BasicBlock::iterator StartPos = StartInst;
-
-  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
-  Worklist.push_back(std::make_pair(StartBB, StartPos));
-  do {
-    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
-      Worklist.pop_back_val();
-    BasicBlock *LocalStartBB = Pair.first;
-    BasicBlock::iterator LocalStartPos = Pair.second;
-    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
-    for (;;) {
-      if (LocalStartPos == StartBBBegin) {
-        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
-        if (PI == PE)
-          // If we've reached the function entry, produce a null dependence.
-          DependingInstructions.insert(0);
-        else
-          // Add the predecessors to the worklist.
-          do {
-            BasicBlock *PredBB = *PI;
-            if (Visited.insert(PredBB))
-              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
-          } while (++PI != PE);
-        break;
-      }
-
-      Instruction *Inst = --LocalStartPos;
-      if (Depends(Flavor, Inst, Arg, PA)) {
-        DependingInstructions.insert(Inst);
-        break;
-      }
-    }
-  } while (!Worklist.empty());
-
-  // Determine whether the original StartBB post-dominates all of the blocks we
-  // visited. If not, insert a sentinal indicating that most optimizations are
-  // not safe.
-  for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(),
-       E = Visited.end(); I != E; ++I) {
-    const BasicBlock *BB = *I;
-    if (BB == StartBB)
-      continue;
-    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
-      const BasicBlock *Succ = *SI;
-      if (Succ != StartBB && !Visited.count(Succ)) {
-        DependingInstructions.insert(reinterpret_cast<Instruction *>(-1));
-        return;
-      }
-    }
-  }
-}
-
-static bool isNullOrUndef(const Value *V) {
-  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
-}
-
-static bool isNoopInstruction(const Instruction *I) {
-  return isa<BitCastInst>(I) ||
-         (isa<GetElementPtrInst>(I) &&
-          cast<GetElementPtrInst>(I)->hasAllZeroIndices());
-}
-
-/// OptimizeRetainCall - Turn objc_retain into
-/// objc_retainAutoreleasedReturnValue if the operand is a return value.
-void
-ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
-  ImmutableCallSite CS(GetObjCArg(Retain));
-  const Instruction *Call = CS.getInstruction();
-  if (!Call) return;
-  if (Call->getParent() != Retain->getParent()) return;
-
-  // Check that the call is next to the retain.
-  BasicBlock::const_iterator I = Call;
-  ++I;
-  while (isNoopInstruction(I)) ++I;
-  if (&*I != Retain)
-    return;
-
-  // Turn it to an objc_retainAutoreleasedReturnValue..
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming "
-                  "objc_retainAutoreleasedReturnValue => "
-                  "objc_retain since the operand is not a return value.\n"
-                  "                                Old: "
-               << *Retain << "\n");
-
-  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                New: "
-               << *Retain << "\n");
-}
-
-/// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into
-/// objc_retain if the operand is not a return value.  Or, if it can be paired
-/// with an objc_autoreleaseReturnValue, delete the pair and return true.
-bool
-ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
-  // Check for the argument being from an immediately preceding call or invoke.
-  const Value *Arg = GetObjCArg(RetainRV);
-  ImmutableCallSite CS(Arg);
-  if (const Instruction *Call = CS.getInstruction()) {
-    if (Call->getParent() == RetainRV->getParent()) {
-      BasicBlock::const_iterator I = Call;
-      ++I;
-      while (isNoopInstruction(I)) ++I;
-      if (&*I == RetainRV)
-        return false;
-    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      BasicBlock *RetainRVParent = RetainRV->getParent();
-      if (II->getNormalDest() == RetainRVParent) {
-        BasicBlock::const_iterator I = RetainRVParent->begin();
-        while (isNoopInstruction(I)) ++I;
-        if (&*I == RetainRV)
-          return false;
-      }
-    }
-  }
-
-  // Check for being preceded by an objc_autoreleaseReturnValue on the same
-  // pointer. In this case, we can delete the pair.
-  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
-  if (I != Begin) {
-    do --I; while (I != Begin && isNoopInstruction(I));
-    if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
-        GetObjCArg(I) == Arg) {
-      Changed = true;
-      ++NumPeeps;
-
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n"
-                   << "                                  Erasing " << *RetainRV
-                   << "\n");
-
-      EraseInstruction(I);
-      EraseInstruction(RetainRV);
-      return true;
-    }
-  }
-
-  // Turn it to a plain objc_retain.
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming "
-                  "objc_retainAutoreleasedReturnValue => "
-                  "objc_retain since the operand is not a return value.\n"
-                  "                                  Old: "
-               << *RetainRV << "\n");
-
-  cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                  New: "
-               << *RetainRV << "\n");
-
-  return false;
-}
-
-/// OptimizeAutoreleaseRVCall - Turn objc_autoreleaseReturnValue into
-/// objc_autorelease if the result is not used as a return value.
-void
-ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) {
-  // Check for a return of the pointer value.
-  const Value *Ptr = GetObjCArg(AutoreleaseRV);
-  SmallVector<const Value *, 2> Users;
-  Users.push_back(Ptr);
-  do {
-    Ptr = Users.pop_back_val();
-    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
-         UI != UE; ++UI) {
-      const User *I = *UI;
-      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
-        return;
-      if (isa<BitCastInst>(I))
-        Users.push_back(I);
-    }
-  } while (!Users.empty());
-
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming "
-                  "objc_autoreleaseReturnValue => "
-                  "objc_autorelease since its operand is not used as a return "
-                  "value.\n"
-                  "                                       Old: "
-               << *AutoreleaseRV << "\n");
-
-  cast<CallInst>(AutoreleaseRV)->
-    setCalledFunction(getAutoreleaseCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                       New: "
-               << *AutoreleaseRV << "\n");
-
-}
-
-/// OptimizeIndividualCalls - Visit each call, one at a time, and make
-/// simplifications without doing any additional analysis.
-void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
-  // Reset all the flags in preparation for recomputing them.
-  UsedInThisFunction = 0;
-
-  // Visit all objc_* calls in F.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: " <<
-          *Inst << "\n");
-
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-
-    switch (Class) {
-    default: break;
-
-    // Delete no-op casts. These function calls have special semantics, but
-    // the semantics are entirely implemented via lowering in the front-end,
-    // so by the time they reach the optimizer, they are just no-op calls
-    // which return their argument.
-    //
-    // There are gray areas here, as the ability to cast reference-counted
-    // pointers to raw void* and back allows code to break ARC assumptions,
-    // however these are currently considered to be unimportant.
-    case IC_NoopCast:
-      Changed = true;
-      ++NumNoops;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:"
-                   " " << *Inst << "\n");
-      EraseInstruction(Inst);
-      continue;
-
-    // If the pointer-to-weak-pointer is null, it's undefined behavior.
-    case IC_StoreWeak:
-    case IC_LoadWeak:
-    case IC_LoadWeakRetained:
-    case IC_InitWeak:
-    case IC_DestroyWeak: {
-      CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0))) {
-        Changed = true;
-        Type *Ty = CI->getArgOperand(0)->getType();
-        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                      Constant::getNullValue(Ty),
-                      CI);
-        llvm::Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
-                        "pointer-to-weak-pointer is undefined behavior.\n"
-                        "                                     Old = " << *CI <<
-                        "\n                                     New = " <<
-                        *NewValue << "\n");
-        CI->replaceAllUsesWith(NewValue);
-        CI->eraseFromParent();
-        continue;
-      }
-      break;
-    }
-    case IC_CopyWeak:
-    case IC_MoveWeak: {
-      CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0)) ||
-          isNullOrUndef(CI->getArgOperand(1))) {
-        Changed = true;
-        Type *Ty = CI->getArgOperand(0)->getType();
-        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                      Constant::getNullValue(Ty),
-                      CI);
-
-        llvm::Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
-                        "pointer-to-weak-pointer is undefined behavior.\n"
-                        "                                     Old = " << *CI <<
-                        "\n                                     New = " <<
-                        *NewValue << "\n");
-
-        CI->replaceAllUsesWith(NewValue);
-        CI->eraseFromParent();
-        continue;
-      }
-      break;
-    }
-    case IC_Retain:
-      OptimizeRetainCall(F, Inst);
-      break;
-    case IC_RetainRV:
-      if (OptimizeRetainRVCall(F, Inst))
-        continue;
-      break;
-    case IC_AutoreleaseRV:
-      OptimizeAutoreleaseRVCall(F, Inst);
-      break;
-    }
-
-    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
-    if (IsAutorelease(Class) && Inst->use_empty()) {
-      CallInst *Call = cast<CallInst>(Inst);
-      const Value *Arg = Call->getArgOperand(0);
-      Arg = FindSingleUseIdentifiedObject(Arg);
-      if (Arg) {
-        Changed = true;
-        ++NumAutoreleases;
-
-        // Create the declaration lazily.
-        LLVMContext &C = Inst->getContext();
-        CallInst *NewCall =
-          CallInst::Create(getReleaseCallee(F.getParent()),
-                           Call->getArgOperand(0), "", Call);
-        NewCall->setMetadata(ImpreciseReleaseMDKind,
-                             MDNode::get(C, ArrayRef<Value *>()));
-
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing "
-                        "objc_autorelease(x) with objc_release(x) since x is "
-                        "otherwise unused.\n"
-                        "                                     Old: " << *Call <<
-                        "\n                                     New: " <<
-                        *NewCall << "\n");
-
-        EraseInstruction(Call);
-        Inst = NewCall;
-        Class = IC_Release;
-      }
-    }
-
-    // For functions which can never be passed stack arguments, add
-    // a tail keyword.
-    if (IsAlwaysTail(Class)) {
-      Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword"
-            " to function since it can never be passed stack args: " << *Inst <<
-            "\n");
-      cast<CallInst>(Inst)->setTailCall();
-    }
-
-    // Set nounwind as needed.
-    if (IsNoThrow(Class)) {
-      Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw"
-            " class. Setting nounwind on: " << *Inst << "\n");
-      cast<CallInst>(Inst)->setDoesNotThrow();
-    }
-
-    if (!IsNoopOnNull(Class)) {
-      UsedInThisFunction |= 1 << Class;
-      continue;
-    }
-
-    const Value *Arg = GetObjCArg(Inst);
-
-    // ARC calls with null are no-ops. Delete them.
-    if (isNullOrUndef(Arg)) {
-      Changed = true;
-      ++NumNoops;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with "
-            " null are no-ops. Erasing: " << *Inst << "\n");
-      EraseInstruction(Inst);
-      continue;
-    }
-
-    // Keep track of which of retain, release, autorelease, and retain_block
-    // are actually present in this function.
-    UsedInThisFunction |= 1 << Class;
-
-    // If Arg is a PHI, and one or more incoming values to the
-    // PHI are null, and the call is control-equivalent to the PHI, and there
-    // are no relevant side effects between the PHI and the call, the call
-    // could be pushed up to just those paths with non-null incoming values.
-    // For now, don't bother splitting critical edges for this.
-    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
-    Worklist.push_back(std::make_pair(Inst, Arg));
-    do {
-      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
-      Inst = Pair.first;
-      Arg = Pair.second;
-
-      const PHINode *PN = dyn_cast<PHINode>(Arg);
-      if (!PN) continue;
-
-      // Determine if the PHI has any null operands, or any incoming
-      // critical edges.
-      bool HasNull = false;
-      bool HasCriticalEdges = false;
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-        Value *Incoming =
-          StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-        if (isNullOrUndef(Incoming))
-          HasNull = true;
-        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
-                   .getNumSuccessors() != 1) {
-          HasCriticalEdges = true;
-          break;
-        }
-      }
-      // If we have null operands and no critical edges, optimize.
-      if (!HasCriticalEdges && HasNull) {
-        SmallPtrSet<Instruction *, 4> DependingInstructions;
-        SmallPtrSet<const BasicBlock *, 4> Visited;
-
-        // Check that there is nothing that cares about the reference
-        // count between the call and the phi.
-        switch (Class) {
-        case IC_Retain:
-        case IC_RetainBlock:
-          // These can always be moved up.
-          break;
-        case IC_Release:
-          // These can't be moved across things that care about the retain
-          // count.
-          FindDependencies(NeedsPositiveRetainCount, Arg,
-                           Inst->getParent(), Inst,
-                           DependingInstructions, Visited, PA);
-          break;
-        case IC_Autorelease:
-          // These can't be moved across autorelease pool scope boundaries.
-          FindDependencies(AutoreleasePoolBoundary, Arg,
-                           Inst->getParent(), Inst,
-                           DependingInstructions, Visited, PA);
-          break;
-        case IC_RetainRV:
-        case IC_AutoreleaseRV:
-          // Don't move these; the RV optimization depends on the autoreleaseRV
-          // being tail called, and the retainRV being immediately after a call
-          // (which might still happen if we get lucky with codegen layout, but
-          // it's not worth taking the chance).
-          continue;
-        default:
-          llvm_unreachable("Invalid dependence flavor");
-        }
-
-        if (DependingInstructions.size() == 1 &&
-            *DependingInstructions.begin() == PN) {
-          Changed = true;
-          ++NumPartialNoops;
-          // Clone the call into each predecessor that has a non-null value.
-          CallInst *CInst = cast<CallInst>(Inst);
-          Type *ParamTy = CInst->getArgOperand(0)->getType();
-          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-            Value *Incoming =
-              StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-            if (!isNullOrUndef(Incoming)) {
-              CallInst *Clone = cast<CallInst>(CInst->clone());
-              Value *Op = PN->getIncomingValue(i);
-              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
-              if (Op->getType() != ParamTy)
-                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
-              Clone->setArgOperand(0, Op);
-              Clone->insertBefore(InsertPos);
-              Worklist.push_back(std::make_pair(Clone, Incoming));
-            }
-          }
-          // Erase the original call.
-          EraseInstruction(CInst);
-          continue;
-        }
-      }
-    } while (!Worklist.empty());
-
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished Queue.\n\n");
-
-  }
-}
-
-/// CheckForCFGHazards - Check for critical edges, loop boundaries, irreducible
-/// control flow, or other CFG structures where moving code across the edge
-/// would result in it being executed more.
-void
-ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
-                               DenseMap<const BasicBlock *, BBState> &BBStates,
-                               BBState &MyStates) const {
-  // If any top-down local-use or possible-dec has a succ which is earlier in
-  // the sequence, forget it.
-  for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
-       E = MyStates.top_down_ptr_end(); I != E; ++I)
-    switch (I->second.GetSeq()) {
-    default: break;
-    case S_Use: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      // If the terminator is an invoke marked with the
-      // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-      // ignored, for ARC purposes.
-      if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-        --SE;
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None:
-        case S_CanRelease: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
-          continue;
-        }
-        case S_Use:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
-        S.ClearSequenceProgress();
-      break;
-    }
-    case S_CanRelease: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      // If the terminator is an invoke marked with the
-      // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-      // ignored, for ARC purposes.
-      if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-        --SE;
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
-          continue;
-        }
-        case S_CanRelease:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-        case S_Use:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
-        S.ClearSequenceProgress();
-      break;
-    }
-    }
-}
-
-bool
-ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
-                                     BasicBlock *BB,
-                                     MapVector<Value *, RRInfo> &Retains,
-                                     BBState &MyStates) {
-  bool NestingDetected = false;
-  InstructionClass Class = GetInstructionClass(Inst);
-  const Value *Arg = 0;
-
-  switch (Class) {
-  case IC_Release: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrBottomUpState(Arg);
-
-    // If we see two releases in a row on the same pointer. If so, make
-    // a note, and we'll cicle back to revisit it after we've
-    // hopefully eliminated the second release, which may allow us to
-    // eliminate the first release too.
-    // Theoretically we could implement removal of nested retain+release
-    // pairs by making PtrState hold a stack of states, but this is
-    // simple and avoids adding overhead for the non-nested case.
-    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease)
-      NestingDetected = true;
-
-    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-    S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
-    S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.IsKnownIncremented();
-    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-    S.RRI.Calls.insert(Inst);
-
-    S.SetKnownPositiveRefCount();
-    break;
-  }
-  case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
-  case IC_Retain:
-  case IC_RetainRV: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrBottomUpState(Arg);
-    S.SetKnownPositiveRefCount();
-
-    switch (S.GetSeq()) {
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-    case S_Use:
-      S.RRI.ReverseInsertPts.clear();
-      // FALL THROUGH
-    case S_CanRelease:
-      // Don't do retain+release tracking for IC_RetainRV, because it's
-      // better to let it remain as the first instruction after a call.
-      if (Class != IC_RetainRV) {
-        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-        Retains[Inst] = S.RRI;
-      }
-      S.ClearSequenceProgress();
-      break;
-    case S_None:
-      break;
-    case S_Retain:
-      llvm_unreachable("bottom-up pointer in retain state!");
-    }
-    return NestingDetected;
-  }
-  case IC_AutoreleasepoolPop:
-    // Conservatively, clear MyStates for all known pointers.
-    MyStates.clearBottomUpPointers();
-    return NestingDetected;
-  case IC_AutoreleasepoolPush:
-  case IC_None:
-    // These are irrelevant.
-    return NestingDetected;
-  default:
-    break;
-  }
-
-  // Consider any other possible effects of this instruction on each
-  // pointer being tracked.
-  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
-       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
-    const Value *Ptr = MI->first;
-    if (Ptr == Arg)
-      continue; // Handled above.
-    PtrState &S = MI->second;
-    Sequence Seq = S.GetSeq();
-
-    // Check for possible releases.
-    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
-      switch (Seq) {
-      case S_Use:
-        S.SetSeq(S_CanRelease);
-        continue;
-      case S_CanRelease:
-      case S_Release:
-      case S_MovableRelease:
-      case S_Stop:
-      case S_None:
-        break;
-      case S_Retain:
-        llvm_unreachable("bottom-up pointer in retain state!");
-      }
-    }
-
-    // Check for possible direct uses.
-    switch (Seq) {
-    case S_Release:
-    case S_MovableRelease:
-      if (CanUse(Inst, Ptr, PA, Class)) {
-        assert(S.RRI.ReverseInsertPts.empty());
-        // If this is an invoke instruction, we're scanning it as part of
-        // one of its successor blocks, since we can't insert code after it
-        // in its own block, and we don't want to split critical edges.
-        if (isa<InvokeInst>(Inst))
-          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
-        else
-          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
-        S.SetSeq(S_Use);
-      } else if (Seq == S_Release &&
-                 (Class == IC_User || Class == IC_CallOrUser)) {
-        // Non-movable releases depend on any possible objc pointer use.
-        S.SetSeq(S_Stop);
-        assert(S.RRI.ReverseInsertPts.empty());
-        // As above; handle invoke specially.
-        if (isa<InvokeInst>(Inst))
-          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
-        else
-          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
-      }
-      break;
-    case S_Stop:
-      if (CanUse(Inst, Ptr, PA, Class))
-        S.SetSeq(S_Use);
-      break;
-    case S_CanRelease:
-    case S_Use:
-    case S_None:
-      break;
-    case S_Retain:
-      llvm_unreachable("bottom-up pointer in retain state!");
-    }
-  }
-
-  return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
-                          DenseMap<const BasicBlock *, BBState> &BBStates,
-                          MapVector<Value *, RRInfo> &Retains) {
-  bool NestingDetected = false;
-  BBState &MyStates = BBStates[BB];
-
-  // Merge the states from each successor to compute the initial state
-  // for the current block.
-  BBState::edge_iterator SI(MyStates.succ_begin()),
-                         SE(MyStates.succ_end());
-  if (SI != SE) {
-    const BasicBlock *Succ = *SI;
-    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
-    assert(I != BBStates.end());
-    MyStates.InitFromSucc(I->second);
-    ++SI;
-    for (; SI != SE; ++SI) {
-      Succ = *SI;
-      I = BBStates.find(Succ);
-      assert(I != BBStates.end());
-      MyStates.MergeSucc(I->second);
-    }
-  }
-
-  // Visit all the instructions, bottom-up.
-  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
-    Instruction *Inst = llvm::prior(I);
-
-    // Invoke instructions are visited as part of their successors (below).
-    if (isa<InvokeInst>(Inst))
-      continue;
-
-    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
-  }
-
-  // If there's a predecessor with an invoke, visit the invoke as if it were
-  // part of this block, since we can't insert code after an invoke in its own
-  // block, and we don't want to split critical edges.
-  for (BBState::edge_iterator PI(MyStates.pred_begin()),
-       PE(MyStates.pred_end()); PI != PE; ++PI) {
-    BasicBlock *Pred = *PI;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
-      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
-  }
-
-  return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
-                                    DenseMap<Value *, RRInfo> &Releases,
-                                    BBState &MyStates) {
-  bool NestingDetected = false;
-  InstructionClass Class = GetInstructionClass(Inst);
-  const Value *Arg = 0;
-
-  switch (Class) {
-  case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
-  case IC_Retain:
-  case IC_RetainRV: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrTopDownState(Arg);
-
-    // Don't do retain+release tracking for IC_RetainRV, because it's
-    // better to let it remain as the first instruction after a call.
-    if (Class != IC_RetainRV) {
-      // If we see two retains in a row on the same pointer. If so, make
-      // a note, and we'll cicle back to revisit it after we've
-      // hopefully eliminated the second retain, which may allow us to
-      // eliminate the first retain too.
-      // Theoretically we could implement removal of nested retain+release
-      // pairs by making PtrState hold a stack of states, but this is
-      // simple and avoids adding overhead for the non-nested case.
-      if (S.GetSeq() == S_Retain)
-        NestingDetected = true;
-
-      S.ResetSequenceProgress(S_Retain);
-      S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      S.RRI.KnownSafe = S.IsKnownIncremented();
-      S.RRI.Calls.insert(Inst);
-    }
-
-    S.SetKnownPositiveRefCount();
-
-    // A retain can be a potential use; procede to the generic checking
-    // code below.
-    break;
-  }
-  case IC_Release: {
-    Arg = GetObjCArg(Inst);
-
-    PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.ClearRefCount();
-
-    switch (S.GetSeq()) {
-    case S_Retain:
-    case S_CanRelease:
-      S.RRI.ReverseInsertPts.clear();
-      // FALL THROUGH
-    case S_Use:
-      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-      Releases[Inst] = S.RRI;
-      S.ClearSequenceProgress();
-      break;
-    case S_None:
-      break;
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-      llvm_unreachable("top-down pointer in release state!");
-    }
-    break;
-  }
-  case IC_AutoreleasepoolPop:
-    // Conservatively, clear MyStates for all known pointers.
-    MyStates.clearTopDownPointers();
-    return NestingDetected;
-  case IC_AutoreleasepoolPush:
-  case IC_None:
-    // These are irrelevant.
-    return NestingDetected;
-  default:
-    break;
-  }
-
-  // Consider any other possible effects of this instruction on each
-  // pointer being tracked.
-  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
-       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
-    const Value *Ptr = MI->first;
-    if (Ptr == Arg)
-      continue; // Handled above.
-    PtrState &S = MI->second;
-    Sequence Seq = S.GetSeq();
-
-    // Check for possible releases.
-    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
-      switch (Seq) {
-      case S_Retain:
-        S.SetSeq(S_CanRelease);
-        assert(S.RRI.ReverseInsertPts.empty());
-        S.RRI.ReverseInsertPts.insert(Inst);
-
-        // One call can't cause a transition from S_Retain to S_CanRelease
-        // and S_CanRelease to S_Use. If we've made the first transition,
-        // we're done.
-        continue;
-      case S_Use:
-      case S_CanRelease:
-      case S_None:
-        break;
-      case S_Stop:
-      case S_Release:
-      case S_MovableRelease:
-        llvm_unreachable("top-down pointer in release state!");
-      }
-    }
-
-    // Check for possible direct uses.
-    switch (Seq) {
-    case S_CanRelease:
-      if (CanUse(Inst, Ptr, PA, Class))
-        S.SetSeq(S_Use);
-      break;
-    case S_Retain:
-    case S_Use:
-    case S_None:
-      break;
-    case S_Stop:
-    case S_Release:
-    case S_MovableRelease:
-      llvm_unreachable("top-down pointer in release state!");
-    }
-  }
-
-  return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitTopDown(BasicBlock *BB,
-                         DenseMap<const BasicBlock *, BBState> &BBStates,
-                         DenseMap<Value *, RRInfo> &Releases) {
-  bool NestingDetected = false;
-  BBState &MyStates = BBStates[BB];
-
-  // Merge the states from each predecessor to compute the initial state
-  // for the current block.
-  BBState::edge_iterator PI(MyStates.pred_begin()),
-                         PE(MyStates.pred_end());
-  if (PI != PE) {
-    const BasicBlock *Pred = *PI;
-    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
-    assert(I != BBStates.end());
-    MyStates.InitFromPred(I->second);
-    ++PI;
-    for (; PI != PE; ++PI) {
-      Pred = *PI;
-      I = BBStates.find(Pred);
-      assert(I != BBStates.end());
-      MyStates.MergePred(I->second);
-    }
-  }
-
-  // Visit all the instructions, top-down.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    Instruction *Inst = I;
-    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
-  }
-
-  CheckForCFGHazards(BB, BBStates, MyStates);
-  return NestingDetected;
-}
-
-static void
-ComputePostOrders(Function &F,
-                  SmallVectorImpl<BasicBlock *> &PostOrder,
-                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
-                  unsigned NoObjCARCExceptionsMDKind,
-                  DenseMap<const BasicBlock *, BBState> &BBStates) {
-  /// Visited - The visited set, for doing DFS walks.
-  SmallPtrSet<BasicBlock *, 16> Visited;
-
-  // Do DFS, computing the PostOrder.
-  SmallPtrSet<BasicBlock *, 16> OnStack;
-  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
-
-  // Functions always have exactly one entry block, and we don't have
-  // any other block that we treat like an entry block.
-  BasicBlock *EntryBB = &F.getEntryBlock();
-  BBState &MyStates = BBStates[EntryBB];
-  MyStates.SetAsEntry();
-  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
-  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
-  Visited.insert(EntryBB);
-  OnStack.insert(EntryBB);
-  do {
-  dfs_next_succ:
-    BasicBlock *CurrBB = SuccStack.back().first;
-    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
-    succ_iterator SE(TI, false);
-
-    // If the terminator is an invoke marked with the
-    // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be
-    // ignored, for ARC purposes.
-    if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind))
-      --SE;
-
-    while (SuccStack.back().second != SE) {
-      BasicBlock *SuccBB = *SuccStack.back().second++;
-      if (Visited.insert(SuccBB)) {
-        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
-        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
-        BBStates[CurrBB].addSucc(SuccBB);
-        BBState &SuccStates = BBStates[SuccBB];
-        SuccStates.addPred(CurrBB);
-        OnStack.insert(SuccBB);
-        goto dfs_next_succ;
-      }
-
-      if (!OnStack.count(SuccBB)) {
-        BBStates[CurrBB].addSucc(SuccBB);
-        BBStates[SuccBB].addPred(CurrBB);
-      }
-    }
-    OnStack.erase(CurrBB);
-    PostOrder.push_back(CurrBB);
-    SuccStack.pop_back();
-  } while (!SuccStack.empty());
-
-  Visited.clear();
-
-  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
-  // Functions may have many exits, and there also blocks which we treat
-  // as exits due to ignored edges.
-  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
-    BasicBlock *ExitBB = I;
-    BBState &MyStates = BBStates[ExitBB];
-    if (!MyStates.isExit())
-      continue;
-
-    MyStates.SetAsExit();
-
-    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
-    Visited.insert(ExitBB);
-    while (!PredStack.empty()) {
-    reverse_dfs_next_succ:
-      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
-      while (PredStack.back().second != PE) {
-        BasicBlock *BB = *PredStack.back().second++;
-        if (Visited.insert(BB)) {
-          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
-          goto reverse_dfs_next_succ;
-        }
-      }
-      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
-    }
-  }
-}
-
-// Visit - Visit the function both top-down and bottom-up.
-bool
-ObjCARCOpt::Visit(Function &F,
-                  DenseMap<const BasicBlock *, BBState> &BBStates,
-                  MapVector<Value *, RRInfo> &Retains,
-                  DenseMap<Value *, RRInfo> &Releases) {
-
-  // Use reverse-postorder traversals, because we magically know that loops
-  // will be well behaved, i.e. they won't repeatedly call retain on a single
-  // pointer without doing a release. We can't use the ReversePostOrderTraversal
-  // class here because we want the reverse-CFG postorder to consider each
-  // function exit point, and we want to ignore selected cycle edges.
-  SmallVector<BasicBlock *, 16> PostOrder;
-  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
-  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
-                    NoObjCARCExceptionsMDKind,
-                    BBStates);
-
-  // Use reverse-postorder on the reverse CFG for bottom-up.
-  bool BottomUpNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
-       ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend();
-       I != E; ++I)
-    BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains);
-
-  // Use reverse-postorder for top-down.
-  bool TopDownNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
-       PostOrder.rbegin(), E = PostOrder.rend();
-       I != E; ++I)
-    TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases);
-
-  return TopDownNestingDetected && BottomUpNestingDetected;
-}
-
-/// MoveCalls - Move the calls in RetainsToMove and ReleasesToMove.
-void ObjCARCOpt::MoveCalls(Value *Arg,
-                           RRInfo &RetainsToMove,
-                           RRInfo &ReleasesToMove,
-                           MapVector<Value *, RRInfo> &Retains,
-                           DenseMap<Value *, RRInfo> &Releases,
-                           SmallVectorImpl<Instruction *> &DeadInsts,
-                           Module *M) {
-  Type *ArgTy = Arg->getType();
-  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
-
-  // Insert the new retain and release calls.
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       PI = ReleasesToMove.ReverseInsertPts.begin(),
-       PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *InsertPt = *PI;
-    Value *MyArg = ArgTy == ParamTy ? Arg :
-                   new BitCastInst(Arg, ParamTy, "", InsertPt);
-    CallInst *Call =
-      CallInst::Create(RetainsToMove.IsRetainBlock ?
-                         getRetainBlockCallee(M) : getRetainCallee(M),
-                       MyArg, "", InsertPt);
-    Call->setDoesNotThrow();
-    if (RetainsToMove.IsRetainBlock)
-      Call->setMetadata(CopyOnEscapeMDKind,
-                        MDNode::get(M->getContext(), ArrayRef<Value *>()));
-    else
-      Call->setTailCall();
-  }
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       PI = RetainsToMove.ReverseInsertPts.begin(),
-       PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
-    Instruction *InsertPt = *PI;
-    Value *MyArg = ArgTy == ParamTy ? Arg :
-                   new BitCastInst(Arg, ParamTy, "", InsertPt);
-    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
-                                      "", InsertPt);
-    // Attach a clang.imprecise_release metadata tag, if appropriate.
-    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
-      Call->setMetadata(ImpreciseReleaseMDKind, M);
-    Call->setDoesNotThrow();
-    if (ReleasesToMove.IsTailCallRelease)
-      Call->setTailCall();
-  }
-
-  // Delete the original retain and release calls.
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       AI = RetainsToMove.Calls.begin(),
-       AE = RetainsToMove.Calls.end(); AI != AE; ++AI) {
-    Instruction *OrigRetain = *AI;
-    Retains.blot(OrigRetain);
-    DeadInsts.push_back(OrigRetain);
-  }
-  for (SmallPtrSet<Instruction *, 2>::const_iterator
-       AI = ReleasesToMove.Calls.begin(),
-       AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) {
-    Instruction *OrigRelease = *AI;
-    Releases.erase(OrigRelease);
-    DeadInsts.push_back(OrigRelease);
-  }
-}
-
-/// PerformCodePlacement - Identify pairings between the retains and releases,
-/// and delete and/or move them.
-bool
-ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
-                                   &BBStates,
-                                 MapVector<Value *, RRInfo> &Retains,
-                                 DenseMap<Value *, RRInfo> &Releases,
-                                 Module *M) {
-  bool AnyPairsCompletelyEliminated = false;
-  RRInfo RetainsToMove;
-  RRInfo ReleasesToMove;
-  SmallVector<Instruction *, 4> NewRetains;
-  SmallVector<Instruction *, 4> NewReleases;
-  SmallVector<Instruction *, 8> DeadInsts;
-
-  // Visit each retain.
-  for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
-       E = Retains.end(); I != E; ++I) {
-    Value *V = I->first;
-    if (!V) continue; // blotted
-
-    Instruction *Retain = cast<Instruction>(V);
-    Value *Arg = GetObjCArg(Retain);
-
-    // If the object being released is in static or stack storage, we know it's
-    // not being managed by ObjC reference counting, so we can delete pairs
-    // regardless of what possible decrements or uses lie between them.
-    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
-
-    // A constant pointer can't be pointing to an object on the heap. It may
-    // be reference-counted, but it won't be deleted.
-    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
-      if (const GlobalVariable *GV =
-            dyn_cast<GlobalVariable>(
-              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
-        if (GV->isConstant())
-          KnownSafe = true;
-
-    // If a pair happens in a region where it is known that the reference count
-    // is already incremented, we can similarly ignore possible decrements.
-    bool KnownSafeTD = true, KnownSafeBU = true;
-
-    // Connect the dots between the top-down-collected RetainsToMove and
-    // bottom-up-collected ReleasesToMove to form sets of related calls.
-    // This is an iterative process so that we connect multiple releases
-    // to multiple retains if needed.
-    unsigned OldDelta = 0;
-    unsigned NewDelta = 0;
-    unsigned OldCount = 0;
-    unsigned NewCount = 0;
-    bool FirstRelease = true;
-    bool FirstRetain = true;
-    NewRetains.push_back(Retain);
-    for (;;) {
-      for (SmallVectorImpl<Instruction *>::const_iterator
-           NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
-        Instruction *NewRetain = *NI;
-        MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
-        assert(It != Retains.end());
-        const RRInfo &NewRetainRRI = It->second;
-        KnownSafeTD &= NewRetainRRI.KnownSafe;
-        for (SmallPtrSet<Instruction *, 2>::const_iterator
-             LI = NewRetainRRI.Calls.begin(),
-             LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
-          Instruction *NewRetainRelease = *LI;
-          DenseMap<Value *, RRInfo>::const_iterator Jt =
-            Releases.find(NewRetainRelease);
-          if (Jt == Releases.end())
-            goto next_retain;
-          const RRInfo &NewRetainReleaseRRI = Jt->second;
-          assert(NewRetainReleaseRRI.Calls.count(NewRetain));
-          if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
-            OldDelta -=
-              BBStates[NewRetainRelease->getParent()].GetAllPathCount();
-
-            // Merge the ReleaseMetadata and IsTailCallRelease values.
-            if (FirstRelease) {
-              ReleasesToMove.ReleaseMetadata =
-                NewRetainReleaseRRI.ReleaseMetadata;
-              ReleasesToMove.IsTailCallRelease =
-                NewRetainReleaseRRI.IsTailCallRelease;
-              FirstRelease = false;
-            } else {
-              if (ReleasesToMove.ReleaseMetadata !=
-                    NewRetainReleaseRRI.ReleaseMetadata)
-                ReleasesToMove.ReleaseMetadata = 0;
-              if (ReleasesToMove.IsTailCallRelease !=
-                    NewRetainReleaseRRI.IsTailCallRelease)
-                ReleasesToMove.IsTailCallRelease = false;
-            }
-
-            // Collect the optimal insertion points.
-            if (!KnownSafe)
-              for (SmallPtrSet<Instruction *, 2>::const_iterator
-                   RI = NewRetainReleaseRRI.ReverseInsertPts.begin(),
-                   RE = NewRetainReleaseRRI.ReverseInsertPts.end();
-                   RI != RE; ++RI) {
-                Instruction *RIP = *RI;
-                if (ReleasesToMove.ReverseInsertPts.insert(RIP))
-                  NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
-              }
-            NewReleases.push_back(NewRetainRelease);
-          }
-        }
-      }
-      NewRetains.clear();
-      if (NewReleases.empty()) break;
-
-      // Back the other way.
-      for (SmallVectorImpl<Instruction *>::const_iterator
-           NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
-        Instruction *NewRelease = *NI;
-        DenseMap<Value *, RRInfo>::const_iterator It =
-          Releases.find(NewRelease);
-        assert(It != Releases.end());
-        const RRInfo &NewReleaseRRI = It->second;
-        KnownSafeBU &= NewReleaseRRI.KnownSafe;
-        for (SmallPtrSet<Instruction *, 2>::const_iterator
-             LI = NewReleaseRRI.Calls.begin(),
-             LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
-          Instruction *NewReleaseRetain = *LI;
-          MapVector<Value *, RRInfo>::const_iterator Jt =
-            Retains.find(NewReleaseRetain);
-          if (Jt == Retains.end())
-            goto next_retain;
-          const RRInfo &NewReleaseRetainRRI = Jt->second;
-          assert(NewReleaseRetainRRI.Calls.count(NewRelease));
-          if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
-            unsigned PathCount =
-              BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
-            OldDelta += PathCount;
-            OldCount += PathCount;
-
-            // Merge the IsRetainBlock values.
-            if (FirstRetain) {
-              RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock;
-              FirstRetain = false;
-            } else if (ReleasesToMove.IsRetainBlock !=
-                       NewReleaseRetainRRI.IsRetainBlock)
-              // It's not possible to merge the sequences if one uses
-              // objc_retain and the other uses objc_retainBlock.
-              goto next_retain;
-
-            // Collect the optimal insertion points.
-            if (!KnownSafe)
-              for (SmallPtrSet<Instruction *, 2>::const_iterator
-                   RI = NewReleaseRetainRRI.ReverseInsertPts.begin(),
-                   RE = NewReleaseRetainRRI.ReverseInsertPts.end();
-                   RI != RE; ++RI) {
-                Instruction *RIP = *RI;
-                if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
-                  PathCount = BBStates[RIP->getParent()].GetAllPathCount();
-                  NewDelta += PathCount;
-                  NewCount += PathCount;
-                }
-              }
-            NewRetains.push_back(NewReleaseRetain);
-          }
-        }
-      }
-      NewReleases.clear();
-      if (NewRetains.empty()) break;
-    }
-
-    // If the pointer is known incremented or nested, we can safely delete the
-    // pair regardless of what's between them.
-    if (KnownSafeTD || KnownSafeBU) {
-      RetainsToMove.ReverseInsertPts.clear();
-      ReleasesToMove.ReverseInsertPts.clear();
-      NewCount = 0;
-    } else {
-      // Determine whether the new insertion points we computed preserve the
-      // balance of retain and release calls through the program.
-      // TODO: If the fully aggressive solution isn't valid, try to find a
-      // less aggressive solution which is.
-      if (NewDelta != 0)
-        goto next_retain;
-    }
-
-    // Determine whether the original call points are balanced in the retain and
-    // release calls through the program. If not, conservatively don't touch
-    // them.
-    // TODO: It's theoretically possible to do code motion in this case, as
-    // long as the existing imbalances are maintained.
-    if (OldDelta != 0)
-      goto next_retain;
-
-    // Ok, everything checks out and we're all set. Let's move some code!
-    Changed = true;
-    assert(OldCount != 0 && "Unreachable code?");
-    AnyPairsCompletelyEliminated = NewCount == 0;
-    NumRRs += OldCount - NewCount;
-    MoveCalls(Arg, RetainsToMove, ReleasesToMove,
-              Retains, Releases, DeadInsts, M);
-
-  next_retain:
-    NewReleases.clear();
-    NewRetains.clear();
-    RetainsToMove.clear();
-    ReleasesToMove.clear();
-  }
-
-  // Now that we're done moving everything, we can delete the newly dead
-  // instructions, as we no longer need them as insert points.
-  while (!DeadInsts.empty())
-    EraseInstruction(DeadInsts.pop_back_val());
-
-  return AnyPairsCompletelyEliminated;
-}
-
-/// OptimizeWeakCalls - Weak pointer optimizations.
-void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
-  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
-  // itself because it uses AliasAnalysis and we need to do provenance
-  // queries instead.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst <<
-          "\n");
-
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
-      continue;
-
-    // Delete objc_loadWeak calls with no users.
-    if (Class == IC_LoadWeak && Inst->use_empty()) {
-      Inst->eraseFromParent();
-      continue;
-    }
-
-    // TODO: For now, just look for an earlier available version of this value
-    // within the same block. Theoretically, we could do memdep-style non-local
-    // analysis too, but that would want caching. A better approach would be to
-    // use the technique that EarlyCSE uses.
-    inst_iterator Current = llvm::prior(I);
-    BasicBlock *CurrentBB = Current.getBasicBlockIterator();
-    for (BasicBlock::iterator B = CurrentBB->begin(),
-                              J = Current.getInstructionIterator();
-         J != B; --J) {
-      Instruction *EarlierInst = &*llvm::prior(J);
-      InstructionClass EarlierClass = GetInstructionClass(EarlierInst);
-      switch (EarlierClass) {
-      case IC_LoadWeak:
-      case IC_LoadWeakRetained: {
-        // If this is loading from the same pointer, replace this load's value
-        // with that one.
-        CallInst *Call = cast<CallInst>(Inst);
-        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
-        Value *Arg = Call->getArgOperand(0);
-        Value *EarlierArg = EarlierCall->getArgOperand(0);
-        switch (PA.getAA()->alias(Arg, EarlierArg)) {
-        case AliasAnalysis::MustAlias:
-          Changed = true;
-          // If the load has a builtin retain, insert a plain retain for it.
-          if (Class == IC_LoadWeakRetained) {
-            CallInst *CI =
-              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
-                               "", Call);
-            CI->setTailCall();
-          }
-          // Zap the fully redundant load.
-          Call->replaceAllUsesWith(EarlierCall);
-          Call->eraseFromParent();
-          goto clobbered;
-        case AliasAnalysis::MayAlias:
-        case AliasAnalysis::PartialAlias:
-          goto clobbered;
-        case AliasAnalysis::NoAlias:
-          break;
-        }
-        break;
-      }
-      case IC_StoreWeak:
-      case IC_InitWeak: {
-        // If this is storing to the same pointer and has the same size etc.
-        // replace this load's value with the stored value.
-        CallInst *Call = cast<CallInst>(Inst);
-        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
-        Value *Arg = Call->getArgOperand(0);
-        Value *EarlierArg = EarlierCall->getArgOperand(0);
-        switch (PA.getAA()->alias(Arg, EarlierArg)) {
-        case AliasAnalysis::MustAlias:
-          Changed = true;
-          // If the load has a builtin retain, insert a plain retain for it.
-          if (Class == IC_LoadWeakRetained) {
-            CallInst *CI =
-              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
-                               "", Call);
-            CI->setTailCall();
-          }
-          // Zap the fully redundant load.
-          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
-          Call->eraseFromParent();
-          goto clobbered;
-        case AliasAnalysis::MayAlias:
-        case AliasAnalysis::PartialAlias:
-          goto clobbered;
-        case AliasAnalysis::NoAlias:
-          break;
-        }
-        break;
-      }
-      case IC_MoveWeak:
-      case IC_CopyWeak:
-        // TOOD: Grab the copied value.
-        goto clobbered;
-      case IC_AutoreleasepoolPush:
-      case IC_None:
-      case IC_User:
-        // Weak pointers are only modified through the weak entry points
-        // (and arbitrary calls, which could call the weak entry points).
-        break;
-      default:
-        // Anything else could modify the weak pointer.
-        goto clobbered;
-      }
-    }
-  clobbered:;
-  }
-
-  // Then, for each destroyWeak with an alloca operand, check to see if
-  // the alloca and all its users can be zapped.
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    if (Class != IC_DestroyWeak)
-      continue;
-
-    CallInst *Call = cast<CallInst>(Inst);
-    Value *Arg = Call->getArgOperand(0);
-    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
-      for (Value::use_iterator UI = Alloca->use_begin(),
-           UE = Alloca->use_end(); UI != UE; ++UI) {
-        const Instruction *UserInst = cast<Instruction>(*UI);
-        switch (GetBasicInstructionClass(UserInst)) {
-        case IC_InitWeak:
-        case IC_StoreWeak:
-        case IC_DestroyWeak:
-          continue;
-        default:
-          goto done;
-        }
-      }
-      Changed = true;
-      for (Value::use_iterator UI = Alloca->use_begin(),
-           UE = Alloca->use_end(); UI != UE; ) {
-        CallInst *UserInst = cast<CallInst>(*UI++);
-        switch (GetBasicInstructionClass(UserInst)) {
-        case IC_InitWeak:
-        case IC_StoreWeak:
-          // These functions return their second argument.
-          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
-          break;
-        case IC_DestroyWeak:
-          // No return value.
-          break;
-        default:
-          llvm_unreachable("alloca really is used!");
-        }
-        UserInst->eraseFromParent();
-      }
-      Alloca->eraseFromParent();
-    done:;
-    }
-  }
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n");
-
-}
-
-/// OptimizeSequences - Identify program paths which execute sequences of
-/// retains and releases which can be eliminated.
-bool ObjCARCOpt::OptimizeSequences(Function &F) {
-  /// Releases, Retains - These are used to store the results of the main flow
-  /// analysis. These use Value* as the key instead of Instruction* so that the
-  /// map stays valid when we get around to rewriting code and calls get
-  /// replaced by arguments.
-  DenseMap<Value *, RRInfo> Releases;
-  MapVector<Value *, RRInfo> Retains;
-
-  /// BBStates, This is used during the traversal of the function to track the
-  /// states for each identified object at each block.
-  DenseMap<const BasicBlock *, BBState> BBStates;
-
-  // Analyze the CFG of the function, and all instructions.
-  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
-
-  // Transform.
-  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
-         NestingDetected;
-}
-
-/// OptimizeReturns - Look for this pattern:
-/// \code
-///    %call = call i8* @something(...)
-///    %2 = call i8* @objc_retain(i8* %call)
-///    %3 = call i8* @objc_autorelease(i8* %2)
-///    ret i8* %3
-/// \endcode
-/// And delete the retain and autorelease.
-///
-/// Otherwise if it's just this:
-/// \code
-///    %3 = call i8* @objc_autorelease(i8* %2)
-///    ret i8* %3
-/// \endcode
-/// convert the autorelease to autoreleaseRV.
-void ObjCARCOpt::OptimizeReturns(Function &F) {
-  if (!F.getReturnType()->isPointerTy())
-    return;
-
-  SmallPtrSet<Instruction *, 4> DependingInstructions;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
-    BasicBlock *BB = FI;
-    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
-
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n");
-
-    if (!Ret) continue;
-
-    const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
-    FindDependencies(NeedsPositiveRetainCount, Arg,
-                     BB, Ret, DependingInstructions, Visited, PA);
-    if (DependingInstructions.size() != 1)
-      goto next_block;
-
-    {
-      CallInst *Autorelease =
-        dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-      if (!Autorelease)
-        goto next_block;
-      InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
-      if (!IsAutorelease(AutoreleaseClass))
-        goto next_block;
-      if (GetObjCArg(Autorelease) != Arg)
-        goto next_block;
-
-      DependingInstructions.clear();
-      Visited.clear();
-
-      // Check that there is nothing that can affect the reference
-      // count between the autorelease and the retain.
-      FindDependencies(CanChangeRetainCount, Arg,
-                       BB, Autorelease, DependingInstructions, Visited, PA);
-      if (DependingInstructions.size() != 1)
-        goto next_block;
-
-      {
-        CallInst *Retain =
-          dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-
-        // Check that we found a retain with the same argument.
-        if (!Retain ||
-            !IsRetain(GetBasicInstructionClass(Retain)) ||
-            GetObjCArg(Retain) != Arg)
-          goto next_block;
-
-        DependingInstructions.clear();
-        Visited.clear();
-
-        // Convert the autorelease to an autoreleaseRV, since it's
-        // returning the value.
-        if (AutoreleaseClass == IC_Autorelease) {
-          Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent()));
-          AutoreleaseClass = IC_AutoreleaseRV;
-        }
-
-        // Check that there is nothing that can affect the reference
-        // count between the retain and the call.
-        // Note that Retain need not be in BB.
-        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
-                         DependingInstructions, Visited, PA);
-        if (DependingInstructions.size() != 1)
-          goto next_block;
-
-        {
-          CallInst *Call =
-            dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-
-          // Check that the pointer is the return value of the call.
-          if (!Call || Arg != Call)
-            goto next_block;
-
-          // Check that the call is a regular call.
-          InstructionClass Class = GetBasicInstructionClass(Call);
-          if (Class != IC_CallOrUser && Class != IC_Call)
-            goto next_block;
-
-          // If so, we can zap the retain and autorelease.
-          Changed = true;
-          ++NumRets;
-          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain
-                       << "\n                             Erasing: "
-                       << *Autorelease << "\n");
-          EraseInstruction(Retain);
-          EraseInstruction(Autorelease);
-        }
-      }
-    }
-
-  next_block:
-    DependingInstructions.clear();
-    Visited.clear();
-  }
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n");
-
-}
-
-bool ObjCARCOpt::doInitialization(Module &M) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  Run = ModuleHasARC(M);
-  if (!Run)
-    return false;
-
-  // Identify the imprecise release metadata kind.
-  ImpreciseReleaseMDKind =
-    M.getContext().getMDKindID("clang.imprecise_release");
-  CopyOnEscapeMDKind =
-    M.getContext().getMDKindID("clang.arc.copy_on_escape");
-  NoObjCARCExceptionsMDKind =
-    M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
-
-  // Intuitively, objc_retain and others are nocapture, however in practice
-  // they are not, because they return their argument value. And objc_release
-  // calls finalizers which can have arbitrary side effects.
-
-  // These are initialized lazily.
-  RetainRVCallee = 0;
-  AutoreleaseRVCallee = 0;
-  ReleaseCallee = 0;
-  RetainCallee = 0;
-  RetainBlockCallee = 0;
-  AutoreleaseCallee = 0;
-
-  return false;
-}
-
-bool ObjCARCOpt::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  Changed = false;
-
-  PA.setAA(&getAnalysis<AliasAnalysis>());
-
-  // This pass performs several distinct transformations. As a compile-time aid
-  // when compiling code that isn't ObjC, skip these if the relevant ObjC
-  // library functions aren't declared.
-
-  // Preliminary optimizations. This also computs UsedInThisFunction.
-  OptimizeIndividualCalls(F);
-
-  // Optimizations for weak pointers.
-  if (UsedInThisFunction & ((1 << IC_LoadWeak) |
-                            (1 << IC_LoadWeakRetained) |
-                            (1 << IC_StoreWeak) |
-                            (1 << IC_InitWeak) |
-                            (1 << IC_CopyWeak) |
-                            (1 << IC_MoveWeak) |
-                            (1 << IC_DestroyWeak)))
-    OptimizeWeakCalls(F);
-
-  // Optimizations for retain+release pairs.
-  if (UsedInThisFunction & ((1 << IC_Retain) |
-                            (1 << IC_RetainRV) |
-                            (1 << IC_RetainBlock)))
-    if (UsedInThisFunction & (1 << IC_Release))
-      // Run OptimizeSequences until it either stops making changes or
-      // no retain+release pair nesting is detected.
-      while (OptimizeSequences(F)) {}
-
-  // Optimizations if objc_autorelease is used.
-  if (UsedInThisFunction & ((1 << IC_Autorelease) |
-                            (1 << IC_AutoreleaseRV)))
-    OptimizeReturns(F);
-
-  return Changed;
-}
-
-void ObjCARCOpt::releaseMemory() {
-  PA.clear();
-}
-
-//===----------------------------------------------------------------------===//
-// ARC contraction.
-//===----------------------------------------------------------------------===//
-
-// TODO: ObjCARCContract could insert PHI nodes when uses aren't
-// dominated by single calls.
-
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Operator.h"
-
-STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
-
-namespace {
-  /// ObjCARCContract - Late ARC optimizations.  These change the IR in a way
-  /// that makes it difficult to be analyzed by ObjCARCOpt, so it's run late.
-  class ObjCARCContract : public FunctionPass {
-    bool Changed;
-    AliasAnalysis *AA;
-    DominatorTree *DT;
-    ProvenanceAnalysis PA;
-
-    /// Run - A flag indicating whether this optimization pass should run.
-    bool Run;
-
-    /// StoreStrongCallee, etc. - Declarations for ObjC runtime
-    /// functions, for use in creating calls to them. These are initialized
-    /// lazily to avoid cluttering up the Module with unused declarations.
-    Constant *StoreStrongCallee,
-             *RetainAutoreleaseCallee, *RetainAutoreleaseRVCallee;
-
-    /// RetainRVMarker - The inline asm string to insert between calls and
-    /// RetainRV calls to make the optimization work on targets which need it.
-    const MDString *RetainRVMarker;
-
-    /// StoreStrongCalls - The set of inserted objc_storeStrong calls. If
-    /// at the end of walking the function we have found no alloca
-    /// instructions, these calls can be marked "tail".
-    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
-
-    Constant *getStoreStrongCallee(Module *M);
-    Constant *getRetainAutoreleaseCallee(Module *M);
-    Constant *getRetainAutoreleaseRVCallee(Module *M);
-
-    bool ContractAutorelease(Function &F, Instruction *Autorelease,
-                             InstructionClass Class,
-                             SmallPtrSet<Instruction *, 4>
-                               &DependingInstructions,
-                             SmallPtrSet<const BasicBlock *, 4>
-                               &Visited);
-
-    void ContractRelease(Instruction *Release,
-                         inst_iterator &Iter);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-    virtual bool doInitialization(Module &M);
-    virtual bool runOnFunction(Function &F);
-
-  public:
-    static char ID;
-    ObjCARCContract() : FunctionPass(ID) {
-      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
-    }
-  };
-}
-
-char ObjCARCContract::ID = 0;
-INITIALIZE_PASS_BEGIN(ObjCARCContract,
-                      "objc-arc-contract", "ObjC ARC contraction", false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DominatorTree)
-INITIALIZE_PASS_END(ObjCARCContract,
-                    "objc-arc-contract", "ObjC ARC contraction", false, false)
-
-Pass *llvm::createObjCARCContractPass() {
-  return new ObjCARCContract();
-}
-
-void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AliasAnalysis>();
-  AU.addRequired<DominatorTree>();
-  AU.setPreservesCFG();
-}
-
-Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
-  if (!StoreStrongCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *I8XX = PointerType::getUnqual(I8X);
-    Type *Params[] = { I8XX, I8X };
-
-    AttributeSet Attribute = AttributeSet()
-      .addAttr(M->getContext(), AttributeSet::FunctionIndex,
-               Attribute::get(C, Attribute::NoUnwind))
-      .addAttr(M->getContext(), 1, Attribute::get(C, Attribute::NoCapture));
-
-    StoreStrongCallee =
-      M->getOrInsertFunction(
-        "objc_storeStrong",
-        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return StoreStrongCallee;
-}
-
-Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
-  if (!RetainAutoreleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    RetainAutoreleaseCallee =
-      M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute);
-  }
-  return RetainAutoreleaseCallee;
-}
-
-Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
-  if (!RetainAutoreleaseRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
-                            Attribute::get(C, Attribute::NoUnwind));
-    RetainAutoreleaseRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
-                             Attribute);
-  }
-  return RetainAutoreleaseRVCallee;
-}
-
-/// ContractAutorelease - Merge an autorelease with a retain into a fused call.
-bool
-ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
-                                     InstructionClass Class,
-                                     SmallPtrSet<Instruction *, 4>
-                                       &DependingInstructions,
-                                     SmallPtrSet<const BasicBlock *, 4>
-                                       &Visited) {
-  const Value *Arg = GetObjCArg(Autorelease);
-
-  // Check that there are no instructions between the retain and the autorelease
-  // (such as an autorelease_pop) which may change the count.
-  CallInst *Retain = 0;
-  if (Class == IC_AutoreleaseRV)
-    FindDependencies(RetainAutoreleaseRVDep, Arg,
-                     Autorelease->getParent(), Autorelease,
-                     DependingInstructions, Visited, PA);
-  else
-    FindDependencies(RetainAutoreleaseDep, Arg,
-                     Autorelease->getParent(), Autorelease,
-                     DependingInstructions, Visited, PA);
-
-  Visited.clear();
-  if (DependingInstructions.size() != 1) {
-    DependingInstructions.clear();
-    return false;
-  }
-
-  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-  DependingInstructions.clear();
-
-  if (!Retain ||
-      GetBasicInstructionClass(Retain) != IC_Retain ||
-      GetObjCArg(Retain) != Arg)
-    return false;
-
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing "
-                  "retain/autorelease. Erasing: " << *Autorelease << "\n"
-                  "                                      Old Retain: "
-               << *Retain << "\n");
-
-  if (Class == IC_AutoreleaseRV)
-    Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent()));
-  else
-    Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                      New Retain: "
-               << *Retain << "\n");
-
-  EraseInstruction(Autorelease);
-  return true;
-}
-
-/// ContractRelease - Attempt to merge an objc_release with a store, load, and
-/// objc_retain to form an objc_storeStrong. This can be a little tricky because
-/// the instructions don't always appear in order, and there may be unrelated
-/// intervening instructions.
-void ObjCARCContract::ContractRelease(Instruction *Release,
-                                      inst_iterator &Iter) {
-  LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
-  if (!Load || !Load->isSimple()) return;
-
-  // For now, require everything to be in one basic block.
-  BasicBlock *BB = Release->getParent();
-  if (Load->getParent() != BB) return;
-
-  // Walk down to find the store and the release, which may be in either order.
-  BasicBlock::iterator I = Load, End = BB->end();
-  ++I;
-  AliasAnalysis::Location Loc = AA->getLocation(Load);
-  StoreInst *Store = 0;
-  bool SawRelease = false;
-  for (; !Store || !SawRelease; ++I) {
-    if (I == End)
-      return;
-
-    Instruction *Inst = I;
-    if (Inst == Release) {
-      SawRelease = true;
-      continue;
-    }
-
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-
-    // Unrelated retains are harmless.
-    if (IsRetain(Class))
-      continue;
-
-    if (Store) {
-      // The store is the point where we're going to put the objc_storeStrong,
-      // so make sure there are no uses after it.
-      if (CanUse(Inst, Load, PA, Class))
-        return;
-    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
-      // We are moving the load down to the store, so check for anything
-      // else which writes to the memory between the load and the store.
-      Store = dyn_cast<StoreInst>(Inst);
-      if (!Store || !Store->isSimple()) return;
-      if (Store->getPointerOperand() != Loc.Ptr) return;
-    }
-  }
-
-  Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
-
-  // Walk up to find the retain.
-  I = Store;
-  BasicBlock::iterator Begin = BB->begin();
-  while (I != Begin && GetBasicInstructionClass(I) != IC_Retain)
-    --I;
-  Instruction *Retain = I;
-  if (GetBasicInstructionClass(Retain) != IC_Retain) return;
-  if (GetObjCArg(Retain) != New) return;
-
-  Changed = true;
-  ++NumStoreStrongs;
-
-  LLVMContext &C = Release->getContext();
-  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-  Type *I8XX = PointerType::getUnqual(I8X);
-
-  Value *Args[] = { Load->getPointerOperand(), New };
-  if (Args[0]->getType() != I8XX)
-    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
-  if (Args[1]->getType() != I8X)
-    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
-  CallInst *StoreStrong =
-    CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()),
-                     Args, "", Store);
-  StoreStrong->setDoesNotThrow();
-  StoreStrong->setDebugLoc(Store->getDebugLoc());
-
-  // We can't set the tail flag yet, because we haven't yet determined
-  // whether there are any escaping allocas. Remember this call, so that
-  // we can set the tail flag once we know it's safe.
-  StoreStrongCalls.insert(StoreStrong);
-
-  if (&*Iter == Store) ++Iter;
-  Store->eraseFromParent();
-  Release->eraseFromParent();
-  EraseInstruction(Retain);
-  if (Load->use_empty())
-    Load->eraseFromParent();
-}
-
-bool ObjCARCContract::doInitialization(Module &M) {
-  // If nothing in the Module uses ARC, don't do anything.
-  Run = ModuleHasARC(M);
-  if (!Run)
-    return false;
-
-  // These are initialized lazily.
-  StoreStrongCallee = 0;
-  RetainAutoreleaseCallee = 0;
-  RetainAutoreleaseRVCallee = 0;
-
-  // Initialize RetainRVMarker.
-  RetainRVMarker = 0;
-  if (NamedMDNode *NMD =
-        M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
-    if (NMD->getNumOperands() == 1) {
-      const MDNode *N = NMD->getOperand(0);
-      if (N->getNumOperands() == 1)
-        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
-          RetainRVMarker = S;
-    }
-
-  return false;
-}
-
-bool ObjCARCContract::runOnFunction(Function &F) {
-  if (!EnableARCOpts)
-    return false;
-
-  // If nothing in the Module uses ARC, don't do anything.
-  if (!Run)
-    return false;
-
-  Changed = false;
-  AA = &getAnalysis<AliasAnalysis>();
-  DT = &getAnalysis<DominatorTree>();
-
-  PA.setAA(&getAnalysis<AliasAnalysis>());
-
-  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
-  // keyword. Be conservative if the function has variadic arguments.
-  // It seems that functions which "return twice" are also unsafe for the
-  // "tail" argument, because they are setjmp, which could need to
-  // return to an earlier stack state.
-  bool TailOkForStoreStrongs = !F.isVarArg() &&
-                               !F.callsFunctionThatReturnsTwice();
-
-  // For ObjC library calls which return their argument, replace uses of the
-  // argument with uses of the call return value, if it dominates the use. This
-  // reduces register pressure.
-  SmallPtrSet<Instruction *, 4> DependingInstructions;
-  SmallPtrSet<const BasicBlock *, 4> Visited;
-  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
-    Instruction *Inst = &*I++;
-
-    DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n");
-
-    // Only these library routines return their argument. In particular,
-    // objc_retainBlock does not necessarily return its argument.
-    InstructionClass Class = GetBasicInstructionClass(Inst);
-    switch (Class) {
-    case IC_Retain:
-    case IC_FusedRetainAutorelease:
-    case IC_FusedRetainAutoreleaseRV:
-      break;
-    case IC_Autorelease:
-    case IC_AutoreleaseRV:
-      if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
-        continue;
-      break;
-    case IC_RetainRV: {
-      // If we're compiling for a target which needs a special inline-asm
-      // marker to do the retainAutoreleasedReturnValue optimization,
-      // insert it now.
-      if (!RetainRVMarker)
-        break;
-      BasicBlock::iterator BBI = Inst;
-      BasicBlock *InstParent = Inst->getParent();
-
-      // Step up to see if the call immediately precedes the RetainRV call.
-      // If it's an invoke, we have to cross a block boundary. And we have
-      // to carefully dodge no-op instructions.
-      do {
-        if (&*BBI == InstParent->begin()) {
-          BasicBlock *Pred = InstParent->getSinglePredecessor();
-          if (!Pred)
-            goto decline_rv_optimization;
-          BBI = Pred->getTerminator();
-          break;
-        }
-        --BBI;
-      } while (isNoopInstruction(BBI));
-
-      if (&*BBI == GetObjCArg(Inst)) {
-        DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for "
-                        "retainAutoreleasedReturnValue optimization.\n");
-        Changed = true;
-        InlineAsm *IA =
-          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
-                                           /*isVarArg=*/false),
-                         RetainRVMarker->getString(),
-                         /*Constraints=*/"", /*hasSideEffects=*/true);
-        CallInst::Create(IA, "", Inst);
-      }
-    decline_rv_optimization:
-      break;
-    }
-    case IC_InitWeak: {
-      // objc_initWeak(p, null) => *p = null
-      CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(1))) {
-        Value *Null =
-          ConstantPointerNull::get(cast<PointerType>(CI->getType()));
-        Changed = true;
-        new StoreInst(Null, CI->getArgOperand(0), CI);
-
-        DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
-                     << "                 New = " << *Null << "\n");
-
-        CI->replaceAllUsesWith(Null);
-        CI->eraseFromParent();
-      }
-      continue;
-    }
-    case IC_Release:
-      ContractRelease(Inst, I);
-      continue;
-    case IC_User:
-      // Be conservative if the function has any alloca instructions.
-      // Technically we only care about escaping alloca instructions,
-      // but this is sufficient to handle some interesting cases.
-      if (isa<AllocaInst>(Inst))
-        TailOkForStoreStrongs = false;
-      continue;
-    default:
-      continue;
-    }
-
-    DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n");
-
-    // Don't use GetObjCArg because we don't want to look through bitcasts
-    // and such; to do the replacement, the argument must have type i8*.
-    const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
-    for (;;) {
-      // If we're compiling bugpointed code, don't get in trouble.
-      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
-        break;
-      // Look through the uses of the pointer.
-      for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
-           UI != UE; ) {
-        Use &U = UI.getUse();
-        unsigned OperandNo = UI.getOperandNo();
-        ++UI; // Increment UI now, because we may unlink its element.
-
-        // If the call's return value dominates a use of the call's argument
-        // value, rewrite the use to use the return value. We check for
-        // reachability here because an unreachable call is considered to
-        // trivially dominate itself, which would lead us to rewriting its
-        // argument in terms of its return value, which would lead to
-        // infinite loops in GetObjCArg.
-        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
-          Changed = true;
-          Instruction *Replacement = Inst;
-          Type *UseTy = U.get()->getType();
-          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
-            // For PHI nodes, insert the bitcast in the predecessor block.
-            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
-            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
-            if (Replacement->getType() != UseTy)
-              Replacement = new BitCastInst(Replacement, UseTy, "",
-                                            &BB->back());
-            // While we're here, rewrite all edges for this PHI, rather
-            // than just one use at a time, to minimize the number of
-            // bitcasts we emit.
-            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
-              if (PHI->getIncomingBlock(i) == BB) {
-                // Keep the UI iterator valid.
-                if (&PHI->getOperandUse(
-                      PHINode::getOperandNumForIncomingValue(i)) ==
-                    &UI.getUse())
-                  ++UI;
-                PHI->setIncomingValue(i, Replacement);
-              }
-          } else {
-            if (Replacement->getType() != UseTy)
-              Replacement = new BitCastInst(Replacement, UseTy, "",
-                                            cast<Instruction>(U.getUser()));
-            U.set(Replacement);
-          }
-        }
-      }
-
-      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
-      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
-        Arg = BI->getOperand(0);
-      else if (isa<GEPOperator>(Arg) &&
-               cast<GEPOperator>(Arg)->hasAllZeroIndices())
-        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
-      else if (isa<GlobalAlias>(Arg) &&
-               !cast<GlobalAlias>(Arg)->mayBeOverridden())
-        Arg = cast<GlobalAlias>(Arg)->getAliasee();
-      else
-        break;
-    }
-  }
-
-  // If this function has no escaping allocas or suspicious vararg usage,
-  // objc_storeStrong calls can be marked with the "tail" keyword.
-  if (TailOkForStoreStrongs)
-    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
-         E = StoreStrongCalls.end(); I != E; ++I)
-      (*I)->setTailCall();
-  StoreStrongCalls.clear();
-
-  return Changed;
-}
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 3e935d8..e30a274 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -271,13 +271,6 @@ public:
     return I->second;
   }
 
-  /*LatticeVal getStructLatticeValueFor(Value *V, unsigned i) const {
-    DenseMap<std::pair<Value*, unsigned>, LatticeVal>::const_iterator I =
-      StructValueState.find(std::make_pair(V, i));
-    assert(I != StructValueState.end() && "V is not in valuemap!");
-    return I->second;
-  }*/
-
   /// getTrackedRetVals - Get the inferred return value map.
   ///
   const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
@@ -710,9 +703,6 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
     markConstant(&PN, OperandVal);      // Acquire operand value
 }
 
-
-
-
 void SCCPSolver::visitReturnInst(ReturnInst &I) {
   if (I.getNumOperands() == 0) return;  // ret void
 
@@ -1185,7 +1175,7 @@ void SCCPSolver::Solve() {
       DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
 
       // "I" got into the work list because it either made the transition from
-      // bottom to constant
+      // bottom to constant, or to overdefined.
       //
       // Anything on this worklist that is overdefined need not be visited
       // since all of its users will have already been marked as overdefined
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 4204171..e90fe90 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -43,14 +43,12 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/InstVisitor.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -411,9 +409,9 @@ static Value *foldSelectInst(SelectInst &SI) {
   // early on.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
     return SI.getOperand(1+CI->isZero());
-  if (SI.getOperand(1) == SI.getOperand(2)) {
+  if (SI.getOperand(1) == SI.getOperand(2))
     return SI.getOperand(1);
-  }
+
   return 0;
 }
 
@@ -621,7 +619,7 @@ private:
   }
 
   // Disable SRoA for any intrinsics except for lifetime invariants.
-  // FIXME: What about debug instrinsics? This matches old behavior, but
+  // FIXME: What about debug intrinsics? This matches old behavior, but
   // doesn't make sense.
   void visitIntrinsicInst(IntrinsicInst &II) {
     if (!IsOffsetKnown)
@@ -1141,8 +1139,7 @@ void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
 
 void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
                                     StringRef Indent) const {
-  for (const_use_iterator UI = use_begin(I), UE = use_end(I);
-       UI != UE; ++UI) {
+  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
     if (!UI->U)
       continue; // Skip dead uses.
     OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
@@ -1170,8 +1167,7 @@ void AllocaPartitioning::print(raw_ostream &OS) const {
   }
 
   OS << "Partitioning of alloca: " << AI << "\n";
-  unsigned Num = 0;
-  for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
     print(OS, I);
     printUsers(OS, I);
   }
@@ -1242,7 +1238,7 @@ public:
     for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
-      Value *Arg = NULL;
+      Value *Arg = 0;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
         // If an argument is zero extended then use argument directly. The ZExt
         // may be zapped by an optimization pass in future.
@@ -1277,7 +1273,7 @@ namespace {
 /// 1) It takes allocations of aggregates and analyzes the ways in which they
 ///    are used to try to split them into smaller allocations, ideally of
 ///    a single scalar data type. It will split up memcpy and memset accesses
-///    as necessary and try to isolate invidual scalar accesses.
+///    as necessary and try to isolate individual scalar accesses.
 /// 2) It will transform accesses into forms which are suitable for SSA value
 ///    promotion. This can be replacing a memset with a scalar store of an
 ///    integer value, or it can involve speculating operations on a PHI or
@@ -1439,8 +1435,7 @@ private:
     // We can only transform this if it is safe to push the loads into the
     // predecessor blocks. The only thing to watch out for is that we can't put
     // a possibly trapping load in the predecessor if it is a critical edge.
-    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
-         ++Idx) {
+    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
       TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
       Value *InVal = PN.getIncomingValue(Idx);
 
@@ -1483,7 +1478,7 @@ private:
                                           PN.getName() + ".sroa.speculated");
 
     // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
-    // matter which one we get and if any differ, it doesn't matter.
+    // matter which one we get and if any differ.
     LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
     MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
     unsigned Align = SomeLoad->getAlignment();
@@ -1816,7 +1811,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
 /// The strategy for finding the more natural GEPs is to peel off layers of the
 /// pointer, walking back through bit casts and GEPs, searching for a base
 /// pointer from which we can compute a natural GEP with the desired
-/// properities. The algorithm tries to fold as many constant indices into
+/// properties. The algorithm tries to fold as many constant indices into
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
 static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
@@ -2062,9 +2057,9 @@ static bool isIntegerWideningViable(const DataLayout &TD,
 
   uint64_t Size = TD.getTypeStoreSize(AllocaTy);
 
-  // Check the uses to ensure the uses are (likely) promoteable integer uses.
+  // Check the uses to ensure the uses are (likely) promotable integer uses.
   // Also ensure that the alloca has a covering load or store. We don't want
-  // to widen the integer operotains only to fail to promote due to some other
+  // to widen the integer operations only to fail to promote due to some other
   // unsplittable entry (which we may make splittable later).
   bool WholeAllocaOp = false;
   for (; I != E; ++I) {
@@ -2283,7 +2278,7 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
 
   // If we are rewriting an alloca partition which can be written as pure
   // vector operations, we stash extra information here. When VecTy is
-  // non-null, we have some strict guarantees about the rewriten alloca:
+  // non-null, we have some strict guarantees about the rewritten alloca:
   //   - The new alloca is exactly the size of the vector type here.
   //   - The accesses all either map to the entire vector or to a single
   //     element.
@@ -2636,7 +2631,7 @@ private:
   ///
   /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
   /// call this routine.
-  /// FIXME: Heed the abvice above.
+  /// FIXME: Heed the advice above.
   ///
   /// \param V The i8 value to splat.
   /// \param Size The number of bytes in the output (assuming i8 is one byte)
@@ -2971,6 +2966,7 @@ private:
     else
       New = IRB.CreateLifetimeEnd(Ptr, Size);
 
+    (void)New;
     DEBUG(dbgs() << "          to: " << *New << "\n");
     return true;
   }
@@ -3147,9 +3143,8 @@ private:
     void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
       assert(Ty->isSingleValueType());
       // Load the single value and insert it using the indices.
-      Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices,
-                                                         Name + ".gep"),
-                                   Name + ".load");
+      Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep");
+      Value *Load = IRB.CreateLoad(GEP, Name + ".load");
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
       DEBUG(dbgs() << "          to: " << *Load << "\n");
     }
@@ -3422,7 +3417,7 @@ bool SROA::rewriteAllocaPartition(AllocaInst &AI,
   // Check for the case where we're going to rewrite to a new alloca of the
   // exact same type as the original, and with the same access offsets. In that
   // case, re-use the existing alloca, but still run through the rewriter to
-  // performe phi and select speculation.
+  // perform phi and select speculation.
   AllocaInst *NewAI;
   if (AllocaTy == AI.getAllocatedType()) {
     assert(PI->BeginOffset == 0 &&
@@ -3589,7 +3584,7 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
 /// If there is a domtree available, we attempt to promote using the full power
 /// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
 /// based on the SSAUpdater utilities. This function returns whether any
-/// promotion occured.
+/// promotion occurred.
 bool SROA::promoteAllocas(Function &F) {
   if (PromotableAllocas.empty())
     return false;
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 35d2fa0..8a9c7da 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -50,11 +50,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLowerAtomicPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeMemCpyOptPass(Registry);
-  initializeObjCARCAliasAnalysisPass(Registry);
-  initializeObjCARCAPElimPass(Registry);
-  initializeObjCARCExpandPass(Registry);
-  initializeObjCARCContractPass(Registry);
-  initializeObjCARCOptPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index d5cefa3..916b37d 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -165,7 +165,7 @@ bool SimplifyLibCalls::runOnFunction(Function &F) {
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
       // Ignore non-calls.
       CallInst *CI = dyn_cast<CallInst>(I++);
-      if (!CI) continue;
+      if (!CI || CI->hasFnAttr(Attribute::NoBuiltin)) continue;
 
       // Ignore indirect calls and calls to non-external functions.
       Function *Callee = CI->getCalledFunction();
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 6572e09..2002e68 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -58,6 +58,7 @@
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -79,11 +80,15 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 
 namespace {
   struct TailCallElim : public FunctionPass {
+    const TargetTransformInfo *TTI;
+
     static char ID; // Pass identification, replacement for typeid
     TailCallElim() : FunctionPass(ID) {
       initializeTailCallElimPass(*PassRegistry::getPassRegistry());
     }
 
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
     virtual bool runOnFunction(Function &F);
 
   private:
@@ -109,14 +114,21 @@ namespace {
 }
 
 char TailCallElim::ID = 0;
-INITIALIZE_PASS(TailCallElim, "tailcallelim",
-                "Tail Call Elimination", false, false)
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim",
+                      "Tail Call Elimination", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim",
+                    "Tail Call Elimination", false, false)
 
 // Public interface to the TailCallElimination pass
 FunctionPass *llvm::createTailCallEliminationPass() {
   return new TailCallElim();
 }
 
+void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfo>();
+}
+
 /// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
 /// callees of this function.  We only do very simple analysis right now, this
 /// could be expanded in the future to use mod/ref information for particular
@@ -151,6 +163,7 @@ bool TailCallElim::runOnFunction(Function &F) {
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg()) return false;
 
+  TTI = &getAnalysis<TargetTransformInfo>();
   BasicBlock *OldEntry = 0;
   bool TailCallsAreMarkedTail = false;
   SmallVector<PHINode*, 8> ArgumentPHIs;
@@ -391,7 +404,8 @@ TailCallElim::FindTRECandidate(Instruction *TI,
   if (BB == &F->getEntryBlock() &&
       FirstNonDbg(BB->front()) == CI &&
       FirstNonDbg(llvm::next(BB->begin())) == TI &&
-      callIsSmall(CI)) {
+      CI->getCalledFunction() &&
+      !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
     CallSite::arg_iterator I = CallSite(CI).arg_begin(),
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 8330e84..ba99d2e 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -37,12 +37,12 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
          // Can delete self loop.
          BB->getSinglePredecessor() == BB) && "Block is not dead!");
   TerminatorInst *BBTerm = BB->getTerminator();
-  
+
   // Loop through all of our successors and make sure they know that one
   // of their predecessors is going away.
   for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i)
     BBTerm->getSuccessor(i)->removePredecessor(BB);
-  
+
   // Zap all the instructions in the block.
   while (!BB->empty()) {
     Instruction &I = BB->back();
@@ -55,7 +55,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
       I.replaceAllUsesWith(UndefValue::get(I.getType()));
     BB->getInstList().pop_back();
   }
-  
+
   // Zap the block!
   BB->eraseFromParent();
 }
@@ -66,25 +66,25 @@ void llvm::DeleteDeadBlock(BasicBlock *BB) {
 /// when the block has exactly one predecessor.
 void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) {
   if (!isa<PHINode>(BB->begin())) return;
-  
+
   AliasAnalysis *AA = 0;
   MemoryDependenceAnalysis *MemDep = 0;
   if (P) {
     AA = P->getAnalysisIfAvailable<AliasAnalysis>();
     MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>();
   }
-  
+
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     if (PN->getIncomingValue(0) != PN)
       PN->replaceAllUsesWith(PN->getIncomingValue(0));
     else
       PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
-    
+
     if (MemDep)
       MemDep->removeInstruction(PN);  // Memdep updates AA itself.
     else if (AA && isa<PointerType>(PN->getType()))
       AA->deleteValue(PN);
-    
+
     PN->eraseFromParent();
   }
 }
@@ -115,7 +115,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
 bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   // Don't merge away blocks who have their address taken.
   if (BB->hasAddressTaken()) return false;
-  
+
   // Can't merge if there are multiple predecessors, or no predecessors.
   BasicBlock *PredBB = BB->getUniquePredecessor();
   if (!PredBB) return false;
@@ -124,7 +124,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   if (PredBB == BB) return false;
   // Don't break invokes.
   if (isa<InvokeInst>(PredBB->getTerminator())) return false;
-  
+
   succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
   BasicBlock *OnlySucc = BB;
   for (; SI != SE; ++SI)
@@ -132,7 +132,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
       OnlySucc = 0;     // There are multiple distinct successors!
       break;
     }
-  
+
   // Can't merge if there are multiple successors.
   if (!OnlySucc) return false;
 
@@ -149,21 +149,21 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
   // Begin by getting rid of unneeded PHIs.
   if (isa<PHINode>(BB->front()))
     FoldSingleEntryPHINodes(BB, P);
-  
+
   // Delete the unconditional branch from the predecessor...
   PredBB->getInstList().pop_back();
-  
+
   // Make all PHI nodes that referred to BB now refer to Pred as their
   // source...
   BB->replaceAllUsesWith(PredBB);
-  
+
   // Move all definitions in the successor to the predecessor...
   PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
-  
+
   // Inherit predecessors name if it exists.
   if (!PredBB->hasName())
     PredBB->takeName(BB);
-  
+
   // Finally, erase the old block and update dominator info.
   if (P) {
     if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) {
@@ -176,16 +176,16 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
 
         DT->eraseNode(BB);
       }
-      
+
       if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>())
         LI->removeBlock(BB);
-      
+
       if (MemoryDependenceAnalysis *MD =
             P->getAnalysisIfAvailable<MemoryDependenceAnalysis>())
         MD->invalidateCachedPredecessors();
     }
   }
-  
+
   BB->eraseFromParent();
   return true;
 }
@@ -251,11 +251,11 @@ unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) {
   }
 }
 
-/// SplitEdge -  Split the edge connecting specified block. Pass P must 
-/// not be NULL. 
+/// SplitEdge -  Split the edge connecting specified block. Pass P must
+/// not be NULL.
 BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
   unsigned SuccNum = GetSuccessorNumber(BB, Succ);
-  
+
   // If this is a critical edge, let SplitCriticalEdge do it.
   TerminatorInst *LatchTerm = BB->getTerminator();
   if (SplitCriticalEdge(LatchTerm, SuccNum, P))
@@ -271,11 +271,11 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
     SP = NULL;
     return SplitBlock(Succ, Succ->begin(), P);
   }
-  
+
   // Otherwise, if BB has a single successor, split it at the bottom of the
   // block.
   assert(BB->getTerminator()->getNumSuccessors() == 1 &&
-         "Should have a single succ!"); 
+         "Should have a single succ!");
   return SplitBlock(BB, BB->getTerminator(), P);
 }
 
@@ -301,12 +301,12 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
     if (DomTreeNode *OldNode = DT->getNode(Old)) {
       std::vector<DomTreeNode *> Children;
       for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
-           I != E; ++I) 
+           I != E; ++I)
         Children.push_back(*I);
 
       DomTreeNode *NewNode = DT->addNewBlock(New,Old);
       for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
-             E = Children.end(); I != E; ++I) 
+             E = Children.end(); I != E; ++I)
         DT->changeImmediateDominator(*I, NewNode);
     }
   }
@@ -424,7 +424,7 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
       PHINode *NewPHI =
         PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
       if (AA) AA->copyValue(PN, NewPHI);
-      
+
       // Move all of the PHI values for 'Preds' to the new PHI.
       for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
         Value *V = PN->removeIncomingValue(Preds[i], false);
@@ -451,16 +451,16 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
 /// preserve LoopSimplify (because it's complicated to handle the case where one
 /// of the edges being split is an exit of a loop with other exits).
 ///
-BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
                                          ArrayRef<BasicBlock*> Preds,
                                          const char *Suffix, Pass *P) {
   // Create new basic block, insert right before the original block.
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
                                          BB->getParent(), BB);
-  
+
   // The new block unconditionally branches to the old block.
   BranchInst *BI = BranchInst::Create(BB, NewBB);
-  
+
   // Move the edges from Preds to point to NewBB instead of BB.
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     // This is slightly more strict than necessary; the minimum requirement
@@ -497,13 +497,13 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 /// block gets the remaining predecessors of OrigBB. The landingpad instruction
 /// OrigBB is clone into both of the new basic blocks. The new blocks are given
 /// the suffixes 'Suffix1' and 'Suffix2', and are returned in the NewBBs vector.
-/// 
+///
 /// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
 /// DominanceFrontier, LoopInfo, and LCCSA but no other analyses. In particular,
 /// it does not preserve LoopSimplify (because it's complicated to handle the
 /// case where one of the edges being split is an exit of a loop with other
 /// exits).
-/// 
+///
 void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
                                        ArrayRef<BasicBlock*> Preds,
                                        const char *Suffix1, const char *Suffix2,
@@ -608,11 +608,11 @@ void llvm::FindFunctionBackedges(const Function &F,
   const BasicBlock *BB = &F.getEntryBlock();
   if (succ_begin(BB) == succ_end(BB))
     return;
-  
+
   SmallPtrSet<const BasicBlock*, 8> Visited;
   SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack;
   SmallPtrSet<const BasicBlock*, 8> InStack;
-  
+
   Visited.insert(BB);
   VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
   InStack.insert(BB);
@@ -620,7 +620,7 @@ void llvm::FindFunctionBackedges(const Function &F,
     std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back();
     const BasicBlock *ParentBB = Top.first;
     succ_const_iterator &I = Top.second;
-    
+
     bool FoundNew = false;
     while (I != succ_end(ParentBB)) {
       BB = *I++;
@@ -632,7 +632,7 @@ void llvm::FindFunctionBackedges(const Function &F,
       if (InStack.count(BB))
         Result.push_back(std::make_pair(ParentBB, BB));
     }
-    
+
     if (FoundNew) {
       // Go down one level if there is a unvisited successor.
       InStack.insert(BB);
@@ -641,7 +641,7 @@ void llvm::FindFunctionBackedges(const Function &F,
       // Go up one level.
       InStack.erase(VisitStack.pop_back_val().first);
     }
-  } while (!VisitStack.empty()); 
+  } while (!VisitStack.empty());
 }
 
 /// FoldReturnIntoUncondBranch - This method duplicates the specified return
@@ -655,7 +655,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   // Clone the return and add it to the end of the predecessor.
   Instruction *NewRet = RI->clone();
   Pred->getInstList().push_back(NewRet);
-      
+
   // If the return instruction returns a value, and if the value was a
   // PHI node in "BB", propagate the right value into the return.
   for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
@@ -679,7 +679,7 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
       }
     }
   }
-      
+
   // Update any PHI nodes in the returning block to realize that we no
   // longer branch to them.
   BB->removePredecessor(Pred);
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index bf540b0..6d13217 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -38,16 +38,16 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen",
                                             AttributeSet::get(M->getContext(),
-                                                             AWI),
+                                                              AS),
                                             TD->getIntPtrType(Context),
                                             B.getInt8PtrTy(),
                                             NULL);
@@ -67,16 +67,16 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrNLen = M->getOrInsertFunction("strnlen",
                                              AttributeSet::get(M->getContext(),
-                                                              AWI),
+                                                              AS),
                                              TD->getIntPtrType(Context),
                                              B.getInt8PtrTy(),
                                              TD->getIntPtrType(Context),
@@ -98,15 +98,15 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AttributeWithIndex AWI =
-    AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AttributeSet AS =
+    AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                      ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr = M->getOrInsertFunction("strchr",
                                             AttributeSet::get(M->getContext(),
-                                                             AWI),
+                                                             AS),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
                                ConstantInt::get(I32Ty, C), "strchr");
@@ -123,17 +123,17 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *StrNCmp = M->getOrInsertFunction("strncmp",
                                           AttributeSet::get(M->getContext(),
-                                                           AWI),
+                                                           AS),
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
                                           B.getInt8PtrTy(),
@@ -156,13 +156,13 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name,
-                                         AttributeSet::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Name);
@@ -180,14 +180,14 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name,
                                           AttributeSet::get(M->getContext(),
-                                                           AWI),
+                                                            AS),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
   CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
@@ -207,12 +207,12 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI;
-  AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                Attribute::NoUnwind);
+  AttributeSet AS;
+  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                         Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
-                                         AttributeSet::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -235,13 +235,13 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI;
+  AttributeSet AS;
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                         ArrayRef<Attribute::AttrKind>(AVs, 2));
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemChr = M->getOrInsertFunction("memchr",
-                                         AttributeSet::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
@@ -263,16 +263,16 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
   Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCmp = M->getOrInsertFunction("memcmp",
-                                         AttributeSet::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AS),
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -344,13 +344,13 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
 
   Value *PutS = M->getOrInsertFunction("puts",
-                                       AttributeSet::get(M->getContext(), AWI),
+                                       AttributeSet::get(M->getContext(), AS),
                                        B.getInt32Ty(),
                                        B.getInt8PtrTy(),
                                        NULL);
@@ -368,14 +368,14 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction("fputc",
-                               AttributeSet::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AS),
                                B.getInt32Ty(),
                                B.getInt32Ty(), File->getType(),
                                NULL);
@@ -401,16 +401,16 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction(FPutsName,
-                               AttributeSet::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AS),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
@@ -434,17 +434,17 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
-  AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), 4, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
-                                   Attribute::NoUnwind);
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 4, Attribute::NoCapture);
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction(FWriteName,
-                               AttributeSet::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AS),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index ccc3eae..a309bce 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -95,18 +95,16 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
            E = OldFunc->arg_end(); I != E; ++I)
       if (Argument* Anew = dyn_cast<Argument>(VMap[I]))
-        Anew->addAttr( OldFunc->getAttributes()
+        Anew->addAttr(OldFunc->getAttributes()
                        .getParamAttributes(I->getArgNo() + 1));
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(NewFunc->getContext(),
-                                    AttributeSet::ReturnIndex,
-                                    OldFunc->getAttributes()
-                                     .getRetAttributes()));
+                           .addAttributes(NewFunc->getContext(),
+                                          AttributeSet::ReturnIndex,
+                                          OldFunc->getAttributes()));
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(NewFunc->getContext(),
-                                    AttributeSet::FunctionIndex,
-                                    OldFunc->getAttributes()
-                                     .getFnAttributes()));
+                           .addAttributes(NewFunc->getContext(),
+                                          AttributeSet::FunctionIndex,
+                                          OldFunc->getAttributes()));
 
   }
 
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 3a21528..f7c659f 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index d5c41f5..db525cd 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/Function.h"
@@ -78,12 +79,21 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     InsertPt = &I;
     ++InsertPt;
   } else {
-    // We cannot demote invoke instructions to the stack if their normal edge
-    // is critical.
     InvokeInst &II = cast<InvokeInst>(I);
-    assert(II.getNormalDest()->getSinglePredecessor() &&
-           "Cannot demote invoke with a critical successor!");
-    InsertPt = II.getNormalDest()->begin();
+    if (II.getNormalDest()->getSinglePredecessor())
+      InsertPt = II.getNormalDest()->getFirstInsertionPt();
+    else {
+      // We cannot demote invoke instructions to the stack if their normal edge
+      // is critical.  Therefore, split the critical edge and insert the store
+      // in the newly created basic block.
+      unsigned SuccNum = GetSuccessorNumber(I.getParent(), II.getNormalDest());
+      TerminatorInst *TI = &cast<TerminatorInst>(I);
+      assert (isCriticalEdge(TI, SuccNum) &&
+              "Expected a critical edge!");
+      BasicBlock *BB = SplitCriticalEdge(TI, SuccNum);
+      assert (BB && "Unable to split critical edge.");
+      InsertPt = BB->getFirstInsertionPt();
+    }
   }
 
   for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
index 5187d7c..3cb8ded 100644
--- a/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -418,3 +418,107 @@ bool llvm::expandDivision(BinaryOperator *Div) {
 
   return true;
 }
+
+/// Generate code to compute the remainder of two integers of bitwidth up to 
+/// 32 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 32 bits; that is, these routines are good for targets
+/// that have no or very little suppport for smaller than 32 bit integer 
+/// arithmetic.
+///
+/// @brief Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+          "Trying to expand remainder from a non-remainder function");
+
+  Type *RemTy = Rem->getType();
+  if (RemTy->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+  if (RemTyBitWidth > 32) 
+    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+
+  if (RemTyBitWidth == 32) 
+    return expandRemainder(Rem);
+
+  // If bitwidth smaller than 32 extend inputs, truncate output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Rem);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtRem;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Rem->getOpcode() == Instruction::SRem) {
+    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+  Rem->replaceAllUsesWith(Trunc);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+
+/// Generate code to divide two integers of bitwidth up to 32 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 32 bits; that is, these routines are good for targets that have no
+/// or very little support for smaller than 32 bit integer arithmetic.
+///
+/// @brief Replace Div with emulation code.
+bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+          "Trying to expand division from a non-division function");
+
+  Type *DivTy = Div->getType();
+  if (DivTy->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+  if (DivTyBitWidth > 32)
+    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+
+  if (DivTyBitWidth == 32)
+    return expandDivision(Div);
+
+  // If bitwidth smaller than 32 extend inputs, truncate output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Div);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtDiv;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Div->getOpcode() == Instruction::SDiv) {
+    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);  
+  }
+  Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+  Div->replaceAllUsesWith(Trunc);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index d519fb7..3716f58 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -72,13 +72,23 @@ namespace {
 
       // Rename all aliases
       for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end();
-           AI != AE; ++AI)
-        AI->setName("alias");
+           AI != AE; ++AI) {
+        StringRef Name = AI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
 
+        AI->setName("alias");
+      }
+      
       // Rename all global variables
       for (Module::global_iterator GI = M.global_begin(), GE = M.global_end();
-           GI != GE; ++GI)
+           GI != GE; ++GI) {
+        StringRef Name = GI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
         GI->setName("global");
+      }
 
       // Rename all struct types
       TypeFinder StructTypes;
@@ -95,6 +105,10 @@ namespace {
       // Rename all functions
       for (Module::iterator FI = M.begin(), FE = M.end();
            FI != FE; ++FI) {
+        StringRef Name = FI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
         FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]);
         runOnFunction(*FI);
       }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index f10c35f..a63d31d 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1332,149 +1332,180 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   return Changed;
 }
 
-/// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1
-/// and an BB2 and the only successor of BB1 is BB2, hoist simple code
-/// (for now, restricted to a single instruction that's side effect free) from
-/// the BB1 into the branch block to speculatively execute it.
+/// \brief Speculate a conditional basic block flattening the CFG.
 ///
-/// Turn
-/// BB:
-///     %t1 = icmp
-///     br i1 %t1, label %BB1, label %BB2
-/// BB1:
-///     %t3 = add %t2, c
+/// Note that this is a very risky transform currently. Speculating
+/// instructions like this is most often not desirable. Instead, there is an MI
+/// pass which can do it with full awareness of the resource constraints.
+/// However, some cases are "obvious" and we should do directly. An example of
+/// this is speculating a single, reasonably cheap instruction.
+///
+/// There is only one distinct advantage to flattening the CFG at the IR level:
+/// it makes very common but simplistic optimizations such as are common in
+/// instcombine and the DAG combiner more powerful by removing CFG edges and
+/// modeling their effects with easier to reason about SSA value graphs.
+///
+///
+/// An illustration of this transform is turning this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     %sub = sub %x, %y
 ///     br label BB2
-/// BB2:
-/// =>
-/// BB:
-///     %t1 = icmp
-///     %t4 = add %t2, c
-///     %t3 = select i1 %t1, %t2, %t3
-static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
-  // Only speculatively execution a single instruction (not counting the
-  // terminator) for now.
-  Instruction *HInst = NULL;
-  Instruction *Term = BB1->getTerminator();
-  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end();
+///   EndBB:
+///     %phi = phi [ %sub, %ThenBB ], [ 0, %EndBB ]
+///     ...
+/// \endcode
+///
+/// Into this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     %sub = sub %x, %y
+///     %cond = select i1 %cmp, 0, %sub
+///     ...
+/// \endcode
+///
+/// \returns true if the conditional block is removed.
+static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
+  // Be conservative for now. FP select instruction can often be expensive.
+  Value *BrCond = BI->getCondition();
+  if (isa<FCmpInst>(BrCond))
+    return false;
+
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0);
+
+  // If ThenBB is actually on the false edge of the conditional branch, remember
+  // to swap the select operands later.
+  bool Invert = false;
+  if (ThenBB != BI->getSuccessor(0)) {
+    assert(ThenBB == BI->getSuccessor(1) && "No edge from 'if' block?");
+    Invert = true;
+  }
+  assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block");
+
+  // Keep a count of how many times instructions are used within CondBB when
+  // they are candidates for sinking into CondBB. Specifically:
+  // - They are defined in BB, and
+  // - They have no side effects, and
+  // - All of their uses are in CondBB.
+  SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
+
+  unsigned SpeculationCost = 0;
+  for (BasicBlock::iterator BBI = ThenBB->begin(),
+                            BBE = llvm::prior(ThenBB->end());
        BBI != BBE; ++BBI) {
     Instruction *I = BBI;
     // Skip debug info.
-    if (isa<DbgInfoIntrinsic>(I)) continue;
-    if (I == Term) break;
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
 
-    if (HInst)
+    // Only speculatively execution a single instruction (not counting the
+    // terminator) for now.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
       return false;
-    HInst = I;
-  }
-
-  BasicBlock *BIParent = BI->getParent();
 
-  // Check the instruction to be hoisted, if there is one.
-  if (HInst) {
     // Don't hoist the instruction if it's unsafe or expensive.
-    if (!isSafeToSpeculativelyExecute(HInst))
+    if (!isSafeToSpeculativelyExecute(I))
       return false;
-    if (ComputeSpeculationCost(HInst) > PHINodeFoldingThreshold)
+    if (ComputeSpeculationCost(I) > PHINodeFoldingThreshold)
       return false;
 
     // Do not hoist the instruction if any of its operands are defined but not
     // used in this BB. The transformation will prevent the operand from
     // being sunk into the use block.
-    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end();
+    for (User::op_iterator i = I->op_begin(), e = I->op_end();
          i != e; ++i) {
       Instruction *OpI = dyn_cast<Instruction>(*i);
-      if (OpI && OpI->getParent() == BIParent &&
-          !OpI->mayHaveSideEffects() &&
-          !OpI->isUsedInBasicBlock(BIParent))
-        return false;
+      if (!OpI || OpI->getParent() != BB ||
+          OpI->mayHaveSideEffects())
+        continue; // Not a candidate for sinking.
+
+      ++SinkCandidateUseCounts[OpI];
     }
   }
 
-  // Be conservative for now. FP select instruction can often be expensive.
-  Value *BrCond = BI->getCondition();
-  if (isa<FCmpInst>(BrCond))
-    return false;
-
-  // If BB1 is actually on the false edge of the conditional branch, remember
-  // to swap the select operands later.
-  bool Invert = false;
-  if (BB1 != BI->getSuccessor(0)) {
-    assert(BB1 == BI->getSuccessor(1) && "No edge from 'if' block?");
-    Invert = true;
-  }
+  // Consider any sink candidates which are only used in CondBB as costs for
+  // speculation. Note, while we iterate over a DenseMap here, we are summing
+  // and so iteration order isn't significant.
+  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator I =
+           SinkCandidateUseCounts.begin(), E = SinkCandidateUseCounts.end();
+       I != E; ++I)
+    if (I->first->getNumUses() == I->second) {
+      ++SpeculationCost;
+      if (SpeculationCost > 1)
+        return false;
+    }
 
-  // Collect interesting PHIs, and scan for hazards.
-  SmallSetVector<std::pair<Value *, Value *>, 4> PHIs;
-  BasicBlock *BB2 = BB1->getTerminator()->getSuccessor(0);
-  for (BasicBlock::iterator I = BB2->begin();
+  // Check that the PHI nodes can be converted to selects.
+  bool HaveRewritablePHIs = false;
+  for (BasicBlock::iterator I = EndBB->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-    Value *BB1V = PN->getIncomingValueForBlock(BB1);
-    Value *BIParentV = PN->getIncomingValueForBlock(BIParent);
+    Value *OrigV = PN->getIncomingValueForBlock(BB);
+    Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
 
     // Skip PHIs which are trivial.
-    if (BB1V == BIParentV)
+    if (ThenV == OrigV)
       continue;
 
-    // Check for safety.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BB1V)) {
-      // An unfolded ConstantExpr could end up getting expanded into
-      // Instructions. Don't speculate this and another instruction at
-      // the same time.
-      if (HInst)
-        return false;
-      if (!isSafeToSpeculativelyExecute(CE))
-        return false;
-      if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
-        return false;
-    }
+    HaveRewritablePHIs = true;
+    ConstantExpr *CE = dyn_cast<ConstantExpr>(ThenV);
+    if (!CE)
+      continue; // Known safe and cheap.
+
+    if (!isSafeToSpeculativelyExecute(CE))
+      return false;
+    if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
+      return false;
 
-    // Ok, we may insert a select for this PHI.
-    PHIs.insert(std::make_pair(BB1V, BIParentV));
+    // Account for the cost of an unfolded ConstantExpr which could end up
+    // getting expanded into Instructions.
+    // FIXME: This doesn't account for how many operations are combined in the
+    // constant expression.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
+      return false;
   }
 
   // If there are no PHIs to process, bail early. This helps ensure idempotence
   // as well.
-  if (PHIs.empty())
+  if (!HaveRewritablePHIs)
     return false;
 
   // If we get here, we can hoist the instruction and if-convert.
-  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *BB1 << "\n";);
+  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
-  // Hoist the instruction.
-  if (HInst)
-    BIParent->getInstList().splice(BI, BB1->getInstList(), HInst);
+  // Hoist the instructions.
+  BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(),
+                           llvm::prior(ThenBB->end()));
 
   // Insert selects and rewrite the PHI operands.
   IRBuilder<true, NoFolder> Builder(BI);
-  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
-    Value *TrueV = PHIs[i].first;
-    Value *FalseV = PHIs[i].second;
+  for (BasicBlock::iterator I = EndBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    unsigned OrigI = PN->getBasicBlockIndex(BB);
+    unsigned ThenI = PN->getBasicBlockIndex(ThenBB);
+    Value *OrigV = PN->getIncomingValue(OrigI);
+    Value *ThenV = PN->getIncomingValue(ThenI);
+
+    // Skip PHIs which are trivial.
+    if (OrigV == ThenV)
+      continue;
 
     // Create a select whose true value is the speculatively executed value and
-    // false value is the previously determined FalseV.
-    SelectInst *SI;
+    // false value is the preexisting value. Swap them if the branch
+    // destinations were inverted.
+    Value *TrueV = ThenV, *FalseV = OrigV;
     if (Invert)
-      SI = cast<SelectInst>
-        (Builder.CreateSelect(BrCond, FalseV, TrueV,
-                              FalseV->getName() + "." + TrueV->getName()));
-    else
-      SI = cast<SelectInst>
-        (Builder.CreateSelect(BrCond, TrueV, FalseV,
-                              TrueV->getName() + "." + FalseV->getName()));
-
-    // Make the PHI node use the select for all incoming values for "then" and
-    // "if" blocks.
-    for (BasicBlock::iterator I = BB2->begin();
-         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-      unsigned BB1I = PN->getBasicBlockIndex(BB1);
-      unsigned BIParentI = PN->getBasicBlockIndex(BIParent);
-      Value *BB1V = PN->getIncomingValue(BB1I);
-      Value *BIParentV = PN->getIncomingValue(BIParentI);
-      if (TrueV == BB1V && FalseV == BIParentV) {
-        PN->setIncomingValue(BB1I, SI);
-        PN->setIncomingValue(BIParentI, SI);
-      }
-    }
+      std::swap(TrueV, FalseV);
+    Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV,
+                                    TrueV->getName() + "." + FalseV->getName());
+    PN->setIncomingValue(OrigI, V);
+    PN->setIncomingValue(ThenI, V);
   }
 
   ++NumSpeculations;
@@ -3382,7 +3413,8 @@ SwitchLookupTable::SwitchLookupTable(Module &M,
                                      ConstantInt *Offset,
                const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
                                      Constant *DefaultValue,
-                                     const DataLayout *TD) {
+                                     const DataLayout *TD)
+    : SingleValue(0), BitMap(0), BitMapElementTy(0), Array(0) {
   assert(Values.size() && "Can't build lookup table without values!");
   assert(TableSize >= Values.size() && "Can't fit values in table!");
 
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 83c74e7..8ad566c 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -50,6 +50,10 @@ public:
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
     =0;
 
+  /// ignoreCallingConv - Returns false if this transformation could possibly
+  /// change the calling convention.
+  virtual bool ignoreCallingConv() { return false; }
+
   Value *optimizeCall(CallInst *CI, const DataLayout *TD,
                       const TargetLibraryInfo *TLI,
                       const LibCallSimplifier *LCS, IRBuilder<> &B) {
@@ -61,7 +65,7 @@ public:
       Context = &CI->getCalledFunction()->getContext();
 
     // We never change the calling convention.
-    if (CI->getCallingConv() != llvm::CallingConv::C)
+    if (!ignoreCallingConv() && CI->getCallingConv() != llvm::CallingConv::C)
       return NULL;
 
     return callOptimizer(CI->getCalledFunction(), CI, B);
@@ -724,6 +728,7 @@ struct StrNCpyOpt : public LibCallOptimization {
 };
 
 struct StrLenOpt : public LibCallOptimization {
+  virtual bool ignoreCallingConv() { return true; }
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 1 ||
@@ -1260,6 +1265,7 @@ struct FFSOpt : public LibCallOptimization {
 };
 
 struct AbsOpt : public LibCallOptimization {
+  virtual bool ignoreCallingConv() { return true; }
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     FunctionType *FT = Callee->getFunctionType();
     // We require integer(integer) where the types agree.
@@ -1883,6 +1889,7 @@ LibCallSimplifier::~LibCallSimplifier() {
 }
 
 Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+  if (CI->hasFnAttr(Attribute::NoBuiltin)) return 0;
   return Impl->optimizeCall(CI);
 }
 
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index a5e1643..b5941bd 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -63,14 +63,29 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     // Check all operands to see if any need to be remapped.
     for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
       Value *OP = MD->getOperand(i);
-      if (OP == 0 || MapValue(OP, VM, Flags, TypeMapper) == OP) continue;
+      if (OP == 0) continue;
+      Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper);
+      // Use identity map if Mapped_Op is null and we can ignore missing
+      // entries.
+      if (Mapped_OP == OP ||
+          (Mapped_OP == 0 && (Flags & RF_IgnoreMissingEntries)))
+        continue;
 
       // Ok, at least one operand needs remapping.  
       SmallVector<Value*, 4> Elts;
       Elts.reserve(MD->getNumOperands());
       for (i = 0; i != e; ++i) {
         Value *Op = MD->getOperand(i);
-        Elts.push_back(Op ? MapValue(Op, VM, Flags, TypeMapper) : 0);
+        if (Op == 0)
+          Elts.push_back(0);
+        else {
+          Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper);
+          // Use identity map if Mapped_Op is null and we can ignore missing
+          // entries.
+          if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries))
+            Mapped_Op = Op;
+          Elts.push_back(Mapped_Op);
+        }
       }
       MDNode *NewMD = MDNode::get(V->getContext(), Elts);
       Dummy->replaceAllUsesWith(NewMD);
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index d72a4a1..7636541 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -48,7 +48,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
-#include <map>
 using namespace llvm;
 
 static cl::opt<bool>
@@ -89,6 +88,10 @@ MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
   cl::desc("The maximum number of pairable instructions per group"));
 
 static cl::opt<unsigned>
+MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
+  cl::desc("The maximum number of candidate instruction pairs per group"));
+
+static cl::opt<unsigned>
 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
   cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
                        " a full cycle check"));
@@ -207,11 +210,6 @@ namespace {
     typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
     typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
     typedef std::pair<VPPair, unsigned> VPPairWithType;
-    typedef std::pair<std::multimap<Value *, Value *>::iterator,
-              std::multimap<Value *, Value *>::iterator> VPIteratorPair;
-    typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
-              std::multimap<ValuePair, ValuePair>::iterator>
-                VPPIteratorPair;
 
     AliasAnalysis *AA;
     DominatorTree *DT;
@@ -225,7 +223,7 @@ namespace {
 
     bool getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
-                       std::multimap<Value *, Value *> &CandidatePairs,
+                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
                        DenseSet<ValuePair> &FixedOrderPairs,
                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len);
@@ -239,33 +237,36 @@ namespace {
       PairConnectionSplat
     };
 
-    void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts,
-                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                       DenseMap<VPPair, unsigned> &PairConnectionTypes);
+    void computeConnectedPairs(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes);
 
     void buildDepMap(BasicBlock &BB,
-                       std::multimap<Value *, Value *> &CandidatePairs,
-                       std::vector<Value *> &PairableInsts,
-                       DenseSet<ValuePair> &PairableInstUsers);
-
-    void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
-                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                        std::vector<Value *> &PairableInsts,
-                        DenseSet<ValuePair> &FixedOrderPairs,
-                        DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                        std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                        std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                        DenseSet<ValuePair> &PairableInstUsers,
-                        DenseMap<Value *, Value *>& ChosenPairs);
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &PairableInstUsers);
+
+    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             DenseMap<ValuePair, int> &CandidatePairCostSavings,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<Value *, Value *>& ChosenPairs);
 
     void fuseChosenPairs(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *>& ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs,
-                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps);
+             std::vector<Value *> &PairableInsts,
+             DenseMap<Value *, Value *>& ChosenPairs,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
 
 
     bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
@@ -277,56 +278,63 @@ namespace {
     bool trackUsesOfI(DenseSet<Value *> &Users,
                       AliasSetTracker &WriteSet, Instruction *I,
                       Instruction *J, bool UpdateUsers = true,
-                      std::multimap<Value *, Value *> *LoadMoveSet = 0);
+                      DenseSet<ValuePair> *LoadMoveSetPairs = 0);
 
-    void computePairsConnectedTo(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      ValuePair P);
+  void computePairsConnectedTo(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             ValuePair P);
 
     bool pairsConflict(ValuePair P, ValuePair Q,
-                 DenseSet<ValuePair> &PairableInstUsers,
-                 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> >
+               *PairableInstUserMap = 0,
+             DenseSet<VPPair> *PairableInstUserPairSet = 0);
 
     bool pairWillFormCycle(ValuePair P,
-                       std::multimap<ValuePair, ValuePair> &PairableInstUsers,
-                       DenseSet<ValuePair> &CurrentPairs);
-
-    void pruneTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree,
-                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
-                      bool UseCycleCheck);
-
-    void buildInitialTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree, ValuePair J);
-
-    void findBestTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &FixedOrderPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
-                      int &BestEffSize, VPIteratorPair ChoiceRange,
-                      bool UseCycleCheck);
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
+             DenseSet<ValuePair> &CurrentPairs);
+
+    void pruneDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<VPPair> &PairableInstUserPairSet,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseMap<ValuePair, size_t> &DAG,
+             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+             bool UseCycleCheck);
+
+    void buildInitialDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
+
+    void findBestDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             DenseMap<ValuePair, int> &CandidatePairCostSavings,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<VPPair> &PairableInstUserPairSet,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+             bool UseCycleCheck);
 
     Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
                      Instruction *J, unsigned o);
@@ -358,20 +366,22 @@ namespace {
 
     void collectPairLoadMoveSet(BasicBlock &BB,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I);
 
     void collectLoadMoveSet(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet);
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs);
 
     bool canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I, Instruction *J);
 
     void moveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J);
 
@@ -463,18 +473,18 @@ namespace {
 
     static inline void getInstructionTypes(Instruction *I,
                                            Type *&T1, Type *&T2) {
-      if (isa<StoreInst>(I)) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         // For stores, it is the value type, not the pointer type that matters
         // because the value is what will come from a vector register.
   
-        Value *IVal = cast<StoreInst>(I)->getValueOperand();
+        Value *IVal = SI->getValueOperand();
         T1 = IVal->getType();
       } else {
         T1 = I->getType();
       }
   
-      if (I->isCast())
-        T2 = cast<CastInst>(I)->getSrcTy();
+      if (CastInst *CI = dyn_cast<CastInst>(I))
+        T2 = CI->getSrcTy();
       else
         T2 = T1;
 
@@ -500,7 +510,7 @@ namespace {
       // InsertElement and ExtractElement have a depth factor of zero. This is
       // for two reasons: First, they cannot be usefully fused. Second, because
       // the pass generates a lot of these, they can confuse the simple metric
-      // used to compare the trees in the next iteration. Thus, giving them a
+      // used to compare the dags in the next iteration. Thus, giving them a
       // weight of zero allows the pass to essentially ignore them in
       // subsequent iterations when looking for vectorization opportunities
       // while still tracking dependency chains that flow through those
@@ -661,19 +671,6 @@ namespace {
       }
     }
 
-    // Returns true if J is the second element in some pair referenced by
-    // some multimap pair iterator pair.
-    template <typename V>
-    bool isSecondInIteratorPair(V J, std::pair<
-           typename std::multimap<V, V>::iterator,
-           typename std::multimap<V, V>::iterator> PairRange) {
-      for (typename std::multimap<V, V>::iterator K = PairRange.first;
-           K != PairRange.second; ++K)
-        if (K->second == J) return true;
-
-      return false;
-    }
-
     bool isPureIEChain(InsertElementInst *IE) {
       InsertElementInst *IENext = IE;
       do {
@@ -698,11 +695,12 @@ namespace {
     DenseMap<Value *, Value *> AllChosenPairs;
     DenseSet<ValuePair> AllFixedOrderPairs;
     DenseMap<VPPair, unsigned> AllPairConnectionTypes;
-    std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps;
+    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
+                                                 AllConnectedPairDeps;
 
     do {
       std::vector<Value *> PairableInsts;
-      std::multimap<Value *, Value *> CandidatePairs;
+      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
       DenseSet<ValuePair> FixedOrderPairs;
       DenseMap<ValuePair, int> CandidatePairCostSavings;
       ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
@@ -711,6 +709,14 @@ namespace {
                                          PairableInsts, NonPow2Len);
       if (PairableInsts.empty()) continue;
 
+      // Build the candidate pair set for faster lookups.
+      DenseSet<ValuePair> CandidatePairsSet;
+      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
+           CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
+        for (std::vector<Value *>::iterator J = I->second.begin(),
+             JE = I->second.end(); J != JE; ++J)
+          CandidatePairsSet.insert(ValuePair(I->first, *J));
+
       // Now we have a map of all of the pairable instructions and we need to
       // select the best possible pairing. A good pairing is one such that the
       // users of the pair are also paired. This defines a (directed) forest
@@ -720,30 +726,33 @@ namespace {
       // Note that it only matters that both members of the second pair use some
       // element of the first pair (to allow for splatting).
 
-      std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps;
+      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
+                                                   ConnectedPairDeps;
       DenseMap<VPPair, unsigned> PairConnectionTypes;
-      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs,
-                            PairConnectionTypes);
+      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
+                            PairableInsts, ConnectedPairs, PairConnectionTypes);
       if (ConnectedPairs.empty()) continue;
 
-      for (std::multimap<ValuePair, ValuePair>::iterator
+      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
            I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I) {
-        ConnectedPairDeps.insert(VPPair(I->second, I->first));
-      }
+           I != IE; ++I)
+        for (std::vector<ValuePair>::iterator J = I->second.begin(),
+             JE = I->second.end(); J != JE; ++J)
+          ConnectedPairDeps[*J].push_back(I->first);
 
       // Build the pairable-instruction dependency map
       DenseSet<ValuePair> PairableInstUsers;
       buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
 
       // There is now a graph of the connected pairs. For each variable, pick
-      // the pairing with the largest tree meeting the depth requirement on at
-      // least one branch. Then select all pairings that are part of that tree
+      // the pairing with the largest dag meeting the depth requirement on at
+      // least one branch. Then select all pairings that are part of that dag
       // and remove them from the list of available pairings and pairable
       // variables.
 
       DenseMap<Value *, Value *> ChosenPairs;
-      choosePairs(CandidatePairs, CandidatePairCostSavings,
+      choosePairs(CandidatePairs, CandidatePairsSet,
+        CandidatePairCostSavings,
         PairableInsts, FixedOrderPairs, PairConnectionTypes,
         ConnectedPairs, ConnectedPairDeps,
         PairableInstUsers, ChosenPairs);
@@ -777,14 +786,15 @@ namespace {
         }
       }
 
-      for (std::multimap<ValuePair, ValuePair>::iterator
+      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
            I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I) {
-        if (AllPairConnectionTypes.count(*I)) {
-          AllConnectedPairs.insert(*I);
-          AllConnectedPairDeps.insert(VPPair(I->second, I->first));
-        }
-      }
+           I != IE; ++I)
+        for (std::vector<ValuePair>::iterator J = I->second.begin(),
+          JE = I->second.end(); J != JE; ++J)
+          if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
+            AllConnectedPairs[I->first].push_back(*J);
+            AllConnectedPairDeps[*J].push_back(I->first);
+          }
     } while (ShouldContinue);
 
     if (AllChosenPairs.empty()) return false;
@@ -910,7 +920,7 @@ namespace {
   // This function returns true if the two provided instructions are compatible
   // (meaning that they can be fused into a vector instruction). This assumes
   // that I has already been determined to be vectorizable and that J is not
-  // in the use tree of I.
+  // in the use dag of I.
   bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
                        bool IsSimpleLoadStore, bool NonPow2Len,
                        int &CostSavings, int &FixedOrder) {
@@ -972,6 +982,11 @@ namespace {
           unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
                                                 BottomAlignment,
                                                 IAddressSpace);
+
+          ICost += TTI->getAddressComputationCost(aTypeI);
+          JCost += TTI->getAddressComputationCost(aTypeJ);
+          VCost += TTI->getAddressComputationCost(VType);
+
           if (VCost > ICost + JCost)
             return false;
 
@@ -994,6 +1009,12 @@ namespace {
       unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
       Type *VT1 = getVecTypeForPair(IT1, JT1),
            *VT2 = getVecTypeForPair(IT2, JT2);
+
+      // Note that this procedure is incorrect for insert and extract element
+      // instructions (because combining these often results in a shuffle),
+      // but this cost is ignored (because insert and extract element
+      // instructions are assigned a zero depth factor and are not really
+      // fused in general).
       unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
 
       if (VCost > ICost + JCost)
@@ -1090,7 +1111,7 @@ namespace {
   // to contain any memory locations to which J writes. The function returns
   // true if J uses I. By default, alias analysis is used to determine
   // whether J reads from memory that overlaps with a location in WriteSet.
-  // If LoadMoveSet is not null, then it is a previously-computed multimap
+  // If LoadMoveSet is not null, then it is a previously-computed map
   // where the key is the memory-based user instruction and the value is
   // the instruction to be compared with I. So, if LoadMoveSet is provided,
   // then the alias analysis is not used. This is necessary because this
@@ -1100,7 +1121,7 @@ namespace {
   bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
                        AliasSetTracker &WriteSet, Instruction *I,
                        Instruction *J, bool UpdateUsers,
-                       std::multimap<Value *, Value *> *LoadMoveSet) {
+                       DenseSet<ValuePair> *LoadMoveSetPairs) {
     bool UsesI = false;
 
     // This instruction may already be marked as a user due, for example, to
@@ -1118,9 +1139,8 @@ namespace {
         }
       }
     if (!UsesI && J->mayReadFromMemory()) {
-      if (LoadMoveSet) {
-        VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
-        UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
+      if (LoadMoveSetPairs) {
+        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
       } else {
         for (AliasSetTracker::iterator W = WriteSet.begin(),
              WE = WriteSet.end(); W != WE; ++W) {
@@ -1144,10 +1164,11 @@ namespace {
   // basic block and collects all candidate pairs for vectorization.
   bool BBVectorize::getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
-                       std::multimap<Value *, Value *> &CandidatePairs,
+                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
                        DenseSet<ValuePair> &FixedOrderPairs,
                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len) {
+    size_t TotalPairs = 0;
     BasicBlock::iterator E = BB.end();
     if (Start == E) return false;
 
@@ -1193,7 +1214,8 @@ namespace {
           PairableInsts.push_back(I);
         }
 
-        CandidatePairs.insert(ValuePair(I, J));
+        CandidatePairs[I].push_back(J);
+        ++TotalPairs;
         if (TTI)
           CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
                                                             CostSavings));
@@ -1217,7 +1239,8 @@ namespace {
         // If we have already found too many pairs, break here and this function
         // will be called again starting after the last instruction selected
         // during this invocation.
-        if (PairableInsts.size() >= Config.MaxInsts) {
+        if (PairableInsts.size() >= Config.MaxInsts ||
+            TotalPairs >= Config.MaxPairs) {
           ShouldContinue = true;
           break;
         }
@@ -1237,11 +1260,12 @@ namespace {
   // it looks for pairs such that both members have an input which is an
   // output of PI or PJ.
   void BBVectorize::computePairsConnectedTo(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      ValuePair P) {
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                  ValuePair P) {
     StoreInst *SI, *SJ;
 
     // For each possible pairing for this variable, look at the uses of
@@ -1259,8 +1283,6 @@ namespace {
         continue;
       }
 
-      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
-
       // For each use of the first variable, look for uses of the second
       // variable...
       for (Value::use_iterator J = P.second->use_begin(),
@@ -1269,19 +1291,17 @@ namespace {
             P.second == SJ->getPointerOperand())
           continue;
 
-        VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
-
         // Look for <I, J>:
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
           VPPair VP(P, ValuePair(*I, *J));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
         }
 
         // Look for <J, I>:
-        if (isSecondInIteratorPair<Value*>(*I, JPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*J, *I))) {
           VPPair VP(P, ValuePair(*J, *I));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
         }
       }
@@ -1294,9 +1314,9 @@ namespace {
             P.first == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
           VPPair VP(P, ValuePair(*I, *J));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
         }
       }
@@ -1313,16 +1333,14 @@ namespace {
                P.second == SI->getPointerOperand())
         continue;
 
-      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
-
       for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
         if ((SJ = dyn_cast<StoreInst>(*J)) &&
             P.second == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
           VPPair VP(P, ValuePair(*I, *J));
-          ConnectedPairs.insert(VP);
+          ConnectedPairs[VP.first].push_back(VP.second);
           PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
         }
       }
@@ -1333,55 +1351,73 @@ namespace {
   // connected if some output of the first pair forms an input to both members
   // of the second pair.
   void BBVectorize::computeConnectedPairs(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes) {
-
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
          PE = PairableInsts.end(); PI != PE; ++PI) {
-      VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
+      DenseMap<Value *, std::vector<Value *> >::iterator PP =
+        CandidatePairs.find(*PI);
+      if (PP == CandidatePairs.end())
+        continue;
 
-      for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
-           P != choiceRange.second; ++P)
-        computePairsConnectedTo(CandidatePairs, PairableInsts,
-                                ConnectedPairs, PairConnectionTypes, *P);
+      for (std::vector<Value *>::iterator P = PP->second.begin(),
+           E = PP->second.end(); P != E; ++P)
+        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
+                                PairableInsts, ConnectedPairs,
+                                PairConnectionTypes, ValuePair(*PI, *P));
     }
 
-    DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
+    DEBUG(size_t TotalPairs = 0;
+          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
+               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
+            TotalPairs += I->second.size();
+          dbgs() << "BBV: found " << TotalPairs
                  << " pair connections.\n");
   }
 
   // This function builds a set of use tuples such that <A, B> is in the set
-  // if B is in the use tree of A. If B is in the use tree of A, then B
+  // if B is in the use dag of A. If B is in the use dag of A, then B
   // depends on the output of A.
   void BBVectorize::buildDepMap(
                       BasicBlock &BB,
-                      std::multimap<Value *, Value *> &CandidatePairs,
+                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
                       DenseSet<ValuePair> &PairableInstUsers) {
     DenseSet<Value *> IsInPair;
-    for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
-         E = CandidatePairs.end(); C != E; ++C) {
+    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
+         CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
       IsInPair.insert(C->first);
-      IsInPair.insert(C->second);
+      IsInPair.insert(C->second.begin(), C->second.end());
     }
 
-    // Iterate through the basic block, recording all Users of each
+    // Iterate through the basic block, recording all users of each
     // pairable instruction.
 
-    BasicBlock::iterator E = BB.end();
+    BasicBlock::iterator E = BB.end(), EL =
+      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
     for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
       if (IsInPair.find(I) == IsInPair.end()) continue;
 
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
-      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
+      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
         (void) trackUsesOfI(Users, WriteSet, I, J);
 
+        if (J == EL)
+          break;
+      }
+
       for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
-           U != E; ++U)
+           U != E; ++U) {
+        if (IsInPair.find(*U) == IsInPair.end()) continue;
         PairableInstUsers.insert(ValuePair(I, *U));
+      }
+
+      if (I == EL)
+        break;
     }
   }
 
@@ -1389,8 +1425,9 @@ namespace {
   // input of pair Q is an output of pair P. If this is the case, then these
   // two pairs cannot be simultaneously fused.
   bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
-                     DenseSet<ValuePair> &PairableInstUsers,
-                     std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
+             DenseSet<VPPair> *PairableInstUserPairSet) {
     // Two pairs are in conflict if they are mutual Users of eachother.
     bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
                   PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
@@ -1403,17 +1440,14 @@ namespace {
     if (PairableInstUserMap) {
       // FIXME: The expensive part of the cycle check is not so much the cycle
       // check itself but this edge insertion procedure. This needs some
-      // profiling and probably a different data structure (same is true of
-      // most uses of std::multimap).
+      // profiling and probably a different data structure.
       if (PUsesQ) {
-        VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
-        if (!isSecondInIteratorPair(P, QPairRange))
-          PairableInstUserMap->insert(VPPair(Q, P));
+        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
+          (*PairableInstUserMap)[Q].push_back(P);
       }
       if (QUsesP) {
-        VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
-        if (!isSecondInIteratorPair(Q, PPairRange))
-          PairableInstUserMap->insert(VPPair(P, Q));
+        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
+          (*PairableInstUserMap)[P].push_back(Q);
       }
     }
 
@@ -1423,8 +1457,8 @@ namespace {
   // This function walks the use graph of current pairs to see if, starting
   // from P, the walk returns to P.
   bool BBVectorize::pairWillFormCycle(ValuePair P,
-                       std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                       DenseSet<ValuePair> &CurrentPairs) {
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<ValuePair> &CurrentPairs) {
     DEBUG(if (DebugCycleCheck)
             dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
                    << *P.second << "\n");
@@ -1441,36 +1475,41 @@ namespace {
       DEBUG(if (DebugCycleCheck)
               dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
                      << *QTop.second << "\n");
-      VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
-      for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
-           C != QPairRange.second; ++C) {
-        if (C->second == P) {
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        PairableInstUserMap.find(QTop);
+      if (QQ == PairableInstUserMap.end())
+        continue;
+
+      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
+           CE = QQ->second.end(); C != CE; ++C) {
+        if (*C == P) {
           DEBUG(dbgs()
                  << "BBV: rejected to prevent non-trivial cycle formation: "
-                 << *C->first.first << " <-> " << *C->first.second << "\n");
+                 << QTop.first << " <-> " << C->second << "\n");
           return true;
         }
 
-        if (CurrentPairs.count(C->second) && !Visited.count(C->second))
-          Q.push_back(C->second);
+        if (CurrentPairs.count(*C) && !Visited.count(*C))
+          Q.push_back(*C);
       }
     } while (!Q.empty());
 
     return false;
   }
 
-  // This function builds the initial tree of connected pairs with the
+  // This function builds the initial dag of connected pairs with the
   // pair J at the root.
-  void BBVectorize::buildInitialTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree, ValuePair J) {
-    // Each of these pairs is viewed as the root node of a Tree. The Tree
+  void BBVectorize::buildInitialDAGFor(
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseSet<ValuePair> &PairableInstUsers,
+                  DenseMap<Value *, Value *> &ChosenPairs,
+                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
+    // Each of these pairs is viewed as the root node of a DAG. The DAG
     // is then walked (depth-first). As this happens, we keep track of
-    // the pairs that compose the Tree and the maximum depth of the Tree.
+    // the pairs that compose the DAG and the maximum depth of the DAG.
     SmallVector<ValuePairWithDepth, 32> Q;
     // General depth-first post-order traversal:
     Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
@@ -1480,69 +1519,65 @@ namespace {
       // Push each child onto the queue:
       bool MoreChildren = false;
       size_t MaxChildDepth = QTop.second;
-      VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
-      for (std::multimap<ValuePair, ValuePair>::iterator k = qtRange.first;
-           k != qtRange.second; ++k) {
-        // Make sure that this child pair is still a candidate:
-        bool IsStillCand = false;
-        VPIteratorPair checkRange =
-          CandidatePairs.equal_range(k->second.first);
-        for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
-             m != checkRange.second; ++m) {
-          if (m->second == k->second.second) {
-            IsStillCand = true;
-            break;
-          }
-        }
-
-        if (IsStillCand) {
-          DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
-          if (C == Tree.end()) {
-            size_t d = getDepthFactor(k->second.first);
-            Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
-            MoreChildren = true;
-          } else {
-            MaxChildDepth = std::max(MaxChildDepth, C->second);
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        ConnectedPairs.find(QTop.first);
+      if (QQ != ConnectedPairs.end())
+        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
+             ke = QQ->second.end(); k != ke; ++k) {
+          // Make sure that this child pair is still a candidate:
+          if (CandidatePairsSet.count(*k)) {
+            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
+            if (C == DAG.end()) {
+              size_t d = getDepthFactor(k->first);
+              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
+              MoreChildren = true;
+            } else {
+              MaxChildDepth = std::max(MaxChildDepth, C->second);
+            }
           }
         }
-      }
 
       if (!MoreChildren) {
-        // Record the current pair as part of the Tree:
-        Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
+        // Record the current pair as part of the DAG:
+        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
         Q.pop_back();
       }
     } while (!Q.empty());
   }
 
-  // Given some initial tree, prune it by removing conflicting pairs (pairs
+  // Given some initial dag, prune it by removing conflicting pairs (pairs
   // that cannot be simultaneously chosen for vectorization).
-  void BBVectorize::pruneTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseMap<ValuePair, size_t> &Tree,
-                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
-                      bool UseCycleCheck) {
+  void BBVectorize::pruneDAGFor(
+              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+              std::vector<Value *> &PairableInsts,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+              DenseSet<ValuePair> &PairableInstUsers,
+              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+              DenseSet<VPPair> &PairableInstUserPairSet,
+              DenseMap<Value *, Value *> &ChosenPairs,
+              DenseMap<ValuePair, size_t> &DAG,
+              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+              bool UseCycleCheck) {
     SmallVector<ValuePairWithDepth, 32> Q;
     // General depth-first post-order traversal:
     Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
     do {
       ValuePairWithDepth QTop = Q.pop_back_val();
-      PrunedTree.insert(QTop.first);
+      PrunedDAG.insert(QTop.first);
 
       // Visit each child, pruning as necessary...
       SmallVector<ValuePairWithDepth, 8> BestChildren;
-      VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
-      for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
-           K != QTopRange.second; ++K) {
-        DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
-        if (C == Tree.end()) continue;
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        ConnectedPairs.find(QTop.first);
+      if (QQ == ConnectedPairs.end())
+        continue;
 
-        // This child is in the Tree, now we need to make sure it is the
+      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
+           KE = QQ->second.end(); K != KE; ++K) {
+        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
+        if (C == DAG.end()) continue;
+
+        // This child is in the DAG, now we need to make sure it is the
         // best of any conflicting children. There could be multiple
         // conflicting children, so first, determine if we're keeping
         // this child, then delete conflicting children as necessary.
@@ -1556,7 +1591,7 @@ namespace {
         // fusing (a,b) we have y .. a/b .. x where y is an input
         // to a/b and x is an output to a/b: x and y can no longer
         // be legally fused. To prevent this condition, we must
-        // make sure that a child pair added to the Tree is not
+        // make sure that a child pair added to the DAG is not
         // both an input and output of an already-selected pair.
 
         // Pairing-induced dependencies can also form from more complicated
@@ -1575,7 +1610,8 @@ namespace {
               C2->first.second == C->first.first ||
               C2->first.second == C->first.second ||
               pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             if (C2->second >= C->second) {
               CanAdd = false;
               break;
@@ -1587,15 +1623,16 @@ namespace {
         if (!CanAdd) continue;
 
         // Even worse, this child could conflict with another node already
-        // selected for the Tree. If that is the case, ignore this child.
-        for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(),
-             E2 = PrunedTree.end(); T != E2; ++T) {
+        // selected for the DAG. If that is the case, ignore this child.
+        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
+             E2 = PrunedDAG.end(); T != E2; ++T) {
           if (T->first == C->first.first ||
               T->first == C->first.second ||
               T->second == C->first.first ||
               T->second == C->first.second ||
               pairsConflict(*T, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             CanAdd = false;
             break;
           }
@@ -1612,7 +1649,8 @@ namespace {
               C2->first.second == C->first.first ||
               C2->first.second == C->first.second ||
               pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             CanAdd = false;
             break;
           }
@@ -1627,7 +1665,8 @@ namespace {
               ChosenPairs.begin(), E2 = ChosenPairs.end();
              C2 != E2; ++C2) {
           if (pairsConflict(*C2, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
             CanAdd = false;
             break;
           }
@@ -1639,7 +1678,7 @@ namespace {
         // To check for non-trivial cycles formed by the addition of the
         // current pair we've formed a list of all relevant pairs, now use a
         // graph walk to check for a cycle. We start from the current pair and
-        // walk the use tree to see if we again reach the current pair. If we
+        // walk the use dag to see if we again reach the current pair. If we
         // do, then the current pair is rejected.
 
         // FIXME: It may be more efficient to use a topological-ordering
@@ -1676,34 +1715,40 @@ namespace {
     } while (!Q.empty());
   }
 
-  // This function finds the best tree of mututally-compatible connected
+  // This function finds the best dag of mututally-compatible connected
   // pairs, given the choice of root pairs as an iterator range.
-  void BBVectorize::findBestTreeFor(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &FixedOrderPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
-                      DenseMap<Value *, Value *> &ChosenPairs,
-                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
-                      int &BestEffSize, VPIteratorPair ChoiceRange,
-                      bool UseCycleCheck) {
-    for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
-         J != ChoiceRange.second; ++J) {
+  void BBVectorize::findBestDAGFor(
+              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+              DenseSet<ValuePair> &CandidatePairsSet,
+              DenseMap<ValuePair, int> &CandidatePairCostSavings,
+              std::vector<Value *> &PairableInsts,
+              DenseSet<ValuePair> &FixedOrderPairs,
+              DenseMap<VPPair, unsigned> &PairConnectionTypes,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+              DenseSet<ValuePair> &PairableInstUsers,
+              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+              DenseSet<VPPair> &PairableInstUserPairSet,
+              DenseMap<Value *, Value *> &ChosenPairs,
+              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+              bool UseCycleCheck) {
+    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
+         J != JE; ++J) {
+      ValuePair IJ(II, *J);
+      if (!CandidatePairsSet.count(IJ))
+        continue;
 
       // Before going any further, make sure that this pair does not
       // conflict with any already-selected pairs (see comment below
-      // near the Tree pruning for more details).
+      // near the DAG pruning for more details).
       DenseSet<ValuePair> ChosenPairSet;
       bool DoesConflict = false;
       for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
            E = ChosenPairs.end(); C != E; ++C) {
-        if (pairsConflict(*C, *J, PairableInstUsers,
-                          UseCycleCheck ? &PairableInstUserMap : 0)) {
+        if (pairsConflict(*C, IJ, PairableInstUsers,
+                          UseCycleCheck ? &PairableInstUserMap : 0,
+                          UseCycleCheck ? &PairableInstUserPairSet : 0)) {
           DoesConflict = true;
           break;
         }
@@ -1713,40 +1758,42 @@ namespace {
       if (DoesConflict) continue;
 
       if (UseCycleCheck &&
-          pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
+          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
         continue;
 
-      DenseMap<ValuePair, size_t> Tree;
-      buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                          PairableInstUsers, ChosenPairs, Tree, *J);
+      DenseMap<ValuePair, size_t> DAG;
+      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
+                          PairableInsts, ConnectedPairs,
+                          PairableInstUsers, ChosenPairs, DAG, IJ);
 
       // Because we'll keep the child with the largest depth, the largest
-      // depth is still the same in the unpruned Tree.
-      size_t MaxDepth = Tree.lookup(*J);
+      // depth is still the same in the unpruned DAG.
+      size_t MaxDepth = DAG.lookup(IJ);
 
-      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {"
-                   << *J->first << " <-> " << *J->second << "} of depth " <<
-                   MaxDepth << " and size " << Tree.size() << "\n");
+      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
+                   << IJ.first << " <-> " << IJ.second << "} of depth " <<
+                   MaxDepth << " and size " << DAG.size() << "\n");
 
-      // At this point the Tree has been constructed, but, may contain
+      // At this point the DAG has been constructed, but, may contain
       // contradictory children (meaning that different children of
-      // some tree node may be attempting to fuse the same instruction).
-      // So now we walk the tree again, in the case of a conflict,
+      // some dag node may be attempting to fuse the same instruction).
+      // So now we walk the dag again, in the case of a conflict,
       // keep only the child with the largest depth. To break a tie,
       // favor the first child.
 
-      DenseSet<ValuePair> PrunedTree;
-      pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                   PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
-                   PrunedTree, *J, UseCycleCheck);
+      DenseSet<ValuePair> PrunedDAG;
+      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                   PairableInstUsers, PairableInstUserMap,
+                   PairableInstUserPairSet,
+                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
 
       int EffSize = 0;
       if (TTI) {
-        DenseSet<Value *> PrunedTreeInstrs;
-        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-             E = PrunedTree.end(); S != E; ++S) {
-          PrunedTreeInstrs.insert(S->first);
-          PrunedTreeInstrs.insert(S->second);
+        DenseSet<Value *> PrunedDAGInstrs;
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S) {
+          PrunedDAGInstrs.insert(S->first);
+          PrunedDAGInstrs.insert(S->second);
         }
 
         // The set of pairs that have already contributed to the total cost.
@@ -1759,8 +1806,8 @@ namespace {
 
         // The node weights represent the cost savings associated with
         // fusing the pair of instructions.
-        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-             E = PrunedTree.end(); S != E; ++S) {
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S) {
           if (!isa<ShuffleVectorInst>(S->first) &&
               !isa<InsertElementInst>(S->first) &&
               !isa<ExtractElementInst>(S->first))
@@ -1778,15 +1825,17 @@ namespace {
 
           // The edge weights contribute in a negative sense: they represent
           // the cost of shuffles.
-          VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S);
-          if (IP.first != ConnectedPairDeps.end()) {
+          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
+            ConnectedPairDeps.find(*S);
+          if (SS != ConnectedPairDeps.end()) {
             unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-            for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-                 Q != IP.second; ++Q) {
-              if (!PrunedTree.count(Q->second))
+            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+                 TE = SS->second.end(); T != TE; ++T) {
+              VPPair Q(*S, *T);
+              if (!PrunedDAG.count(Q.second))
                 continue;
               DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q->second, Q->first));
+                PairConnectionTypes.find(VPPair(Q.second, Q.first));
               assert(R != PairConnectionTypes.end() &&
                      "Cannot find pair connection type");
               if (R->second == PairConnectionDirect)
@@ -1802,24 +1851,35 @@ namespace {
               ((NumDepsSwap > NumDepsDirect) ||
                 FixedOrderPairs.count(ValuePair(S->second, S->first)));
 
-            for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-                 Q != IP.second; ++Q) {
-              if (!PrunedTree.count(Q->second))
+            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+                 TE = SS->second.end(); T != TE; ++T) {
+              VPPair Q(*S, *T);
+              if (!PrunedDAG.count(Q.second))
                 continue;
               DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q->second, Q->first));
+                PairConnectionTypes.find(VPPair(Q.second, Q.first));
               assert(R != PairConnectionTypes.end() &&
                      "Cannot find pair connection type");
-              Type *Ty1 = Q->second.first->getType(),
-                   *Ty2 = Q->second.second->getType();
+              Type *Ty1 = Q.second.first->getType(),
+                   *Ty2 = Q.second.second->getType();
               Type *VTy = getVecTypeForPair(Ty1, Ty2);
               if ((R->second == PairConnectionDirect && FlipOrder) ||
                   (R->second == PairConnectionSwap && !FlipOrder)  ||
                   R->second == PairConnectionSplat) {
                 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                    VTy, VTy);
+
+                if (VTy->getVectorNumElements() == 2) {
+                  if (R->second == PairConnectionSplat)
+                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                      TargetTransformInfo::SK_Broadcast, VTy));
+                  else
+                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                      TargetTransformInfo::SK_Reverse, VTy));
+                }
+
                 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                  *Q->second.first << " <-> " << *Q->second.second <<
+                  *Q.second.first << " <-> " << *Q.second.second <<
                     "} -> {" <<
                   *S->first << " <-> " << *S->second << "} = " <<
                    ESContrib << "\n");
@@ -1846,7 +1906,7 @@ namespace {
               }
               if (isa<ExtractElementInst>(*I))
                 continue;
-              if (PrunedTreeInstrs.count(*I))
+              if (PrunedDAGInstrs.count(*I))
                 continue;
               NeedsExtraction = true;
               break;
@@ -1854,10 +1914,12 @@ namespace {
 
             if (NeedsExtraction) {
               int ESContrib;
-              if (Ty1->isVectorTy())
+              if (Ty1->isVectorTy()) {
                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                Ty1, VTy);
-              else
+                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
+              } else
                 ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::ExtractElement, VTy, 0);
 
@@ -1876,7 +1938,7 @@ namespace {
               }
               if (isa<ExtractElementInst>(*I))
                 continue;
-              if (PrunedTreeInstrs.count(*I))
+              if (PrunedDAGInstrs.count(*I))
                 continue;
               NeedsExtraction = true;
               break;
@@ -1884,10 +1946,13 @@ namespace {
 
             if (NeedsExtraction) {
               int ESContrib;
-              if (Ty2->isVectorTy())
+              if (Ty2->isVectorTy()) {
                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                Ty2, VTy);
-              else
+                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                  TargetTransformInfo::SK_ExtractSubvector, VTy,
+                  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
+              } else
                 ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::ExtractElement, VTy, 1);
               DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
@@ -1915,7 +1980,7 @@ namespace {
               ValuePair VPR = ValuePair(O2, O1);
 
               // Internal edges are not handled here.
-              if (PrunedTree.count(VP) || PrunedTree.count(VPR))
+              if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
                 continue;
 
               Type *Ty1 = O1->getType(),
@@ -1963,6 +2028,10 @@ namespace {
               } else if (IncomingPairs.count(VPR)) {
                 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
                                                VTy, VTy);
+
+                if (VTy->getVectorNumElements() == 2)
+                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                    TargetTransformInfo::SK_Reverse, VTy));
               } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
                 ESContrib = (int) TTI->getVectorInstrCost(
                                     Instruction::InsertElement, VTy, 0);
@@ -2005,27 +2074,27 @@ namespace {
 
         if (!HasNontrivialInsts) {
           DEBUG(if (DebugPairSelection) dbgs() <<
-                "\tNo non-trivial instructions in tree;"
+                "\tNo non-trivial instructions in DAG;"
                 " override to zero effective size\n");
           EffSize = 0;
         }
       } else {
-        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-             E = PrunedTree.end(); S != E; ++S)
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S)
           EffSize += (int) getDepthFactor(S->first);
       }
 
       DEBUG(if (DebugPairSelection)
-             dbgs() << "BBV: found pruned Tree for pair {"
-             << *J->first << " <-> " << *J->second << "} of depth " <<
-             MaxDepth << " and size " << PrunedTree.size() <<
+             dbgs() << "BBV: found pruned DAG for pair {"
+             << IJ.first << " <-> " << IJ.second << "} of depth " <<
+             MaxDepth << " and size " << PrunedDAG.size() <<
             " (effective size: " << EffSize << ")\n");
       if (((TTI && !UseChainDepthWithTI) ||
             MaxDepth >= Config.ReqChainDepth) &&
           EffSize > 0 && EffSize > BestEffSize) {
         BestMaxDepth = MaxDepth;
         BestEffSize = EffSize;
-        BestTree = PrunedTree;
+        BestDAG = PrunedDAG;
       }
     }
   }
@@ -2033,66 +2102,98 @@ namespace {
   // Given the list of candidate pairs, this function selects those
   // that will be fused into vector instructions.
   void BBVectorize::choosePairs(
-                      std::multimap<Value *, Value *> &CandidatePairs,
-                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &FixedOrderPairs,
-                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
-                      DenseSet<ValuePair> &PairableInstUsers,
-                      DenseMap<Value *, Value *>& ChosenPairs) {
+                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                DenseSet<ValuePair> &CandidatePairsSet,
+                DenseMap<ValuePair, int> &CandidatePairCostSavings,
+                std::vector<Value *> &PairableInsts,
+                DenseSet<ValuePair> &FixedOrderPairs,
+                DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+                DenseSet<ValuePair> &PairableInstUsers,
+                DenseMap<Value *, Value *>& ChosenPairs) {
     bool UseCycleCheck =
-     CandidatePairs.size() <= Config.MaxCandPairsForCycleCheck;
-    std::multimap<ValuePair, ValuePair> PairableInstUserMap;
+     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
+
+    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
+    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
+         E = CandidatePairsSet.end(); I != E; ++I) {
+      std::vector<Value *> &JJ = CandidatePairs2[I->second];
+      if (JJ.empty()) JJ.reserve(32);
+      JJ.push_back(I->first);
+    }
+
+    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
+    DenseSet<VPPair> PairableInstUserPairSet;
     for (std::vector<Value *>::iterator I = PairableInsts.begin(),
          E = PairableInsts.end(); I != E; ++I) {
       // The number of possible pairings for this variable:
-      size_t NumChoices = CandidatePairs.count(*I);
+      size_t NumChoices = CandidatePairs.lookup(*I).size();
       if (!NumChoices) continue;
 
-      VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
+      std::vector<Value *> &JJ = CandidatePairs[*I];
 
-      // The best pair to choose and its tree:
+      // The best pair to choose and its dag:
       size_t BestMaxDepth = 0;
       int BestEffSize = 0;
-      DenseSet<ValuePair> BestTree;
-      findBestTreeFor(CandidatePairs, CandidatePairCostSavings,
+      DenseSet<ValuePair> BestDAG;
+      findBestDAGFor(CandidatePairs, CandidatePairsSet,
+                      CandidatePairCostSavings,
                       PairableInsts, FixedOrderPairs, PairConnectionTypes,
                       ConnectedPairs, ConnectedPairDeps,
-                      PairableInstUsers, PairableInstUserMap, ChosenPairs,
-                      BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
+                      PairableInstUsers, PairableInstUserMap,
+                      PairableInstUserPairSet, ChosenPairs,
+                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
                       UseCycleCheck);
 
-      // A tree has been chosen (or not) at this point. If no tree was
+      if (BestDAG.empty())
+        continue;
+
+      // A dag has been chosen (or not) at this point. If no dag was
       // chosen, then this instruction, I, cannot be paired (and is no longer
       // considered).
 
-      DEBUG(if (BestTree.size() > 0)
-              dbgs() << "BBV: selected pairs in the best tree for: "
-                     << *cast<Instruction>(*I) << "\n");
+      DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
+                   << *cast<Instruction>(*I) << "\n");
 
-      for (DenseSet<ValuePair>::iterator S = BestTree.begin(),
-           SE2 = BestTree.end(); S != SE2; ++S) {
-        // Insert the members of this tree into the list of chosen pairs.
+      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
+           SE2 = BestDAG.end(); S != SE2; ++S) {
+        // Insert the members of this dag into the list of chosen pairs.
         ChosenPairs.insert(ValuePair(S->first, S->second));
         DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
                *S->second << "\n");
 
-        // Remove all candidate pairs that have values in the chosen tree.
-        for (std::multimap<Value *, Value *>::iterator K =
-               CandidatePairs.begin(); K != CandidatePairs.end();) {
-          if (K->first == S->first || K->second == S->first ||
-              K->second == S->second || K->first == S->second) {
-            // Don't remove the actual pair chosen so that it can be used
-            // in subsequent tree selections.
-            if (!(K->first == S->first && K->second == S->second))
-              CandidatePairs.erase(K++);
-            else
-              ++K;
-          } else {
-            ++K;
-          }
+        // Remove all candidate pairs that have values in the chosen dag.
+        std::vector<Value *> &KK = CandidatePairs[S->first];
+        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
+             K != KE; ++K) {
+          if (*K == S->second)
+            continue;
+
+          CandidatePairsSet.erase(ValuePair(S->first, *K));
+        }
+
+        std::vector<Value *> &LL = CandidatePairs2[S->second];
+        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
+             L != LE; ++L) {
+          if (*L == S->first)
+            continue;
+
+          CandidatePairsSet.erase(ValuePair(*L, S->second));
+        }
+
+        std::vector<Value *> &MM = CandidatePairs[S->second];
+        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
+             M != ME; ++M) {
+          assert(*M != S->first && "Flipped pair in candidate list?");
+          CandidatePairsSet.erase(ValuePair(S->second, *M));
+        }
+
+        std::vector<Value *> &NN = CandidatePairs2[S->first];
+        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
+             N != NE; ++N) {
+          assert(*N != S->second && "Flipped pair in candidate list?");
+          CandidatePairsSet.erase(ValuePair(*N, S->first));
         }
       }
     }
@@ -2696,7 +2797,7 @@ namespace {
 
   // Move all uses of the function I (including pairing-induced uses) after J.
   bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I, Instruction *J) {
     // Skip to the first instruction past I.
     BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
@@ -2704,18 +2805,18 @@ namespace {
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
     for (; cast<Instruction>(L) != J; ++L)
-      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet);
+      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
 
     assert(cast<Instruction>(L) == J &&
       "Tracking has not proceeded far enough to check for dependencies");
     // If J is now in the use set of I, then trackUsesOfI will return true
     // and we have a dependency cycle (and the fusing operation must abort).
-    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet);
+    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
   }
 
   // Move all uses of the function I (including pairing-induced uses) after J.
   void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *&InsertionPt,
                      Instruction *I, Instruction *J) {
     // Skip to the first instruction past I.
@@ -2724,7 +2825,7 @@ namespace {
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
     for (; cast<Instruction>(L) != J;) {
-      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
+      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
         // Move this instruction
         Instruction *InstToMove = L; ++L;
 
@@ -2744,7 +2845,8 @@ namespace {
   // to be moved after J (the second instruction) when the pair is fused.
   void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
                      Instruction *I) {
     // Skip to the first instruction past I.
     BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
@@ -2757,8 +2859,10 @@ namespace {
     // could be before I if this is an inverted input.
     for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
       if (trackUsesOfI(Users, WriteSet, I, L)) {
-        if (L->mayReadFromMemory())
-          LoadMoveSet.insert(ValuePair(L, I));
+        if (L->mayReadFromMemory()) {
+          LoadMoveSet[L].push_back(I);
+          LoadMoveSetPairs.insert(ValuePair(L, I));
+        }
       }
     }
   }
@@ -2767,20 +2871,22 @@ namespace {
   // are chosen for vectorization, we can end up in a situation where the
   // aliasing analysis starts returning different query results as the
   // process of fusing instruction pairs continues. Because the algorithm
-  // relies on finding the same use trees here as were found earlier, we'll
+  // relies on finding the same use dags here as were found earlier, we'll
   // need to precompute the necessary aliasing information here and then
   // manually update it during the fusion process.
   void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     std::multimap<Value *, Value *> &LoadMoveSet) {
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs) {
     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
          PIE = PairableInsts.end(); PI != PIE; ++PI) {
       DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
       if (P == ChosenPairs.end()) continue;
 
       Instruction *I = cast<Instruction>(P->first);
-      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
+      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
+                             LoadMoveSetPairs, I);
     }
   }
 
@@ -2816,12 +2922,12 @@ namespace {
   // because the vector instruction is inserted in the location of the pair's
   // second member).
   void BBVectorize::fuseChosenPairs(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs,
-                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
-                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) {
+             std::vector<Value *> &PairableInsts,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
     LLVMContext& Context = BB.getContext();
 
     // During the vectorization process, the order of the pairs to be fused
@@ -2835,8 +2941,10 @@ namespace {
          E = FlippedPairs.end(); P != E; ++P)
       ChosenPairs.insert(*P);
 
-    std::multimap<Value *, Value *> LoadMoveSet;
-    collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
+    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
+    DenseSet<ValuePair> LoadMoveSetPairs;
+    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
+                       LoadMoveSet, LoadMoveSetPairs);
 
     DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
 
@@ -2868,7 +2976,7 @@ namespace {
       ChosenPairs.erase(FP);
       ChosenPairs.erase(P);
 
-      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) {
+      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
         DEBUG(dbgs() << "BBV: fusion of: " << *I <<
                " <-> " << *J <<
                " aborted because of non-trivial dependency cycle\n");
@@ -2885,18 +2993,20 @@ namespace {
         // of dependencies connected via swaps, and those directly connected,
         // and flip the order if the number of swaps is greater.
         bool OrigOrder = true;
-        VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J));
-        if (IP.first == ConnectedPairDeps.end()) {
-          IP = ConnectedPairDeps.equal_range(ValuePair(J, I));
+        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
+          ConnectedPairDeps.find(ValuePair(I, J));
+        if (IJ == ConnectedPairDeps.end()) {
+          IJ = ConnectedPairDeps.find(ValuePair(J, I));
           OrigOrder = false;
         }
 
-        if (IP.first != ConnectedPairDeps.end()) {
+        if (IJ != ConnectedPairDeps.end()) {
           unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-          for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-               Q != IP.second; ++Q) {
+          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
+               TE = IJ->second.end(); T != TE; ++T) {
+            VPPair Q(IJ->first, *T);
             DenseMap<VPPair, unsigned>::iterator R =
-              PairConnectionTypes.find(VPPair(Q->second, Q->first));
+              PairConnectionTypes.find(VPPair(Q.second, Q.first));
             assert(R != PairConnectionTypes.end() &&
                    "Cannot find pair connection type");
             if (R->second == PairConnectionDirect)
@@ -2922,17 +3032,20 @@ namespace {
 
       // If the pair being fused uses the opposite order from that in the pair
       // connection map, then we need to flip the types.
-      VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L));
-      for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
-           Q != IP.second; ++Q) {
-        DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q);
-        assert(R != PairConnectionTypes.end() &&
-               "Cannot find pair connection type");
-        if (R->second == PairConnectionDirect)
-          R->second = PairConnectionSwap;
-        else if (R->second == PairConnectionSwap)
-          R->second = PairConnectionDirect;
-      }
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
+        ConnectedPairs.find(ValuePair(H, L));
+      if (HL != ConnectedPairs.end())
+        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
+             TE = HL->second.end(); T != TE; ++T) {
+          VPPair Q(HL->first, *T);
+          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
+          assert(R != PairConnectionTypes.end() &&
+                 "Cannot find pair connection type");
+          if (R->second == PairConnectionDirect)
+            R->second = PairConnectionSwap;
+          else if (R->second == PairConnectionSwap)
+            R->second = PairConnectionDirect;
+        }
 
       bool LBeforeH = !FlipPairOrder;
       unsigned NumOperands = I->getNumOperands();
@@ -2964,12 +3077,12 @@ namespace {
       Instruction *K1 = 0, *K2 = 0;
       replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
 
-      // The use tree of the first original instruction must be moved to after
-      // the location of the second instruction. The entire use tree of the
-      // first instruction is disjoint from the input tree of the second
+      // The use dag of the first original instruction must be moved to after
+      // the location of the second instruction. The entire use dag of the
+      // first instruction is disjoint from the input dag of the second
       // (by definition), and so commutes with it.
 
-      moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
+      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
 
       if (!isa<StoreInst>(I)) {
         L->replaceAllUsesWith(K1);
@@ -2986,17 +3099,23 @@ namespace {
       // yet-to-be-fused pair. The loads in question are the keys of the map.
       if (I->mayReadFromMemory()) {
         std::vector<ValuePair> NewSetMembers;
-        VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
-        VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
-        for (std::multimap<Value *, Value *>::iterator N = IPairRange.first;
-             N != IPairRange.second; ++N)
-          NewSetMembers.push_back(ValuePair(K, N->second));
-        for (std::multimap<Value *, Value *>::iterator N = JPairRange.first;
-             N != JPairRange.second; ++N)
-          NewSetMembers.push_back(ValuePair(K, N->second));
+        DenseMap<Value *, std::vector<Value *> >::iterator II =
+          LoadMoveSet.find(I);
+        if (II != LoadMoveSet.end())
+          for (std::vector<Value *>::iterator N = II->second.begin(),
+               NE = II->second.end(); N != NE; ++N)
+            NewSetMembers.push_back(ValuePair(K, *N));
+        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
+          LoadMoveSet.find(J);
+        if (JJ != LoadMoveSet.end())
+          for (std::vector<Value *>::iterator N = JJ->second.begin(),
+               NE = JJ->second.end(); N != NE; ++N)
+            NewSetMembers.push_back(ValuePair(K, *N));
         for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
-             AE = NewSetMembers.end(); A != AE; ++A)
-          LoadMoveSet.insert(*A);
+             AE = NewSetMembers.end(); A != AE; ++A) {
+          LoadMoveSet[A->first].push_back(A->second);
+          LoadMoveSetPairs.insert(*A);
+        }
       }
 
       // Before removing I, set the iterator to the next instruction.
@@ -3056,6 +3175,7 @@ VectorizeConfig::VectorizeConfig() {
   MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
   SplatBreaksChain = ::SplatBreaksChain;
   MaxInsts = ::MaxInsts;
+  MaxPairs = ::MaxPairs;
   MaxIter = ::MaxIter;
   Pow2LenOnly = ::Pow2LenOnly;
   NoMemOpBoost = ::NoMemOpBoost;
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9c82cb8..f489393 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9,10 +9,10 @@
 //
 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
 // and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizes uses (will use) the codegen
+// in the codegen. However, the vectorizer uses (will use) the codegen
 // interfaces to generate IR that is likely to result in an optimal binary.
 //
-// The loop vectorizer combines consecutive loop iteration into a single
+// The loop vectorizer combines consecutive loop iterations into a single
 // 'wide' iteration. After this transformation the index is incremented
 // by the SIMD vector width, and not by one.
 //
@@ -32,7 +32,7 @@
 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
 //
 // Variable uniformity checks are inspired by:
-// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
 //
 // Other ideas/concepts are from:
 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
@@ -101,24 +101,20 @@ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
 /// We don't vectorize loops with a known constant trip count below this number.
-static const unsigned TinyTripCountVectorThreshold = 16;
+static cl::opt<unsigned>
+TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
+                             cl::Hidden,
+                             cl::desc("Don't vectorize loops with a constant "
+                                      "trip count that is smaller than this "
+                                      "value."));
 
 /// We don't unroll loops with a known constant trip count below this number.
 static const unsigned TinyTripCountUnrollThreshold = 128;
 
-/// We don't unroll loops that are larget than this threshold.
-static const unsigned MaxLoopSizeThreshold = 32;
-
 /// When performing a runtime memory check, do not check more than this
 /// number of pointers. Notice that the check is quadratic!
 static const unsigned RuntimeMemoryCheckThreshold = 4;
 
-/// This is the highest vector width that we try to generate.
-static const unsigned MaxVectorSize = 8;
-
-/// This is the highest Unroll Factor.
-static const unsigned MaxUnrollSize = 4;
-
 namespace {
 
 // Forward declarations.
@@ -169,8 +165,8 @@ private:
 
   /// Add code that checks at runtime if the accessed arrays overlap.
   /// Returns the comparator value or NULL if no check is needed.
-  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
-                         Instruction *Loc);
+  Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                               Instruction *Loc);
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
@@ -196,6 +192,10 @@ private:
   /// of scalars.
   void scalarizeInstruction(Instruction *Instr);
 
+  /// Vectorize Load and Store instructions,
+  void vectorizeMemoryInstruction(Instruction *Instr,
+                                  LoopVectorizationLegality *Legal);
+
   /// Create a broadcast instruction. This method generates a broadcast
   /// instruction (shuffle) for loop invariant values and for the induction
   /// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -228,31 +228,34 @@ private:
     ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
 
     /// \return True if 'Key' is saved in the Value Map.
-    bool has(Value *Key) { return MapStoreage.count(Key); }
+    bool has(Value *Key) const { return MapStorage.count(Key); }
 
     /// Initializes a new entry in the map. Sets all of the vector parts to the
     /// save value in 'Val'.
     /// \return A reference to a vector with splat values.
     VectorParts &splat(Value *Key, Value *Val) {
-      MapStoreage[Key].clear();
-      MapStoreage[Key].append(UF, Val);
-      return MapStoreage[Key];
+      VectorParts &Entry = MapStorage[Key];
+      Entry.assign(UF, Val);
+      return Entry;
     }
 
     ///\return A reference to the value that is stored at 'Key'.
     VectorParts &get(Value *Key) {
-      if (!has(Key))
-        MapStoreage[Key].resize(UF);
-      return MapStoreage[Key];
+      VectorParts &Entry = MapStorage[Key];
+      if (Entry.empty())
+        Entry.resize(UF);
+      assert(Entry.size() == UF);
+      return Entry;
     }
 
+  private:
     /// The unroll factor. Each entry in the map stores this number of vector
     /// elements.
     unsigned UF;
 
     /// Map storage. We use std::map and not DenseMap because insertions to a
     /// dense map invalidates its iterators.
-    std::map<Value*, VectorParts> MapStoreage;
+    std::map<Value *, VectorParts> MapStorage;
   };
 
   /// The original loop.
@@ -289,8 +292,8 @@ private:
   BasicBlock *LoopVectorBody;
   ///The scalar loop body.
   BasicBlock *LoopScalarBody;
-  ///The first bypass block.
-  BasicBlock *LoopBypassBlock;
+  /// A list of all bypass blocks. The first block is the entry of the loop.
+  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 
   /// The new Induction variable which was added to the new block.
   PHINode *Induction;
@@ -316,8 +319,9 @@ private:
 class LoopVectorizationLegality {
 public:
   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
-                            DominatorTree *DT)
-      : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {}
+                            DominatorTree *DT, TargetTransformInfo* TTI,
+                            AliasAnalysis* AA)
+      : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), Induction(0) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -336,7 +340,8 @@ public:
     IK_NoInduction,         ///< Not an induction variable.
     IK_IntInduction,        ///< Integer induction variable. Step = 1.
     IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
-    IK_PtrInduction         ///< Pointer induction variable. Step = sizeof(elem).
+    IK_PtrInduction,        ///< Pointer induction var. Step = sizeof(elem).
+    IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
   };
 
   /// This POD struct holds information about reduction variables.
@@ -400,6 +405,11 @@ public:
   /// induction descriptor.
   typedef MapVector<PHINode*, InductionInfo> InductionList;
 
+  /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
+  /// respective Store/Load instruction(s) to calculate aliasing.
+  typedef DenseMap<Value*, Instruction* > AliasMap;
+  typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -473,6 +483,14 @@ private:
   InductionKind isInductionVariable(PHINode *Phi);
   /// Return true if can compute the address bounds of Ptr within the loop.
   bool hasComputableBounds(Value *Ptr);
+  /// Return true if there is the chance of write reorder.
+  bool hasPossibleGlobalWriteReorder(Value *Object,
+                                     Instruction *Inst,
+                                     AliasMultiMap &WriteObjects,
+                                     unsigned MaxByteWidth);
+  /// Return the AA location for a load or a store.
+  AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
+
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -480,8 +498,12 @@ private:
   ScalarEvolution *SE;
   /// DataLayout analysis.
   DataLayout *DL;
-  // Dominators.
+  /// Dominators.
   DominatorTree *DT;
+  /// Target Info.
+  TargetTransformInfo *TTI;
+  /// Alias Analysis.
+  AliasAnalysis *AA;
 
   //  ---  vectorization state --- //
 
@@ -517,20 +539,34 @@ class LoopVectorizationCostModel {
 public:
   LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
                              LoopVectorizationLegality *Legal,
-                             const TargetTransformInfo &TTI)
-      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {}
-
-  /// \return The most profitable vectorization factor.
+                             const TargetTransformInfo &TTI,
+                             DataLayout *DL)
+      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL) {}
+
+  /// Information about vectorization costs
+  struct VectorizationFactor {
+    unsigned Width; // Vector width with best cost
+    unsigned Cost; // Cost of the loop with that width
+  };
+  /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to VF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
-  unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
+  VectorizationFactor selectVectorizationFactor(bool OptForSize,
+                                                unsigned UserVF);
 
+  /// \return The size (in bits) of the widest type in the code that
+  /// needs to be vectorized. We ignore values that remain scalar such as
+  /// 64 bit loop indices.
+  unsigned getWidestType();
 
   /// \return The most profitable unroll factor.
   /// If UserUF is non-zero then this method finds the best unroll-factor
   /// based on register pressure and other parameters.
-  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
+  /// VF and LoopCost are the selected vectorization factor and the cost of the
+  /// selected VF.
+  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
+                              unsigned LoopCost);
 
   /// \brief A struct that represents some properties of the register usage
   /// of a loop.
@@ -562,6 +598,10 @@ private:
   /// the scalar type.
   static Type* ToVectorTy(Type *Scalar, unsigned VF);
 
+  /// Returns whether the instruction is a load or store and will be a emitted
+  /// as a vector operation.
+  bool isConsecutiveLoadOrStore(Instruction *I);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
@@ -572,6 +612,8 @@ private:
   LoopVectorizationLegality *Legal;
   /// Vector target information.
   const TargetTransformInfo &TTI;
+  /// Target data layout information.
+  DataLayout *DL;
 };
 
 /// The LoopVectorize Pass.
@@ -588,6 +630,7 @@ struct LoopVectorize : public LoopPass {
   LoopInfo *LI;
   TargetTransformInfo *TTI;
   DominatorTree *DT;
+  AliasAnalysis *AA;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -599,21 +642,22 @@ struct LoopVectorize : public LoopPass {
     LI = &getAnalysis<LoopInfo>();
     TTI = &getAnalysis<TargetTransformInfo>();
     DT = &getAnalysis<DominatorTree>();
+    AA = getAnalysisIfAvailable<AliasAnalysis>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL, DT);
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
     }
 
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI);
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL);
 
-    // Check the function attribues to find out if this function should be
+    // Check the function attributes to find out if this function should be
     // optimized for size.
     Function *F = L->getHeader()->getParent();
     Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
@@ -628,20 +672,24 @@ struct LoopVectorize : public LoopPass {
       return false;
     }
 
-    unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
-    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll);
+    // Select the optimal vectorization factor.
+    LoopVectorizationCostModel::VectorizationFactor VF;
+    VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+    // Select the unroll factor.
+    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
+                                        VF.Width, VF.Cost);
 
-    if (VF == 1) {
+    if (VF.Width == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
       return false;
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
           F->getParent()->getModuleIdentifier()<<"\n");
     DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
 
-    // If we decided that it is *legal* to vectorizer the loop then do it.
-    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
+    // If we decided that it is *legal* to vectorize the loop then do it.
+    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF.Width, UF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -730,6 +778,9 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+  // Make sure that the pointer does not point to structs.
+  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+    return 0;
 
   // If this value is a pointer induction variable we know it is consecutive.
   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
@@ -737,6 +788,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     InductionInfo II = Inductions[Phi];
     if (IK_PtrInduction == II.IK)
       return 1;
+    else if (IK_ReversePtrInduction == II.IK)
+      return -1;
   }
 
   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
@@ -746,6 +799,29 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   unsigned NumOperands = Gep->getNumOperands();
   Value *LastIndex = Gep->getOperand(NumOperands - 1);
 
+  Value *GpPtr = Gep->getPointerOperand();
+  // If this GEP value is a consecutive pointer induction variable and all of
+  // the indices are constant then we know it is consecutive. We can
+  Phi = dyn_cast<PHINode>(GpPtr);
+  if (Phi && Inductions.count(Phi)) {
+
+    // Make sure that the pointer does not point to structs.
+    PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
+    if (GepPtrType->getElementType()->isAggregateType())
+      return 0;
+
+    // Make sure that all of the index operands are loop invariant.
+    for (unsigned i = 1; i < NumOperands; ++i)
+      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+        return 0;
+
+    InductionInfo II = Inductions[Phi];
+    if (IK_PtrInduction == II.IK)
+      return 1;
+    else if (IK_ReversePtrInduction == II.IK)
+      return -1;
+  }
+
   // Check that all of the gep indices are uniform except for the last.
   for (unsigned i = 0; i < NumOperands - 1; ++i)
     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
@@ -784,8 +860,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
   // If this scalar is unknown, assume that it is a constant or that it is
   // loop invariant. Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
-  WidenMap.splat(V, B);
-  return WidenMap.get(V);
+  return WidenMap.splat(V, B);
 }
 
 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
@@ -799,6 +874,111 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
                                      "reverse");
 }
 
+
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
+                                             LoopVectorizationLegality *Legal) {
+  // Attempt to issue a wide load.
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+  assert((LI || SI) && "Invalid Load/Store instruction");
+
+  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *DataTy = VectorType::get(ScalarDataTy, VF);
+  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+
+  // If the pointer is loop invariant or if it is non consecutive,
+  // scalarize the load.
+  int Stride = Legal->isConsecutivePtr(Ptr);
+  bool Reverse = Stride < 0;
+  bool UniformLoad = LI && Legal->isUniform(Ptr);
+  if (Stride == 0 || UniformLoad)
+    return scalarizeInstruction(Instr);
+
+  Constant *Zero = Builder.getInt32(0);
+  VectorParts &Entry = WidenMap.get(Instr);
+
+  // Handle consecutive loads/stores.
+  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+    Value *PtrOperand = Gep->getPointerOperand();
+    Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
+    FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
+
+    // Create the new GEP with the new induction variable.
+    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+    Gep2->setOperand(0, FirstBasePtr);
+    Gep2->setName("gep.indvar.base");
+    Ptr = Builder.Insert(Gep2);
+  } else if (Gep) {
+    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
+                               OrigLoop) && "Base ptr must be invariant");
+
+    // The last index does not have to be the induction. It can be
+    // consecutive and be a function of the index. For example A[I+1];
+    unsigned NumOperands = Gep->getNumOperands();
+
+    Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
+    VectorParts &GEPParts = getVectorValue(LastGepOperand);
+    Value *LastIndex = GEPParts[0];
+    LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+    // Create the new GEP with the new induction variable.
+    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+    Gep2->setOperand(NumOperands - 1, LastIndex);
+    Gep2->setName("gep.indvar.idx");
+    Ptr = Builder.Insert(Gep2);
+  } else {
+    // Use the induction element ptr.
+    assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+    VectorParts &PtrVal = getVectorValue(Ptr);
+    Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
+  }
+
+  // Handle Stores:
+  if (SI) {
+    assert(!Legal->isUniform(SI->getPointerOperand()) &&
+           "We do not allow storing to uniform addresses");
+
+    VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      // Calculate the pointer for the specific unroll-part.
+      Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+      if (Reverse) {
+        // If we store to reverse consecutive memory locations then we need
+        // to reverse the order of elements in the stored value.
+        StoredVal[Part] = reverseVector(StoredVal[Part]);
+        // If the address is consecutive but reversed, then the
+        // wide store needs to start at the last vector element.
+        PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+        PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+      }
+
+      Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+      Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
+    }
+  }
+
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // Calculate the pointer for the specific unroll-part.
+    Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+    if (Reverse) {
+      // If the address is consecutive but reversed, then the
+      // wide store needs to start at the last vector element.
+      PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+      PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+    }
+
+    Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+    Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
+    cast<LoadInst>(LI)->setAlignment(Alignment);
+    Entry[Part] = Reverse ? reverseVector(LI) :  LI;
+  }
+}
+
 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
@@ -870,7 +1050,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   }
 }
 
-Value*
+Instruction *
 InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
                                      Instruction *Loc) {
   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
@@ -879,7 +1059,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   if (!PtrRtCheck->Need)
     return NULL;
 
-  Value *MemoryRuntimeCheck = 0;
+  Instruction *MemoryRuntimeCheck = 0;
   unsigned NumPointers = PtrRtCheck->Pointers.size();
   SmallVector<Value* , 2> Starts;
   SmallVector<Value* , 2> Ends;
@@ -908,28 +1088,23 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
     }
   }
 
+  IRBuilder<> ChkBuilder(Loc);
+
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i+1; j < NumPointers; ++j) {
-      Instruction::CastOps Op = Instruction::BitCast;
-      Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc);
-      Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc);
-      Value *End0 =   CastInst::Create(Op, Ends[i],   PtrArithTy, "bc", Loc);
-      Value *End1 =   CastInst::Create(Op, Ends[j],   PtrArithTy, "bc", Loc);
-
-      Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                    Start0, End1, "bound0", Loc);
-      Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                    Start1, End0, "bound1", Loc);
-      Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
-                                                 "found.conflict", Loc);
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
+
+      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
       if (MemoryRuntimeCheck)
-        MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
-                                                    MemoryRuntimeCheck,
-                                                    IsConflict,
-                                                    "conflict.rdx", Loc);
-      else
-        MemoryRuntimeCheck = IsConflict;
+        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+                                         "conflict.rdx");
 
+      MemoryRuntimeCheck = cast<Instruction>(IsConflict);
     }
   }
 
@@ -943,7 +1118,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-       [ ] <-- vector loop bypass.
+       [ ] <-- vector loop bypass (may consist of multiple blocks).
      /  |
     /   v
    |   [ ]     <-- vector pre header.
@@ -1004,10 +1179,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   ConstantInt::get(IdxTy, 0);
 
   assert(BypassBlock && "Invalid loop structure");
-
-  // Generate the code that checks in runtime if arrays overlap.
-  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
-                                              BypassBlock->getTerminator());
+  LoopBypassBlocks.push_back(BypassBlock);
 
   // Split the single block loop into the two loop structure described above.
   BasicBlock *VectorPH =
@@ -1019,10 +1191,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   BasicBlock *ScalarPH =
   MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
 
-  // This is the location in which we add all of the logic for bypassing
-  // the new vector loop.
-  Instruction *Loc = BypassBlock->getTerminator();
-
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
@@ -1033,42 +1201,62 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // times the unroll factor (num of SIMD instructions).
   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
 
+  // This is the IR builder that we use to add all of the logic for bypassing
+  // the new vector loop.
+  IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
+
   // We may need to extend the index in case there is a type mismatch.
   // We know that the count starts at zero and does not overflow.
   if (Count->getType() != IdxTy) {
     // The exit count can be of pointer type. Convert it to the correct
     // integer type.
     if (ExitCount->getType()->isPointerTy())
-      Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
+      Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
     else
-      Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+      Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
   }
 
   // Add the start index to the loop count to get the new end index.
-  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
+  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
 
   // Now we need to generate the expression for N - (N % VF), which is
   // the part that the vectorized body will execute.
-  Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc);
-  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
-  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
-                                                     "end.idx.rnd.down", Loc);
+  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
+  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
+  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
+                                                     "end.idx.rnd.down");
 
   // Now, compare the new count to zero. If it is zero skip the vector loop and
   // jump to the scalar loop.
-  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                               IdxEndRoundDown,
-                               StartIdx,
-                               "cmp.zero", Loc);
-
-  // If we are using memory runtime checks, include them in.
-  if (MemoryRuntimeCheck)
-    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
-                                 "CntOrMem", Loc);
+  Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
+                                          "cmp.zero");
+
+  BasicBlock *LastBypassBlock = BypassBlock;
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
+                                                 BypassBlock->getTerminator());
+  if (MemRuntimeCheck) {
+    // Create a new block containing the memory check.
+    BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
+                                                          "vector.memcheck");
+    LoopBypassBlocks.push_back(CheckBlock);
+
+    // Replace the branch into the memory check block with a conditional branch
+    // for the "few elements case".
+    Instruction *OldTerm = BypassBlock->getTerminator();
+    BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
+    OldTerm->eraseFromParent();
+
+    Cmp = MemRuntimeCheck;
+    LastBypassBlock = CheckBlock;
+  }
 
-  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
-  // Remove the old terminator.
-  Loc->eraseFromParent();
+  LastBypassBlock->getTerminator()->eraseFromParent();
+  BranchInst::Create(MiddleBlock, VectorPH, Cmp,
+                     LastBypassBlock);
 
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
@@ -1108,30 +1296,45 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
       Value *CRD = CountRoundDown;
       if (CRDSize > IISize)
         CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
-                               II.StartValue->getType(),
-                               "tr.crd", BypassBlock->getTerminator());
+                               II.StartValue->getType(), "tr.crd",
+                               LoopBypassBlocks.back()->getTerminator());
       else if (CRDSize < IISize)
         CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
                                II.StartValue->getType(),
-                               "sext.crd", BypassBlock->getTerminator());
+                               "sext.crd",
+                               LoopBypassBlocks.back()->getTerminator());
       // Handle reverse integer induction counter:
-      EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
-                                           BypassBlock->getTerminator());
+      EndValue =
+        BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
       break;
     }
     case LoopVectorizationLegality::IK_PtrInduction: {
       // For pointer induction variables, calculate the offset using
       // the end index.
-      EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown,
-                                           "ptr.ind.end",
-                                           BypassBlock->getTerminator());
+      EndValue =
+        GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      break;
+    }
+    case LoopVectorizationLegality::IK_ReversePtrInduction: {
+      // The value at the end of the loop for the reverse pointer is calculated
+      // by creating a GEP with a negative index starting from the start value.
+      Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
+      Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown,
+                                  "rev.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx,
+                                  "rev.ptr.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
       break;
     }
     }// end of case
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    ResumeVal->addIncoming(II.StartValue, BypassBlock);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
     ResumeVal->addIncoming(EndValue, VecBody);
 
     // Fix the scalar body counter (PHI node).
@@ -1147,7 +1350,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
     assert(!ResumeIndex && "Unexpected resume value found");
     ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
                                   MiddleBlock->getTerminator());
-    ResumeIndex->addIncoming(StartIdx, BypassBlock);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
     ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
   }
 
@@ -1187,6 +1391,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Insert the new loop into the loop nest and register the new basic blocks.
   if (ParentLoop) {
     ParentLoop->addChildLoop(Lp);
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+      ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
@@ -1203,7 +1409,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
-  LoopBypassBlock = BypassBlock;
 }
 
 /// This function returns the identity element (or neutral element) for
@@ -1295,9 +1500,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // the cost-model.
   //
   //===------------------------------------------------===//
-  BasicBlock &BB = *OrigLoop->getHeader();
-  Constant *Zero =
-  ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0);
+  Constant *Zero = Builder.getInt32(0);
 
   // In order to support reduction variables we need to be able to vectorize
   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
@@ -1343,7 +1546,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // To do so, we need to generate the 'identity' vector and overide
     // one of the elements with the incoming scalar reduction. We need
     // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+    Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
 
     // This is the vector-clone of the value that leaves the loop.
     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@@ -1391,7 +1594,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
       PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
       Value *StartVal = (part == 0) ? VectorStart : Identity;
-      NewPhi->addIncoming(StartVal, LoopBypassBlock);
+      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
       NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
       RdxParts.push_back(NewPhi);
     }
@@ -1533,8 +1737,6 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
 void
 InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
                                           BasicBlock *BB, PhiVector *PV) {
-  Constant *Zero = Builder.getInt32(0);
-
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     VectorParts &Entry = WidenMap.get(it);
@@ -1568,7 +1770,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // optimizations will clean it up.
         VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
                                                P->getParent());
-        
+
         for (unsigned part = 0; part < UF; ++part) {
         VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
         VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
@@ -1600,6 +1802,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       }
       case LoopVectorizationLegality::IK_ReverseIntInduction:
       case LoopVectorizationLegality::IK_PtrInduction:
+      case LoopVectorizationLegality::IK_ReversePtrInduction:
         // Handle reverse integer and pointer inductions.
         Value *StartIdx = 0;
         // If we have a single integer induction variable then use it.
@@ -1635,15 +1838,23 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // Handle the pointer induction variable case.
         assert(P->getType()->isPointerTy() && "Unexpected type.");
 
+        // Is this a reverse induction ptr or a consecutive induction ptr.
+        bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
+                        II.IK);
+
         // This is the vector of results. Notice that we don't generate
         // vector geps because scalar geps result in better code.
         for (unsigned part = 0; part < UF; ++part) {
           Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
           for (unsigned int i = 0; i < VF; ++i) {
-            Constant *Idx = ConstantInt::get(Induction->getType(),
-                                             i + part * VF);
-            Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
-                                                 "gep.idx");
+            int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
+            Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+            Value *GlobalIdx;
+            if (!Reverse)
+              GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+            else
+              GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+
             Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
                                                "next.gep");
             VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
@@ -1684,13 +1895,13 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       for (unsigned Part = 0; Part < UF; ++Part) {
         Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
 
-        // Update the NSW, NUW and Exact flags.
-        BinaryOperator *VecOp = cast<BinaryOperator>(V);
-        if (isa<OverflowingBinaryOperator>(BinOp)) {
+        // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
+        BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
+        if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
           VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
           VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
         }
-        if (isa<PossiblyExactOperator>(VecOp))
+        if (VecOp && isa<PossiblyExactOperator>(VecOp))
           VecOp->setIsExact(BinOp->isExact());
 
         Entry[Part] = V;
@@ -1740,124 +1951,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       break;
     }
 
-    case Instruction::Store: {
-      // Attempt to issue a wide store.
-      StoreInst *SI = dyn_cast<StoreInst>(it);
-      Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
-      Value *Ptr = SI->getPointerOperand();
-      unsigned Alignment = SI->getAlignment();
-
-      assert(!Legal->isUniform(Ptr) &&
-             "We do not allow storing to uniform addresses");
-
-
-      int Stride = Legal->isConsecutivePtr(Ptr);
-      bool Reverse = Stride < 0;
-      if (Stride == 0) {
-        scalarizeInstruction(it);
+    case Instruction::Store:
+    case Instruction::Load:
+        vectorizeMemoryInstruction(it, Legal);
         break;
-      }
-
-      // Handle consecutive stores.
-
-      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-      if (Gep) {
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-
-        Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
-        VectorParts &GEPParts = getVectorValue(LastGepOperand);
-        Value *LastIndex = GEPParts[0];
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
-      } else {
-        // Use the induction element ptr.
-        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-        VectorParts &PtrVal = getVectorValue(Ptr);
-        Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
-      }
-
-      VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        // Calculate the pointer for the specific unroll-part.
-        Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
-
-        if (Reverse) {
-          // If we store to reverse consecutive memory locations then we need
-          // to reverse the order of elements in the stored value.
-          StoredVal[Part] = reverseVector(StoredVal[Part]);
-          // If the address is consecutive but reversed, then the
-          // wide store needs to start at the last vector element.
-          PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
-          PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
-        }
-
-        Value *VecPtr = Builder.CreateBitCast(PartPtr, StTy->getPointerTo());
-        Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
-      }
-      break;
-    }
-    case Instruction::Load: {
-      // Attempt to issue a wide load.
-      LoadInst *LI = dyn_cast<LoadInst>(it);
-      Type *RetTy = VectorType::get(LI->getType(), VF);
-      Value *Ptr = LI->getPointerOperand();
-      unsigned Alignment = LI->getAlignment();
-
-      // If the pointer is loop invariant or if it is non consecutive,
-      // scalarize the load.
-      int Stride = Legal->isConsecutivePtr(Ptr);
-      bool Reverse = Stride < 0;
-      if (Legal->isUniform(Ptr) || Stride == 0) {
-        scalarizeInstruction(it);
-        break;
-      }
-
-      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-      if (Gep) {
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-
-        Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
-        VectorParts &GEPParts = getVectorValue(LastGepOperand);
-        Value *LastIndex = GEPParts[0];
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
-      } else {
-        // Use the induction element ptr.
-        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-        VectorParts &PtrVal = getVectorValue(Ptr);
-        Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
-      }
-
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        // Calculate the pointer for the specific unroll-part.
-        Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
-
-        if (Reverse) {
-          // If the address is consecutive but reversed, then the
-          // wide store needs to start at the last vector element.
-          PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
-          PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
-        }
-
-        Value *VecPtr = Builder.CreateBitCast(PartPtr, RetTy->getPointerTo());
-        Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
-        cast<LoadInst>(LI)->setAlignment(Alignment);
-        Entry[Part] = Reverse ? reverseVector(LI) :  LI;
-      }
-      break;
-    }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -1924,12 +2021,14 @@ void InnerLoopVectorizer::updateAnalysis() {
   SE->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
-  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
          "Entry does not dominate exit.");
 
-  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+    DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
+  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
   DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
   DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -2196,7 +2295,51 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   }
 }
 
+AliasAnalysis::Location
+LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
+  if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+    return AA->getLocation(Store);
+  else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+    return AA->getLocation(Load);
+
+  llvm_unreachable("Should be either load or store instruction");
+}
+
+bool
+LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
+                                                Value *Object,
+                                                Instruction *Inst,
+                                                AliasMultiMap& WriteObjects,
+                                                unsigned MaxByteWidth) {
+
+  AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
+
+  std::vector<Instruction*>::iterator
+              it = WriteObjects[Object].begin(),
+              end = WriteObjects[Object].end();
+
+  for (; it != end; ++it) {
+    Instruction* I = *it;
+    if (I == Inst)
+      continue;
+
+    AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
+    if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
+                  ThatLoc.getWithNewSize(MaxByteWidth)))
+      return true;
+  }
+  return false;
+}
+
 bool LoopVectorizationLegality::canVectorizeMemory() {
+
+  if (TheLoop->isAnnotatedParallel()) {
+    DEBUG(dbgs()
+          << "LV: A loop annotated parallel, ignore memory dependency "
+          << "checks.\n");
+    return true;
+  }
+
   typedef SmallVector<Value*, 16> ValueVector;
   typedef SmallPtrSet<Value*, 16> ValueSet;
   // Holds the Load and Store *instructions*.
@@ -2250,9 +2393,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return true;
   }
 
-  // Holds the read and read-write *pointers* that we find.
-  ValueVector Reads;
-  ValueVector ReadWrites;
+  // Holds the read and read-write *pointers* that we find. These maps hold
+  // unique values for pointers (so no need for multi-map).
+  AliasMap Reads;
+  AliasMap ReadWrites;
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -2274,7 +2418,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     // If we did *not* see this pointer before, insert it to
     // the read-write list. At this phase it is only a 'write' list.
     if (Seen.insert(Ptr))
-      ReadWrites.push_back(Ptr);
+      ReadWrites.insert(std::make_pair(Ptr, ST));
   }
 
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
@@ -2289,7 +2433,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
     if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
-      Reads.push_back(Ptr);
+      Reads.insert(std::make_pair(Ptr, LD));
   }
 
   // If we write (or read-write) to a single destination and there are no
@@ -2302,22 +2446,27 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
-  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
-    if (hasComputableBounds(*I)) {
-      PtrRtCheck.insert(SE, TheLoop, *I);
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+  AliasMap::iterator MI, ME;
+  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+    Value *V = (*MI).first;
+    if (hasComputableBounds(V)) {
+      PtrRtCheck.insert(SE, TheLoop, V);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
       break;
     }
-  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
-    if (hasComputableBounds(*I)) {
-      PtrRtCheck.insert(SE, TheLoop, *I);
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+  }
+  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+    Value *V = (*MI).first;
+    if (hasComputableBounds(V)) {
+      PtrRtCheck.insert(SE, TheLoop, V);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
       break;
     }
+  }
 
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
@@ -2332,47 +2481,104 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   bool NeedRTCheck = false;
 
+  // Biggest vectorized access possible, vector width * unroll factor.
+  // TODO: We're being very pessimistic here, find a way to know the
+  // real access width before getting here.
+  unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
+                           TTI->getMaximumUnrollFactor();
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
-  ValueSet WriteObjects;
+  // Note that WriteObjects duplicates the stores (indexed now by underlying
+  // objects) to avoid pointing to elements inside ReadWrites.
+  // TODO: Maybe create a new type where they can interact without duplication.
+  AliasMultiMap WriteObjects;
   ValueVector TempObjects;
 
   // Check that the read-writes do not conflict with other read-write
   // pointers.
   bool AllWritesIdentified = true;
-  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
-    GetUnderlyingObjects(*I, TempObjects, DL);
-    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
-         it != e; ++it) {
-      if (!isIdentifiedObject(*it)) {
-        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
+  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+    Value *Val = (*MI).first;
+    Instruction *Inst = (*MI).second;
+
+    GetUnderlyingObjects(Val, TempObjects, DL);
+    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+         UI != UE; ++UI) {
+      if (!isIdentifiedObject(*UI)) {
+        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
         NeedRTCheck = true;
         AllWritesIdentified = false;
       }
-      if (!WriteObjects.insert(*it)) {
+
+      // Never seen it before, can't alias.
+      if (WriteObjects[*UI].empty()) {
+        DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
+        WriteObjects[*UI].push_back(Inst);
+        continue;
+      }
+      // Direct alias found.
+      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **UI <<"\n");
+        return false;
+      }
+      DEBUG(dbgs() << "LV: Found a conflicting global value:"
+            << **UI <<"\n");
+      DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
+      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+      // If global alias, make sure they do alias.
+      if (hasPossibleGlobalWriteReorder(*UI,
+                                        Inst,
+                                        WriteObjects,
+                                        MaxByteWidth)) {
         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << **it <<"\n");
+              << *UI <<"\n");
         return false;
       }
+
+      // Didn't alias, insert into map for further reference.
+      WriteObjects[*UI].push_back(Inst);
     }
     TempObjects.clear();
   }
 
   /// Check that the reads don't conflict with the read-writes.
-  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
-    GetUnderlyingObjects(*I, TempObjects, DL);
-    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
-         it != e; ++it) {
+  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+    Value *Val = (*MI).first;
+    GetUnderlyingObjects(Val, TempObjects, DL);
+    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+         UI != UE; ++UI) {
       // If all of the writes are identified then we don't care if the read
       // pointer is identified or not.
-      if (!AllWritesIdentified && !isIdentifiedObject(*it)) {
-        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
+      if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
+        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
         NeedRTCheck = true;
       }
-      if (WriteObjects.count(*it)) {
-        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
-              << **it <<"\n");
+
+      // Never seen it before, can't alias.
+      if (WriteObjects[*UI].empty())
+        continue;
+      // Direct alias found.
+      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **UI <<"\n");
+        return false;
+      }
+      DEBUG(dbgs() << "LV: Found a global value:  "
+            << **UI <<"\n");
+      Instruction *Inst = (*MI).second;
+      DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
+      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+      // If global alias, make sure they do alias.
+      if (hasPossibleGlobalWriteReorder(*UI,
+                                        Inst,
+                                        WriteObjects,
+                                        MaxByteWidth)) {
+        DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
+              << *UI <<"\n");
         return false;
       }
     }
@@ -2535,7 +2741,7 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
     return IK_NoInduction;
 
-  // Check that the PHI is consecutive and starts at zero.
+  // Check that the PHI is consecutive.
   const SCEV *PhiScev = SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
   if (!AR) {
@@ -2562,6 +2768,8 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
   if (C->getValue()->equalsInt(Size))
     return IK_PtrInduction;
+  else if (C->getValue()->equalsInt(0 - Size))
+    return IK_ReversePtrInduction;
 
   return IK_NoInduction;
 }
@@ -2612,18 +2820,34 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
   return AR->isAffine();
 }
 
-unsigned
+LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
                                                       unsigned UserVF) {
+  // Width 1 means no vectorize
+  VectorizationFactor Factor = { 1U, 0U };
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
     DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
-    return 1;
+    return Factor;
   }
 
   // Find the trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
   DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
 
+  unsigned WidestType = getWidestType();
+  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+  unsigned MaxVectorSize = WidestRegister / WidestType;
+  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+
+  if (MaxVectorSize == 0) {
+    DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+    MaxVectorSize = 1;
+  }
+
+  assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
+         " into one vector!");
+
   unsigned VF = MaxVectorSize;
 
   // If we optimize the program for size, avoid creating the tail loop.
@@ -2631,7 +2855,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
     // If we are unable to calculate the trip count then don't try to vectorize.
     if (TC < 2) {
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
-      return 1;
+      return Factor;
     }
 
     // Find the maximum SIMD width that can fit within the trip count.
@@ -2644,7 +2868,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
     // zero then we require a tail.
     if (VF < 2) {
       DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
-      return 1;
+      return Factor;
     }
   }
 
@@ -2652,7 +2876,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
     DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
 
-    return UserVF;
+    Factor.Width = UserVF;
+    return Factor;
   }
 
   float Cost = expectedCost(1);
@@ -2672,12 +2897,70 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
   }
 
   DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
-  return Width;
+  Factor.Width = Width;
+  Factor.Cost = Width * Cost;
+  return Factor;
+}
+
+unsigned LoopVectorizationCostModel::getWidestType() {
+  unsigned MaxWidth = 8;
+
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    BasicBlock *BB = *bb;
+
+    // For each instruction in the loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      Type *T = it->getType();
+
+      // Only examine Loads, Stores and PHINodes.
+      if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
+        continue;
+
+      // Examine PHI nodes that are reduction variables.
+      if (PHINode *PN = dyn_cast<PHINode>(it))
+        if (!Legal->getReductionVars()->count(PN))
+          continue;
+
+      // Examine the stored values.
+      if (StoreInst *ST = dyn_cast<StoreInst>(it))
+        T = ST->getValueOperand()->getType();
+
+      // Ignore loaded pointer types and stored pointer types that are not
+      // consecutive. However, we do want to take consecutive stores/loads of
+      // pointer vectors into account.
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+        continue;
+
+      MaxWidth = std::max(MaxWidth,
+                          (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
+    }
+  }
+
+  return MaxWidth;
 }
 
 unsigned
 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
-                                               unsigned UserUF) {
+                                               unsigned UserUF,
+                                               unsigned VF,
+                                               unsigned LoopCost) {
+
+  // -- The unroll heuristics --
+  // We unroll the loop in order to expose ILP and reduce the loop overhead.
+  // There are many micro-architectural considerations that we can't predict
+  // at this level. For example frontend pressure (on decode or fetch) due to
+  // code size, or the number and capabilities of the execution ports.
+  //
+  // We use the following heuristics to select the unroll factor:
+  // 1. If the code has reductions the we unroll in order to break the cross
+  // iteration dependency.
+  // 2. If the loop is really small then we unroll in order to reduce the loop
+  // overhead.
+  // 3. We don't unroll if we think that we will spill registers to memory due
+  // to the increased register pressure.
+
   // Use the user preference, unless 'auto' is selected.
   if (UserUF != 0)
     return UserUF;
@@ -2710,17 +2993,39 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   // fit without causing spills.
   unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
 
-  // We don't want to unroll the loops to the point where they do not fit into
-  // the decoded cache. Assume that we only allow 32 IR instructions.
-  UF = std::min(UF, (MaxLoopSizeThreshold / R.NumInstructions));
-
   // Clamp the unroll factor ranges to reasonable factors.
+  unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
+
+  // If we did not calculate the cost for VF (because the user selected the VF)
+  // then we calculate the cost of VF here.
+  if (LoopCost == 0)
+    LoopCost = expectedCost(VF);
+
+  // Clamp the calculated UF to be between the 1 and the max unroll factor
+  // that the target allows.
   if (UF > MaxUnrollSize)
     UF = MaxUnrollSize;
   else if (UF < 1)
     UF = 1;
 
-  return UF;
+  if (Legal->getReductionVars()->size()) {
+    DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+    return UF;
+  }
+
+  // We want to unroll tiny loops in order to reduce the loop overhead.
+  // We assume that the cost overhead is 1 and we use the cost model
+  // to estimate the cost of the loop and unroll until the cost of the
+  // loop overhead is about 5% of the cost of the loop.
+  DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
+  if (LoopCost < 20) {
+    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
+    unsigned NewUF = 20/LoopCost + 1;
+    return std::min(NewUF, UF);
+  }
+
+  DEBUG(dbgs() << "LV: Not Unrolling. \n");
+  return 1;
 }
 
 LoopVectorizationCostModel::RegisterUsage
@@ -2878,9 +3183,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
-    // We mark this instruction as zero-cost because scalar GEPs are usually
-    // lowered to the intruction addressing mode. At the moment we don't
-    // generate vector geps.
+    // We mark this instruction as zero-cost because the cost of GEPs in
+    // vectorized code depends on whether the corresponding memory instruction
+    // is scalarized or not. Therefore, we handle GEPs with the memory
+    // instruction cost.
     return 0;
   case Instruction::Br: {
     return TTI.getCFInstrCost(I->getOpcode());
@@ -2923,83 +3229,59 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     VectorTy = ToVectorTy(ValTy, VF);
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
   }
-  case Instruction::Store: {
-    StoreInst *SI = cast<StoreInst>(I);
-    Type *ValTy = SI->getValueOperand()->getType();
+  case Instruction::Store:
+  case Instruction::Load: {
+    StoreInst *SI = dyn_cast<StoreInst>(I);
+    LoadInst *LI = dyn_cast<LoadInst>(I);
+    Type *ValTy = (SI ? SI->getValueOperand()->getType() :
+                   LI->getType());
     VectorTy = ToVectorTy(ValTy, VF);
 
+    unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
+    unsigned AS = SI ? SI->getPointerAddressSpace() :
+      LI->getPointerAddressSpace();
+    Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
+    // We add the cost of address computation here instead of with the gep
+    // instruction because only here we know whether the operation is
+    // scalarized.
     if (VF == 1)
-      return TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
-                                   SI->getAlignment(),
-                                   SI->getPointerAddressSpace());
+      return TTI.getAddressComputationCost(VectorTy) +
+        TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
-    // Scalarized stores.
-    int Stride = Legal->isConsecutivePtr(SI->getPointerOperand());
+    // Scalarized loads/stores.
+    int Stride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = Stride < 0;
     if (0 == Stride) {
       unsigned Cost = 0;
-
       // The cost of extracting from the value vector and pointer vector.
-      Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+      Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
       for (unsigned i = 0; i < VF; ++i) {
-        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
-                                       i);
+        //  The cost of extracting the pointer operand.
         Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
+        // In case of STORE, the cost of ExtractElement from the vector.
+        // In case of LOAD, the cost of InsertElement into the returned
+        // vector.
+        Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
+                                            Instruction::InsertElement,
+                                            VectorTy, i);
       }
 
-      // The cost of the scalar stores.
+      // The cost of the scalar loads/stores.
+      Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
       Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                       SI->getAlignment(),
-                                       SI->getPointerAddressSpace());
+                                       Alignment, AS);
       return Cost;
     }
 
-    // Wide stores.
-    unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
-                                        SI->getAlignment(),
-                                        SI->getPointerAddressSpace());
+    // Wide load/stores.
+    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
     if (Reverse)
       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
                                   VectorTy, 0);
     return Cost;
   }
-  case Instruction::Load: {
-    LoadInst *LI = cast<LoadInst>(I);
-
-    if (VF == 1)
-      return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
-                                 LI->getPointerAddressSpace());
-
-    // Scalarized loads.
-    int Stride = Legal->isConsecutivePtr(LI->getPointerOperand());
-    bool Reverse = Stride < 0;
-    if (0 == Stride) {
-      unsigned Cost = 0;
-      Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-
-      // The cost of extracting from the pointer vector.
-      for (unsigned i = 0; i < VF; ++i)
-        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
-
-      // The cost of inserting data to the result vector.
-      for (unsigned i = 0; i < VF; ++i)
-        Cost += TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy, i);
-
-      // The cost of the scalar stores.
-      Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(),
-                                       LI->getAlignment(),
-                                       LI->getPointerAddressSpace());
-      return Cost;
-    }
-
-    // Wide loads.
-    unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
-                                        LI->getAlignment(),
-                                        LI->getPointerAddressSpace());
-    if (Reverse)
-      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
-    return Cost;
-  }
   case Instruction::ZExt:
   case Instruction::SExt:
   case Instruction::FPToUI:
@@ -3077,4 +3359,14 @@ namespace llvm {
   }
 }
 
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+  // Check for a store.
+  if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
+    return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
+
+  // Check for a load.
+  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
 
+  return false;
+}
diff --git a/projects/sample/Makefile.llvm.rules b/projects/sample/Makefile.llvm.rules
index 89b9e56..30f54c4 100644
--- a/projects/sample/Makefile.llvm.rules
+++ b/projects/sample/Makefile.llvm.rules
@@ -501,16 +501,24 @@ ifeq ($(HOST_OS),Darwin)
 
   LoadableModuleOptions := -Wl,-flat_namespace -Wl,-undefined,suppress
   SharedLinkOptions := -dynamiclib
-  ifneq ($(ARCH),ARM)
-    SharedLinkOptions += -mmacosx-version-min=$(DARWIN_VERSION)
+  ifdef DEPLOYMENT_TARGET
+    SharedLinkOptions += $(DEPLOYMENT_TARGET)
+  else
+    ifneq ($(ARCH),ARM)
+      SharedLinkOptions += -mmacosx-version-min=$(DARWIN_VERSION)
+    endif
   endif
 else
   SharedLinkOptions=-shared
 endif
 
 ifeq ($(TARGET_OS),Darwin)
-  ifneq ($(ARCH),ARM)
-    TargetCommonOpts += -mmacosx-version-min=$(DARWIN_VERSION)
+  ifdef DEPLOYMENT_TARGET
+    TargetCommonOpts += $(DEPLOYMENT_TARGET)
+  else
+    ifneq ($(ARCH),ARM)
+      TargetCommonOpts += -mmacosx-version-min=$(DARWIN_VERSION)
+    endif
   endif
 endif
 
@@ -745,7 +753,7 @@ ObjectsBC := $(BaseNameSources:%=$(ObjDir)/%.bc)
 #----------------------------------------------------------
 
 ifeq (-mingw32,$(findstring -mingw32,$(BUILD_TRIPLE)))
-  ECHOPATH := $(Verb)python -u -c "import sys;print ' '.join(sys.argv[1:])"
+  ECHOPATH := $(Verb)$(PYTHON) -u -c "import sys;print ' '.join(sys.argv[1:])"
 else
   ECHOPATH := $(Verb)$(ECHO)
 endif
diff --git a/projects/sample/autoconf/config.sub b/projects/sample/autoconf/config.sub
index 9942491..9d22c1e 100755
--- a/projects/sample/autoconf/config.sub
+++ b/projects/sample/autoconf/config.sub
@@ -251,7 +251,8 @@ case $basic_machine in
 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
 	| am33_2.0 \
 	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
-        | be32 | be64 \
+   | be32 | be64 \
+   | aarch64 \
 	| bfin \
 	| c4x | clipper \
 	| d10v | d30v | dlx | dsp16xx \
@@ -359,6 +360,7 @@ case $basic_machine in
 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
 	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
+   | aarch64-* \
 	| avr-* | avr32-* \
 	| be32-* | be64-* \
 	| bfin-* | bs2000-* \
diff --git a/projects/sample/autoconf/configure.ac b/projects/sample/autoconf/configure.ac
index 1763ea2..283bc12 100644
--- a/projects/sample/autoconf/configure.ac
+++ b/projects/sample/autoconf/configure.ac
@@ -304,6 +304,7 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
+  aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
   mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
@@ -474,6 +475,7 @@ else
     PowerPC)     AC_SUBST(TARGET_HAS_JIT,1) ;;
     x86_64)      AC_SUBST(TARGET_HAS_JIT,1) ;;
     ARM)         AC_SUBST(TARGET_HAS_JIT,1) ;;
+    AArch64)     AC_SUBST(TARGET_HAS_JIT,0) ;;
     Mips)        AC_SUBST(TARGET_HAS_JIT,1) ;;
     XCore)       AC_SUBST(TARGET_HAS_JIT,0) ;;
     MSP430)      AC_SUBST(TARGET_HAS_JIT,0) ;;
@@ -596,7 +598,7 @@ if test "$enableval" = host-only ; then
   enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips XCore MSP430 Hexagon CppBackend MBlaze NVPTX" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 Hexagon CppBackend MBlaze NVPTX" ;;
   *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
       case "$a_target" in
         x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -604,6 +606,7 @@ case "$enableval" in
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
+        aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         xcore)    TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
         msp430)   TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
@@ -617,6 +620,7 @@ case "$enableval" in
             Sparc)       TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
             PowerPC)     TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
             ARM)         TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
+            AArch64)     TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
             Mips)        TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
             MBlaze)      TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
             XCore)       TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
diff --git a/projects/sample/configure b/projects/sample/configure
index 94e931f..a8fc4bf 100755
--- a/projects/sample/configure
+++ b/projects/sample/configure
@@ -3844,6 +3844,7 @@ else
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
+  aarch64*-*)             llvm_cv_target_arch="AArch64" ;;
   mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
   mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
@@ -5101,6 +5102,8 @@ else
  ;;
     ARM)         TARGET_HAS_JIT=1
  ;;
+    AArch64)     TARGET_HAS_JIT=0
+ ;;
     Mips)        TARGET_HAS_JIT=1
  ;;
     XCore)       TARGET_HAS_JIT=0
@@ -5297,7 +5300,7 @@ if test "$enableval" = host-only ; then
   enableval=host
 fi
 case "$enableval" in
-  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM Mips XCore MSP430 Hexagon CppBackend MBlaze NVPTX" ;;
+  all) TARGETS_TO_BUILD="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 Hexagon CppBackend MBlaze NVPTX" ;;
   *)for a_target in `echo $enableval|sed -e 's/,/ /g' ` ; do
       case "$a_target" in
         x86)      TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;;
@@ -5305,6 +5308,7 @@ case "$enableval" in
         sparc)    TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
         powerpc)  TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
+        aarch64)  TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         xcore)    TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
         msp430)   TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
@@ -5318,6 +5322,7 @@ case "$enableval" in
             Sparc)       TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;;
             PowerPC)     TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;;
             ARM)         TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
+            AArch64)     TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;;
             Mips)        TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
             MBlaze)      TARGETS_TO_BUILD="MBlaze $TARGETS_TO_BUILD" ;;
             XCore)       TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
@@ -10348,7 +10353,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10351 "configure"
+#line 10356 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
diff --git a/runtime/libprofile/Makefile b/runtime/libprofile/Makefile
index 6e92253..44ea204 100644
--- a/runtime/libprofile/Makefile
+++ b/runtime/libprofile/Makefile
@@ -50,7 +50,7 @@ ifeq ($(HOST_OS),Darwin)
     endif
 
     # If we're doing an Apple-style build, add the LTO object path.
-    ifeq ($(RC_BUILDIT),YES)
+    ifeq ($(RC_XBS),YES)
        TempFile           := $(shell mkdir -p ${OBJROOT}/dSYMs ; mktemp ${OBJROOT}/dSYMs/profile_rt-lto.XXXXXX)
        LLVMLibsOptions    := $(LLVMLibsOptions) \
                              -Wl,-object_path_lto -Wl,$(TempFile)
diff --git a/test/Analysis/BasicAA/intrinsics.ll b/test/Analysis/BasicAA/intrinsics.ll
index 59725cf..c1cf587 100644
--- a/test/Analysis/BasicAA/intrinsics.ll
+++ b/test/Analysis/BasicAA/intrinsics.ll
@@ -7,7 +7,7 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 
 ; CHECK:      define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[ATTR:#[0-9]+]]
 ; CHECK-NEXT:   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
 ; CHECK-NEXT:   %c = add <8 x i16> %a, %a
 define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) {
@@ -22,7 +22,7 @@ entry:
 ; CHECK:      define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %q = getelementptr i8* %p, i64 16
-; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[ATTR]]
 ; CHECK-NEXT:   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
 ; CHECK-NEXT:   %c = add <8 x i16> %a, %a
 define <8 x i16> @test1(i8* %p, <8 x i16> %y) {
@@ -37,3 +37,6 @@ entry:
 
 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+
+; CHECK: attributes #0 = { nounwind readonly }
+; CHECK: attributes [[ATTR]] = { nounwind }
diff --git a/test/Analysis/BasicAA/pure-const-dce.ll b/test/Analysis/BasicAA/pure-const-dce.ll
index 266e607..e489928 100644
--- a/test/Analysis/BasicAA/pure-const-dce.ll
+++ b/test/Analysis/BasicAA/pure-const-dce.ll
@@ -4,11 +4,11 @@
 
 ; CHECK:      @test
 ; CHECK:      entry
-; CHECK:      %tmp0 = call i32 @TestConst(i32 5) readnone
-; CHECK-NEXT: %tmp1 = call i32 @TestPure(i32 6) readonly
+; CHECK:      %tmp0 = call i32 @TestConst(i32 5) [[READNONE:#[0-9]+]]
+; CHECK-NEXT: %tmp1 = call i32 @TestPure(i32 6) [[READONLY:#[0-9]+]]
 ; CHECK-NEXT: %tmp2 = call i32 @TestNone(i32 7)
 ; CHECK-NEXT: store i32 1, i32* @g
-; CHECK-NEXT: %tmp5 = call i32 @TestPure(i32 6) readonly
+; CHECK-NEXT: %tmp5 = call i32 @TestPure(i32 6) [[READONLY]]
 ; CHECK-NEXT: %tmp7 = call i32 @TestNone(i32 7)
 ; CHECK-NEXT: %tmp8 = call i32 @TestNone(i32 7)
 ; CHECK-NEXT: %sum0 = add i32 %tmp0, %tmp1
@@ -49,3 +49,6 @@ declare i32 @TestConst(i32) readnone
 declare i32 @TestPure(i32) readonly
 
 declare i32 @TestNone(i32)
+
+; CHECK: attributes [[READNONE]] = { readnone }
+; CHECK: attributes [[READONLY]] = { readonly }
diff --git a/test/Analysis/CostModel/ARM/cast.ll b/test/Analysis/CostModel/ARM/cast.ll
new file mode 100644
index 0000000..464b6ec
--- /dev/null
+++ b/test/Analysis/CostModel/ARM/cast.ll
@@ -0,0 +1,158 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios6.0.0"
+
+define i32 @casts() {
+
+    ; -- scalars --
+  ; CHECK: cost of 1 {{.*}} sext
+  %r0 = sext i1 undef to i8
+  ; CHECK: cost of 1 {{.*}} zext
+  %r1 = zext i1 undef to i8
+  ; CHECK: cost of 1 {{.*}} sext
+  %r2 = sext i1 undef to i16
+  ; CHECK: cost of 1 {{.*}} zext
+  %r3 = zext i1 undef to i16
+  ; CHECK: cost of 1 {{.*}} sext
+  %r4 = sext i1 undef to i32
+  ; CHECK: cost of 1 {{.*}} zext
+  %r5 = zext i1 undef to i32
+  ; CHECK: cost of 1 {{.*}} sext
+  %r6 = sext i1 undef to i64
+  ; CHECK: cost of 1 {{.*}} zext
+  %r7 = zext i1 undef to i64
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r8 = trunc i8 undef to i1
+  ; CHECK: cost of 1 {{.*}} sext
+  %r9 = sext i8 undef to i16
+  ; CHECK: cost of 1 {{.*}} zext
+  %r10 = zext i8 undef to i16
+  ; CHECK: cost of 1 {{.*}} sext
+  %r11 = sext i8 undef to i32
+  ; CHECK: cost of 1 {{.*}} zext
+  %r12 = zext i8 undef to i32
+  ; CHECK: cost of 1 {{.*}} sext
+  %r13 = sext i8 undef to i64
+  ; CHECK: cost of 1 {{.*}} zext
+  %r14 = zext i8 undef to i64
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r15 = trunc i16 undef to i1
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r16 = trunc i16 undef to i8
+  ; CHECK: cost of 1 {{.*}} sext
+  %r17 = sext i16 undef to i32
+  ; CHECK: cost of 1 {{.*}} zext
+  %r18 = zext i16 undef to i32
+  ; CHECK: cost of 2 {{.*}} sext
+  %r19 = sext i16 undef to i64
+  ; CHECK: cost of 1 {{.*}} zext
+  %r20 = zext i16 undef to i64
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r21 = trunc i32 undef to i1
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r22 = trunc i32 undef to i8
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r23 = trunc i32 undef to i16
+  ; CHECK: cost of 1 {{.*}} sext
+  %r24 = sext i32 undef to i64
+  ; CHECK: cost of 1 {{.*}} zext
+  %r25 = zext i32 undef to i64
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r26 = trunc i64 undef to i1
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r27 = trunc i64 undef to i8
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r28 = trunc i64 undef to i16
+  ; CHECK: cost of 0 {{.*}} trunc
+  %r29 = trunc i64 undef to i32
+
+    ; -- floating point conversions --
+  ; Moves between scalar and NEON registers.
+  ; CHECK: cost of 2 {{.*}} fptoui
+  %r30 = fptoui float undef to i1
+  ; CHECK: cost of 2 {{.*}} fptosi
+  %r31 = fptosi float undef to i1
+  ; CHECK: cost of 2 {{.*}} fptoui
+  %r32 = fptoui float undef to i8
+  ; CHECK: cost of 2 {{.*}} fptosi
+  %r33 = fptosi float undef to i8
+  ; CHECK: cost of 2 {{.*}} fptoui
+  %r34 = fptoui float undef to i16
+  ; CHECK: cost of 2 {{.*}} fptosi
+  %r35 = fptosi float undef to i16
+  ; CHECK: cost of 2 {{.*}} fptoui
+  %r36 = fptoui float undef to i32
+  ; CHECK: cost of 2 {{.*}} fptosi
+  %r37 = fptosi float undef to i32
+  ; CHECK: cost of 10 {{.*}} fptoui
+  %r38 = fptoui float undef to i64
+  ; CHECK: cost of 10 {{.*}} fptosi
+  %r39 = fptosi float undef to i64
+  ; CHECK: cost of 2 {{.*}} fptoui
+  %r40 = fptoui double undef to i1
+  ; CHECK: cost of 2 {{.*}} fptosi
+  %r41 = fptosi double undef to i1
+  ; CHECK: cost of 2 {{.*}} fptoui
+  %r42 = fptoui double undef to i8
+  ; CHECK: cost of 2 {{.*}} fptosi
+  %r43 = fptosi double undef to i8
+  ; CHECK: cost of 2 {{.*}} fptoui
+  %r44 = fptoui double undef to i16
+  ; CHECK: cost of 2 {{.*}} fptosi
+  %r45 = fptosi double undef to i16
+  ; CHECK: cost of 2 {{.*}} fptoui
+  %r46 = fptoui double undef to i32
+  ; CHECK: cost of 2 {{.*}} fptosi
+  %r47 = fptosi double undef to i32
+  ; Function call
+  ; CHECK: cost of 10 {{.*}} fptoui
+  %r48 = fptoui double undef to i64
+  ; CHECK: cost of 10 {{.*}} fptosi
+  %r49 = fptosi double undef to i64
+
+  ; CHECK: cost of 2 {{.*}} sitofp
+  %r50 = sitofp i1 undef to float
+  ; CHECK: cost of 2 {{.*}} uitofp
+  %r51 = uitofp i1 undef to float
+  ; CHECK: cost of 2 {{.*}} sitofp
+  %r52 = sitofp i1 undef to double
+  ; CHECK: cost of 2 {{.*}} uitofp
+  %r53 = uitofp i1 undef to double
+  ; CHECK: cost of 2 {{.*}} sitofp
+  %r54 = sitofp i8 undef to float
+  ; CHECK: cost of 2 {{.*}} uitofp
+  %r55 = uitofp i8 undef to float
+  ; CHECK: cost of 2 {{.*}} sitofp
+  %r56 = sitofp i8 undef to double
+  ; CHECK: cost of 2 {{.*}} uitofp
+  %r57 = uitofp i8 undef to double
+  ; CHECK: cost of 2 {{.*}} sitofp
+  %r58 = sitofp i16 undef to float
+  ; CHECK: cost of 2 {{.*}} uitofp
+  %r59 = uitofp i16 undef to float
+  ; CHECK: cost of 2 {{.*}} sitofp
+  %r60 = sitofp i16 undef to double
+  ; CHECK: cost of 2 {{.*}} uitofp
+  %r61 = uitofp i16 undef to double
+  ; CHECK: cost of 2 {{.*}} sitofp
+  %r62 = sitofp i32 undef to float
+  ; CHECK: cost of 2 {{.*}} uitofp
+  %r63 = uitofp i32 undef to float
+  ; CHECK: cost of 2 {{.*}} sitofp
+  %r64 = sitofp i32 undef to double
+  ; CHECK: cost of 2 {{.*}} uitofp
+  %r65 = uitofp i32 undef to double
+  ; Function call
+  ; CHECK: cost of 10 {{.*}} sitofp
+  %r66 = sitofp i64 undef to float
+  ; CHECK: cost of 10 {{.*}} uitofp
+  %r67 = uitofp i64 undef to float
+  ; CHECK: cost of 10 {{.*}} sitofp
+  %r68 = sitofp i64 undef to double
+  ; CHECK: cost of 10 {{.*}} uitofp
+  %r69 = uitofp i64 undef to double
+
+  ;CHECK: cost of 0 {{.*}} ret
+  ret i32 undef
+}
+
diff --git a/test/Analysis/CostModel/ARM/gep.ll b/test/Analysis/CostModel/ARM/gep.ll
new file mode 100644
index 0000000..a63b87d
--- /dev/null
+++ b/test/Analysis/CostModel/ARM/gep.ll
@@ -0,0 +1,43 @@
+; RUN: opt -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios6.0.0"
+
+define void @test_geps() {
+  ; Cost of scalar integer geps should be one. We can't always expect it to be
+  ; folded into the instruction addressing mode.
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8*
+  %a0 = getelementptr inbounds i8* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16*
+  %a1 = getelementptr inbounds i16* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32*
+  %a2 = getelementptr inbounds i32* undef, i32 0
+
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64*
+  %a3 = getelementptr inbounds i64* undef, i32 0
+
+  ; Cost of scalar floating point geps should be one. We cannot fold the address
+  ; computation.
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds float*
+  %a4 = getelementptr inbounds float* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double*
+  %a5 = getelementptr inbounds double* undef, i32 0
+
+
+  ; Cost of vector geps should be one. We cannot fold the address computation.
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i8>*
+  %a7 = getelementptr inbounds <4 x i8>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i16>*
+  %a8 = getelementptr inbounds <4 x i16>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>*
+  %a9 = getelementptr inbounds <4 x i32>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i64>*
+  %a10 = getelementptr inbounds <4 x i64>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x float>*
+  %a11 = getelementptr inbounds <4 x float>* undef, i32 0
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x double>*
+  %a12 = getelementptr inbounds <4 x double>* undef, i32 0
+
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/ARM/insertelement.ll b/test/Analysis/CostModel/ARM/insertelement.ll
new file mode 100644
index 0000000..f951b08
--- /dev/null
+++ b/test/Analysis/CostModel/ARM/insertelement.ll
@@ -0,0 +1,46 @@
+; RUN: opt -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios6.0.0"
+
+; Multiple insert elements from loads into d subregisters are expensive on swift
+; due to renaming constraints.
+%T_i8v = type <8 x i8>
+%T_i8 = type i8
+; CHECK: insertelement_i8
+define void @insertelement_i8(%T_i8* %saddr,
+                           %T_i8v* %vaddr) {
+  %v0 = load %T_i8v* %vaddr
+  %v1 = load %T_i8* %saddr
+;CHECK: estimated cost of 3 for {{.*}} insertelement <8 x i8>
+  %v2 = insertelement %T_i8v %v0, %T_i8 %v1, i32 1
+  store %T_i8v %v2, %T_i8v* %vaddr
+  ret void
+}
+
+
+%T_i16v = type <4 x i16>
+%T_i16 = type i16
+; CHECK: insertelement_i16
+define void @insertelement_i16(%T_i16* %saddr,
+                           %T_i16v* %vaddr) {
+  %v0 = load %T_i16v* %vaddr
+  %v1 = load %T_i16* %saddr
+;CHECK: estimated cost of 3 for {{.*}} insertelement <4 x i16>
+  %v2 = insertelement %T_i16v %v0, %T_i16 %v1, i32 1
+  store %T_i16v %v2, %T_i16v* %vaddr
+  ret void
+}
+
+%T_i32v = type <2 x i32>
+%T_i32 = type i32
+; CHECK: insertelement_i32
+define void @insertelement_i32(%T_i32* %saddr,
+                           %T_i32v* %vaddr) {
+  %v0 = load %T_i32v* %vaddr
+  %v1 = load %T_i32* %saddr
+;CHECK: estimated cost of 3 for {{.*}} insertelement <2 x i32>
+  %v2 = insertelement %T_i32v %v0, %T_i32 %v1, i32 1
+  store %T_i32v %v2, %T_i32v* %vaddr
+  ret void
+}
diff --git a/test/Analysis/CostModel/ARM/lit.local.cfg b/test/Analysis/CostModel/ARM/lit.local.cfg
new file mode 100644
index 0000000..cb77b09
--- /dev/null
+++ b/test/Analysis/CostModel/ARM/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM' in targets:
+    config.unsupported = True
+
diff --git a/test/Analysis/CostModel/ARM/select.ll b/test/Analysis/CostModel/ARM/select.ll
new file mode 100644
index 0000000..96afccf
--- /dev/null
+++ b/test/Analysis/CostModel/ARM/select.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios6.0.0"
+
+; CHECK: casts
+define void @casts() {
+    ; Scalar values
+  ; CHECK: cost of 1 {{.*}} select
+  %v1 = select i1 undef, i8 undef, i8 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v2 = select i1 undef, i16 undef, i16 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v3 = select i1 undef, i32 undef, i32 undef
+  ; CHECK: cost of 2 {{.*}} select
+  %v4 = select i1 undef, i64 undef, i64 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v5 = select i1 undef, float undef, float undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v6 = select i1 undef, double undef, double undef
+
+    ; Vector values
+  ; CHECK: cost of 1 {{.*}} select
+  %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v8 = select <4 x i1>  undef, <4 x i8> undef, <4 x i8> undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v9 = select <8 x i1>  undef, <8 x i8> undef, <8 x i8> undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v10 = select <16 x i1>  undef, <16 x i8> undef, <16 x i8> undef
+
+  ; CHECK: cost of 1 {{.*}} select
+  %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v12 = select <4 x i1>  undef, <4 x i16> undef, <4 x i16> undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v13 = select <8 x i1>  undef, <8 x i16> undef, <8 x i16> undef
+
+  ; CHECK: cost of 1 {{.*}} select
+  %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v15 = select <4 x i1>  undef, <4 x i32> undef, <4 x i32> undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+
+  ; CHECK: cost of 1 {{.*}} select
+  %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v18 = select <4 x i1>  undef, <4 x float> undef, <4 x float> undef
+
+  ; CHECK: cost of 1 {{.*}} select
+  %v19 = select <2 x i1>  undef, <2 x double> undef, <2 x double> undef
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/ARM/shuffle.ll b/test/Analysis/CostModel/ARM/shuffle.ll
new file mode 100644
index 0000000..c92d668
--- /dev/null
+++ b/test/Analysis/CostModel/ARM/shuffle.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios6.0.0"
+
+; CHECK: shuffle
+define void @shuffle() {
+
+
+  ;; Reverse shuffles should be lowered to vrev and possibly a vext (for
+  ;; quadwords)
+
+    ; Vector values
+  ; CHECK: cost of 1 {{.*}} shuffle
+  %v7 = shufflevector <2 x i8> undef, <2 x i8>undef, <2 x i32> <i32 1, i32 0>
+  ; CHECK: cost of 1 {{.*}} shuffle
+  %v8 = shufflevector <4 x i8> undef, <4 x i8>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ; CHECK: cost of 1 {{.*}} shuffle
+  %v9 = shufflevector <8 x i8> undef, <8 x i8>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ; CHECK: cost of 2 {{.*}} shuffle
+  %v10 = shufflevector <16 x i8> undef, <16 x i8>undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; CHECK: cost of 1 {{.*}} shuffle
+  %v11 = shufflevector <2 x i16> undef, <2 x i16>undef, <2 x i32> <i32 1, i32 0>
+  ; CHECK: cost of 1 {{.*}} shuffle
+  %v12 = shufflevector <4 x i16> undef, <4 x i16>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ; CHECK: cost of 2 {{.*}} shuffle
+  %v13 = shufflevector <8 x i16> undef, <8 x i16>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; CHECK: cost of 1 {{.*}} shuffle
+  %v14 = shufflevector <2 x i32> undef, <2 x i32>undef, <2 x i32> <i32 1, i32 0>
+  ; CHECK: cost of 2 {{.*}} shuffle
+  %v15 = shufflevector <4 x i32> undef, <4 x i32>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  ; CHECK: cost of 1 {{.*}} shuffle
+  %v16 = shufflevector <2 x float> undef, <2 x float>undef, <2 x i32> <i32 1, i32 0>
+  ; CHECK: cost of 2 {{.*}} shuffle
+  %v17 = shufflevector <4 x float> undef, <4 x float>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/PowerPC/insert_extract.ll b/test/Analysis/CostModel/PowerPC/insert_extract.ll
new file mode 100644
index 0000000..f51963d
--- /dev/null
+++ b/test/Analysis/CostModel/PowerPC/insert_extract.ll
@@ -0,0 +1,16 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @insert(i32 %arg) {
+  ; CHECK: cost of 13 {{.*}} insertelement
+  %x = insertelement <4 x i32> undef, i32 %arg, i32 0
+  ret i32 undef
+}
+
+define i32 @extract(<4 x i32> %arg) {
+  ; CHECK: cost of 13 {{.*}} extractelement
+  %x = extractelement <4 x i32> %arg, i32 0
+  ret i32 %x
+}
+
diff --git a/test/Analysis/CostModel/PowerPC/lit.local.cfg b/test/Analysis/CostModel/PowerPC/lit.local.cfg
new file mode 100644
index 0000000..4019eca
--- /dev/null
+++ b/test/Analysis/CostModel/PowerPC/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'PowerPC' in targets:
+    config.unsupported = True
+
diff --git a/test/Analysis/CostModel/PowerPC/load_store.ll b/test/Analysis/CostModel/PowerPC/load_store.ll
new file mode 100644
index 0000000..c77cce9
--- /dev/null
+++ b/test/Analysis/CostModel/PowerPC/load_store.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @stores(i32 %arg) {
+
+  ; CHECK: cost of 1 {{.*}} store
+  store i8 undef, i8* undef, align 4
+  ; CHECK: cost of 1 {{.*}} store
+  store i16 undef, i16* undef, align 4
+  ; CHECK: cost of 1 {{.*}} store
+  store i32 undef, i32* undef, align 4
+  ; CHECK: cost of 2 {{.*}} store
+  store i64 undef, i64* undef, align 4
+  ; CHECK: cost of 4 {{.*}} store
+  store i128 undef, i128* undef, align 4
+
+  ret i32 undef
+}
+define i32 @loads(i32 %arg) {
+  ; CHECK: cost of 1 {{.*}} load
+  load i8* undef, align 4
+  ; CHECK: cost of 1 {{.*}} load
+  load i16* undef, align 4
+  ; CHECK: cost of 1 {{.*}} load
+  load i32* undef, align 4
+  ; CHECK: cost of 2 {{.*}} load
+  load i64* undef, align 4
+  ; CHECK: cost of 4 {{.*}} load
+  load i128* undef, align 4
+
+  ret i32 undef
+}
+
diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll
index cedc682..bacc778 100644
--- a/test/Analysis/CostModel/X86/cast.ll
+++ b/test/Analysis/CostModel/X86/cast.ll
@@ -44,6 +44,10 @@ define i32 @zext_sext(<8 x i1> %in) {
   %B = zext <8 x i16> undef to <8 x i32>
   ;CHECK: cost of 1 {{.*}} sext
   %C = sext <4 x i32> undef to <4 x i64>
+  ;CHECK: cost of 8 {{.*}} sext
+  %C1 = sext <4 x i8> undef to <4 x i64>
+  ;CHECK: cost of 8 {{.*}} sext
+  %C2 = sext <4 x i16> undef to <4 x i64>
 
   ;CHECK: cost of 1 {{.*}} zext
   %D = zext <4 x i32> undef to <4 x i64>
@@ -59,7 +63,7 @@ define i32 @zext_sext(<8 x i1> %in) {
   ret i32 undef
 }
 
-define i32 @masks(<8 x i1> %in) {
+define i32 @masks8(<8 x i1> %in) {
   ;CHECK: cost of 6 {{.*}} zext
   %Z = zext <8 x i1> %in to <8 x i32>
   ;CHECK: cost of 9 {{.*}} sext
@@ -67,3 +71,9 @@ define i32 @masks(<8 x i1> %in) {
   ret i32 undef
 }
 
+define i32 @masks4(<4 x i1> %in) {
+  ;CHECK: cost of 8 {{.*}} sext
+  %S = sext <4 x i1> %in to <4 x i64>
+  ret i32 undef
+}
+
diff --git a/test/Analysis/CostModel/X86/gep.ll b/test/Analysis/CostModel/X86/gep.ll
new file mode 100644
index 0000000..877184a
--- /dev/null
+++ b/test/Analysis/CostModel/X86/gep.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+
+define void @test_geps() {
+  ; Cost of should be zero. We expect it to be folded into
+  ; the instruction addressing mode.
+;CHECK:  cost of 0 for instruction: {{.*}} getelementptr inbounds i8*
+  %a0 = getelementptr inbounds i8* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16*
+  %a1 = getelementptr inbounds i16* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32*
+  %a2 = getelementptr inbounds i32* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64*
+  %a3 = getelementptr inbounds i64* undef, i32 0
+
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds float*
+  %a4 = getelementptr inbounds float* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds double*
+  %a5 = getelementptr inbounds double* undef, i32 0
+
+ ; Vector geps should also have zero cost.
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>*
+  %a7 = getelementptr inbounds <4 x i8>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>*
+  %a8 = getelementptr inbounds <4 x i16>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i32>*
+  %a9 = getelementptr inbounds <4 x i32>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i64>*
+  %a10 = getelementptr inbounds <4 x i64>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x float>*
+  %a11 = getelementptr inbounds <4 x float>* undef, i32 0
+;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x double>*
+  %a12 = getelementptr inbounds <4 x double>* undef, i32 0
+
+
+  ret void
+}
diff --git a/test/Analysis/Profiling/lit.local.cfg b/test/Analysis/Profiling/lit.local.cfg
index 19eebc0..d507d3f 100644
--- a/test/Analysis/Profiling/lit.local.cfg
+++ b/test/Analysis/Profiling/lit.local.cfg
@@ -1 +1,13 @@
 config.suffixes = ['.ll', '.c', '.cpp']
+
+def getRoot(config):
+    if not config.parent:
+        return config
+    return getRoot(config.parent)
+
+root = getRoot(config)
+
+# Most profiling tests rely on a JIT being present to gather their data; AArch64
+# doesn't have any JIT at present so they will fail when run there.
+if root.host_arch in ['AArch64']:
+    config.unsupported = True
diff --git a/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll b/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
index 1ac5927..c6cc26a 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
@@ -9,13 +9,13 @@
 ; invalid, as it's possible that this only happens after optimization on a
 ; code path which isn't ever executed.
 
-; CHECK: define void @test0_yes(i32* nocapture %p) nounwind readnone {
+; CHECK: define void @test0_yes(i32* nocapture %p) #0 {
 define void @test0_yes(i32* %p) nounwind {
   store i32 0, i32* %p, !tbaa !1
   ret void
 }
 
-; CHECK: define void @test0_no(i32* nocapture %p) nounwind {
+; CHECK: define void @test0_no(i32* nocapture %p) #1 {
 define void @test0_no(i32* %p) nounwind {
   store i32 0, i32* %p, !tbaa !2
   ret void
@@ -24,13 +24,13 @@ define void @test0_no(i32* %p) nounwind {
 ; Add the readonly attribute, since there's just a call to a function which 
 ; TBAA says doesn't modify any memory.
 
-; CHECK: define void @test1_yes(i32* nocapture %p) nounwind readonly {
+; CHECK: define void @test1_yes(i32* nocapture %p) #2 {
 define void @test1_yes(i32* %p) nounwind {
   call void @callee(i32* %p), !tbaa !1
   ret void
 }
 
-; CHECK: define void @test1_no(i32* %p) nounwind {
+; CHECK: define void @test1_no(i32* %p) #1 {
 define void @test1_no(i32* %p) nounwind {
   call void @callee(i32* %p), !tbaa !2
   ret void
@@ -43,13 +43,13 @@ define void @test1_no(i32* %p) nounwind {
 ; This is unusual, since the function is memcpy, but as above, this
 ; isn't necessarily invalid.
 
-; CHECK: define void @test2_yes(i8* nocapture %p, i8* nocapture %q, i64 %n) nounwind readnone {
+; CHECK: define void @test2_yes(i8* nocapture %p, i8* nocapture %q, i64 %n) #0 {
 define void @test2_yes(i8* %p, i8* %q, i64 %n) nounwind {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %p, i8* %q, i64 %n, i32 1, i1 false), !tbaa !1
   ret void
 }
 
-; CHECK: define void @test2_no(i8* nocapture %p, i8* nocapture %q, i64 %n) nounwind {
+; CHECK: define void @test2_no(i8* nocapture %p, i8* nocapture %q, i64 %n) #1 {
 define void @test2_no(i8* %p, i8* %q, i64 %n) nounwind {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %p, i8* %q, i64 %n, i32 1, i1 false), !tbaa !2
   ret void
@@ -57,13 +57,13 @@ define void @test2_no(i8* %p, i8* %q, i64 %n) nounwind {
 
 ; Similar to the others, va_arg only accesses memory through its operand.
 
-; CHECK: define i32 @test3_yes(i8* nocapture %p) nounwind readnone {
+; CHECK: define i32 @test3_yes(i8* nocapture %p) #0 {
 define i32 @test3_yes(i8* %p) nounwind {
   %t = va_arg i8* %p, i32, !tbaa !1
   ret i32 %t
 }
 
-; CHECK: define i32 @test3_no(i8* nocapture %p) nounwind {
+; CHECK: define i32 @test3_no(i8* nocapture %p) #1 {
 define i32 @test3_no(i8* %p) nounwind {
   %t = va_arg i8* %p, i32, !tbaa !2
   ret i32 %t
@@ -72,6 +72,10 @@ define i32 @test3_no(i8* %p) nounwind {
 declare void @callee(i32* %p) nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) nounwind
 
+; CHECK: attributes #0 = { nounwind readnone }
+; CHECK: attributes #1 = { nounwind }
+; CHECK: attributes #2 = { nounwind readonly }
+
 ; Root note.
 !0 = metadata !{ }
 
diff --git a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
index 8f080e2..6f1c22d 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
@@ -7,7 +7,7 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-
 
 ; CHECK:      define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind
+; CHECK-NEXT:   %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[NUW:#[0-9]+]]
 ; CHECK-NEXT:   call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16)
 ; CHECK-NEXT:   %c = add <8 x i16> %a, %a
 define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) {
@@ -22,6 +22,9 @@ entry:
 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly
 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
 
+; CHECK: attributes #0 = { nounwind readonly }
+; CHECK: attributes [[NUW]] = { nounwind }
+
 !0 = metadata !{metadata !"tbaa root", null}
 !1 = metadata !{metadata !"A", metadata !0}
 !2 = metadata !{metadata !"B", metadata !0}
diff --git a/test/Assembler/2008-09-02-FunctionNotes.ll b/test/Assembler/2008-09-02-FunctionNotes.ll
index 761c91e..11a0411 100644
--- a/test/Assembler/2008-09-02-FunctionNotes.ll
+++ b/test/Assembler/2008-09-02-FunctionNotes.ll
@@ -1,14 +1,21 @@
 ; Test function attributes
-; RUN: llvm-as < %s | llvm-dis | grep inline | count 2
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 
+; CHECK: define void @fn1() #0
 define void @fn1() alwaysinline {
   ret void
 }
 
+; CHECK: define void @fn2() #1
 define void @fn2() noinline {
   ret void
 }
 
+; CHECK: define void @fn3()
+; CHECK-NOT: define void @fn3() #{{.*}}
 define void @fn3() {
   ret void
 }
+
+; CHECK: attributes #0 = { alwaysinline }
+; CHECK: attributes #1 = { noinline }
diff --git a/test/Assembler/ConstantExprNoFold.ll b/test/Assembler/ConstantExprNoFold.ll
new file mode 100644
index 0000000..83e8909
--- /dev/null
+++ b/test/Assembler/ConstantExprNoFold.ll
@@ -0,0 +1,23 @@
+; This test checks to make sure that constant exprs don't fold in some simple
+; situations
+
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Even give it a datalayout, to tempt folding as much as possible.
+target datalayout = "p:32:32"
+
+@A = global i64 0
+@B = global i64 0
+
+; Don't fold this. @A might really be allocated next to @B, in which case the
+; icmp should return true. It's not valid to *dereference* in @B from a pointer
+; based on @A, but icmp isn't a dereference.
+
+; CHECK: @C = global i1 icmp eq (i64* getelementptr inbounds (i64* @A, i64 1), i64* @B)
+@C = global i1 icmp eq (i64* getelementptr inbounds (i64* @A, i64 1), i64* @B)
+
+; Don't fold this completely away either. In theory this could be simplified
+; to only use a gep on one side of the icmp though.
+
+; CHECK: @D = global i1 icmp eq (i64* getelementptr inbounds (i64* @A, i64 1), i64* getelementptr inbounds (i64* @B, i64 2))
+@D = global i1 icmp eq (i64* getelementptr inbounds (i64* @A, i64 1), i64* getelementptr inbounds (i64* @B, i64 2))
diff --git a/test/Assembler/externally-initialized.ll b/test/Assembler/externally-initialized.ll
new file mode 100644
index 0000000..4be6e62
--- /dev/null
+++ b/test/Assembler/externally-initialized.ll
@@ -0,0 +1,5 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+
+; CHECK: @G = externally_initialized global i32 0
+
+@G = externally_initialized global i32 0
diff --git a/test/Assembler/unnamed-addr.ll b/test/Assembler/unnamed-addr.ll
index 3c94ca2..35b3b39 100644
--- a/test/Assembler/unnamed-addr.ll
+++ b/test/Assembler/unnamed-addr.ll
@@ -15,4 +15,6 @@ declare i32 @zed(%struct.foobar*, %struct.foobar*)
 
 ; CHECK: @bar.d = internal unnamed_addr constant %struct.foobar zeroinitializer, align 4
 ; CHECK: @foo.d = internal constant %struct.foobar zeroinitializer, align 4
-; CHECK: define i32 @main() unnamed_addr nounwind ssp {
+; CHECK: define i32 @main() unnamed_addr #0 {
+
+; CHECK: attributes #0 = { nounwind ssp }
diff --git a/test/Bindings/Ocaml/vmcore.ml b/test/Bindings/Ocaml/vmcore.ml
index 61be4b7..b49bab9 100644
--- a/test/Bindings/Ocaml/vmcore.ml
+++ b/test/Bindings/Ocaml/vmcore.ml
@@ -860,7 +860,8 @@ let test_builder () =
   group "function attribute";
   begin
       ignore (add_function_attr fn Attribute.UWTable);
-      (* RUN: grep "X7.*uwtable" < %t.ll
+      (* RUN: grep "X7.*#0" < %t.ll
+       * RUN: grep "attributes #0 = .*uwtable.*" < %t.ll
        *)
       insist ([Attribute.UWTable] = function_attr fn);
   end;
diff --git a/test/Bitcode/attributes.ll b/test/Bitcode/attributes.ll
index 502e967..6c46e94 100644
--- a/test/Bitcode/attributes.ll
+++ b/test/Bitcode/attributes.ll
@@ -14,7 +14,7 @@ define void @f2(i8 signext)
 }
 
 define void @f3() noreturn
-; CHECK: define void @f3() noreturn
+; CHECK: define void @f3() #0
 {
         ret void;
 }
@@ -32,7 +32,7 @@ define void @f5(i8* sret)
 }
 
 define void @f6() nounwind
-; CHECK: define void @f6() nounwind
+; CHECK: define void @f6() #1
 {
         ret void;
 }
@@ -56,43 +56,43 @@ define void @f9(i8* nest)
 }
 
 define void @f10() readnone
-; CHECK: define void @f10() readnone
+; CHECK: define void @f10() #2
 {
         ret void;
 }
 
 define void @f11() readonly
-; CHECK: define void @f11() readonly
+; CHECK: define void @f11() #3
 {
         ret void;
 }
 
 define void @f12() noinline
-; CHECK: define void @f12() noinline
+; CHECK: define void @f12() #4
 {
         ret void;
 }
 
 define void @f13() alwaysinline
-; CHECK: define void @f13() alwaysinline
+; CHECK: define void @f13() #5
 {
         ret void;
 }
 
 define void @f14() optsize
-; CHECK: define void @f14() optsize
+; CHECK: define void @f14() #6
 {
         ret void;
 }
 
 define void @f15() ssp
-; CHECK: define void @f15() ssp
+; CHECK: define void @f15() #7
 {
         ret void;
 }
 
 define void @f16() sspreq
-; CHECK: define void @f16() sspreq
+; CHECK: define void @f16() #8
 {
         ret void;
 }
@@ -110,55 +110,93 @@ define void @f18(i8* nocapture)
 }
 
 define void @f19() noredzone
-; CHECK: define void @f19() noredzone
+; CHECK: define void @f19() #9
 {
         ret void;
 }
 
 define void @f20() noimplicitfloat
-; CHECK: define void @f20() noimplicitfloat
+; CHECK: define void @f20() #10
 {
         ret void;
 }
 
 define void @f21() naked
-; CHECK: define void @f21() naked
+; CHECK: define void @f21() #11
 {
         ret void;
 }
 
 define void @f22() inlinehint
-; CHECK: define void @f22() inlinehint
+; CHECK: define void @f22() #12
 {
         ret void;
 }
 
 define void @f23() alignstack(4)
-; CHECK: define void @f23() alignstack(4)
+; CHECK: define void @f23() #13
 {
         ret void;
 }
 
 define void @f24() returns_twice
-; CHECK: define void @f24() returns_twice
+; CHECK: define void @f24() #14
 {
         ret void;
 }
 
 define void @f25() uwtable
-; CHECK: define void @f25() uwtable
+; CHECK: define void @f25() #15
 {
         ret void;
 }
 
 define void @f26() nonlazybind
-; CHECK: define void @f26() nonlazybind
+; CHECK: define void @f26() #16
 {
         ret void;
 }
 
-define void @f27() address_safety
-; CHECK: define void @f27() address_safety
+define void @f27() sanitize_address
+; CHECK: define void @f27() #17
 {
         ret void;
 }
+define void @f28() sanitize_thread
+; CHECK: define void @f28() #18
+{
+        ret void;
+}
+define void @f29() sanitize_memory
+; CHECK: define void @f29() #19
+{
+        ret void;
+}
+
+define void @f30() "cpu"="cortex-a8"
+; CHECK: define void @f30() #20
+{
+        ret void;
+}
+
+; CHECK: attributes #0 = { noreturn }
+; CHECK: attributes #1 = { nounwind }
+; CHECK: attributes #2 = { readnone }
+; CHECK: attributes #3 = { readonly }
+; CHECK: attributes #4 = { noinline }
+; CHECK: attributes #5 = { alwaysinline }
+; CHECK: attributes #6 = { optsize }
+; CHECK: attributes #7 = { ssp }
+; CHECK: attributes #8 = { sspreq }
+; CHECK: attributes #9 = { noredzone }
+; CHECK: attributes #10 = { noimplicitfloat }
+; CHECK: attributes #11 = { naked }
+; CHECK: attributes #12 = { inlinehint }
+; CHECK: attributes #13 = { alignstack=4 }
+; CHECK: attributes #14 = { returns_twice }
+; CHECK: attributes #15 = { uwtable }
+; CHECK: attributes #16 = { nonlazybind }
+; CHECK: attributes #17 = { sanitize_address }
+; CHECK: attributes #18 = { sanitize_thread }
+; CHECK: attributes #19 = { sanitize_memory }
+; CHECK: attributes #20 = { "cpu"="cortex-a8" }
diff --git a/test/Bitcode/ptest-new.ll b/test/Bitcode/ptest-new.ll
index 276fb7a..735cc9c 100644
--- a/test/Bitcode/ptest-new.ll
+++ b/test/Bitcode/ptest-new.ll
@@ -13,10 +13,13 @@ entry:
  ret i32 %add2
 }
 
-; CHECK: declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
-; CHECK: declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
-; CHECK: declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
+; CHECK: declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) #1
+; CHECK: declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) #1
+; CHECK: declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) #1
 
 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
+
+; CHECK: attributes #0 = { nounwind }
+; CHECK: attributes #1 = { nounwind readnone }
diff --git a/test/Bitcode/ptest-old.ll b/test/Bitcode/ptest-old.ll
index fc6ed8e..fbe962f 100644
--- a/test/Bitcode/ptest-old.ll
+++ b/test/Bitcode/ptest-old.ll
@@ -13,10 +13,13 @@ entry:
  ret i32 %add2
 }
 
-; CHECK: declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
-; CHECK: declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
-; CHECK: declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
+; CHECK: declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) #1
+; CHECK: declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) #1
+; CHECK: declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) #1
 
 declare i32 @llvm.x86.sse41.ptestc(<4 x float>, <4 x float>) nounwind readnone
 declare i32 @llvm.x86.sse41.ptestz(<4 x float>, <4 x float>) nounwind readnone
 declare i32 @llvm.x86.sse41.ptestnzc(<4 x float>, <4 x float>) nounwind readnone
+
+; CHECK: attributes #0 = { nounwind }
+; CHECK: attributes #1 = { nounwind readnone }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e146ae1..3da7c18 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -24,6 +24,7 @@ set(LLVM_TEST_DEPENDS UnitTests
           llvm-nm
           llvm-objdump
           llvm-readobj
+          llvm-rtdyld
           macho-dump opt
           profile_rt-shared
           FileCheck count not
diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll
new file mode 100644
index 0000000..7cb3732
--- /dev/null
+++ b/test/CodeGen/AArch64/adc.ll
@@ -0,0 +1,54 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i128 @test_simple(i128 %a, i128 %b, i128 %c) {
+; CHECK: test_simple:
+
+  %valadd = add i128 %a, %b
+; CHECK: adds [[ADDLO:x[0-9]+]], x0, x2
+; CHECK-NEXT: adcs [[ADDHI:x[0-9]+]], x1, x3
+
+  %valsub = sub i128 %valadd, %c
+; CHECK: subs x0, [[ADDLO]], x4
+; CHECK: sbcs x1, [[ADDHI]], x5
+
+  ret i128 %valsub
+; CHECK: ret
+}
+
+define i128 @test_imm(i128 %a) {
+; CHECK: test_imm:
+
+  %val = add i128 %a, 12
+; CHECK: adds x0, x0, #12
+; CHECK: adcs x1, x1, {{x[0-9]|xzr}}
+
+  ret i128 %val
+; CHECK: ret
+}
+
+define i128 @test_shifted(i128 %a, i128 %b) {
+; CHECK: test_shifted:
+
+  %rhs = shl i128 %b, 45
+
+  %val = add i128 %a, %rhs
+; CHECK: adds x0, x0, x2, lsl #45
+; CHECK: adcs x1, x1, {{x[0-9]}}
+
+  ret i128 %val
+; CHECK: ret
+}
+
+define i128 @test_extended(i128 %a, i16 %b) {
+; CHECK: test_extended:
+
+  %ext = sext i16 %b to i128
+  %rhs = shl i128 %ext, 3
+
+  %val = add i128 %a, %rhs
+; CHECK: adds x0, x0, w2, sxth #3
+; CHECK: adcs x1, x1, {{x[0-9]}}
+
+  ret i128 %val
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/addsub-shifted.ll b/test/CodeGen/AArch64/addsub-shifted.ll
new file mode 100644
index 0000000..f2c74f6
--- /dev/null
+++ b/test/CodeGen/AArch64/addsub-shifted.ll
@@ -0,0 +1,295 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_lsl_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
+; CHECK: test_lsl_arith:
+
+  %rhs1 = load volatile i32* @var32
+  %shift1 = shl i32 %rhs1, 18
+  %val1 = add i32 %lhs32, %shift1
+  store volatile i32 %val1, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #18
+
+  %rhs2 = load volatile i32* @var32
+  %shift2 = shl i32 %rhs2, 31
+  %val2 = add i32 %shift2, %lhs32
+  store volatile i32 %val2, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #31
+
+  %rhs3 = load volatile i32* @var32
+  %shift3 = shl i32 %rhs3, 5
+  %val3 = sub i32 %lhs32, %shift3
+  store volatile i32 %val3, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #5
+
+; Subtraction is not commutative!
+  %rhs4 = load volatile i32* @var32
+  %shift4 = shl i32 %rhs4, 19
+  %val4 = sub i32 %shift4, %lhs32
+  store volatile i32 %val4, i32* @var32
+; CHECK-NOT: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #19
+
+  %lhs4a = load volatile i32* @var32
+  %shift4a = shl i32 %lhs4a, 15
+  %val4a = sub i32 0, %shift4a
+  store volatile i32 %val4a, i32* @var32
+; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, lsl #15
+
+  %rhs5 = load volatile i64* @var64
+  %shift5 = shl i64 %rhs5, 18
+  %val5 = add i64 %lhs64, %shift5
+  store volatile i64 %val5, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #18
+
+  %rhs6 = load volatile i64* @var64
+  %shift6 = shl i64 %rhs6, 31
+  %val6 = add i64 %shift6, %lhs64
+  store volatile i64 %val6, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #31
+
+  %rhs7 = load volatile i64* @var64
+  %shift7 = shl i64 %rhs7, 5
+  %val7 = sub i64 %lhs64, %shift7
+  store volatile i64 %val7, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #5
+
+; Subtraction is not commutative!
+  %rhs8 = load volatile i64* @var64
+  %shift8 = shl i64 %rhs8, 19
+  %val8 = sub i64 %shift8, %lhs64
+  store volatile i64 %val8, i64* @var64
+; CHECK-NOT: sub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #19
+
+  %lhs8a = load volatile i64* @var64
+  %shift8a = shl i64 %lhs8a, 60
+  %val8a = sub i64 0, %shift8a
+  store volatile i64 %val8a, i64* @var64
+; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, lsl #60
+
+  ret void
+; CHECK: ret
+}
+
+define void @test_lsr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
+; CHECK: test_lsr_arith:
+
+  %shift1 = lshr i32 %rhs32, 18
+  %val1 = add i32 %lhs32, %shift1
+  store volatile i32 %val1, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsr #18
+
+  %shift2 = lshr i32 %rhs32, 31
+  %val2 = add i32 %shift2, %lhs32
+  store volatile i32 %val2, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsr #31
+
+  %shift3 = lshr i32 %rhs32, 5
+  %val3 = sub i32 %lhs32, %shift3
+  store volatile i32 %val3, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsr #5
+
+; Subtraction is not commutative!
+  %shift4 = lshr i32 %rhs32, 19
+  %val4 = sub i32 %shift4, %lhs32
+  store volatile i32 %val4, i32* @var32
+; CHECK-NOT: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsr #19
+
+  %shift4a = lshr i32 %lhs32, 15
+  %val4a = sub i32 0, %shift4a
+  store volatile i32 %val4a, i32* @var32
+; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, lsr #15
+
+  %shift5 = lshr i64 %rhs64, 18
+  %val5 = add i64 %lhs64, %shift5
+  store volatile i64 %val5, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsr #18
+
+  %shift6 = lshr i64 %rhs64, 31
+  %val6 = add i64 %shift6, %lhs64
+  store volatile i64 %val6, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsr #31
+
+  %shift7 = lshr i64 %rhs64, 5
+  %val7 = sub i64 %lhs64, %shift7
+  store volatile i64 %val7, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsr #5
+
+; Subtraction is not commutative!
+  %shift8 = lshr i64 %rhs64, 19
+  %val8 = sub i64 %shift8, %lhs64
+  store volatile i64 %val8, i64* @var64
+; CHECK-NOT: sub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsr #19
+
+  %shift8a = lshr i64 %lhs64, 45
+  %val8a = sub i64 0, %shift8a
+  store volatile i64 %val8a, i64* @var64
+; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, lsr #45
+
+  ret void
+; CHECK: ret
+}
+
+define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
+; CHECK: test_asr_arith:
+
+  %shift1 = ashr i32 %rhs32, 18
+  %val1 = add i32 %lhs32, %shift1
+  store volatile i32 %val1, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, asr #18
+
+  %shift2 = ashr i32 %rhs32, 31
+  %val2 = add i32 %shift2, %lhs32
+  store volatile i32 %val2, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, asr #31
+
+  %shift3 = ashr i32 %rhs32, 5
+  %val3 = sub i32 %lhs32, %shift3
+  store volatile i32 %val3, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, asr #5
+
+; Subtraction is not commutative!
+  %shift4 = ashr i32 %rhs32, 19
+  %val4 = sub i32 %shift4, %lhs32
+  store volatile i32 %val4, i32* @var32
+; CHECK-NOT: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, asr #19
+
+  %shift4a = ashr i32 %lhs32, 15
+  %val4a = sub i32 0, %shift4a
+  store volatile i32 %val4a, i32* @var32
+; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, asr #15
+
+  %shift5 = ashr i64 %rhs64, 18
+  %val5 = add i64 %lhs64, %shift5
+  store volatile i64 %val5, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, asr #18
+
+  %shift6 = ashr i64 %rhs64, 31
+  %val6 = add i64 %shift6, %lhs64
+  store volatile i64 %val6, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, asr #31
+
+  %shift7 = ashr i64 %rhs64, 5
+  %val7 = sub i64 %lhs64, %shift7
+  store volatile i64 %val7, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, asr #5
+
+; Subtraction is not commutative!
+  %shift8 = ashr i64 %rhs64, 19
+  %val8 = sub i64 %shift8, %lhs64
+  store volatile i64 %val8, i64* @var64
+; CHECK-NOT: sub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, asr #19
+
+  %shift8a = ashr i64 %lhs64, 45
+  %val8a = sub i64 0, %shift8a
+  store volatile i64 %val8a, i64* @var64
+; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, asr #45
+
+  ret void
+; CHECK: ret
+}
+
+define i32 @test_cmp(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
+; CHECK: test_cmp:
+
+  %shift1 = shl i32 %rhs32, 13
+  %tst1 = icmp uge i32 %lhs32, %shift1
+  br i1 %tst1, label %t2, label %end
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}, lsl #13
+
+t2:
+  %shift2 = lshr i32 %rhs32, 20
+  %tst2 = icmp ne i32 %lhs32, %shift2
+  br i1 %tst2, label %t3, label %end
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}, lsr #20
+
+t3:
+  %shift3 = ashr i32 %rhs32, 9
+  %tst3 = icmp ne i32 %lhs32, %shift3
+  br i1 %tst3, label %t4, label %end
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}, asr #9
+
+t4:
+  %shift4 = shl i64 %rhs64, 43
+  %tst4 = icmp uge i64 %lhs64, %shift4
+  br i1 %tst4, label %t5, label %end
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}, lsl #43
+
+t5:
+  %shift5 = lshr i64 %rhs64, 20
+  %tst5 = icmp ne i64 %lhs64, %shift5
+  br i1 %tst5, label %t6, label %end
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}, lsr #20
+
+t6:
+  %shift6 = ashr i64 %rhs64, 59
+  %tst6 = icmp ne i64 %lhs64, %shift6
+  br i1 %tst6, label %t7, label %end
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}, asr #59
+
+t7:
+  ret i32 1
+end:
+
+  ret i32 0
+; CHECK: ret
+}
+
+define i32 @test_cmn(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
+; CHECK: test_cmn:
+
+  %shift1 = shl i32 %rhs32, 13
+  %val1 = sub i32 0, %shift1
+  %tst1 = icmp uge i32 %lhs32, %val1
+  br i1 %tst1, label %t2, label %end
+  ; Important that this isn't lowered to a cmn instruction because if %rhs32 ==
+  ; 0 then the results will differ.
+; CHECK: sub [[RHS:w[0-9]+]], wzr, {{w[0-9]+}}, lsl #13
+; CHECK: cmp {{w[0-9]+}}, [[RHS]]
+
+t2:
+  %shift2 = lshr i32 %rhs32, 20
+  %val2 = sub i32 0, %shift2
+  %tst2 = icmp ne i32 %lhs32, %val2
+  br i1 %tst2, label %t3, label %end
+; CHECK: cmn {{w[0-9]+}}, {{w[0-9]+}}, lsr #20
+
+t3:
+  %shift3 = ashr i32 %rhs32, 9
+  %val3 = sub i32 0, %shift3
+  %tst3 = icmp eq i32 %lhs32, %val3
+  br i1 %tst3, label %t4, label %end
+; CHECK: cmn {{w[0-9]+}}, {{w[0-9]+}}, asr #9
+
+t4:
+  %shift4 = shl i64 %rhs64, 43
+  %val4 = sub i64 0, %shift4
+  %tst4 = icmp slt i64 %lhs64, %val4
+  br i1 %tst4, label %t5, label %end
+  ; Again, it's important that cmn isn't used here in case %rhs64 == 0.
+; CHECK: sub [[RHS:x[0-9]+]], xzr, {{x[0-9]+}}, lsl #43
+; CHECK: cmp {{x[0-9]+}}, [[RHS]]
+
+t5:
+  %shift5 = lshr i64 %rhs64, 20
+  %val5 = sub i64 0, %shift5
+  %tst5 = icmp ne i64 %lhs64, %val5
+  br i1 %tst5, label %t6, label %end
+; CHECK: cmn {{x[0-9]+}}, {{x[0-9]+}}, lsr #20
+
+t6:
+  %shift6 = ashr i64 %rhs64, 59
+  %val6 = sub i64 0, %shift6
+  %tst6 = icmp ne i64 %lhs64, %val6
+  br i1 %tst6, label %t7, label %end
+; CHECK: cmn {{x[0-9]+}}, {{x[0-9]+}}, asr #59
+
+t7:
+  ret i32 1
+end:
+
+  ret i32 0
+; CHECK: ret
+}
+
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
new file mode 100644
index 0000000..5148807
--- /dev/null
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -0,0 +1,127 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+; Note that this should be refactored (for efficiency if nothing else)
+; when the PCS is implemented so we don't have to worry about the
+; loads and stores.
+
+@var_i32 = global i32 42
+@var_i64 = global i64 0
+
+; Add pure 12-bit immediates:
+define void @add_small() {
+; CHECK: add_small:
+
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #4095
+  %val32 = load i32* @var_i32
+  %newval32 = add i32 %val32, 4095
+  store i32 %newval32, i32* @var_i32
+
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #52
+  %val64 = load i64* @var_i64
+  %newval64 = add i64 %val64, 52
+  store i64 %newval64, i64* @var_i64
+
+  ret void
+}
+
+; Add 12-bit immediates, shifted left by 12 bits
+define void @add_med() {
+; CHECK: add_med:
+
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12
+  %val32 = load i32* @var_i32
+  %newval32 = add i32 %val32, 14610432 ; =0xdef000
+  store i32 %newval32, i32* @var_i32
+
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #4095, lsl #12
+  %val64 = load i64* @var_i64
+  %newval64 = add i64 %val64, 16773120 ; =0xfff000
+  store i64 %newval64, i64* @var_i64
+
+  ret void
+}
+
+; Subtract 12-bit immediates
+define void @sub_small() {
+; CHECK: sub_small:
+
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, #4095
+  %val32 = load i32* @var_i32
+  %newval32 = sub i32 %val32, 4095
+  store i32 %newval32, i32* @var_i32
+
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, #52
+  %val64 = load i64* @var_i64
+  %newval64 = sub i64 %val64, 52
+  store i64 %newval64, i64* @var_i64
+
+  ret void
+}
+
+; Subtract 12-bit immediates, shifted left by 12 bits
+define void @sub_med() {
+; CHECK: sub_med:
+
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12
+  %val32 = load i32* @var_i32
+  %newval32 = sub i32 %val32, 14610432 ; =0xdef000
+  store i32 %newval32, i32* @var_i32
+
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, #4095, lsl #12
+  %val64 = load i64* @var_i64
+  %newval64 = sub i64 %val64, 16773120 ; =0xfff000
+  store i64 %newval64, i64* @var_i64
+
+  ret void
+}
+
+define void @testing() {
+; CHECK: testing:
+  %val = load i32* @var_i32
+
+; CHECK: cmp {{w[0-9]+}}, #4095
+; CHECK: b.ne .LBB4_6
+  %cmp_pos_small = icmp ne i32 %val, 4095
+  br i1 %cmp_pos_small, label %ret, label %test2
+
+test2:
+; CHECK: cmp {{w[0-9]+}}, #3567, lsl #12
+; CHECK: b.lo .LBB4_6
+  %newval2 = add i32 %val, 1
+  store i32 %newval2, i32* @var_i32
+  %cmp_pos_big = icmp ult i32 %val, 14610432
+  br i1 %cmp_pos_big, label %ret, label %test3
+
+test3:
+; CHECK: cmp {{w[0-9]+}}, #123
+; CHECK: b.lt .LBB4_6
+  %newval3 = add i32 %val, 2
+  store i32 %newval3, i32* @var_i32
+  %cmp_pos_slt = icmp slt i32 %val, 123
+  br i1 %cmp_pos_slt, label %ret, label %test4
+
+test4:
+; CHECK: cmp {{w[0-9]+}}, #321
+; CHECK: b.gt .LBB4_6
+  %newval4 = add i32 %val, 3
+  store i32 %newval4, i32* @var_i32
+  %cmp_pos_sgt = icmp sgt i32 %val, 321
+  br i1 %cmp_pos_sgt, label %ret, label %test5
+
+test5:
+; CHECK: cmn {{w[0-9]+}}, #444
+; CHECK: b.gt .LBB4_6
+  %newval5 = add i32 %val, 4
+  store i32 %newval5, i32* @var_i32
+  %cmp_neg_uge = icmp sgt i32 %val, -444
+  br i1 %cmp_neg_uge, label %ret, label %test6
+
+test6:
+  %newval6 = add i32 %val, 5
+  store i32 %newval6, i32* @var_i32
+  ret void
+
+ret:
+  ret void
+}
+; TODO: adds/subs
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
new file mode 100644
index 0000000..2dd1662
--- /dev/null
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -0,0 +1,189 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @addsub_i8rhs() {
+; CHECK: addsub_i8rhs:
+    %val8_tmp = load i8* @var8
+    %lhs32 = load i32* @var32
+    %lhs64 = load i64* @var64
+
+    ; Need this to prevent extension upon load and give a vanilla i8 operand.
+    %val8 = add i8 %val8_tmp, 123
+
+
+; Zero-extending to 32-bits
+    %rhs32_zext = zext i8 %val8 to i32
+    %res32_zext = add i32 %lhs32, %rhs32_zext
+    store volatile i32 %res32_zext, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxtb
+
+   %rhs32_zext_shift = shl i32 %rhs32_zext, 3
+   %res32_zext_shift = add i32 %lhs32, %rhs32_zext_shift
+   store volatile i32 %res32_zext_shift, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxtb #3
+
+
+; Zero-extending to 64-bits
+    %rhs64_zext = zext i8 %val8 to i64
+    %res64_zext = add i64 %lhs64, %rhs64_zext
+    store volatile i64 %res64_zext, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtb
+
+   %rhs64_zext_shift = shl i64 %rhs64_zext, 1
+   %res64_zext_shift = add i64 %lhs64, %rhs64_zext_shift
+   store volatile i64 %res64_zext_shift, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtb #1
+
+; Sign-extending to 32-bits
+    %rhs32_sext = sext i8 %val8 to i32
+    %res32_sext = add i32 %lhs32, %rhs32_sext
+    store volatile i32 %res32_sext, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxtb
+
+   %rhs32_sext_shift = shl i32 %rhs32_sext, 1
+   %res32_sext_shift = add i32 %lhs32, %rhs32_sext_shift
+   store volatile i32 %res32_sext_shift, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxtb #1
+
+; Sign-extending to 64-bits
+    %rhs64_sext = sext i8 %val8 to i64
+    %res64_sext = add i64 %lhs64, %rhs64_sext
+    store volatile i64 %res64_sext, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtb
+
+   %rhs64_sext_shift = shl i64 %rhs64_sext, 4
+   %res64_sext_shift = add i64 %lhs64, %rhs64_sext_shift
+   store volatile i64 %res64_sext_shift, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtb #4
+
+
+; CMP variants
+    %tst = icmp slt i32 %lhs32, %rhs32_zext
+    br i1 %tst, label %end, label %test2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}, uxtb
+
+test2:
+    %cmp_sext = sext i8 %val8 to i64
+    %tst2 = icmp eq i64 %lhs64, %cmp_sext
+    br i1 %tst2, label %other, label %end
+; CHECK: cmp {{x[0-9]+}}, {{w[0-9]+}}, sxtb
+
+other:
+    store volatile i32 %lhs32, i32* @var32
+    ret void
+
+end:
+    ret void
+}
+
+define void @addsub_i16rhs() {
+; CHECK: addsub_i16rhs:
+    %val16_tmp = load i16* @var16
+    %lhs32 = load i32* @var32
+    %lhs64 = load i64* @var64
+
+    ; Need this to prevent extension upon load and give a vanilla i16 operand.
+    %val16 = add i16 %val16_tmp, 123
+
+
+; Zero-extending to 32-bits
+    %rhs32_zext = zext i16 %val16 to i32
+    %res32_zext = add i32 %lhs32, %rhs32_zext
+    store volatile i32 %res32_zext, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxth
+
+   %rhs32_zext_shift = shl i32 %rhs32_zext, 3
+   %res32_zext_shift = add i32 %lhs32, %rhs32_zext_shift
+   store volatile i32 %res32_zext_shift, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxth #3
+
+
+; Zero-extending to 64-bits
+    %rhs64_zext = zext i16 %val16 to i64
+    %res64_zext = add i64 %lhs64, %rhs64_zext
+    store volatile i64 %res64_zext, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxth
+
+   %rhs64_zext_shift = shl i64 %rhs64_zext, 1
+   %res64_zext_shift = add i64 %lhs64, %rhs64_zext_shift
+   store volatile i64 %res64_zext_shift, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxth #1
+
+; Sign-extending to 32-bits
+    %rhs32_sext = sext i16 %val16 to i32
+    %res32_sext = add i32 %lhs32, %rhs32_sext
+    store volatile i32 %res32_sext, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth
+
+   %rhs32_sext_shift = shl i32 %rhs32_sext, 1
+   %res32_sext_shift = add i32 %lhs32, %rhs32_sext_shift
+   store volatile i32 %res32_sext_shift, i32* @var32
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth #1
+
+; Sign-extending to 64-bits
+    %rhs64_sext = sext i16 %val16 to i64
+    %res64_sext = add i64 %lhs64, %rhs64_sext
+    store volatile i64 %res64_sext, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxth
+
+   %rhs64_sext_shift = shl i64 %rhs64_sext, 4
+   %res64_sext_shift = add i64 %lhs64, %rhs64_sext_shift
+   store volatile i64 %res64_sext_shift, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxth #4
+
+
+; CMP variants
+    %tst = icmp slt i32 %lhs32, %rhs32_zext
+    br i1 %tst, label %end, label %test2
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}, uxth
+
+test2:
+    %cmp_sext = sext i16 %val16 to i64
+    %tst2 = icmp eq i64 %lhs64, %cmp_sext
+    br i1 %tst2, label %other, label %end
+; CHECK: cmp {{x[0-9]+}}, {{w[0-9]+}}, sxth
+
+other:
+    store volatile i32 %lhs32, i32* @var32
+    ret void
+
+end:
+    ret void
+}
+
+; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for
+; example), but the remaining instructions are probably not idiomatic
+; in the face of "add/sub (shifted register)" so I don't intend to.
+define void @addsub_i32rhs() {
+; CHECK: addsub_i32rhs:
+    %val32_tmp = load i32* @var32
+    %lhs64 = load i64* @var64
+
+    %val32 = add i32 %val32_tmp, 123
+
+    %rhs64_zext = zext i32 %val32 to i64
+    %res64_zext = add i64 %lhs64, %rhs64_zext
+    store volatile i64 %res64_zext, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw
+
+    %rhs64_zext_shift = shl i64 %rhs64_zext, 2
+    %res64_zext_shift = add i64 %lhs64, %rhs64_zext_shift
+    store volatile i64 %res64_zext_shift, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2
+
+    %rhs64_sext = sext i32 %val32 to i64
+    %res64_sext = add i64 %lhs64, %rhs64_sext
+    store volatile i64 %res64_sext, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw
+
+    %rhs64_sext_shift = shl i64 %rhs64_sext, 2
+    %res64_sext_shift = add i64 %lhs64, %rhs64_sext_shift
+    store volatile i64 %res64_sext_shift, i64* @var64
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw #2
+
+    ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/adrp-relocation.ll b/test/CodeGen/AArch64/adrp-relocation.ll
new file mode 100644
index 0000000..c33b442
--- /dev/null
+++ b/test/CodeGen/AArch64/adrp-relocation.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -filetype=obj < %s | elf-dump | FileCheck %s
+
+define i64 @testfn() nounwind {
+entry:
+  ret i64 0
+}
+
+define i64 @foo() nounwind {
+entry:
+  %bar = alloca i64 ()*, align 8
+  store i64 ()* @testfn, i64 ()** %bar, align 8
+  %call = call i64 @testfn()
+  ret i64 %call
+}
+
+; The above should produce an ADRP/ADD pair to calculate the address of
+; testfn. The important point is that LLVM shouldn't think it can deal with the
+; relocation on the ADRP itself (even though it knows everything about the
+; relative offsets of testfn and foo) because its value depends on where this
+; object file's .text section gets relocated in memory.
+
+; CHECK: .rela.text
+
+; CHECK: # Relocation 0
+; CHECK-NEXT: (('r_offset', 0x0000000000000010)
+; CHECK-NEXT:  ('r_sym', 0x00000007)
+; CHECK-NEXT:  ('r_type', 0x00000113)
+; CHECK-NEXT:  ('r_addend', 0x0000000000000000)
+; CHECK-NEXT: ),
+; CHECK-NEXT:  Relocation 1
+; CHECK-NEXT: (('r_offset', 0x0000000000000014)
+; CHECK-NEXT:  ('r_sym', 0x00000007)
+; CHECK-NEXT:  ('r_type', 0x00000115)
+; CHECK-NEXT:  ('r_addend', 0x0000000000000000)
+; CHECK-NEXT: ),
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
new file mode 100644
index 0000000..6421769
--- /dev/null
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -0,0 +1,134 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+declare void @use_addr(i8*)
+
+define void @test_simple_alloca(i64 %n) {
+; CHECK: test_simple_alloca:
+
+  %buf = alloca i8, i64 %n
+  ; Make sure we align the stack change to 16 bytes:
+; CHECK: add [[SPDELTA:x[0-9]+]], x0, #15
+; CHECK: and x0, [[SPDELTA]], #0xfffffffffffffff0
+
+  ; Make sure we change SP. It would be surprising if anything but x0 were used
+  ; for the final sp, but it could be if it was then moved into x0.
+; CHECK: mov [[TMP:x[0-9]+]], sp
+; CHECK: sub x0, [[TMP]], [[SPDELTA]]
+; CHECK: mov sp, x0
+
+  call void @use_addr(i8* %buf)
+; CHECK: bl use_addr
+
+  ret void
+  ; Make sure epilogue restores sp from fp
+; CHECK: sub sp, x29, #16
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: add sp, sp, #32
+; CHECK: ret
+}
+
+declare void @use_addr_loc(i8*, i64*)
+
+define i64 @test_alloca_with_local(i64 %n) {
+; CHECK: test_alloca_with_local:
+; CHECK: sub sp, sp, #32
+; CHECK: stp x29, x30, [sp, #16]
+
+  %loc = alloca i64
+  %buf = alloca i8, i64 %n
+  ; Make sure we align the stack change to 16 bytes:
+; CHECK: add [[SPDELTA:x[0-9]+]], x0, #15
+; CHECK: and x0, [[SPDELTA]], #0xfffffffffffffff0
+
+  ; Make sure we change SP. It would be surprising if anything but x0 were used
+  ; for the final sp, but it could be if it was then moved into x0.
+; CHECK: mov [[TMP:x[0-9]+]], sp
+; CHECK: sub x0, [[TMP]], [[SPDELTA]]
+; CHECK: mov sp, x0
+
+  ; Obviously suboptimal code here, but it to get &local in x1
+; CHECK: sub [[TMP:x[0-9]+]], x29, [[LOC_FROM_FP:#[0-9]+]]
+; CHECK: add x1, [[TMP]], #0
+
+  call void @use_addr_loc(i8* %buf, i64* %loc)
+; CHECK: bl use_addr
+
+  %val = load i64* %loc
+; CHECK: sub x[[TMP:[0-9]+]], x29, [[LOC_FROM_FP]]
+; CHECK: ldr x0, [x[[TMP]]]
+
+  ret i64 %val
+  ; Make sure epilogue restores sp from fp
+; CHECK: sub sp, x29, #16
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: add sp, sp, #32
+; CHECK: ret
+}
+
+define void @test_variadic_alloca(i64 %n, ...) {
+; CHECK: test_variadic_alloca:
+
+; CHECK: sub     sp, sp, #208
+; CHECK: stp     x29, x30, [sp, #192]
+; CHECK: add     x29, sp, #192
+; CHECK: sub     x9, x29, #192
+; CHECK: add     x8, x9, #0
+; CHECK: str     q7, [x8, #112]
+; [...]
+; CHECK: str     q1, [x8, #16]
+
+  %addr = alloca i8, i64 %n
+
+  call void @use_addr(i8* %addr)
+; CHECK: bl use_addr
+
+  ret void
+; CHECK: sub sp, x29, #192
+; CHECK: ldp x29, x30, [sp, #192]
+; CHECK: add sp, sp, #208
+}
+
+define void @test_alloca_large_frame(i64 %n) {
+; CHECK: test_alloca_large_frame:
+
+; CHECK: sub sp, sp, #496
+; CHECK: stp x29, x30, [sp, #480]
+; CHECK: add x29, sp, #480
+; CHECK: sub sp, sp, #48
+; CHECK: sub sp, sp, #1953, lsl #12
+
+  %addr1 = alloca i8, i64 %n
+  %addr2 = alloca i64, i64 1000000
+
+  call void @use_addr_loc(i8* %addr1, i64* %addr2)
+
+  ret void
+; CHECK: sub sp, x29, #480
+; CHECK: ldp x29, x30, [sp, #480]
+; CHECK: add sp, sp, #496
+}
+
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
+
+define void @test_scoped_alloca(i64 %n) {
+; CHECK: test_scoped_alloca
+; CHECK: sub sp, sp, #32
+
+  %sp = call i8* @llvm.stacksave()
+; CHECK: mov [[SAVED_SP:x[0-9]+]], sp
+
+  %addr = alloca i8, i64 %n
+; CHECK: and [[SPDELTA:x[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: mov [[OLDSP:x[0-9]+]], sp
+; CHECK: sub [[NEWSP:x[0-9]+]], [[OLDSP]], [[SPDELTA]]
+; CHECK: mov sp, [[NEWSP]]
+
+  call void @use_addr(i8* %addr)
+; CHECK: bl use_addr
+
+  call void @llvm.stackrestore(i8* %sp)
+; CHECK: mov sp, [[SAVED_SP]]
+
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/analyze-branch.ll b/test/CodeGen/AArch64/analyze-branch.ll
new file mode 100644
index 0000000..e10bbb0
--- /dev/null
+++ b/test/CodeGen/AArch64/analyze-branch.ll
@@ -0,0 +1,231 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; This test checks that LLVM can do basic stripping and reapplying of branches
+; to basic blocks.
+
+declare void @test_true()
+declare void @test_false()
+
+; !0 corresponds to a branch being taken, !1 to not being takne.
+!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
+
+define void @test_Bcc_fallthrough_taken(i32 %in) nounwind {
+; CHECK: test_Bcc_fallthrough_taken:
+  %tst = icmp eq i32 %in, 42
+  br i1 %tst, label %true, label %false, !prof !0
+
+; CHECK: cmp {{w[0-9]+}}, #42
+
+; CHECK: b.ne [[FALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_true
+
+; CHECK: [[FALSE]]:
+; CHECK: bl test_false
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+define void @test_Bcc_fallthrough_nottaken(i32 %in) nounwind {
+; CHECK: test_Bcc_fallthrough_nottaken:
+  %tst = icmp eq i32 %in, 42
+  br i1 %tst, label %true, label %false, !prof !1
+
+; CHECK: cmp {{w[0-9]+}}, #42
+
+; CHECK: b.eq [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_false
+
+; CHECK: [[TRUE]]:
+; CHECK: bl test_true
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+define void @test_CBZ_fallthrough_taken(i32 %in) nounwind {
+; CHECK: test_CBZ_fallthrough_taken:
+  %tst = icmp eq i32 %in, 0
+  br i1 %tst, label %true, label %false, !prof !0
+
+; CHECK: cbnz {{w[0-9]+}}, [[FALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_true
+
+; CHECK: [[FALSE]]:
+; CHECK: bl test_false
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+define void @test_CBZ_fallthrough_nottaken(i64 %in) nounwind {
+; CHECK: test_CBZ_fallthrough_nottaken:
+  %tst = icmp eq i64 %in, 0
+  br i1 %tst, label %true, label %false, !prof !1
+
+; CHECK: cbz {{x[0-9]+}}, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_false
+
+; CHECK: [[TRUE]]:
+; CHECK: bl test_true
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+define void @test_CBNZ_fallthrough_taken(i32 %in) nounwind {
+; CHECK: test_CBNZ_fallthrough_taken:
+  %tst = icmp ne i32 %in, 0
+  br i1 %tst, label %true, label %false, !prof !0
+
+; CHECK: cbz {{w[0-9]+}}, [[FALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_true
+
+; CHECK: [[FALSE]]:
+; CHECK: bl test_false
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+define void @test_CBNZ_fallthrough_nottaken(i64 %in) nounwind {
+; CHECK: test_CBNZ_fallthrough_nottaken:
+  %tst = icmp ne i64 %in, 0
+  br i1 %tst, label %true, label %false, !prof !1
+
+; CHECK: cbnz {{x[0-9]+}}, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_false
+
+; CHECK: [[TRUE]]:
+; CHECK: bl test_true
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+define void @test_TBZ_fallthrough_taken(i32 %in) nounwind {
+; CHECK: test_TBZ_fallthrough_taken:
+  %bit = and i32 %in, 32768
+  %tst = icmp eq i32 %bit, 0
+  br i1 %tst, label %true, label %false, !prof !0
+
+; CHECK: tbnz {{w[0-9]+}}, #15, [[FALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_true
+
+; CHECK: [[FALSE]]:
+; CHECK: bl test_false
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+define void @test_TBZ_fallthrough_nottaken(i64 %in) nounwind {
+; CHECK: test_TBZ_fallthrough_nottaken:
+  %bit = and i64 %in, 32768
+  %tst = icmp eq i64 %bit, 0
+  br i1 %tst, label %true, label %false, !prof !1
+
+; CHECK: tbz {{x[0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_false
+
+; CHECK: [[TRUE]]:
+; CHECK: bl test_true
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+
+define void @test_TBNZ_fallthrough_taken(i32 %in) nounwind {
+; CHECK: test_TBNZ_fallthrough_taken:
+  %bit = and i32 %in, 32768
+  %tst = icmp ne i32 %bit, 0
+  br i1 %tst, label %true, label %false, !prof !0
+
+; CHECK: tbz {{w[0-9]+}}, #15, [[FALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_true
+
+; CHECK: [[FALSE]]:
+; CHECK: bl test_false
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
+define void @test_TBNZ_fallthrough_nottaken(i64 %in) nounwind {
+; CHECK: test_TBNZ_fallthrough_nottaken:
+  %bit = and i64 %in, 32768
+  %tst = icmp ne i64 %bit, 0
+  br i1 %tst, label %true, label %false, !prof !1
+
+; CHECK: tbnz {{x[0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: // BB#
+; CHECK-NEXT: bl test_false
+
+; CHECK: [[TRUE]]:
+; CHECK: bl test_true
+
+true:
+  call void @test_true()
+  ret void
+
+false:
+  call void @test_false()
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/atomic-ops-not-barriers.ll b/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
new file mode 100644
index 0000000..3c03e47
--- /dev/null
+++ b/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+define i32 @foo(i32* %var, i1 %cond) {
+; CHECK: foo:
+  br i1 %cond, label %atomic_ver, label %simple_ver
+simple_ver:
+  %oldval = load i32* %var
+  %newval = add nsw i32 %oldval, -1
+  store i32 %newval, i32* %var
+  br label %somewhere
+atomic_ver:
+  %val = atomicrmw add i32* %var, i32 -1 seq_cst
+  br label %somewhere
+; CHECK: dmb
+; CHECK: ldxr
+; CHECK: dmb
+  ; The key point here is that the second dmb isn't immediately followed by the
+  ; simple_ver basic block, which LLVM attempted to do when DMB had been marked
+  ; with isBarrier. For now, look for something that looks like "somewhere".
+; CHECK-NEXT: mov
+somewhere:
+  %combined = phi i32 [ %val, %atomic_ver ], [ %newval, %simple_ver]
+  ret i32 %combined
+}
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
new file mode 100644
index 0000000..bcb1a6f
--- /dev/null
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -0,0 +1,1099 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_add_i8:
+   %old = atomicrmw add i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_add_i16:
+   %old = atomicrmw add i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_add_i32:
+   %old = atomicrmw add i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_add_i64:
+   %old = atomicrmw add i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: add [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_sub_i8:
+   %old = atomicrmw sub i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_sub_i16:
+   %old = atomicrmw sub i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_sub_i32:
+   %old = atomicrmw sub i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_sub_i64:
+   %old = atomicrmw sub i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: sub [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_and_i8:
+   %old = atomicrmw and i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_and_i16:
+   %old = atomicrmw and i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_and_i32:
+   %old = atomicrmw and i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_and_i64:
+   %old = atomicrmw and i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: and [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_or_i8:
+   %old = atomicrmw or i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_or_i16:
+   %old = atomicrmw or i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_or_i32:
+   %old = atomicrmw or i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_or_i64:
+   %old = atomicrmw or i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: orr [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_xor_i8:
+   %old = atomicrmw xor i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_xor_i16:
+   %old = atomicrmw xor i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_xor_i32:
+   %old = atomicrmw xor i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_xor_i64:
+   %old = atomicrmw xor i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: eor [[NEW:x[0-9]+]], x[[OLD]], x0
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_xchg_i8:
+   %old = atomicrmw xchg i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_xchg_i16:
+   %old = atomicrmw xchg i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_xchg_i32:
+   %old = atomicrmw xchg i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_xchg_i64:
+   %old = atomicrmw xchg i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], x0, [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+
+define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_min_i8:
+   %old = atomicrmw min i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_min_i16:
+   %old = atomicrmw min i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]], sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_min_i32:
+   %old = atomicrmw min i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]]
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_min_i64:
+   %old = atomicrmw min i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: cmp x0, x[[OLD]]
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_max_i8:
+   %old = atomicrmw max i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_max_i16:
+   %old = atomicrmw max i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]], sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_max_i32:
+   %old = atomicrmw max i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]]
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_max_i64:
+   %old = atomicrmw max i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: cmp x0, x[[OLD]]
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lt
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_umin_i8:
+   %old = atomicrmw umin i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_umin_i16:
+   %old = atomicrmw umin i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]], uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_umin_i32:
+   %old = atomicrmw umin i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]]
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_umin_i64:
+   %old = atomicrmw umin i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: cmp x0, x[[OLD]]
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
+; CHECK: test_atomic_load_umax_i8:
+   %old = atomicrmw umax i8* @var8, i8 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
+; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
+; CHECK: test_atomic_load_umax_i16:
+   %old = atomicrmw umax i16* @var16, i16 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]], uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
+; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
+; CHECK: test_atomic_load_umax_i32:
+   %old = atomicrmw umax i32* @var32, i32 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w0, w[[OLD]]
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
+; CHECK: test_atomic_load_umax_i64:
+   %old = atomicrmw umax i64* @var64, i64 %offset seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; x0 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: cmp x0, x[[OLD]]
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lo
+; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne .LBB{{[0-9]+}}_1
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
+; CHECK: test_atomic_cmpxchg_i8:
+   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
+; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
+  ; As above, w1 is a reasonable guess.
+; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne [[STARTAGAIN]]
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
+; CHECK: test_atomic_cmpxchg_i16:
+   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+
+; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
+; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
+  ; As above, w1 is a reasonable guess.
+; CHECK: stxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne [[STARTAGAIN]]
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
+; CHECK: test_atomic_cmpxchg_i32:
+   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+
+; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
+; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
+  ; As above, w1 is a reasonable guess.
+; CHECK: stxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne [[STARTAGAIN]]
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
+; CHECK: test_atomic_cmpxchg_i64:
+   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new seq_cst
+; CHECK: dmb ish
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+
+; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
+; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+  ; w0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
+  ; As above, w1 is a reasonable guess.
+; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: b.ne [[STARTAGAIN]]
+; CHECK: dmb ish
+
+; CHECK: mov x0, x[[OLD]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_monotonic_i8() nounwind {
+; CHECK: test_atomic_load_monotonic_i8:
+  %val = load atomic i8* @var8 monotonic, align 1
+; CHECK-NOT: dmb
+; CHECK: adrp x[[HIADDR:[0-9]+]], var8
+; CHECK: ldrb w0, [x[[HIADDR]], #:lo12:var8]
+; CHECK-NOT: dmb
+
+  ret i8 %val
+}
+
+define i8 @test_atomic_load_monotonic_regoff_i8(i64 %base, i64 %off) nounwind {
+; CHECK: test_atomic_load_monotonic_regoff_i8:
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i8*
+
+  %val = load atomic i8* %addr monotonic, align 1
+; CHECK-NOT: dmb
+; CHECK: ldrb w0, [x0, x1]
+; CHECK-NOT: dmb
+
+  ret i8 %val
+}
+
+define i8 @test_atomic_load_acquire_i8() nounwind {
+; CHECK: test_atomic_load_acquire_i8:
+  %val = load atomic i8* @var8 acquire, align 1
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+
+; CHECK: ldarb w0, [x[[ADDR]]]
+  ret i8 %val
+}
+
+define i8 @test_atomic_load_seq_cst_i8() nounwind {
+; CHECK: test_atomic_load_seq_cst_i8:
+  %val = load atomic i8* @var8 seq_cst, align 1
+; CHECK: adrp x[[HIADDR:[0-9]+]], var8
+; CHECK: ldrb w0, [x[[HIADDR]], #:lo12:var8]
+; CHECK: dmb ish
+  ret i8 %val
+}
+
+define i16 @test_atomic_load_monotonic_i16() nounwind {
+; CHECK: test_atomic_load_monotonic_i16:
+  %val = load atomic i16* @var16 monotonic, align 2
+; CHECK-NOT: dmb
+; CHECK: adrp x[[HIADDR:[0-9]+]], var16
+; CHECK: ldrh w0, [x[[HIADDR]], #:lo12:var16]
+; CHECK-NOT: dmb
+
+  ret i16 %val
+}
+
+define i32 @test_atomic_load_monotonic_regoff_i32(i64 %base, i64 %off) nounwind {
+; CHECK: test_atomic_load_monotonic_regoff_i32:
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i32*
+
+  %val = load atomic i32* %addr monotonic, align 4
+; CHECK-NOT: dmb
+; CHECK: ldr w0, [x0, x1]
+; CHECK-NOT: dmb
+
+  ret i32 %val
+}
+
+define i64 @test_atomic_load_seq_cst_i64() nounwind {
+; CHECK: test_atomic_load_seq_cst_i64:
+  %val = load atomic i64* @var64 seq_cst, align 8
+; CHECK: adrp x[[HIADDR:[0-9]+]], var64
+; CHECK: ldr x0, [x[[HIADDR]], #:lo12:var64]
+; CHECK: dmb ish
+  ret i64 %val
+}
+
+define void @test_atomic_store_monotonic_i8(i8 %val) nounwind {
+; CHECK: test_atomic_store_monotonic_i8:
+  store atomic i8 %val, i8* @var8 monotonic, align 1
+; CHECK: adrp x[[HIADDR:[0-9]+]], var8
+; CHECK: strb w0, [x[[HIADDR]], #:lo12:var8]
+
+  ret void
+}
+
+define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val) nounwind {
+; CHECK: test_atomic_store_monotonic_regoff_i8:
+
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i8*
+
+  store atomic i8 %val, i8* %addr monotonic, align 1
+; CHECK: strb w2, [x0, x1]
+
+  ret void
+}
+define void @test_atomic_store_release_i8(i8 %val) nounwind {
+; CHECK: test_atomic_store_release_i8:
+  store atomic i8 %val, i8* @var8 release, align 1
+; CHECK: adrp [[HIADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: stlrb w0, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
+; CHECK: test_atomic_store_seq_cst_i8:
+  store atomic i8 %val, i8* @var8 seq_cst, align 1
+; CHECK: adrp [[HIADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: stlrb w0, [x[[ADDR]]]
+; CHECK: dmb ish
+
+  ret void
+}
+
+define void @test_atomic_store_monotonic_i16(i16 %val) nounwind {
+; CHECK: test_atomic_store_monotonic_i16:
+  store atomic i16 %val, i16* @var16 monotonic, align 2
+; CHECK: adrp x[[HIADDR:[0-9]+]], var16
+; CHECK: strh w0, [x[[HIADDR]], #:lo12:var16]
+
+  ret void
+}
+
+define void @test_atomic_store_monotonic_regoff_i32(i64 %base, i64 %off, i32 %val) nounwind {
+; CHECK: test_atomic_store_monotonic_regoff_i32:
+
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i32*
+
+  store atomic i32 %val, i32* %addr monotonic, align 4
+; CHECK: str w2, [x0, x1]
+
+  ret void
+}
+
+define void @test_atomic_store_release_i64(i64 %val) nounwind {
+; CHECK: test_atomic_store_release_i64:
+  store atomic i64 %val, i64* @var64 release, align 8
+; CHECK: adrp [[HIADDR:x[0-9]+]], var64
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var64
+; CHECK: stlr x0, [x[[ADDR]]]
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/basic-pic.ll b/test/CodeGen/AArch64/basic-pic.ll
new file mode 100644
index 0000000..da94041
--- /dev/null
+++ b/test/CodeGen/AArch64/basic-pic.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -filetype=obj %s -o -| llvm-objdump -r - | FileCheck --check-prefix=CHECK-ELF %s
+
+@var = global i32 0
+
+; CHECK-ELF: RELOCATION RECORDS FOR [.text]
+
+define i32 @get_globalvar() {
+; CHECK: get_globalvar:
+
+  %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], #:got_lo12:var]
+; CHECK: ldr w0, [x[[GOTLOC]]]
+
+; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var
+; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var
+  ret i32 %val
+}
+
+define i32* @get_globalvaraddr() {
+; CHECK: get_globalvaraddr:
+
+  %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:var]
+
+; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var
+; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var
+  ret i32* @var
+}
+
+@hiddenvar = hidden global i32 0
+
+define i32 @get_hiddenvar() {
+; CHECK: get_hiddenvar:
+
+  %val = load i32* @hiddenvar
+; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
+; CHECK: ldr w0, [x[[HI]], #:lo12:hiddenvar]
+
+; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar
+; CHECK-ELF: R_AARCH64_LDST32_ABS_LO12_NC hiddenvar
+  ret i32 %val
+}
+
+define i32* @get_hiddenvaraddr() {
+; CHECK: get_hiddenvaraddr:
+
+  %val = load i32* @hiddenvar
+; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
+; CHECK: add x0, [[HI]], #:lo12:hiddenvar
+
+; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar
+; CHECK-ELF: R_AARCH64_ADD_ABS_LO12_NC hiddenvar
+  ret i32* @hiddenvar
+}
+
+define void()* @get_func() {
+; CHECK: get_func:
+
+  ret void()* bitcast(void()*()* @get_func to void()*)
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
+; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:get_func]
+
+  ; Particularly important that the ADRP gets a relocation, LLVM tends to think
+  ; it can relax it because it knows where get_func is. It can't!
+; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE get_func
+; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC get_func
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/bitfield-insert-0.ll b/test/CodeGen/AArch64/bitfield-insert-0.ll
new file mode 100644
index 0000000..d1191f6
--- /dev/null
+++ b/test/CodeGen/AArch64/bitfield-insert-0.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -disassemble - | FileCheck %s
+
+; The encoding of lsb -> immr in the CGed bitfield instructions was wrong at one
+; point, in the edge case where lsb = 0. Just make sure.
+
+define void @test_bfi0(i32* %existing, i32* %new) {
+; CHECK: bfxil {{w[0-9]+}}, {{w[0-9]+}}, #0, #18
+
+  %oldval = load volatile i32* %existing
+  %oldval_keep = and i32 %oldval, 4294705152 ; 0xfffc_0000
+
+  %newval = load volatile i32* %new
+  %newval_masked = and i32 %newval, 262143 ; = 0x0003_ffff
+
+  %combined = or i32 %newval_masked, %oldval_keep
+  store volatile i32 %combined, i32* %existing
+
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
new file mode 100644
index 0000000..3e871b9
--- /dev/null
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -0,0 +1,193 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; First, a simple example from Clang. The registers could plausibly be
+; different, but probably won't be.
+
+%struct.foo = type { i8, [2 x i8], i8 }
+
+define [1 x i64] @from_clang([1 x i64] %f.coerce, i32 %n) nounwind readnone {
+; CHECK: from_clang:
+; CHECK: bfi w0, w1, #3, #4
+; CHECK-NEXT: ret
+
+entry:
+  %f.coerce.fca.0.extract = extractvalue [1 x i64] %f.coerce, 0
+  %tmp.sroa.0.0.extract.trunc = trunc i64 %f.coerce.fca.0.extract to i32
+  %bf.value = shl i32 %n, 3
+  %0 = and i32 %bf.value, 120
+  %f.sroa.0.0.insert.ext.masked = and i32 %tmp.sroa.0.0.extract.trunc, 135
+  %1 = or i32 %f.sroa.0.0.insert.ext.masked, %0
+  %f.sroa.0.0.extract.trunc = zext i32 %1 to i64
+  %tmp1.sroa.1.1.insert.insert = and i64 %f.coerce.fca.0.extract, 4294967040
+  %tmp1.sroa.0.0.insert.insert = or i64 %f.sroa.0.0.extract.trunc, %tmp1.sroa.1.1.insert.insert
+  %.fca.0.insert = insertvalue [1 x i64] undef, i64 %tmp1.sroa.0.0.insert.insert, 0
+  ret [1 x i64] %.fca.0.insert
+}
+
+define void @test_whole32(i32* %existing, i32* %new) {
+; CHECK: test_whole32:
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
+
+  %oldval = load volatile i32* %existing
+  %oldval_keep = and i32 %oldval, 2214592511 ; =0x83ffffff
+
+  %newval = load volatile i32* %new
+  %newval_shifted = shl i32 %newval, 26
+  %newval_masked = and i32 %newval_shifted, 2080374784 ; = 0x7c000000
+
+  %combined = or i32 %oldval_keep, %newval_masked
+  store volatile i32 %combined, i32* %existing
+
+  ret void
+}
+
+define void @test_whole64(i64* %existing, i64* %new) {
+; CHECK: test_whole64:
+; CHECK: bfi {{x[0-9]+}}, {{x[0-9]+}}, #26, #14
+; CHECK-NOT: and
+; CHECK: ret
+
+  %oldval = load volatile i64* %existing
+  %oldval_keep = and i64 %oldval, 18446742974265032703 ; = 0xffffff0003ffffffL
+
+  %newval = load volatile i64* %new
+  %newval_shifted = shl i64 %newval, 26
+  %newval_masked = and i64 %newval_shifted, 1099444518912 ; = 0xfffc000000
+
+  %combined = or i64 %oldval_keep, %newval_masked
+  store volatile i64 %combined, i64* %existing
+
+  ret void
+}
+
+define void @test_whole32_from64(i64* %existing, i64* %new) {
+; CHECK: test_whole32_from64:
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
+; CHECK-NOT: and
+; CHECK: ret
+
+  %oldval = load volatile i64* %existing
+  %oldval_keep = and i64 %oldval, 4294901760 ; = 0xffff0000
+
+  %newval = load volatile i64* %new
+  %newval_masked = and i64 %newval, 65535 ; = 0xffff
+
+  %combined = or i64 %oldval_keep, %newval_masked
+  store volatile i64 %combined, i64* %existing
+
+  ret void
+}
+
+define void @test_32bit_masked(i32 *%existing, i32 *%new) {
+; CHECK: test_32bit_masked:
+; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
+; CHECK: and {{w[0-9]+}}, [[INSERT]], #0xff
+
+  %oldval = load volatile i32* %existing
+  %oldval_keep = and i32 %oldval, 135 ; = 0x87
+
+  %newval = load volatile i32* %new
+  %newval_shifted = shl i32 %newval, 3
+  %newval_masked = and i32 %newval_shifted, 120 ; = 0x78
+
+  %combined = or i32 %oldval_keep, %newval_masked
+  store volatile i32 %combined, i32* %existing
+
+  ret void
+}
+
+define void @test_64bit_masked(i64 *%existing, i64 *%new) {
+; CHECK: test_64bit_masked:
+; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
+; CHECK: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
+
+  %oldval = load volatile i64* %existing
+  %oldval_keep = and i64 %oldval, 1095216660480 ; = 0xff_0000_0000
+
+  %newval = load volatile i64* %new
+  %newval_shifted = shl i64 %newval, 40
+  %newval_masked = and i64 %newval_shifted, 280375465082880 ; = 0xff00_0000_0000
+
+  %combined = or i64 %newval_masked, %oldval_keep
+  store volatile i64 %combined, i64* %existing
+
+  ret void
+}
+
+; Mask is too complicated for literal ANDwwi, make sure other avenues are tried.
+define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
+; CHECK: test_32bit_complexmask:
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+
+  %oldval = load volatile i32* %existing
+  %oldval_keep = and i32 %oldval, 647 ; = 0x287
+
+  %newval = load volatile i32* %new
+  %newval_shifted = shl i32 %newval, 3
+  %newval_masked = and i32 %newval_shifted, 120 ; = 0x278
+
+  %combined = or i32 %oldval_keep, %newval_masked
+  store volatile i32 %combined, i32* %existing
+
+  ret void
+}
+
+; Neither mask is is a contiguous set of 1s. BFI can't be used
+define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
+; CHECK: test_32bit_badmask:
+; CHECK-NOT: bfi
+; CHECK: ret
+
+  %oldval = load volatile i32* %existing
+  %oldval_keep = and i32 %oldval, 135 ; = 0x87
+
+  %newval = load volatile i32* %new
+  %newval_shifted = shl i32 %newval, 3
+  %newval_masked = and i32 %newval_shifted, 632 ; = 0x278
+
+  %combined = or i32 %oldval_keep, %newval_masked
+  store volatile i32 %combined, i32* %existing
+
+  ret void
+}
+
+; Ditto
+define void @test_64bit_badmask(i64 *%existing, i64 *%new) {
+; CHECK: test_64bit_badmask:
+; CHECK-NOT: bfi
+; CHECK: ret
+
+  %oldval = load volatile i64* %existing
+  %oldval_keep = and i64 %oldval, 135 ; = 0x87
+
+  %newval = load volatile i64* %new
+  %newval_shifted = shl i64 %newval, 3
+  %newval_masked = and i64 %newval_shifted, 664 ; = 0x278
+
+  %combined = or i64 %oldval_keep, %newval_masked
+  store volatile i64 %combined, i64* %existing
+
+  ret void
+}
+
+; Bitfield insert where there's a left-over shr needed at the beginning
+; (e.g. result of str.bf1 = str.bf2)
+define void @test_32bit_with_shr(i32* %existing, i32* %new) {
+; CHECK: test_32bit_with_shr:
+
+  %oldval = load volatile i32* %existing
+  %oldval_keep = and i32 %oldval, 2214592511 ; =0x83ffffff
+
+  %newval = load i32* %new
+  %newval_shifted = shl i32 %newval, 12
+  %newval_masked = and i32 %newval_shifted, 2080374784 ; = 0x7c000000
+
+  %combined = or i32 %oldval_keep, %newval_masked
+  store volatile i32 %combined, i32* %existing
+; CHECK: lsr [[BIT:w[0-9]+]], {{w[0-9]+}}, #14
+; CHECK: bfi {{w[0-9]}}, [[BIT]], #26, #5
+
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll
new file mode 100644
index 0000000..36d337e
--- /dev/null
+++ b/test/CodeGen/AArch64/bitfield.ll
@@ -0,0 +1,218 @@
+
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_extendb(i8 %var) {
+; CHECK: test_extendb:
+
+  %sxt32 = sext i8 %var to i32
+  store volatile i32 %sxt32, i32* @var32
+; CHECK: sxtb {{w[0-9]+}}, {{w[0-9]+}}
+
+  %sxt64 = sext i8 %var to i64
+  store volatile i64 %sxt64, i64* @var64
+; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}}
+
+; N.b. this doesn't actually produce a bitfield instruction at the
+; moment, but it's still a good test to have and the semantics are
+; correct.
+  %uxt32 = zext i8 %var to i32
+  store volatile i32 %uxt32, i32* @var32
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xff
+
+  %uxt64 = zext i8 %var to i64
+  store volatile i64 %uxt64, i64* @var64
+; CHECK: uxtb {{x[0-9]+}}, {{w[0-9]+}}
+  ret void
+}
+
+define void @test_extendh(i16 %var) {
+; CHECK: test_extendh:
+
+  %sxt32 = sext i16 %var to i32
+  store volatile i32 %sxt32, i32* @var32
+; CHECK: sxth {{w[0-9]+}}, {{w[0-9]+}}
+
+  %sxt64 = sext i16 %var to i64
+  store volatile i64 %sxt64, i64* @var64
+; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}}
+
+; N.b. this doesn't actually produce a bitfield instruction at the
+; moment, but it's still a good test to have and the semantics are
+; correct.
+  %uxt32 = zext i16 %var to i32
+  store volatile i32 %uxt32, i32* @var32
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
+
+  %uxt64 = zext i16 %var to i64
+  store volatile i64 %uxt64, i64* @var64
+; CHECK: uxth {{x[0-9]+}}, {{w[0-9]+}}
+  ret void
+}
+
+define void @test_extendw(i32 %var) {
+; CHECK: test_extendw:
+
+  %sxt64 = sext i32 %var to i64
+  store volatile i64 %sxt64, i64* @var64
+; CHECK: sxtw {{x[0-9]+}}, {{w[0-9]+}}
+
+  %uxt64 = zext i32 %var to i64
+  store volatile i64 %uxt64, i64* @var64
+; CHECK: ubfx {{w[0-9]+}}, {{w[0-9]+}}, #0, #32
+  ret void
+}
+
+define void @test_shifts(i32 %val32, i64 %val64) {
+; CHECK: test_shifts:
+
+  %shift1 = ashr i32 %val32, 31
+  store volatile i32 %shift1, i32* @var32
+; CHECK: asr {{w[0-9]+}}, {{w[0-9]+}}, #31
+
+  %shift2 = lshr i32 %val32, 8
+  store volatile i32 %shift2, i32* @var32
+; CHECK: lsr {{w[0-9]+}}, {{w[0-9]+}}, #8
+
+  %shift3 = shl i32 %val32, 1
+  store volatile i32 %shift3, i32* @var32
+; CHECK: lsl {{w[0-9]+}}, {{w[0-9]+}}, #1
+
+  %shift4 = ashr i64 %val64, 31
+  store volatile i64 %shift4, i64* @var64
+; CHECK: asr {{x[0-9]+}}, {{x[0-9]+}}, #31
+
+  %shift5 = lshr i64 %val64, 8
+  store volatile i64 %shift5, i64* @var64
+; CHECK: lsr {{x[0-9]+}}, {{x[0-9]+}}, #8
+
+  %shift6 = shl i64 %val64, 63
+  store volatile i64 %shift6, i64* @var64
+; CHECK: lsl {{x[0-9]+}}, {{x[0-9]+}}, #63
+
+  %shift7 = ashr i64 %val64, 63
+  store volatile i64 %shift7, i64* @var64
+; CHECK: asr {{x[0-9]+}}, {{x[0-9]+}}, #63
+
+  %shift8 = lshr i64 %val64, 63
+  store volatile i64 %shift8, i64* @var64
+; CHECK: lsr {{x[0-9]+}}, {{x[0-9]+}}, #63
+
+  %shift9 = lshr i32 %val32, 31
+  store volatile i32 %shift9, i32* @var32
+; CHECK: lsr {{w[0-9]+}}, {{w[0-9]+}}, #31
+
+  %shift10 = shl i32 %val32, 31
+  store volatile i32 %shift10, i32* @var32
+; CHECK: lsl {{w[0-9]+}}, {{w[0-9]+}}, #31
+
+  ret void
+}
+
+; LLVM can produce in-register extensions taking place entirely with
+; 64-bit registers too.
+define void @test_sext_inreg_64(i64 %in) {
+; CHECK: test_sext_inreg_64:
+
+; i1 doesn't have an official alias, but crops up and is handled by
+; the bitfield ops.
+  %trunc_i1 = trunc i64 %in to i1
+  %sext_i1 = sext i1 %trunc_i1 to i64
+  store volatile i64 %sext_i1, i64* @var64
+; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #1
+
+  %trunc_i8 = trunc i64 %in to i8
+  %sext_i8 = sext i8 %trunc_i8 to i64
+  store volatile i64 %sext_i8, i64* @var64
+; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}}
+
+  %trunc_i16 = trunc i64 %in to i16
+  %sext_i16 = sext i16 %trunc_i16 to i64
+  store volatile i64 %sext_i16, i64* @var64
+; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}}
+
+  %trunc_i32 = trunc i64 %in to i32
+  %sext_i32 = sext i32 %trunc_i32 to i64
+  store volatile i64 %sext_i32, i64* @var64
+; CHECK: sxtw {{x[0-9]+}}, {{w[0-9]+}}
+  ret void
+}
+
+; These instructions don't actually select to official bitfield
+; operations, but it's important that we select them somehow:
+define void @test_zext_inreg_64(i64 %in) {
+; CHECK: test_zext_inreg_64:
+
+  %trunc_i8 = trunc i64 %in to i8
+  %zext_i8 = zext i8 %trunc_i8 to i64
+  store volatile i64 %zext_i8, i64* @var64
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
+
+  %trunc_i16 = trunc i64 %in to i16
+  %zext_i16 = zext i16 %trunc_i16 to i64
+  store volatile i64 %zext_i16, i64* @var64
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
+
+  %trunc_i32 = trunc i64 %in to i32
+  %zext_i32 = zext i32 %trunc_i32 to i64
+  store volatile i64 %zext_i32, i64* @var64
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffffffff
+
+  ret void
+}
+
+define i64 @test_sext_inreg_from_32(i32 %in) {
+; CHECK: test_sext_inreg_from_32:
+
+  %small = trunc i32 %in to i1
+  %ext = sext i1 %small to i64
+
+  ; Different registers are of course, possible, though suboptimal. This is
+  ; making sure that a 64-bit "(sext_inreg (anyext GPR32), i1)" uses the 64-bit
+  ; sbfx rather than just 32-bits.
+; CHECK: sbfx x0, x0, #0, #1
+  ret i64 %ext
+}
+
+
+define i32 @test_ubfx32(i32* %addr) {
+; CHECK: test_ubfx32:
+; CHECK: ubfx {{w[0-9]+}}, {{w[0-9]+}}, #23, #3
+
+   %fields = load i32* %addr
+   %shifted = lshr i32 %fields, 23
+   %masked = and i32 %shifted, 7
+   ret i32 %masked
+}
+
+define i64 @test_ubfx64(i64* %addr) {
+; CHECK: test_ubfx64:
+; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #25, #10
+
+   %fields = load i64* %addr
+   %shifted = lshr i64 %fields, 25
+   %masked = and i64 %shifted, 1023
+   ret i64 %masked
+}
+
+define i32 @test_sbfx32(i32* %addr) {
+; CHECK: test_sbfx32:
+; CHECK: sbfx {{w[0-9]+}}, {{w[0-9]+}}, #6, #3
+
+   %fields = load i32* %addr
+   %shifted = shl i32 %fields, 23
+   %extended = ashr i32 %shifted, 29
+   ret i32 %extended
+}
+
+define i64 @test_sbfx64(i64* %addr) {
+; CHECK: test_sbfx64:
+; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #63
+
+   %fields = load i64* %addr
+   %shifted = shl i64 %fields, 1
+   %extended = ashr i64 %shifted, 1
+   ret i64 %extended
+}
diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll
new file mode 100644
index 0000000..3d0a5cf
--- /dev/null
+++ b/test/CodeGen/AArch64/blockaddress.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@addr = global i8* null
+
+define void @test_blockaddress() {
+; CHECK: test_blockaddress:
+  store volatile i8* blockaddress(@test_blockaddress, %block), i8** @addr
+  %val = load volatile i8** @addr
+  indirectbr i8* %val, [label %block]
+; CHECK: adrp [[DEST_HI:x[0-9]+]], [[DEST_LBL:.Ltmp[0-9]+]]
+; CHECK: add [[DEST:x[0-9]+]], [[DEST_HI]], #:lo12:[[DEST_LBL]]
+; CHECK: str [[DEST]],
+; CHECK: ldr [[NEWDEST:x[0-9]+]]
+; CHECK: br [[NEWDEST]]
+
+block:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/bool-loads.ll b/test/CodeGen/AArch64/bool-loads.ll
new file mode 100644
index 0000000..5c7640b
--- /dev/null
+++ b/test/CodeGen/AArch64/bool-loads.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+@var = global i1 0
+
+define i32 @test_sextloadi32() {
+; CHECK: test_sextloadi32
+
+  %val = load i1* @var
+  %ret = sext i1 %val to i32
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #1
+
+  ret i32 %ret
+; CHECK: ret
+}
+
+define i64 @test_sextloadi64() {
+; CHECK: test_sextloadi64
+
+  %val = load i1* @var
+  %ret = sext i1 %val to i64
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #1
+
+  ret i64 %ret
+; CHECK: ret
+}
+
+define i32 @test_zextloadi32() {
+; CHECK: test_zextloadi32
+
+; It's not actually necessary that "ret" is next, but as far as LLVM
+; is concerned only 0 or 1 should be loadable so no extension is
+; necessary.
+  %val = load i1* @var
+  %ret = zext i1 %val to i32
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+
+  ret i32 %ret
+; CHECK-NEXT: ret
+}
+
+define i64 @test_zextloadi64() {
+; CHECK: test_zextloadi64
+
+; It's not actually necessary that "ret" is next, but as far as LLVM
+; is concerned only 0 or 1 should be loadable so no extension is
+; necessary.
+  %val = load i1* @var
+  %ret = zext i1 %val to i64
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+
+  ret i64 %ret
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/AArch64/breg.ll b/test/CodeGen/AArch64/breg.ll
new file mode 100644
index 0000000..38ed473
--- /dev/null
+++ b/test/CodeGen/AArch64/breg.ll
@@ -0,0 +1,17 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@stored_label = global i8* null
+
+define void @foo() {
+; CHECK: foo:
+  %lab = load i8** @stored_label
+  indirectbr i8* %lab, [label  %otherlab, label %retlab]
+; CHECK: adrp {{x[0-9]+}}, stored_label
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:stored_label]
+; CHECK: br {{x[0-9]+}}
+
+otherlab:
+  ret void
+retlab:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/callee-save.ll b/test/CodeGen/AArch64/callee-save.ll
new file mode 100644
index 0000000..9dddf74
--- /dev/null
+++ b/test/CodeGen/AArch64/callee-save.ll
@@ -0,0 +1,86 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var = global float 0.0
+
+define void @foo() {
+; CHECK: foo:
+
+; CHECK stp d14, d15, [sp
+; CHECK stp d12, d13, [sp
+; CHECK stp d10, d11, [sp
+; CHECK stp d8, d9, [sp
+
+  ; Create lots of live variables to exhaust the supply of
+  ; caller-saved registers
+  %val1 = load volatile float* @var
+  %val2 = load volatile float* @var
+  %val3 = load volatile float* @var
+  %val4 = load volatile float* @var
+  %val5 = load volatile float* @var
+  %val6 = load volatile float* @var
+  %val7 = load volatile float* @var
+  %val8 = load volatile float* @var
+  %val9 = load volatile float* @var
+  %val10 = load volatile float* @var
+  %val11 = load volatile float* @var
+  %val12 = load volatile float* @var
+  %val13 = load volatile float* @var
+  %val14 = load volatile float* @var
+  %val15 = load volatile float* @var
+  %val16 = load volatile float* @var
+  %val17 = load volatile float* @var
+  %val18 = load volatile float* @var
+  %val19 = load volatile float* @var
+  %val20 = load volatile float* @var
+  %val21 = load volatile float* @var
+  %val22 = load volatile float* @var
+  %val23 = load volatile float* @var
+  %val24 = load volatile float* @var
+  %val25 = load volatile float* @var
+  %val26 = load volatile float* @var
+  %val27 = load volatile float* @var
+  %val28 = load volatile float* @var
+  %val29 = load volatile float* @var
+  %val30 = load volatile float* @var
+  %val31 = load volatile float* @var
+  %val32 = load volatile float* @var
+
+  store volatile float %val1, float* @var
+  store volatile float %val2, float* @var
+  store volatile float %val3, float* @var
+  store volatile float %val4, float* @var
+  store volatile float %val5, float* @var
+  store volatile float %val6, float* @var
+  store volatile float %val7, float* @var
+  store volatile float %val8, float* @var
+  store volatile float %val9, float* @var
+  store volatile float %val10, float* @var
+  store volatile float %val11, float* @var
+  store volatile float %val12, float* @var
+  store volatile float %val13, float* @var
+  store volatile float %val14, float* @var
+  store volatile float %val15, float* @var
+  store volatile float %val16, float* @var
+  store volatile float %val17, float* @var
+  store volatile float %val18, float* @var
+  store volatile float %val19, float* @var
+  store volatile float %val20, float* @var
+  store volatile float %val21, float* @var
+  store volatile float %val22, float* @var
+  store volatile float %val23, float* @var
+  store volatile float %val24, float* @var
+  store volatile float %val25, float* @var
+  store volatile float %val26, float* @var
+  store volatile float %val27, float* @var
+  store volatile float %val28, float* @var
+  store volatile float %val29, float* @var
+  store volatile float %val30, float* @var
+  store volatile float %val31, float* @var
+  store volatile float %val32, float* @var
+
+; CHECK: ldp     d8, d9, [sp
+; CHECK: ldp     d10, d11, [sp
+; CHECK: ldp     d12, d13, [sp
+; CHECK: ldp     d14, d15, [sp
+  ret void
+}
diff --git a/test/CodeGen/AArch64/compare-branch.ll b/test/CodeGen/AArch64/compare-branch.ll
new file mode 100644
index 0000000..4213110
--- /dev/null
+++ b/test/CodeGen/AArch64/compare-branch.ll
@@ -0,0 +1,38 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @foo() {
+; CHECK: foo:
+
+  %val1 = load volatile i32* @var32
+  %tst1 = icmp eq i32 %val1, 0
+  br i1 %tst1, label %end, label %test2
+; CHECK: cbz {{w[0-9]+}}, .LBB
+
+test2:
+  %val2 = load volatile i32* @var32
+  %tst2 = icmp ne i32 %val2, 0
+  br i1 %tst2, label %end, label %test3
+; CHECK: cbnz {{w[0-9]+}}, .LBB
+
+test3:
+  %val3 = load volatile i64* @var64
+  %tst3 = icmp eq i64 %val3, 0
+  br i1 %tst3, label %end, label %test4
+; CHECK: cbz {{x[0-9]+}}, .LBB
+
+test4:
+  %val4 = load volatile i64* @var64
+  %tst4 = icmp ne i64 %val4, 0
+  br i1 %tst4, label %end, label %test5
+; CHECK: cbnz {{x[0-9]+}}, .LBB
+
+test5:
+  store volatile i64 %val4, i64* @var64
+  ret void
+
+end:
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
new file mode 100644
index 0000000..3051cf5
--- /dev/null
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -0,0 +1,213 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+; CHECK: test_csel:
+
+  %tst1 = icmp ugt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, i32 42, i32 52
+  store i32 %val1, i32* @var32
+; CHECK: movz [[W52:w[0-9]+]], #52
+; CHECK: movz [[W42:w[0-9]+]], #42
+; CHECK: csel {{w[0-9]+}}, [[W42]], [[W52]], hi
+
+  %rhs64 = sext i32 %rhs32 to i64
+  %tst2 = icmp sle i64 %lhs64, %rhs64
+  %val2 = select i1 %tst2, i64 %lhs64, i64 %rhs64
+  store i64 %val2, i64* @var64
+; CHECK: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw
+; CHECK: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]]
+; CHECK: csel {{x[0-9]+}}, [[LHS]], [[EXT_RHS]], le
+
+  ret void
+; CHECK: ret
+}
+
+define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %rhs64) {
+; CHECK: test_floatcsel:
+
+  %tst1 = fcmp one float %lhs32, %rhs32
+; CHECK: fcmp {{s[0-9]+}}, {{s[0-9]+}}
+  %val1 = select i1 %tst1, i32 42, i32 52
+  store i32 %val1, i32* @var32
+; CHECK: movz [[W52:w[0-9]+]], #52
+; CHECK: movz [[W42:w[0-9]+]], #42
+; CHECK: csel [[MAYBETRUE:w[0-9]+]], [[W42]], [[W52]], mi
+; CHECK: csel {{w[0-9]+}}, [[W42]], [[MAYBETRUE]], gt
+
+
+  %tst2 = fcmp ueq double %lhs64, %rhs64
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+  %val2 = select i1 %tst2, i64 9, i64 15
+  store i64 %val2, i64* @var64
+; CHECK: movz [[CONST15:x[0-9]+]], #15
+; CHECK: movz [[CONST9:x[0-9]+]], #9
+; CHECK: csel [[MAYBETRUE:x[0-9]+]], [[CONST9]], [[CONST15]], eq
+; CHECK: csel {{x[0-9]+}}, [[CONST9]], [[MAYBETRUE]], vs
+
+  ret void
+; CHECK: ret
+}
+
+
+define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+; CHECK: test_csinc:
+
+; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
+  %tst1 = icmp ugt i32 %lhs32, %rhs32
+  %inc1 = add i32 %rhs32, 1
+  %val1 = select i1 %tst1, i32 %inc1, i32 %lhs32
+  store volatile i32 %val1, i32* @var32
+; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]]
+; CHECK: csinc {{w[0-9]+}}, [[LHS]], [[RHS]], ls
+
+  %rhs2 = add i32 %rhs32, 42
+  %tst2 = icmp sle i32 %lhs32, %rhs2
+  %inc2 = add i32 %rhs32, 1
+  %val2 = select i1 %tst2, i32 %lhs32, i32 %inc2
+  store volatile i32 %val2, i32* @var32
+; CHECK: cmp [[LHS:w[0-9]+]], {{w[0-9]+}}
+; CHECK: csinc {{w[0-9]+}}, [[LHS]], {{w[0-9]+}}, le
+
+; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
+  %rhs3 = sext i32 %rhs32 to i64
+  %tst3 = icmp ugt i64 %lhs64, %rhs3
+  %inc3 = add i64 %rhs3, 1
+  %val3 = select i1 %tst3, i64 %inc3, i64 %lhs64
+  store volatile i64 %val3, i64* @var64
+; CHECK: cmp [[LHS:x[0-9]+]], {{w[0-9]+}}
+; CHECK: csinc {{x[0-9]+}}, [[LHS]], {{x[0-9]+}}, ls
+
+  %rhs4 = zext i32 %rhs32 to i64
+  %tst4 = icmp sle i64 %lhs64, %rhs4
+  %inc4 = add i64 %rhs4, 1
+  %val4 = select i1 %tst4, i64 %lhs64, i64 %inc4
+  store volatile i64 %val4, i64* @var64
+; CHECK: cmp [[LHS:x[0-9]+]], {{w[0-9]+}}
+; CHECK: csinc {{x[0-9]+}}, [[LHS]], {{x[0-9]+}}, le
+
+  ret void
+; CHECK: ret
+}
+
+define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+; CHECK: test_csinv:
+
+; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
+  %tst1 = icmp ugt i32 %lhs32, %rhs32
+  %inc1 = xor i32 -1, %rhs32
+  %val1 = select i1 %tst1, i32 %inc1, i32 %lhs32
+  store volatile i32 %val1, i32* @var32
+; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]]
+; CHECK: csinv {{w[0-9]+}}, [[LHS]], [[RHS]], ls
+
+  %rhs2 = add i32 %rhs32, 42
+  %tst2 = icmp sle i32 %lhs32, %rhs2
+  %inc2 = xor i32 -1, %rhs32
+  %val2 = select i1 %tst2, i32 %lhs32, i32 %inc2
+  store volatile i32 %val2, i32* @var32
+; CHECK: cmp [[LHS:w[0-9]+]], {{w[0-9]+}}
+; CHECK: csinv {{w[0-9]+}}, [[LHS]], {{w[0-9]+}}, le
+
+; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
+  %rhs3 = sext i32 %rhs32 to i64
+  %tst3 = icmp ugt i64 %lhs64, %rhs3
+  %inc3 = xor i64 -1, %rhs3
+  %val3 = select i1 %tst3, i64 %inc3, i64 %lhs64
+  store volatile i64 %val3, i64* @var64
+; CHECK: cmp [[LHS:x[0-9]+]], {{w[0-9]+}}
+; CHECK: csinv {{x[0-9]+}}, [[LHS]], {{x[0-9]+}}, ls
+
+  %rhs4 = zext i32 %rhs32 to i64
+  %tst4 = icmp sle i64 %lhs64, %rhs4
+  %inc4 = xor i64 -1, %rhs4
+  %val4 = select i1 %tst4, i64 %lhs64, i64 %inc4
+  store volatile i64 %val4, i64* @var64
+; CHECK: cmp [[LHS:x[0-9]+]], {{w[0-9]+}}
+; CHECK: csinv {{x[0-9]+}}, [[LHS]], {{x[0-9]+}}, le
+
+  ret void
+; CHECK: ret
+}
+
+define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+; CHECK: test_csneg:
+
+; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
+  %tst1 = icmp ugt i32 %lhs32, %rhs32
+  %inc1 = sub i32 0, %rhs32
+  %val1 = select i1 %tst1, i32 %inc1, i32 %lhs32
+  store volatile i32 %val1, i32* @var32
+; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]]
+; CHECK: csneg {{w[0-9]+}}, [[LHS]], [[RHS]], ls
+
+  %rhs2 = add i32 %rhs32, 42
+  %tst2 = icmp sle i32 %lhs32, %rhs2
+  %inc2 = sub i32 0, %rhs32
+  %val2 = select i1 %tst2, i32 %lhs32, i32 %inc2
+  store volatile i32 %val2, i32* @var32
+; CHECK: cmp [[LHS:w[0-9]+]], {{w[0-9]+}}
+; CHECK: csneg {{w[0-9]+}}, [[LHS]], {{w[0-9]+}}, le
+
+; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
+  %rhs3 = sext i32 %rhs32 to i64
+  %tst3 = icmp ugt i64 %lhs64, %rhs3
+  %inc3 = sub i64 0, %rhs3
+  %val3 = select i1 %tst3, i64 %inc3, i64 %lhs64
+  store volatile i64 %val3, i64* @var64
+; CHECK: cmp [[LHS:x[0-9]+]], {{w[0-9]+}}
+; CHECK: csneg {{x[0-9]+}}, [[LHS]], {{x[0-9]+}}, ls
+
+  %rhs4 = zext i32 %rhs32 to i64
+  %tst4 = icmp sle i64 %lhs64, %rhs4
+  %inc4 = sub i64 0, %rhs4
+  %val4 = select i1 %tst4, i64 %lhs64, i64 %inc4
+  store volatile i64 %val4, i64* @var64
+; CHECK: cmp [[LHS:x[0-9]+]], {{w[0-9]+}}
+; CHECK: csneg {{x[0-9]+}}, [[LHS]], {{x[0-9]+}}, le
+
+  ret void
+; CHECK: ret
+}
+
+define void @test_cset(i32 %lhs, i32 %rhs, i64 %lhs64) {
+; CHECK: test_cset:
+
+; N.b. code is not optimal here (32-bit csinc would be better) but
+; incoming DAG is too complex
+  %tst1 = icmp eq i32 %lhs, %rhs
+  %val1 = zext i1 %tst1 to i32
+  store i32 %val1, i32* @var32
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ne
+
+  %rhs64 = sext i32 %rhs to i64
+  %tst2 = icmp ule i64 %lhs64, %rhs64
+  %val2 = zext i1 %tst2 to i64
+  store i64 %val2, i64* @var64
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, hi
+
+  ret void
+; CHECK: ret
+}
+
+define void @test_csetm(i32 %lhs, i32 %rhs, i64 %lhs64) {
+; CHECK: test_csetm:
+
+  %tst1 = icmp eq i32 %lhs, %rhs
+  %val1 = sext i1 %tst1 to i32
+  store i32 %val1, i32* @var32
+; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: csinv {{w[0-9]+}}, wzr, wzr, ne
+
+  %rhs64 = sext i32 %rhs to i64
+  %tst2 = icmp ule i64 %lhs64, %rhs64
+  %val2 = sext i1 %tst2 to i64
+  store i64 %val2, i64* @var64
+; CHECK: csinv {{x[0-9]+}}, xzr, xzr, hi
+
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
new file mode 100644
index 0000000..f5d5759
--- /dev/null
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -0,0 +1,84 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
+; CHECK: test_select_i32:
+  %val = select i1 %bit, i32 %a, i32 %b
+; CHECK: movz [[ONE:w[0-9]+]], #1
+; CHECK: tst w0, [[ONE]]
+; CHECK-NEXT: csel w0, w1, w2, ne
+
+  ret i32 %val
+}
+
+define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
+; CHECK: test_select_i64:
+  %val = select i1 %bit, i64 %a, i64 %b
+; CHECK: movz [[ONE:w[0-9]+]], #1
+; CHECK: tst w0, [[ONE]]
+; CHECK-NEXT: csel x0, x1, x2, ne
+
+  ret i64 %val
+}
+
+define float @test_select_float(i1 %bit, float %a, float %b) {
+; CHECK: test_select_float:
+  %val = select i1 %bit, float %a, float %b
+; CHECK: movz [[ONE:w[0-9]+]], #1
+; CHECK: tst w0, [[ONE]]
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+
+  ret float %val
+}
+
+define double @test_select_double(i1 %bit, double %a, double %b) {
+; CHECK: test_select_double:
+  %val = select i1 %bit, double %a, double %b
+; CHECK: movz [[ONE:w[0-9]+]], #1
+; CHECK: tst w0, [[ONE]]
+; CHECK-NEXT: fcsel d0, d0, d1, ne
+
+  ret double %val
+}
+
+define i32 @test_brcond(i1 %bit) {
+; CHECK: test_brcond:
+  br i1 %bit, label %true, label %false
+; CHECK: tbz {{w[0-9]+}}, #0, .LBB
+
+true:
+  ret i32 0
+false:
+  ret i32 42
+}
+
+define i1 @test_setcc_float(float %lhs, float %rhs) {
+; CHECK: test_setcc_float
+  %val = fcmp oeq float %lhs, %rhs
+; CHECK: fcmp s0, s1
+; CHECK: csinc w0, wzr, wzr, ne
+  ret i1 %val
+}
+
+define i1 @test_setcc_double(double %lhs, double %rhs) {
+; CHECK: test_setcc_double
+  %val = fcmp oeq double %lhs, %rhs
+; CHECK: fcmp d0, d1
+; CHECK: csinc w0, wzr, wzr, ne
+  ret i1 %val
+}
+
+define i1 @test_setcc_i32(i32 %lhs, i32 %rhs) {
+; CHECK: test_setcc_i32
+  %val = icmp ugt i32 %lhs, %rhs
+; CHECK: cmp w0, w1
+; CHECK: csinc w0, wzr, wzr, ls
+  ret i1 %val
+}
+
+define i1 @test_setcc_i64(i64 %lhs, i64 %rhs) {
+; CHECK: test_setcc_i64
+  %val = icmp ne i64 %lhs, %rhs
+; CHECK: cmp x0, x1
+; CHECK: csinc w0, wzr, wzr, eq
+  ret i1 %val
+}
diff --git a/test/CodeGen/AArch64/dp-3source.ll b/test/CodeGen/AArch64/dp-3source.ll
new file mode 100644
index 0000000..c40d393
--- /dev/null
+++ b/test/CodeGen/AArch64/dp-3source.ll
@@ -0,0 +1,163 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i32 @test_madd32(i32 %val0, i32 %val1, i32 %val2) {
+; CHECK: test_madd32:
+  %mid = mul i32 %val1, %val2
+  %res = add i32 %val0, %mid
+; CHECK: madd {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  ret i32 %res
+}
+
+define i64 @test_madd64(i64 %val0, i64 %val1, i64 %val2) {
+; CHECK: test_madd64:
+  %mid = mul i64 %val1, %val2
+  %res = add i64 %val0, %mid
+; CHECK: madd {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  ret i64 %res
+}
+
+define i32 @test_msub32(i32 %val0, i32 %val1, i32 %val2) {
+; CHECK: test_msub32:
+  %mid = mul i32 %val1, %val2
+  %res = sub i32 %val0, %mid
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  ret i32 %res
+}
+
+define i64 @test_msub64(i64 %val0, i64 %val1, i64 %val2) {
+; CHECK: test_msub64:
+  %mid = mul i64 %val1, %val2
+  %res = sub i64 %val0, %mid
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  ret i64 %res
+}
+
+define i64 @test_smaddl(i64 %acc, i32 %val1, i32 %val2) {
+; CHECK: test_smaddl:
+  %ext1 = sext i32 %val1 to i64
+  %ext2 = sext i32 %val2 to i64
+  %prod = mul i64 %ext1, %ext2
+  %res = add i64 %acc, %prod
+; CHECK: smaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  ret i64 %res
+}
+
+define i64 @test_smsubl(i64 %acc, i32 %val1, i32 %val2) {
+; CHECK: test_smsubl:
+  %ext1 = sext i32 %val1 to i64
+  %ext2 = sext i32 %val2 to i64
+  %prod = mul i64 %ext1, %ext2
+  %res = sub i64 %acc, %prod
+; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  ret i64 %res
+}
+
+define i64 @test_umaddl(i64 %acc, i32 %val1, i32 %val2) {
+; CHECK: test_umaddl:
+  %ext1 = zext i32 %val1 to i64
+  %ext2 = zext i32 %val2 to i64
+  %prod = mul i64 %ext1, %ext2
+  %res = add i64 %acc, %prod
+; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  ret i64 %res
+}
+
+define i64 @test_umsubl(i64 %acc, i32 %val1, i32 %val2) {
+; CHECK: test_umsubl:
+  %ext1 = zext i32 %val1 to i64
+  %ext2 = zext i32 %val2 to i64
+  %prod = mul i64 %ext1, %ext2
+  %res = sub i64 %acc, %prod
+; CHECK: umsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  ret i64 %res
+}
+
+define i64 @test_smulh(i64 %lhs, i64 %rhs) {
+; CHECK: test_smulh:
+  %ext1 = sext i64 %lhs to i128
+  %ext2 = sext i64 %rhs to i128
+  %res = mul i128 %ext1, %ext2
+  %high = lshr i128 %res, 64
+  %val = trunc i128 %high to i64
+; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  ret i64 %val
+}
+
+define i64 @test_umulh(i64 %lhs, i64 %rhs) {
+; CHECK: test_umulh:
+  %ext1 = zext i64 %lhs to i128
+  %ext2 = zext i64 %rhs to i128
+  %res = mul i128 %ext1, %ext2
+  %high = lshr i128 %res, 64
+  %val = trunc i128 %high to i64
+; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  ret i64 %val
+}
+
+define i32 @test_mul32(i32 %lhs, i32 %rhs) {
+; CHECK: test_mul32:
+  %res = mul i32 %lhs, %rhs
+; CHECK: mul {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  ret i32 %res
+}
+
+define i64 @test_mul64(i64 %lhs, i64 %rhs) {
+; CHECK: test_mul64:
+  %res = mul i64 %lhs, %rhs
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  ret i64 %res
+}
+
+define i32 @test_mneg32(i32 %lhs, i32 %rhs) {
+; CHECK: test_mneg32:
+  %prod = mul i32 %lhs, %rhs
+  %res = sub i32 0, %prod
+; CHECK: mneg {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  ret i32 %res
+}
+
+define i64 @test_mneg64(i64 %lhs, i64 %rhs) {
+; CHECK: test_mneg64:
+  %prod = mul i64 %lhs, %rhs
+  %res = sub i64 0, %prod
+; CHECK: mneg {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  ret i64 %res
+}
+
+define i64 @test_smull(i32 %lhs, i32 %rhs) {
+; CHECK: test_smull:
+  %ext1 = sext i32 %lhs to i64
+  %ext2 = sext i32 %rhs to i64
+  %res = mul i64 %ext1, %ext2
+; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  ret i64 %res
+}
+
+define i64 @test_umull(i32 %lhs, i32 %rhs) {
+; CHECK: test_umull:
+  %ext1 = zext i32 %lhs to i64
+  %ext2 = zext i32 %rhs to i64
+  %res = mul i64 %ext1, %ext2
+; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  ret i64 %res
+}
+
+define i64 @test_smnegl(i32 %lhs, i32 %rhs) {
+; CHECK: test_smnegl:
+  %ext1 = sext i32 %lhs to i64
+  %ext2 = sext i32 %rhs to i64
+  %prod = mul i64 %ext1, %ext2
+  %res = sub i64 0, %prod
+; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  ret i64 %res
+}
+
+define i64 @test_umnegl(i32 %lhs, i32 %rhs) {
+; CHECK: test_umnegl:
+  %ext1 = zext i32 %lhs to i64
+  %ext2 = zext i32 %rhs to i64
+  %prod = mul i64 %ext1, %ext2
+  %res = sub i64 0, %prod
+; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  ret i64 %res
+}
diff --git a/test/CodeGen/AArch64/dp1.ll b/test/CodeGen/AArch64/dp1.ll
new file mode 100644
index 0000000..83aa8b4
--- /dev/null
+++ b/test/CodeGen/AArch64/dp1.ll
@@ -0,0 +1,152 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @rev_i32() {
+; CHECK: rev_i32:
+    %val0_tmp = load i32* @var32
+    %val1_tmp = call i32 @llvm.bswap.i32(i32 %val0_tmp)
+; CHECK: rev	{{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val1_tmp, i32* @var32
+    ret void
+}
+
+define void @rev_i64() {
+; CHECK: rev_i64:
+    %val0_tmp = load i64* @var64
+    %val1_tmp = call i64 @llvm.bswap.i64(i64 %val0_tmp)
+; CHECK: rev	{{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val1_tmp, i64* @var64
+    ret void
+}
+
+define void @rev32_i64() {
+; CHECK: rev32_i64:
+    %val0_tmp = load i64* @var64
+    %val1_tmp = shl i64 %val0_tmp, 32
+    %val5_tmp = sub i64 64, 32
+    %val2_tmp = lshr i64 %val0_tmp, %val5_tmp
+    %val3_tmp = or i64 %val1_tmp, %val2_tmp
+    %val4_tmp = call i64 @llvm.bswap.i64(i64 %val3_tmp)
+; CHECK: rev32	{{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val4_tmp, i64* @var64
+    ret void
+}
+
+define void @rev16_i32() {
+; CHECK: rev16_i32:
+    %val0_tmp = load i32* @var32
+    %val1_tmp = shl i32 %val0_tmp, 16
+    %val2_tmp = lshr i32 %val0_tmp, 16
+    %val3_tmp = or i32 %val1_tmp, %val2_tmp
+    %val4_tmp = call i32 @llvm.bswap.i32(i32 %val3_tmp)
+; CHECK: rev16	{{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val4_tmp, i32* @var32
+    ret void
+}
+
+define void @clz_zerodef_i32() {
+; CHECK: clz_zerodef_i32:
+    %val0_tmp = load i32* @var32
+    %val4_tmp = call i32 @llvm.ctlz.i32(i32 %val0_tmp, i1 0)
+; CHECK: clz	{{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val4_tmp, i32* @var32
+    ret void
+}
+
+define void @clz_zerodef_i64() {
+; CHECK: clz_zerodef_i64:
+    %val0_tmp = load i64* @var64
+    %val4_tmp = call i64 @llvm.ctlz.i64(i64 %val0_tmp, i1 0)
+; CHECK: clz	{{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val4_tmp, i64* @var64
+    ret void
+}
+
+define void @clz_zeroundef_i32() {
+; CHECK: clz_zeroundef_i32:
+    %val0_tmp = load i32* @var32
+    %val4_tmp = call i32 @llvm.ctlz.i32(i32 %val0_tmp, i1 1)
+; CHECK: clz	{{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val4_tmp, i32* @var32
+    ret void
+}
+
+define void @clz_zeroundef_i64() {
+; CHECK: clz_zeroundef_i64:
+    %val0_tmp = load i64* @var64
+    %val4_tmp = call i64 @llvm.ctlz.i64(i64 %val0_tmp, i1 1)
+; CHECK: clz	{{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val4_tmp, i64* @var64
+    ret void
+}
+
+define void @cttz_zerodef_i32() {
+; CHECK: cttz_zerodef_i32:
+    %val0_tmp = load i32* @var32
+    %val4_tmp = call i32 @llvm.cttz.i32(i32 %val0_tmp, i1 0)
+; CHECK: rbit   [[REVERSED:w[0-9]+]], {{w[0-9]+}}
+; CHECK: clz	{{w[0-9]+}}, [[REVERSED]]
+    store volatile i32 %val4_tmp, i32* @var32
+    ret void
+}
+
+define void @cttz_zerodef_i64() {
+; CHECK: cttz_zerodef_i64:
+    %val0_tmp = load i64* @var64
+    %val4_tmp = call i64 @llvm.cttz.i64(i64 %val0_tmp, i1 0)
+; CHECK: rbit   [[REVERSED:x[0-9]+]], {{x[0-9]+}}
+; CHECK: clz	{{x[0-9]+}}, [[REVERSED]]
+    store volatile i64 %val4_tmp, i64* @var64
+    ret void
+}
+
+define void @cttz_zeroundef_i32() {
+; CHECK: cttz_zeroundef_i32:
+    %val0_tmp = load i32* @var32
+    %val4_tmp = call i32 @llvm.cttz.i32(i32 %val0_tmp, i1 1)
+; CHECK: rbit   [[REVERSED:w[0-9]+]], {{w[0-9]+}}
+; CHECK: clz	{{w[0-9]+}}, [[REVERSED]]
+    store volatile i32 %val4_tmp, i32* @var32
+    ret void
+}
+
+define void @cttz_zeroundef_i64() {
+; CHECK: cttz_zeroundef_i64:
+    %val0_tmp = load i64* @var64
+    %val4_tmp = call i64 @llvm.cttz.i64(i64 %val0_tmp, i1 1)
+; CHECK: rbit   [[REVERSED:x[0-9]+]], {{x[0-9]+}}
+; CHECK: clz	{{x[0-9]+}}, [[REVERSED]]
+    store volatile i64 %val4_tmp, i64* @var64
+    ret void
+}
+
+; These two are just compilation tests really: the operation's set to Expand in
+; ISelLowering.
+define void @ctpop_i32() {
+; CHECK: ctpop_i32:
+    %val0_tmp = load i32* @var32
+    %val4_tmp = call i32 @llvm.ctpop.i32(i32 %val0_tmp)
+    store volatile i32 %val4_tmp, i32* @var32
+    ret void
+}
+
+define void @ctpop_i64() {
+; CHECK: ctpop_i64:
+    %val0_tmp = load i64* @var64
+    %val4_tmp = call i64 @llvm.ctpop.i64(i64 %val0_tmp)
+    store volatile i64 %val4_tmp, i64* @var64
+    ret void
+}
+
+
+declare i32 @llvm.bswap.i32(i32)
+declare i64 @llvm.bswap.i64(i64)
+declare i32  @llvm.ctlz.i32 (i32, i1)
+declare i64  @llvm.ctlz.i64 (i64, i1)
+declare i32  @llvm.cttz.i32 (i32, i1)
+declare i64  @llvm.cttz.i64 (i64, i1)
+declare i32  @llvm.ctpop.i32 (i32)
+declare i64  @llvm.ctpop.i64 (i64)
+
diff --git a/test/CodeGen/AArch64/dp2.ll b/test/CodeGen/AArch64/dp2.ll
new file mode 100644
index 0000000..4c740f6
--- /dev/null
+++ b/test/CodeGen/AArch64/dp2.ll
@@ -0,0 +1,169 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var32_0 = global i32 0
+@var32_1 = global i32 0
+@var64_0 = global i64 0
+@var64_1 = global i64 0
+
+define void @rorv_i64() {
+; CHECK: rorv_i64:
+    %val0_tmp = load i64* @var64_0
+    %val1_tmp = load i64* @var64_1
+    %val2_tmp = sub i64 64, %val1_tmp
+    %val3_tmp = shl i64 %val0_tmp, %val2_tmp
+    %val4_tmp = lshr i64 %val0_tmp, %val1_tmp
+    %val5_tmp = or i64 %val3_tmp, %val4_tmp
+; CHECK: ror	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val5_tmp, i64* @var64_0
+    ret void
+}
+
+define void @asrv_i64() {
+; CHECK: asrv_i64:
+    %val0_tmp = load i64* @var64_0
+    %val1_tmp = load i64* @var64_1
+    %val4_tmp = ashr i64 %val0_tmp, %val1_tmp
+; CHECK: asr	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val4_tmp, i64* @var64_1
+    ret void
+}
+
+define void @lsrv_i64() {
+; CHECK: lsrv_i64:
+    %val0_tmp = load i64* @var64_0
+    %val1_tmp = load i64* @var64_1
+    %val4_tmp = lshr i64 %val0_tmp, %val1_tmp
+; CHECK: lsr	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val4_tmp, i64* @var64_0
+    ret void
+}
+
+define void @lslv_i64() {
+; CHECK: lslv_i64:
+    %val0_tmp = load i64* @var64_0
+    %val1_tmp = load i64* @var64_1
+    %val4_tmp = shl i64 %val0_tmp, %val1_tmp
+; CHECK: lsl	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val4_tmp, i64* @var64_1
+    ret void
+}
+
+define void @udiv_i64() {
+; CHECK: udiv_i64:
+    %val0_tmp = load i64* @var64_0
+    %val1_tmp = load i64* @var64_1
+    %val4_tmp = udiv i64 %val0_tmp, %val1_tmp
+; CHECK: udiv	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val4_tmp, i64* @var64_0
+    ret void
+}
+
+define void @sdiv_i64() {
+; CHECK: sdiv_i64:
+    %val0_tmp = load i64* @var64_0
+    %val1_tmp = load i64* @var64_1
+    %val4_tmp = sdiv i64 %val0_tmp, %val1_tmp
+; CHECK: sdiv	{{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+    store volatile i64 %val4_tmp, i64* @var64_1
+    ret void
+}
+
+
+define void @lsrv_i32() {
+; CHECK: lsrv_i32:
+    %val0_tmp = load i32* @var32_0
+    %val1_tmp = load i32* @var32_1
+    %val2_tmp = add i32 1, %val1_tmp
+    %val4_tmp = lshr i32 %val0_tmp, %val2_tmp
+; CHECK: lsr	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val4_tmp, i32* @var32_0
+    ret void
+}
+
+define void @lslv_i32() {
+; CHECK: lslv_i32:
+    %val0_tmp = load i32* @var32_0
+    %val1_tmp = load i32* @var32_1
+    %val2_tmp = add i32 1, %val1_tmp
+    %val4_tmp = shl i32 %val0_tmp, %val2_tmp
+; CHECK: lsl	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val4_tmp, i32* @var32_1
+    ret void
+}
+
+define void @rorv_i32() {
+; CHECK: rorv_i32:
+    %val0_tmp = load i32* @var32_0
+    %val6_tmp = load i32* @var32_1
+    %val1_tmp = add i32 1, %val6_tmp
+    %val2_tmp = sub i32 32, %val1_tmp
+    %val3_tmp = shl i32 %val0_tmp, %val2_tmp
+    %val4_tmp = lshr i32 %val0_tmp, %val1_tmp
+    %val5_tmp = or i32 %val3_tmp, %val4_tmp
+; CHECK: ror	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val5_tmp, i32* @var32_0
+    ret void
+}
+
+define void @asrv_i32() {
+; CHECK: asrv_i32:
+    %val0_tmp = load i32* @var32_0
+    %val1_tmp = load i32* @var32_1
+    %val2_tmp = add i32 1, %val1_tmp
+    %val4_tmp = ashr i32 %val0_tmp, %val2_tmp
+; CHECK: asr	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val4_tmp, i32* @var32_1
+    ret void
+}
+
+define void @sdiv_i32() {
+; CHECK: sdiv_i32:
+    %val0_tmp = load i32* @var32_0
+    %val1_tmp = load i32* @var32_1
+    %val4_tmp = sdiv i32 %val0_tmp, %val1_tmp
+; CHECK: sdiv	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val4_tmp, i32* @var32_1
+    ret void
+}
+
+define void @udiv_i32() {
+; CHECK: udiv_i32:
+    %val0_tmp = load i32* @var32_0
+    %val1_tmp = load i32* @var32_1
+    %val4_tmp = udiv i32 %val0_tmp, %val1_tmp
+; CHECK: udiv	{{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+    store volatile i32 %val4_tmp, i32* @var32_0
+    ret void
+}
+
+; The point of this test is that we may not actually see (shl GPR32:$Val, (zext GPR32:$Val2))
+; in the DAG (the RHS may be natively 64-bit), but we should still use the lsl instructions.
+define i32 @test_lsl32() {
+; CHECK: test_lsl32:
+
+  %val = load i32* @var32_0
+  %ret = shl i32 1, %val
+; CHECK: lsl {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+
+  ret i32 %ret
+}
+
+define i32 @test_lsr32() {
+; CHECK: test_lsr32:
+
+  %val = load i32* @var32_0
+  %ret = lshr i32 1, %val
+; CHECK: lsr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+
+  ret i32 %ret
+}
+
+define i32 @test_asr32(i32 %in) {
+; CHECK: test_asr32:
+
+  %val = load i32* @var32_0
+  %ret = ashr i32 %in, %val
+; CHECK: asr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+
+  ret i32 %ret
+}
diff --git a/test/CodeGen/AArch64/elf-extern.ll b/test/CodeGen/AArch64/elf-extern.ll
new file mode 100644
index 0000000..ee89d8d
--- /dev/null
+++ b/test/CodeGen/AArch64/elf-extern.ll
@@ -0,0 +1,21 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -filetype=obj | elf-dump | FileCheck %s
+
+; External symbols are a different concept to global variables but should still
+; get relocations and so on when used.
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+
+define i32 @check_extern() {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* undef, i8* undef, i32 undef, i32 4, i1 0)
+  ret i32 0
+}
+
+; CHECK: .rela.text
+; CHECK: ('r_sym', 0x00000009)
+; CHECK-NEXT: ('r_type', 0x0000011b)
+
+; CHECK: .symtab
+; CHECK: Symbol 9
+; CHECK-NEXT: memcpy
+
+
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
new file mode 100644
index 0000000..2989776
--- /dev/null
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -o - < %s | FileCheck %s
+
+declare extern_weak i32 @var()
+
+define i32()* @foo() {
+; The usual ADRP/ADD pair can't be used for a weak reference because it must
+; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+  ret i32()* @var
+; CHECK: .LCPI0_0:
+; CHECK-NEXT: .xword var
+
+; CHECK: ldr x0, [{{x[0-9]+}}, #:lo12:.LCPI0_0]
+
+}
diff --git a/test/CodeGen/AArch64/extract.ll b/test/CodeGen/AArch64/extract.ll
new file mode 100644
index 0000000..0626781
--- /dev/null
+++ b/test/CodeGen/AArch64/extract.ll
@@ -0,0 +1,57 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i64 @ror_i64(i64 %in) {
+; CHECK: ror_i64:
+    %left = shl i64 %in, 19
+    %right = lshr i64 %in, 45
+    %val5 = or i64 %left, %right
+; CHECK: extr {{x[0-9]+}}, x0, x0, #45
+    ret i64 %val5
+}
+
+define i32 @ror_i32(i32 %in) {
+; CHECK: ror_i32:
+    %left = shl i32 %in, 9
+    %right = lshr i32 %in, 23
+    %val5 = or i32 %left, %right
+; CHECK: extr {{w[0-9]+}}, w0, w0, #23
+    ret i32 %val5
+}
+
+define i32 @extr_i32(i32 %lhs, i32 %rhs) {
+; CHECK: extr_i32:
+  %left = shl i32 %lhs, 6
+  %right = lshr i32 %rhs, 26
+  %val = or i32 %left, %right
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{w[0-9]+}}, w0, w1, #26
+
+  ret i32 %val
+}
+
+define i64 @extr_i64(i64 %lhs, i64 %rhs) {
+; CHECK: extr_i64:
+  %right = lshr i64 %rhs, 40
+  %left = shl i64 %lhs, 24
+  %val = or i64 %right, %left
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{x[0-9]+}}, x0, x1, #40
+
+  ret i64 %val
+}
+
+; Regression test: a bad experimental pattern crept into git which optimised
+; this pattern to a single EXTR.
+define i32 @extr_regress(i32 %a, i32 %b) {
+; CHECK: extr_regress:
+
+    %sh1 = shl i32 %a, 14
+    %sh2 = lshr i32 %b, 14
+    %val = or i32 %sh2, %sh1
+; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}}
+
+    ret i32 %val
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
new file mode 100644
index 0000000..1a114a5
--- /dev/null
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -0,0 +1,58 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
+
+; This test is designed to be run in the situation where the
+; call-frame is not reserved (hence disable-fp-elim), but where
+; callee-pop can occur (hence tailcallopt).
+
+declare fastcc void @will_pop([8 x i32], i32 %val)
+
+define fastcc void @foo(i32 %in) {
+; CHECK: foo:
+
+  %addr = alloca i8, i32 %in
+
+; Normal frame setup stuff:
+; CHECK: sub sp, sp,
+; CHECK stp x29, x30
+
+; Reserve space for call-frame:
+; CHECK: sub sp, sp, #16
+
+  call fastcc void @will_pop([8 x i32] undef, i32 42)
+; CHECK: bl will_pop
+
+; Since @will_pop is fastcc with tailcallopt, it will put the stack
+; back where it needs to be, we shouldn't duplicate that
+; CHECK-NOT: sub sp, sp, #16
+; CHECK-NOT: add sp, sp,
+
+; CHECK: ldp x29, x30
+; CHECK: add sp, sp,
+  ret void
+}
+
+declare void @wont_pop([8 x i32], i32 %val)
+
+define void @foo1(i32 %in) {
+; CHECK: foo1:
+
+  %addr = alloca i8, i32 %in
+; Normal frame setup again
+; CHECK sub sp, sp,
+; CHECK stp x29, x30
+
+; Reserve space for call-frame
+; CHECK sub sp, sp, #16
+
+  call void @wont_pop([8 x i32] undef, i32 42)
+; CHECK bl wont_pop
+
+; This time we *do* need to unreserve the call-frame
+; CHECK add sp, sp, #16
+
+; Check for epilogue (primarily to make sure sp spotted above wasn't
+; part of it).
+; CHECK: ldp x29, x30
+; CHECK: add sp, sp,
+  ret void
+}
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
new file mode 100644
index 0000000..41cde94
--- /dev/null
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -0,0 +1,123 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+; Without tailcallopt fastcc still means the caller cleans up the
+; stack, so try to make sure this is respected.
+
+define fastcc void @func_stack0() {
+; CHECK: func_stack0:
+; CHECK: sub sp, sp, #48
+
+; CHECK-TAIL: func_stack0:
+; CHECK-TAIL: sub sp, sp, #48
+
+
+  call fastcc void @func_stack8([8 x i32] undef, i32 42)
+; CHECK:  bl func_stack8
+; CHECK-NOT: sub sp, sp,
+
+; CHECK-TAIL: bl func_stack8
+; CHECK-TAIL: sub sp, sp, #16
+
+
+  call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
+; CHECK: bl func_stack32
+; CHECK-NOT: sub sp, sp,
+
+; CHECK-TAIL: bl func_stack32
+; CHECK-TAIL: sub sp, sp, #32
+
+
+  call fastcc void @func_stack0()
+; CHECK: bl func_stack0
+; CHECK-NOT: sub sp, sp
+
+; CHECK-TAIL: bl func_stack0
+; CHECK-TAIL-NOT: sub sp, sp
+
+  ret void
+; CHECK: add sp, sp, #48
+; CHECK-NEXT: ret
+
+; CHECK-TAIL: add sp, sp, #48
+; CHECK-TAIL-NEXT: ret
+
+}
+
+define fastcc void @func_stack8([8 x i32], i32 %stacked) {
+; CHECK: func_stack8:
+; CHECK: sub sp, sp, #48
+
+; CHECK-TAIL: func_stack8:
+; CHECK-TAIL: sub sp, sp, #48
+
+
+  call fastcc void @func_stack8([8 x i32] undef, i32 42)
+; CHECK:  bl func_stack8
+; CHECK-NOT: sub sp, sp,
+
+; CHECK-TAIL: bl func_stack8
+; CHECK-TAIL: sub sp, sp, #16
+
+
+  call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
+; CHECK: bl func_stack32
+; CHECK-NOT: sub sp, sp,
+
+; CHECK-TAIL: bl func_stack32
+; CHECK-TAIL: sub sp, sp, #32
+
+
+  call fastcc void @func_stack0()
+; CHECK: bl func_stack0
+; CHECK-NOT: sub sp, sp
+
+; CHECK-TAIL: bl func_stack0
+; CHECK-TAIL-NOT: sub sp, sp
+
+  ret void
+; CHECK: add sp, sp, #48
+; CHECK-NEXT: ret
+
+; CHECK-TAIL: add sp, sp, #64
+; CHECK-TAIL-NEXT: ret
+}
+
+define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
+; CHECK: func_stack32:
+; CHECK: sub sp, sp, #48
+
+; CHECK-TAIL: func_stack32:
+; CHECK-TAIL: sub sp, sp, #48
+
+
+  call fastcc void @func_stack8([8 x i32] undef, i32 42)
+; CHECK:  bl func_stack8
+; CHECK-NOT: sub sp, sp,
+
+; CHECK-TAIL: bl func_stack8
+; CHECK-TAIL: sub sp, sp, #16
+
+
+  call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
+; CHECK: bl func_stack32
+; CHECK-NOT: sub sp, sp,
+
+; CHECK-TAIL: bl func_stack32
+; CHECK-TAIL: sub sp, sp, #32
+
+
+  call fastcc void @func_stack0()
+; CHECK: bl func_stack0
+; CHECK-NOT: sub sp, sp
+
+; CHECK-TAIL: bl func_stack0
+; CHECK-TAIL-NOT: sub sp, sp
+
+  ret void
+; CHECK: add sp, sp, #48
+; CHECK-NEXT: ret
+
+; CHECK-TAIL: add sp, sp, #80
+; CHECK-TAIL-NEXT: ret
+}
diff --git a/test/CodeGen/AArch64/fcmp.ll b/test/CodeGen/AArch64/fcmp.ll
new file mode 100644
index 0000000..ad4a903
--- /dev/null
+++ b/test/CodeGen/AArch64/fcmp.ll
@@ -0,0 +1,81 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+declare void @bar(i32)
+
+define void @test_float(float %a, float %b) {
+; CHECK: test_float:
+
+  %tst1 = fcmp oeq float %a, %b
+  br i1 %tst1, label %end, label %t2
+; CHECK: fcmp {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK: b.eq .L
+
+t2:
+  %tst2 = fcmp une float %b, 0.0
+  br i1 %tst2, label %t3, label %end
+; CHECK: fcmp {{s[0-9]+}}, #0.0
+; CHECK: b.eq .L
+
+
+t3:
+; This test can't be implemented with just one A64 conditional
+; branch. LLVM converts "ordered and not equal" to "unordered or
+; equal" before instruction selection, which is what we currently
+; test. Obviously, other sequences are valid.
+  %tst3 = fcmp one float %a,  %b
+  br i1 %tst3, label %t4, label %end
+; CHECK: fcmp {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NEXT: b.eq .[[T4:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: b.vs .[[T4]]
+t4:
+  %tst4 = fcmp uge float %a, -0.0
+  br i1 %tst4, label %t5, label %end
+; CHECK-NOT: fcmp {{s[0-9]+}}, #0.0
+; CHECK: b.mi .LBB
+
+t5:
+  call void @bar(i32 0)
+  ret void
+end:
+  ret void
+
+}
+
+define void @test_double(double %a, double %b) {
+; CHECK: test_double:
+
+  %tst1 = fcmp oeq double %a, %b
+  br i1 %tst1, label %end, label %t2
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: b.eq .L
+
+t2:
+  %tst2 = fcmp une double %b, 0.0
+  br i1 %tst2, label %t3, label %end
+; CHECK: fcmp {{d[0-9]+}}, #0.0
+; CHECK: b.eq .L
+
+
+t3:
+; This test can't be implemented with just one A64 conditional
+; branch. LLVM converts "ordered and not equal" to "unordered or
+; equal" before instruction selection, which is what we currently
+; test. Obviously, other sequences are valid.
+  %tst3 = fcmp one double %a,  %b
+  br i1 %tst3, label %t4, label %end
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: b.eq .[[T4:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: b.vs .[[T4]]
+t4:
+  %tst4 = fcmp uge double %a, -0.0
+  br i1 %tst4, label %t5, label %end
+; CHECK-NOT: fcmp {{d[0-9]+}}, #0.0
+; CHECK: b.mi .LBB
+
+t5:
+  call void @bar(i32 0)
+  ret void
+end:
+  ret void
+
+}
diff --git a/test/CodeGen/AArch64/fcvt-fixed.ll b/test/CodeGen/AArch64/fcvt-fixed.ll
new file mode 100644
index 0000000..0f7b95b
--- /dev/null
+++ b/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -0,0 +1,191 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_fcvtzs(float %flt, double %dbl) {
+; CHECK: test_fcvtzs:
+
+  %fix1 = fmul float %flt, 128.0
+  %cvt1 = fptosi float %fix1 to i32
+; CHECK: fcvtzs {{w[0-9]+}}, {{s[0-9]+}}, #7
+  store volatile i32 %cvt1, i32* @var32
+
+  %fix2 = fmul float %flt, 4294967296.0
+  %cvt2 = fptosi float %fix2 to i32
+; CHECK: fcvtzs {{w[0-9]+}}, {{s[0-9]+}}, #32
+  store volatile i32 %cvt2, i32* @var32
+
+  %fix3 = fmul float %flt, 128.0
+  %cvt3 = fptosi float %fix3 to i64
+; CHECK: fcvtzs {{x[0-9]+}}, {{s[0-9]+}}, #7
+  store volatile i64 %cvt3, i64* @var64
+
+  %fix4 = fmul float %flt, 18446744073709551616.0
+  %cvt4 = fptosi float %fix4 to i64
+; CHECK: fcvtzs {{x[0-9]+}}, {{s[0-9]+}}, #64
+  store volatile i64 %cvt4, i64* @var64
+
+  %fix5 = fmul double %dbl, 128.0
+  %cvt5 = fptosi double %fix5 to i32
+; CHECK: fcvtzs {{w[0-9]+}}, {{d[0-9]+}}, #7
+  store volatile i32 %cvt5, i32* @var32
+
+  %fix6 = fmul double %dbl, 4294967296.0
+  %cvt6 = fptosi double %fix6 to i32
+; CHECK: fcvtzs {{w[0-9]+}}, {{d[0-9]+}}, #32
+  store volatile i32 %cvt6, i32* @var32
+
+  %fix7 = fmul double %dbl, 128.0
+  %cvt7 = fptosi double %fix7 to i64
+; CHECK: fcvtzs {{x[0-9]+}}, {{d[0-9]+}}, #7
+  store volatile i64 %cvt7, i64* @var64
+
+  %fix8 = fmul double %dbl, 18446744073709551616.0
+  %cvt8 = fptosi double %fix8 to i64
+; CHECK: fcvtzs {{x[0-9]+}}, {{d[0-9]+}}, #64
+  store volatile i64 %cvt8, i64* @var64
+
+  ret void
+}
+
+define void @test_fcvtzu(float %flt, double %dbl) {
+; CHECK: test_fcvtzu:
+
+  %fix1 = fmul float %flt, 128.0
+  %cvt1 = fptoui float %fix1 to i32
+; CHECK: fcvtzu {{w[0-9]+}}, {{s[0-9]+}}, #7
+  store volatile i32 %cvt1, i32* @var32
+
+  %fix2 = fmul float %flt, 4294967296.0
+  %cvt2 = fptoui float %fix2 to i32
+; CHECK: fcvtzu {{w[0-9]+}}, {{s[0-9]+}}, #32
+  store volatile i32 %cvt2, i32* @var32
+
+  %fix3 = fmul float %flt, 128.0
+  %cvt3 = fptoui float %fix3 to i64
+; CHECK: fcvtzu {{x[0-9]+}}, {{s[0-9]+}}, #7
+  store volatile i64 %cvt3, i64* @var64
+
+  %fix4 = fmul float %flt, 18446744073709551616.0
+  %cvt4 = fptoui float %fix4 to i64
+; CHECK: fcvtzu {{x[0-9]+}}, {{s[0-9]+}}, #64
+  store volatile i64 %cvt4, i64* @var64
+
+  %fix5 = fmul double %dbl, 128.0
+  %cvt5 = fptoui double %fix5 to i32
+; CHECK: fcvtzu {{w[0-9]+}}, {{d[0-9]+}}, #7
+  store volatile i32 %cvt5, i32* @var32
+
+  %fix6 = fmul double %dbl, 4294967296.0
+  %cvt6 = fptoui double %fix6 to i32
+; CHECK: fcvtzu {{w[0-9]+}}, {{d[0-9]+}}, #32
+  store volatile i32 %cvt6, i32* @var32
+
+  %fix7 = fmul double %dbl, 128.0
+  %cvt7 = fptoui double %fix7 to i64
+; CHECK: fcvtzu {{x[0-9]+}}, {{d[0-9]+}}, #7
+  store volatile i64 %cvt7, i64* @var64
+
+  %fix8 = fmul double %dbl, 18446744073709551616.0
+  %cvt8 = fptoui double %fix8 to i64
+; CHECK: fcvtzu {{x[0-9]+}}, {{d[0-9]+}}, #64
+  store volatile i64 %cvt8, i64* @var64
+
+  ret void
+}
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+
+define void @test_scvtf(i32 %int, i64 %long) {
+; CHECK: test_scvtf:
+
+  %cvt1 = sitofp i32 %int to float
+  %fix1 = fdiv float %cvt1, 128.0
+; CHECK: scvtf {{s[0-9]+}}, {{w[0-9]+}}, #7
+  store volatile float %fix1, float* @varfloat
+
+  %cvt2 = sitofp i32 %int to float
+  %fix2 = fdiv float %cvt2, 4294967296.0
+; CHECK: scvtf {{s[0-9]+}}, {{w[0-9]+}}, #32
+  store volatile float %fix2, float* @varfloat
+
+  %cvt3 = sitofp i64 %long to float
+  %fix3 = fdiv float %cvt3, 128.0
+; CHECK: scvtf {{s[0-9]+}}, {{x[0-9]+}}, #7
+  store volatile float %fix3, float* @varfloat
+
+  %cvt4 = sitofp i64 %long to float
+  %fix4 = fdiv float %cvt4, 18446744073709551616.0
+; CHECK: scvtf {{s[0-9]+}}, {{x[0-9]+}}, #64
+  store volatile float %fix4, float* @varfloat
+
+  %cvt5 = sitofp i32 %int to double
+  %fix5 = fdiv double %cvt5, 128.0
+; CHECK: scvtf {{d[0-9]+}}, {{w[0-9]+}}, #7
+  store volatile double %fix5, double* @vardouble
+
+  %cvt6 = sitofp i32 %int to double
+  %fix6 = fdiv double %cvt6, 4294967296.0
+; CHECK: scvtf {{d[0-9]+}}, {{w[0-9]+}}, #32
+  store volatile double %fix6, double* @vardouble
+
+  %cvt7 = sitofp i64 %long to double
+  %fix7 = fdiv double %cvt7, 128.0
+; CHECK: scvtf {{d[0-9]+}}, {{x[0-9]+}}, #7
+  store volatile double %fix7, double* @vardouble
+
+  %cvt8 = sitofp i64 %long to double
+  %fix8 = fdiv double %cvt8, 18446744073709551616.0
+; CHECK: scvtf {{d[0-9]+}}, {{x[0-9]+}}, #64
+  store volatile double %fix8, double* @vardouble
+
+  ret void
+}
+
+define void @test_ucvtf(i32 %int, i64 %long) {
+; CHECK: test_ucvtf:
+
+  %cvt1 = uitofp i32 %int to float
+  %fix1 = fdiv float %cvt1, 128.0
+; CHECK: ucvtf {{s[0-9]+}}, {{w[0-9]+}}, #7
+  store volatile float %fix1, float* @varfloat
+
+  %cvt2 = uitofp i32 %int to float
+  %fix2 = fdiv float %cvt2, 4294967296.0
+; CHECK: ucvtf {{s[0-9]+}}, {{w[0-9]+}}, #32
+  store volatile float %fix2, float* @varfloat
+
+  %cvt3 = uitofp i64 %long to float
+  %fix3 = fdiv float %cvt3, 128.0
+; CHECK: ucvtf {{s[0-9]+}}, {{x[0-9]+}}, #7
+  store volatile float %fix3, float* @varfloat
+
+  %cvt4 = uitofp i64 %long to float
+  %fix4 = fdiv float %cvt4, 18446744073709551616.0
+; CHECK: ucvtf {{s[0-9]+}}, {{x[0-9]+}}, #64
+  store volatile float %fix4, float* @varfloat
+
+  %cvt5 = uitofp i32 %int to double
+  %fix5 = fdiv double %cvt5, 128.0
+; CHECK: ucvtf {{d[0-9]+}}, {{w[0-9]+}}, #7
+  store volatile double %fix5, double* @vardouble
+
+  %cvt6 = uitofp i32 %int to double
+  %fix6 = fdiv double %cvt6, 4294967296.0
+; CHECK: ucvtf {{d[0-9]+}}, {{w[0-9]+}}, #32
+  store volatile double %fix6, double* @vardouble
+
+  %cvt7 = uitofp i64 %long to double
+  %fix7 = fdiv double %cvt7, 128.0
+; CHECK: ucvtf {{d[0-9]+}}, {{x[0-9]+}}, #7
+  store volatile double %fix7, double* @vardouble
+
+  %cvt8 = uitofp i64 %long to double
+  %fix8 = fdiv double %cvt8, 18446744073709551616.0
+; CHECK: ucvtf {{d[0-9]+}}, {{x[0-9]+}}, #64
+  store volatile double %fix8, double* @vardouble
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll
new file mode 100644
index 0000000..c771d68
--- /dev/null
+++ b/test/CodeGen/AArch64/fcvt-int.ll
@@ -0,0 +1,151 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i32 @test_floattoi32(float %in) {
+; CHECK: test_floattoi32:
+
+  %signed = fptosi float %in to i32
+  %unsigned = fptoui float %in to i32
+; CHECK: fcvtzu [[UNSIG:w[0-9]+]], {{s[0-9]+}}
+; CHECK: fcvtzs [[SIG:w[0-9]+]], {{s[0-9]+}}
+
+  %res = sub i32 %signed, %unsigned
+; CHECK: sub {{w[0-9]+}}, [[SIG]], [[UNSIG]]
+
+  ret i32 %res
+; CHECK: ret
+}
+
+define i32 @test_doubletoi32(double %in) {
+; CHECK: test_doubletoi32:
+
+  %signed = fptosi double %in to i32
+  %unsigned = fptoui double %in to i32
+; CHECK: fcvtzu [[UNSIG:w[0-9]+]], {{d[0-9]+}}
+; CHECK: fcvtzs [[SIG:w[0-9]+]], {{d[0-9]+}}
+
+  %res = sub i32 %signed, %unsigned
+; CHECK: sub {{w[0-9]+}}, [[SIG]], [[UNSIG]]
+
+  ret i32 %res
+; CHECK: ret
+}
+
+define i64 @test_floattoi64(float %in) {
+; CHECK: test_floattoi64:
+
+  %signed = fptosi float %in to i64
+  %unsigned = fptoui float %in to i64
+; CHECK: fcvtzu [[UNSIG:x[0-9]+]], {{s[0-9]+}}
+; CHECK: fcvtzs [[SIG:x[0-9]+]], {{s[0-9]+}}
+
+  %res = sub i64 %signed, %unsigned
+; CHECK: sub {{x[0-9]+}}, [[SIG]], [[UNSIG]]
+
+  ret i64 %res
+; CHECK: ret
+}
+
+define i64 @test_doubletoi64(double %in) {
+; CHECK: test_doubletoi64:
+
+  %signed = fptosi double %in to i64
+  %unsigned = fptoui double %in to i64
+; CHECK: fcvtzu [[UNSIG:x[0-9]+]], {{d[0-9]+}}
+; CHECK: fcvtzs [[SIG:x[0-9]+]], {{d[0-9]+}}
+
+  %res = sub i64 %signed, %unsigned
+; CHECK: sub {{x[0-9]+}}, [[SIG]], [[UNSIG]]
+
+  ret i64 %res
+; CHECK: ret
+}
+
+define float @test_i32tofloat(i32 %in) {
+; CHECK: test_i32tofloat:
+
+  %signed = sitofp i32 %in to float
+  %unsigned = uitofp i32 %in to float
+; CHECK: ucvtf [[UNSIG:s[0-9]+]], {{w[0-9]+}}
+; CHECK: scvtf [[SIG:s[0-9]+]], {{w[0-9]+}}
+
+  %res = fsub float %signed, %unsigned
+; CHECL: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]]
+  ret float %res
+; CHECK: ret
+}
+
+define double @test_i32todouble(i32 %in) {
+; CHECK: test_i32todouble:
+
+  %signed = sitofp i32 %in to double
+  %unsigned = uitofp i32 %in to double
+; CHECK: ucvtf [[UNSIG:d[0-9]+]], {{w[0-9]+}}
+; CHECK: scvtf [[SIG:d[0-9]+]], {{w[0-9]+}}
+
+  %res = fsub double %signed, %unsigned
+; CHECK: fsub {{d[0-9]+}}, [[SIG]], [[UNSIG]]
+  ret double %res
+; CHECK: ret
+}
+
+define float @test_i64tofloat(i64 %in) {
+; CHECK: test_i64tofloat:
+
+  %signed = sitofp i64 %in to float
+  %unsigned = uitofp i64 %in to float
+; CHECK: ucvtf [[UNSIG:s[0-9]+]], {{x[0-9]+}}
+; CHECK: scvtf [[SIG:s[0-9]+]], {{x[0-9]+}}
+
+  %res = fsub float %signed, %unsigned
+; CHECK: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]]
+  ret float %res
+; CHECK: ret
+}
+
+define double @test_i64todouble(i64 %in) {
+; CHECK: test_i64todouble:
+
+  %signed = sitofp i64 %in to double
+  %unsigned = uitofp i64 %in to double
+; CHECK: ucvtf [[UNSIG:d[0-9]+]], {{x[0-9]+}}
+; CHECK: scvtf [[SIG:d[0-9]+]], {{x[0-9]+}}
+
+  %res = fsub double %signed, %unsigned
+; CHECK: sub {{d[0-9]+}}, [[SIG]], [[UNSIG]]
+  ret double %res
+; CHECK: ret
+}
+
+define i32 @test_bitcastfloattoi32(float %in) {
+; CHECK: test_bitcastfloattoi32:
+
+   %res = bitcast float %in to i32
+; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
+   ret i32 %res
+}
+
+define i64 @test_bitcastdoubletoi64(double %in) {
+; CHECK: test_bitcastdoubletoi64:
+
+   %res = bitcast double %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define float @test_bitcasti32tofloat(i32 %in) {
+; CHECK: test_bitcasti32tofloat:
+
+   %res = bitcast i32 %in to float
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
+   ret float %res
+
+}
+
+define double @test_bitcasti64todouble(i64 %in) {
+; CHECK: test_bitcasti64todouble:
+
+   %res = bitcast i64 %in to double
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret double %res
+
+}
diff --git a/test/CodeGen/AArch64/flags-multiuse.ll b/test/CodeGen/AArch64/flags-multiuse.ll
new file mode 100644
index 0000000..940c146
--- /dev/null
+++ b/test/CodeGen/AArch64/flags-multiuse.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+; LLVM should be able to cope with multiple uses of the same flag-setting
+; instruction at different points of a routine. Either by rematerializing the
+; compare or by saving and restoring the flag register.
+
+declare void @bar()
+
+@var = global i32 0
+
+define i32 @test_multiflag(i32 %n, i32 %m, i32 %o) {
+; CHECK: test_multiflag:
+
+  %test = icmp ne i32 %n, %m
+; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]]
+
+  %val = zext i1 %test to i32
+; CHECK: csinc {{[xw][0-9]+}}, {{xzr|wzr}}, {{xzr|wzr}}, eq
+
+  store i32 %val, i32* @var
+
+  call void @bar()
+; CHECK: bl bar
+
+  ; Currently, the comparison is emitted again. An MSR/MRS pair would also be
+  ; acceptable, but assuming the call preserves NZCV is not.
+  br i1 %test, label %iftrue, label %iffalse
+; CHECK: cmp [[LHS]], [[RHS]]
+; CHECK: b.eq
+
+iftrue:
+  ret i32 42
+iffalse:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/floatdp_1source.ll b/test/CodeGen/AArch64/floatdp_1source.ll
new file mode 100644
index 0000000..c94ba9b
--- /dev/null
+++ b/test/CodeGen/AArch64/floatdp_1source.ll
@@ -0,0 +1,138 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@varhalf = global half 0.0
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+
+declare float @fabsf(float) readonly
+declare double @fabs(double) readonly
+
+declare float @llvm.sqrt.f32(float %Val)
+declare double @llvm.sqrt.f64(double %Val)
+
+declare float @ceilf(float) readonly
+declare double @ceil(double) readonly
+
+declare float @floorf(float) readonly
+declare double @floor(double) readonly
+
+declare float @truncf(float) readonly
+declare double @trunc(double) readonly
+
+declare float @rintf(float) readonly
+declare double @rint(double) readonly
+
+declare float @nearbyintf(float) readonly
+declare double @nearbyint(double) readonly
+
+define void @simple_float() {
+; CHECK: simple_float:
+  %val1 = load volatile float* @varfloat
+
+  %valabs = call float @fabsf(float %val1)
+  store volatile float %valabs, float* @varfloat
+; CHECK: fabs {{s[0-9]+}}, {{s[0-9]+}}
+
+  %valneg = fsub float -0.0, %val1
+  store volatile float %valneg, float* @varfloat
+; CHECK: fneg {{s[0-9]+}}, {{s[0-9]+}}
+
+  %valsqrt = call float @llvm.sqrt.f32(float %val1)
+  store volatile float %valsqrt, float* @varfloat
+; CHECK: fsqrt {{s[0-9]+}}, {{s[0-9]+}}
+
+  %valceil = call float @ceilf(float %val1)
+  store volatile float %valceil, float* @varfloat
+; CHECK: frintp {{s[0-9]+}}, {{s[0-9]+}}
+
+  %valfloor = call float @floorf(float %val1)
+  store volatile float %valfloor, float* @varfloat
+; CHECK: frintm {{s[0-9]+}}, {{s[0-9]+}}
+
+  %valtrunc = call float @truncf(float %val1)
+  store volatile float %valtrunc, float* @varfloat
+; CHECK: frintz {{s[0-9]+}}, {{s[0-9]+}}
+
+  %valrint = call float @rintf(float %val1)
+  store volatile float %valrint, float* @varfloat
+; CHECK: frintx {{s[0-9]+}}, {{s[0-9]+}}
+
+  %valnearbyint = call float @nearbyintf(float %val1)
+  store volatile float %valnearbyint, float* @varfloat
+; CHECK: frinti {{s[0-9]+}}, {{s[0-9]+}}
+
+  ret void
+}
+
+define void @simple_double() {
+; CHECK: simple_double:
+  %val1 = load volatile double* @vardouble
+
+  %valabs = call double @fabs(double %val1)
+  store volatile double %valabs, double* @vardouble
+; CHECK: fabs {{d[0-9]+}}, {{d[0-9]+}}
+
+  %valneg = fsub double -0.0, %val1
+  store volatile double %valneg, double* @vardouble
+; CHECK: fneg {{d[0-9]+}}, {{d[0-9]+}}
+
+  %valsqrt = call double @llvm.sqrt.f64(double %val1)
+  store volatile double %valsqrt, double* @vardouble
+; CHECK: fsqrt {{d[0-9]+}}, {{d[0-9]+}}
+
+  %valceil = call double @ceil(double %val1)
+  store volatile double %valceil, double* @vardouble
+; CHECK: frintp {{d[0-9]+}}, {{d[0-9]+}}
+
+  %valfloor = call double @floor(double %val1)
+  store volatile double %valfloor, double* @vardouble
+; CHECK: frintm {{d[0-9]+}}, {{d[0-9]+}}
+
+  %valtrunc = call double @trunc(double %val1)
+  store volatile double %valtrunc, double* @vardouble
+; CHECK: frintz {{d[0-9]+}}, {{d[0-9]+}}
+
+  %valrint = call double @rint(double %val1)
+  store volatile double %valrint, double* @vardouble
+; CHECK: frintx {{d[0-9]+}}, {{d[0-9]+}}
+
+  %valnearbyint = call double @nearbyint(double %val1)
+  store volatile double %valnearbyint, double* @vardouble
+; CHECK: frinti {{d[0-9]+}}, {{d[0-9]+}}
+
+  ret void
+}
+
+define void @converts() {
+; CHECK: converts:
+
+  %val16 = load volatile half* @varhalf
+  %val32 = load volatile float* @varfloat
+  %val64 = load volatile double* @vardouble
+
+  %val16to32 = fpext half %val16 to float
+  store volatile float %val16to32, float* @varfloat
+; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
+
+  %val16to64 = fpext half %val16 to double
+  store volatile double %val16to64, double* @vardouble
+; CHECK: fcvt {{d[0-9]+}}, {{h[0-9]+}}
+
+  %val32to16 = fptrunc float %val32 to half
+  store volatile half %val32to16, half* @varhalf
+; CHECK: fcvt {{h[0-9]+}}, {{s[0-9]+}}
+
+  %val32to64 = fpext float %val32 to double
+  store volatile double %val32to64, double* @vardouble
+; CHECK: fcvt {{d[0-9]+}}, {{s[0-9]+}}
+
+  %val64to16 = fptrunc double %val64 to half
+  store volatile half %val64to16, half* @varhalf
+; CHECK: fcvt {{h[0-9]+}}, {{d[0-9]+}}
+
+  %val64to32 = fptrunc double %val64 to float
+  store volatile float %val64to32, float* @varfloat
+; CHECK: fcvt {{s[0-9]+}}, {{d[0-9]+}}
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/floatdp_2source.ll b/test/CodeGen/AArch64/floatdp_2source.ll
new file mode 100644
index 0000000..b2256b3
--- /dev/null
+++ b/test/CodeGen/AArch64/floatdp_2source.ll
@@ -0,0 +1,60 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+
+define void @testfloat() {
+; CHECK: testfloat:
+  %val1 = load float* @varfloat
+
+  %val2 = fadd float %val1, %val1
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+
+  %val3 = fmul float %val2, %val1
+; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+
+  %val4 = fdiv float %val3, %val1
+; CHECK: fdiv {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+
+  %val5 = fsub float %val4, %val2
+; CHECK: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+
+  store volatile float %val5, float* @varfloat
+
+; These will be enabled with the implementation of floating-point litpool entries.
+  %val6 = fmul float %val1, %val2
+  %val7 = fsub float -0.0, %val6
+; CHECK: fnmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+
+  store volatile float %val7, float* @varfloat
+
+  ret void
+}
+
+define void @testdouble() {
+; CHECK: testdouble:
+  %val1 = load double* @vardouble
+
+  %val2 = fadd double %val1, %val1
+; CHECK: fadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+
+  %val3 = fmul double %val2, %val1
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+
+  %val4 = fdiv double %val3, %val1
+; CHECK: fdiv {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+
+  %val5 = fsub double %val4, %val2
+; CHECK: fsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+
+  store volatile double %val5, double* @vardouble
+
+; These will be enabled with the implementation of doubleing-point litpool entries.
+   %val6 = fmul double %val1, %val2
+   %val7 = fsub double -0.0, %val6
+; CHECK: fnmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+
+   store volatile double %val7, double* @vardouble
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/fp-cond-sel.ll b/test/CodeGen/AArch64/fp-cond-sel.ll
new file mode 100644
index 0000000..56e8f16
--- /dev/null
+++ b/test/CodeGen/AArch64/fp-cond-sel.ll
@@ -0,0 +1,26 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+
+define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+; CHECK: test_csel:
+
+  %tst1 = icmp ugt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float 0.0, float 1.0
+  store float %val1, float* @varfloat
+; CHECK: ldr [[FLT0:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK: fmov [[FLT1:s[0-9]+]], #1.0
+; CHECK: fcsel {{s[0-9]+}}, [[FLT0]], [[FLT1]], hi
+
+  %rhs64 = sext i32 %rhs32 to i64
+  %tst2 = icmp sle i64 %lhs64, %rhs64
+  %val2 = select i1 %tst2, double 1.0, double 0.0
+  store double %val2, double* @vardouble
+; CHECK: ldr [[FLT0:d[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK: fmov [[FLT1:d[0-9]+]], #1.0
+; CHECK: fcsel {{d[0-9]+}}, [[FLT1]], [[FLT0]], le
+
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll
new file mode 100644
index 0000000..39db9be
--- /dev/null
+++ b/test/CodeGen/AArch64/fp-dp3.ll
@@ -0,0 +1,102 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
+
+declare float @llvm.fma.f32(float, float, float)
+declare double @llvm.fma.f64(double, double, double)
+
+define float @test_fmadd(float %a, float %b, float %c) {
+; CHECK: test_fmadd:
+  %val = call float @llvm.fma.f32(float %a, float %b, float %c)
+; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %val
+}
+
+define float @test_fmsub(float %a, float %b, float %c) {
+; CHECK: test_fmsub:
+  %nega = fsub float -0.0, %a
+  %val = call float @llvm.fma.f32(float %nega, float %b, float %c)
+; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %val
+}
+
+define float @test_fnmadd(float %a, float %b, float %c) {
+; CHECK: test_fnmadd:
+  %negc = fsub float -0.0, %c
+  %val = call float @llvm.fma.f32(float %a, float %b, float %negc)
+; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %val
+}
+
+define float @test_fnmsub(float %a, float %b, float %c) {
+; CHECK: test_fnmsub:
+  %nega = fsub float -0.0, %a
+  %negc = fsub float -0.0, %c
+  %val = call float @llvm.fma.f32(float %nega, float %b, float %negc)
+; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %val
+}
+
+define double @testd_fmadd(double %a, double %b, double %c) {
+; CHECK: testd_fmadd:
+  %val = call double @llvm.fma.f64(double %a, double %b, double %c)
+; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  ret double %val
+}
+
+define double @testd_fmsub(double %a, double %b, double %c) {
+; CHECK: testd_fmsub:
+  %nega = fsub double -0.0, %a
+  %val = call double @llvm.fma.f64(double %nega, double %b, double %c)
+; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  ret double %val
+}
+
+define double @testd_fnmadd(double %a, double %b, double %c) {
+; CHECK: testd_fnmadd:
+  %negc = fsub double -0.0, %c
+  %val = call double @llvm.fma.f64(double %a, double %b, double %negc)
+; CHECK: fnmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  ret double %val
+}
+
+define double @testd_fnmsub(double %a, double %b, double %c) {
+; CHECK: testd_fnmsub:
+  %nega = fsub double -0.0, %a
+  %negc = fsub double -0.0, %c
+  %val = call double @llvm.fma.f64(double %nega, double %b, double %negc)
+; CHECK: fnmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  ret double %val
+}
+
+define float @test_fmadd_unfused(float %a, float %b, float %c) {
+; CHECK: test_fmadd_unfused:
+  %prod = fmul float %b, %c
+  %sum = fadd float %a, %prod
+; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %sum
+}
+
+define float @test_fmsub_unfused(float %a, float %b, float %c) {
+; CHECK: test_fmsub_unfused:
+  %prod = fmul float %b, %c
+  %diff = fsub float %a, %prod
+; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %diff
+}
+
+define float @test_fnmadd_unfused(float %a, float %b, float %c) {
+; CHECK: test_fnmadd_unfused:
+  %nega = fsub float -0.0, %a
+  %prod = fmul float %b, %c
+  %sum = fadd float %nega, %prod
+; CHECK: fnmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %sum
+}
+
+define float @test_fnmsub_unfused(float %a, float %b, float %c) {
+; CHECK: test_fnmsub_unfused:
+  %nega = fsub float -0.0, %a
+  %prod = fmul float %b, %c
+  %diff = fsub float %nega, %prod
+; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %diff
+}
diff --git a/test/CodeGen/AArch64/fp128-folding.ll b/test/CodeGen/AArch64/fp128-folding.ll
new file mode 100644
index 0000000..b5bdcf4
--- /dev/null
+++ b/test/CodeGen/AArch64/fp128-folding.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+declare void @bar(i8*, i8*, i32*)
+
+; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
+; which is not supported.
+
+define fp128 @test_folding() {
+; CHECK: test_folding:
+  %l = alloca i32
+  store i32 42, i32* %l
+  %val = load i32* %l
+  %fpval = sitofp i32 %val to fp128
+  ; If the value is loaded from a constant pool into an fp128, it's been folded
+  ; successfully.
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI
+  ret fp128 %fpval
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/fp128.ll b/test/CodeGen/AArch64/fp128.ll
new file mode 100644
index 0000000..258d34b
--- /dev/null
+++ b/test/CodeGen/AArch64/fp128.ll
@@ -0,0 +1,280 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@lhs = global fp128 zeroinitializer
+@rhs = global fp128 zeroinitializer
+
+define fp128 @test_add() {
+; CHECK: test_add:
+
+  %lhs = load fp128* @lhs
+  %rhs = load fp128* @rhs
+; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+
+  %val = fadd fp128 %lhs, %rhs
+; CHECK: bl __addtf3
+  ret fp128 %val
+}
+
+define fp128 @test_sub() {
+; CHECK: test_sub:
+
+  %lhs = load fp128* @lhs
+  %rhs = load fp128* @rhs
+; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+
+  %val = fsub fp128 %lhs, %rhs
+; CHECK: bl __subtf3
+  ret fp128 %val
+}
+
+define fp128 @test_mul() {
+; CHECK: test_mul:
+
+  %lhs = load fp128* @lhs
+  %rhs = load fp128* @rhs
+; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+
+  %val = fmul fp128 %lhs, %rhs
+; CHECK: bl __multf3
+  ret fp128 %val
+}
+
+define fp128 @test_div() {
+; CHECK: test_div:
+
+  %lhs = load fp128* @lhs
+  %rhs = load fp128* @rhs
+; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+
+  %val = fdiv fp128 %lhs, %rhs
+; CHECK: bl __divtf3
+  ret fp128 %val
+}
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_fptosi() {
+; CHECK: test_fptosi:
+  %val = load fp128* @lhs
+
+  %val32 = fptosi fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixtfsi
+
+  %val64 = fptosi fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixtfdi
+
+  ret void
+}
+
+define void @test_fptoui() {
+; CHECK: test_fptoui:
+  %val = load fp128* @lhs
+
+  %val32 = fptoui fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixunstfsi
+
+  %val64 = fptoui fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixunstfdi
+
+  ret void
+}
+
+define void @test_sitofp() {
+; CHECK: test_sitofp:
+
+  %src32 = load i32* @var32
+  %val32 = sitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatsitf
+
+  %src64 = load i64* @var64
+  %val64 = sitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatditf
+
+  ret void
+}
+
+define void @test_uitofp() {
+; CHECK: test_uitofp:
+
+  %src32 = load i32* @var32
+  %val32 = uitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatunsitf
+
+  %src64 = load i64* @var64
+  %val64 = uitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatunditf
+
+  ret void
+}
+
+define i1 @test_setcc1() {
+; CHECK: test_setcc1:
+
+  %lhs = load fp128* @lhs
+  %rhs = load fp128* @rhs
+; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+
+; Technically, everything after the call to __letf2 is redundant, but we'll let
+; LLVM have its fun for now.
+  %val = fcmp ole fp128 %lhs, %rhs
+; CHECK: bl __letf2
+; CHECK: cmp w0, #0
+; CHECK: csinc w0, wzr, wzr, gt
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i1 @test_setcc2() {
+; CHECK: test_setcc2:
+
+  %lhs = load fp128* @lhs
+  %rhs = load fp128* @rhs
+; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+
+; Technically, everything after the call to __letf2 is redundant, but we'll let
+; LLVM have its fun for now.
+  %val = fcmp ugt fp128 %lhs, %rhs
+; CHECK: bl      __unordtf2
+; CHECK: mov     x[[UNORDERED:[0-9]+]], x0
+
+; CHECK: bl      __gttf2
+; CHECK: cmp w0, #0
+; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
+; CHECK: cmp w[[UNORDERED]], #0
+; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+; CHECK: orr     w0, [[UNORDERED]], [[GT]]
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i32 @test_br_cc() {
+; CHECK: test_br_cc:
+
+  %lhs = load fp128* @lhs
+  %rhs = load fp128* @rhs
+; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+
+  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
+  %cond = fcmp olt fp128 %lhs, %rhs
+; CHECK: bl      __unordtf2
+; CHECK: mov     x[[UNORDERED:[0-9]+]], x0
+
+; CHECK: bl      __getf2
+; CHECK: cmp w0, #0
+
+; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
+; CHECK: cmp w[[UNORDERED]], #0
+; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
+; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
+  br i1 %cond, label %iftrue, label %iffalse
+
+iftrue:
+  ret i32 42
+; CHECK-NEXT: BB#
+; CHECK-NEXT: movz x0, #42
+; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
+
+iffalse:
+  ret i32 29
+; CHECK: [[RET29]]:
+; CHECK-NEXT: movz x0, #29
+; CHECK-NEXT: [[REALRET]]:
+; CHECK: ret
+}
+
+define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
+; CHECK: test_select:
+
+  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
+  store fp128 %val, fp128* @lhs
+; CHECK: cmp w0, #0
+; CHECK: str q1, [sp]
+; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: [[IFFALSE]]:
+; CHECK-NEXT: ldr q0, [sp]
+; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
+  ret void
+; CHECK: ret
+}
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+
+define void @test_round() {
+; CHECK: test_round:
+
+  %val = load fp128* @lhs
+
+  %float = fptrunc fp128 %val to float
+  store float %float, float* @varfloat
+; CHECK: bl __trunctfsf2
+; CHECK: str s0, [{{x[0-9]+}}, #:lo12:varfloat]
+
+  %double = fptrunc fp128 %val to double
+  store double %double, double* @vardouble
+; CHECK: bl __trunctfdf2
+; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
+
+  ret void
+}
+
+define void @test_extend() {
+; CHECK: test_extend:
+
+  %val = load fp128* @lhs
+
+  %float = load float* @varfloat
+  %fromfloat = fpext float %float to fp128
+  store volatile fp128 %fromfloat, fp128* @lhs
+; CHECK: bl __extendsftf2
+; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
+
+  %double = load double* @vardouble
+  %fromdouble = fpext double %double to fp128
+  store volatile fp128 %fromdouble, fp128* @lhs
+; CHECK: bl __extenddftf2
+; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
+
+  ret void
+; CHECK: ret
+}
+
+define fp128 @test_neg(fp128 %in) {
+; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
+; Make sure the weird hex constant below *is* -0.0
+; CHECK-NEXT: fp128 -0
+
+; CHECK: test_neg:
+
+  ; Could in principle be optimized to fneg which we can't select, this makes
+  ; sure that doesn't happen.
+  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
+; CHECK: str q0, [sp, #-16]
+; CHECK-NEXT: ldr q1, [sp], #16
+; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:[[MINUS0]]]
+; CHECK: bl __subtf3
+
+  ret fp128 %ret
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
new file mode 100644
index 0000000..fd28aee
--- /dev/null
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -0,0 +1,34 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@varf32 = global float 0.0
+@varf64 = global double 0.0
+
+define void @check_float() {
+; CHECK: check_float:
+
+  %val = load float* @varf32
+  %newval1 = fadd float %val, 8.5
+  store volatile float %newval1, float* @varf32
+; CHECK: fmov {{s[0-9]+}}, #8.5
+
+  %newval2 = fadd float %val, 128.0
+  store volatile float %newval2, float* @varf32
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI0_0
+
+  ret void
+}
+
+define void @check_double() {
+; CHECK: check_double:
+
+  %val = load double* @varf64
+  %newval1 = fadd double %val, 8.5
+  store volatile double %newval1, double* @varf64
+; CHECK: fmov {{d[0-9]+}}, #8.5
+
+  %newval2 = fadd double %val, 128.0
+  store volatile double %newval2, double* @varf64
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
new file mode 100644
index 0000000..5675e5a
--- /dev/null
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -0,0 +1,193 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+%myStruct = type { i64 , i8, i32 }
+
+@var8 = global i8 0
+@var32 = global i32 0
+@var64 = global i64 0
+@var128 = global i128 0
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+@varstruct = global %myStruct zeroinitializer
+
+define void @take_i8s(i8 %val1, i8 %val2) {
+; CHECK: take_i8s:
+    store i8 %val2, i8* @var8
+    ; Not using w1 may be technically allowed, but it would indicate a
+    ; problem in itself.
+;  CHECK: strb w1, [{{x[0-9]+}}, #:lo12:var8]
+    ret void
+}
+
+define void @add_floats(float %val1, float %val2) {
+; CHECK: add_floats:
+    %newval = fadd float %val1, %val2
+; CHECK: fadd [[ADDRES:s[0-9]+]], s0, s1
+    store float %newval, float* @varfloat
+; CHECK: str [[ADDRES]], [{{x[0-9]+}}, #:lo12:varfloat]
+    ret void
+}
+
+; byval pointers should be allocated to the stack and copied as if
+; with memcpy.
+define void @take_struct(%myStruct* byval %structval) {
+; CHECK: take_struct:
+    %addr0 = getelementptr %myStruct* %structval, i64 0, i32 2
+    %addr1 = getelementptr %myStruct* %structval, i64 0, i32 0
+
+    %val0 = load i32* %addr0
+    ; Some weird move means x0 is used for one access
+; CHECK: ldr [[REG32:w[0-9]+]], [{{x[0-9]+|sp}}, #12]
+    store i32 %val0, i32* @var32
+; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
+
+    %val1 = load i64* %addr1
+; CHECK: ldr [[REG64:x[0-9]+]], [{{x[0-9]+|sp}}]
+    store i64 %val1, i64* @var64
+; CHECK str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
+
+    ret void
+}
+
+; %structval should be at sp + 16
+define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %structval) {
+; CHECK: check_byval_align:
+
+    %addr0 = getelementptr %myStruct* %structval, i64 0, i32 2
+    %addr1 = getelementptr %myStruct* %structval, i64 0, i32 0
+
+    %val0 = load i32* %addr0
+    ; Some weird move means x0 is used for one access
+; CHECK: add x[[STRUCTVAL_ADDR:[0-9]+]], sp, #16
+; CHECK: ldr [[REG32:w[0-9]+]], [x[[STRUCTVAL_ADDR]], #12]
+    store i32 %val0, i32* @var32
+; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
+
+    %val1 = load i64* %addr1
+; CHECK: ldr [[REG64:x[0-9]+]], [sp, #16]
+    store i64 %val1, i64* @var64
+; CHECK str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
+
+    ret void
+}
+
+define i32 @return_int() {
+; CHECK: return_int:
+    %val = load i32* @var32
+    ret i32 %val
+; CHECK: ldr w0, [{{x[0-9]+}}, #:lo12:var32]
+    ; Make sure epilogue follows
+; CHECK-NEXT: ret
+}
+
+define double @return_double() {
+; CHECK: return_double:
+    ret double 3.14
+; CHECK: ldr d0, [{{x[0-9]+}}, #:lo12:.LCPI
+}
+
+; This is the kind of IR clang will produce for returning a struct
+; small enough to go into registers. Not all that pretty, but it
+; works.
+define [2 x i64] @return_struct() {
+; CHECK: return_struct:
+    %addr = bitcast %myStruct* @varstruct to [2 x i64]*
+    %val = load [2 x i64]* %addr
+    ret [2 x i64] %val
+; CHECK: ldr x0, [{{x[0-9]+}}, #:lo12:varstruct]
+    ; Odd register regex below disallows x0 which we want to be live now.
+; CHECK: add {{x[1-9][0-9]*}}, {{x[1-9][0-9]*}}, #:lo12:varstruct
+; CHECK-NEXT: ldr x1, [{{x[1-9][0-9]*}}, #8]
+    ; Make sure epilogue immediately follows
+; CHECK-NEXT: ret
+}
+
+; Large structs are passed by reference (storage allocated by caller
+; to preserve value semantics) in x8. Strictly this only applies to
+; structs larger than 16 bytes, but C semantics can still be provided
+; if LLVM does it to %myStruct too. So this is the simplest check
+define void @return_large_struct(%myStruct* sret %retval) {
+; CHECK: return_large_struct:
+    %addr0 = getelementptr %myStruct* %retval, i64 0, i32 0
+    %addr1 = getelementptr %myStruct* %retval, i64 0, i32 1
+    %addr2 = getelementptr %myStruct* %retval, i64 0, i32 2
+
+    store i64 42, i64* %addr0
+    store i8 2, i8* %addr1
+    store i32 9, i32* %addr2
+; CHECK: str {{x[0-9]+}}, [x8]
+; CHECK: strb {{w[0-9]+}}, [x8, #8]
+; CHECK: str {{w[0-9]+}}, [x8, #12]
+
+    ret void
+}
+
+; This struct is just too far along to go into registers: (only x7 is
+; available, but it needs two). Also make sure that %stacked doesn't
+; sneak into x7 behind.
+define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var45,
+                          i32* %var6, %myStruct* byval %struct, i32* byval %stacked,
+                          double %notstacked) {
+; CHECK: struct_on_stack:
+    %addr = getelementptr %myStruct* %struct, i64 0, i32 0
+    %val64 = load i64* %addr
+    store i64 %val64, i64* @var64
+    ; Currently nothing on local stack, so struct should be at sp
+; CHECK: ldr [[VAL64:x[0-9]+]], [sp]
+; CHECK: str [[VAL64]], [{{x[0-9]+}}, #:lo12:var64]
+
+    store double %notstacked, double* @vardouble
+; CHECK-NOT: ldr d0
+; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble
+
+    %retval = load i32* %stacked
+    ret i32 %retval
+; CHECK: ldr w0, [sp, #16]
+}
+
+define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
+                         float %var4, float %var5, float %var6, float %var7,
+                         float %var8) {
+; CHECK: stacked_fpu:
+    store float %var8, float* @varfloat
+    ; Beware as above: the offset would be different on big-endian
+    ; machines if the first ldr were changed to use s-registers.
+; CHECK: ldr d[[VALFLOAT:[0-9]+]], [sp]
+; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, #:lo12:varfloat]
+
+    ret void
+}
+
+; 128-bit integer types should be passed in xEVEN, xODD rather than
+; the reverse. In this case x2 and x3. Nothing should use x1.
+define i32 @check_i128_regalign(i32 %val0, i128 %val1, i32 %val2) {
+; CHECK: check_i128_regalign
+    store i128 %val1, i128* @var128
+; CHECK: str x2, [{{x[0-9]+}}, #:lo12:var128]
+; CHECK: str x3, [{{x[0-9]+}}, #8]
+
+    ret i32 %val2
+; CHECK: mov x0, x4
+}
+
+define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
+                                   i32 %val4, i32 %val5, i32 %val6, i32 %val7,
+                                   i32 %stack1, i128 %stack2) {
+; CHECK: check_i128_stackalign
+    store i128 %stack2, i128* @var128
+    ; Nothing local on stack in current codegen, so first stack is 16 away
+; CHECK: ldr {{x[0-9]+}}, [sp, #16]
+    ; Important point is that we address sp+24 for second dword
+; CHECK: add     [[REG:x[0-9]+]], sp, #16
+; CHECK: ldr     {{x[0-9]+}}, {{\[}}[[REG]], #8]
+    ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+
+define i32 @test_extern() {
+; CHECK: test_extern:
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* undef, i8* undef, i32 undef, i32 4, i1 0)
+; CHECK: bl memcpy
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
new file mode 100644
index 0000000..abb09a5
--- /dev/null
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -0,0 +1,140 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+%myStruct = type { i64 , i8, i32 }
+
+@var8 = global i8 0
+@var8_2 = global i8 0
+@var32 = global i32 0
+@var64 = global i64 0
+@var128 = global i128 0
+@varfloat = global float 0.0
+@varfloat_2 = global float 0.0
+@vardouble = global double 0.0
+@varstruct = global %myStruct zeroinitializer
+@varsmallstruct = global [2 x i64] zeroinitializer
+
+declare void @take_i8s(i8 %val1, i8 %val2)
+declare void @take_floats(float %val1, float %val2)
+
+define void @simple_args() {
+; CHECK: simple_args:
+  %char1 = load i8* @var8
+  %char2 = load i8* @var8_2
+  call void @take_i8s(i8 %char1, i8 %char2)
+; CHECK: ldrb w0, [{{x[0-9]+}}, #:lo12:var8]
+; CHECK: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2]
+; CHECK: bl take_i8s
+
+  %float1 = load float* @varfloat
+  %float2 = load float* @varfloat_2
+  call void @take_floats(float %float1, float %float2)
+; CHECK: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2]
+; CHECK: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK: bl take_floats
+
+  ret void
+}
+
+declare i32 @return_int()
+declare double @return_double()
+declare [2 x i64] @return_smallstruct()
+declare void @return_large_struct(%myStruct* sret %retval)
+
+define void @simple_rets() {
+; CHECK: simple_rets:
+
+  %int = call i32 @return_int()
+  store i32 %int, i32* @var32
+; CHECK: bl return_int
+; CHECK: str w0, [{{x[0-9]+}}, #:lo12:var32]
+
+  %dbl = call double @return_double()
+  store double %dbl, double* @vardouble
+; CHECK: bl return_double
+; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
+
+  %arr = call [2 x i64] @return_smallstruct()
+  store [2 x i64] %arr, [2 x i64]* @varsmallstruct
+; CHECK: bl return_smallstruct
+; CHECK: str x1, [{{x[0-9]+}}, #8]
+; CHECK: str x0, [{{x[0-9]+}}, #:lo12:varsmallstruct]
+
+  call void @return_large_struct(%myStruct* sret @varstruct)
+; CHECK: add x8, {{x[0-9]+}}, #:lo12:varstruct
+; CHECK bl return_large_struct
+
+  ret void
+}
+
+
+declare i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var45,
+                             i32* %var6, %myStruct* byval %struct, i32 %stacked,
+                             double %notstacked)
+declare void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
+                          float %var4, float %var5, float %var6, float %var7,
+                          float %var8)
+
+define void @check_stack_args() {
+  call i32 @struct_on_stack(i8 0, i16 12, i32 42, i64 99, i128 1,
+                            i32* @var32, %myStruct* byval @varstruct,
+                            i32 999, double 1.0)
+  ; Want to check that the final double is passed in registers and
+  ; that varstruct is passed on the stack. Rather dependent on how a
+  ; memcpy gets created, but the following works for now.
+; CHECK: mov x0, sp
+; CHECK: str {{w[0-9]+}}, [x0]
+; CHECK: str {{w[0-9]+}}, [x0, #12]
+; CHECK: fmov d0,
+; CHECK: bl struct_on_stack
+
+  call void @stacked_fpu(float -1.0, double 1.0, float 4.0, float 2.0,
+                         float -2.0, float -8.0, float 16.0, float 1.0,
+                         float 64.0)
+; CHECK: ldr s[[STACKEDREG:[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK: mov x0, sp
+; CHECK: str d[[STACKEDREG]], [x0]
+; CHECK bl stacked_fpu
+  ret void
+}
+
+
+declare void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
+                                    i32 %val4, i32 %val5, i32 %val6, i32 %val7,
+                                    i32 %stack1, i128 %stack2)
+
+declare void @check_i128_regalign(i32 %val0, i128 %val1)
+
+
+define void @check_i128_align() {
+; CHECK: check_i128_align:
+  %val = load i128* @var128
+  call void @check_i128_stackalign(i32 0, i32 1, i32 2, i32 3,
+                                   i32 4, i32 5, i32 6, i32 7,
+                                   i32 42, i128 %val)
+; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var128]
+; CHECK: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
+; CHECK: mov x[[SPREG:[0-9]+]], sp
+; CHECK: str [[I128HI]], [x[[SPREG]], #24]
+; CHECK: str [[I128LO]], [x[[SPREG]], #16]
+; CHECK: bl check_i128_stackalign
+
+  call void @check_i128_regalign(i32 0, i128 42)
+; CHECK-NOT: mov x1
+; CHECK: movz x2, #42
+; CHECK: mov x3, xzr
+; CHECK: bl check_i128_regalign
+
+  ret void
+}
+
+@fptr = global void()* null
+
+define void @check_indirect_call() {
+; CHECK: check_indirect_call:
+  %func = load void()** @fptr
+  call void %func()
+; CHECK: ldr [[FPTR:x[0-9]+]], [{{x[0-9]+}}, #:lo12:fptr]
+; CHECK: blr [[FPTR]]
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll
new file mode 100644
index 0000000..8ed6e55
--- /dev/null
+++ b/test/CodeGen/AArch64/global-alignment.ll
@@ -0,0 +1,69 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@var32 = global [3 x i32] zeroinitializer
+@var64 = global [3 x i64] zeroinitializer
+@var32_align64 = global [3 x i32] zeroinitializer, align 8
+
+define i64 @test_align32() {
+; CHECK: test_align32:
+  %addr = bitcast [3 x i32]* @var32 to i64*
+
+  ; Since @var32 is only guaranteed to be aligned to 32-bits, it's invalid to
+  ; emit an "LDR x0, [x0, #:lo12:var32] instruction to implement this load.
+  %val = load i64* %addr
+; CHECK: adrp [[HIBITS:x[0-9]+]], var32
+; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], #:lo12:var32
+; CHECK: ldr x0, [x[[ADDR]]]
+
+  ret i64 %val
+}
+
+define i64 @test_align64() {
+; CHECK: test_align64:
+  %addr = bitcast [3 x i64]* @var64 to i64*
+
+  ; However, var64 *is* properly aligned and emitting an adrp/add/ldr would be
+  ; inefficient.
+  %val = load i64* %addr
+; CHECK: adrp x[[HIBITS:[0-9]+]], var64
+; CHECK-NOT: add x[[HIBITS]]
+; CHECK: ldr x0, [x[[HIBITS]], #:lo12:var64]
+
+  ret i64 %val
+}
+
+define i64 @test_var32_align64() {
+; CHECK: test_var32_align64:
+  %addr = bitcast [3 x i32]* @var32_align64 to i64*
+
+  ; Since @var32 is only guaranteed to be aligned to 32-bits, it's invalid to
+  ; emit an "LDR x0, [x0, #:lo12:var32] instruction to implement this load.
+  %val = load i64* %addr
+; CHECK: adrp x[[HIBITS:[0-9]+]], var32_align64
+; CHECK-NOT: add x[[HIBITS]]
+; CHECK: ldr x0, [x[[HIBITS]], #:lo12:var32_align64]
+
+  ret i64 %val
+}
+
+@yet_another_var = external global {i32, i32}
+
+define i64 @test_yet_another_var() {
+; CHECK: test_yet_another_var:
+
+  ; @yet_another_var has a preferred alignment of 8, but that's not enough if
+  ; we're going to be linking against other things. Its ABI alignment is only 4
+  ; so we can't fold the load.
+  %val = load i64* bitcast({i32, i32}* @yet_another_var to i64*)
+; CHECK: adrp [[HIBITS:x[0-9]+]], yet_another_var
+; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], #:lo12:yet_another_var
+; CHECK: ldr x0, [x[[ADDR]]]
+  ret i64 %val
+}
+
+define i64()* @test_functions() {
+; CHECK: test_functions:
+  ret i64()* @test_yet_another_var
+; CHECK: adrp [[HIBITS:x[0-9]+]], test_yet_another_var
+; CHECK: add x0, [[HIBITS]], #:lo12:test_yet_another_var
+}
diff --git a/test/CodeGen/AArch64/got-abuse.ll b/test/CodeGen/AArch64/got-abuse.ll
new file mode 100644
index 0000000..c474e58
--- /dev/null
+++ b/test/CodeGen/AArch64/got-abuse.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s
+
+; LLVM gives well-defined semantics to this horrible construct (though C says
+; it's undefined). Regardless, we shouldn't crash. The important feature here is
+; that in general the only way to access a GOT symbol is via a 64-bit
+; load. Neither of these alternatives has the ELF relocations required to
+; support it:
+;    + ldr wD, [xN, #:got_lo12:func]
+;    + add xD, xN, #:got_lo12:func
+
+declare void @consume(i32)
+declare void @func()
+
+define void @foo() nounwind {
+; CHECK: foo:
+entry:
+  call void @consume(i32 ptrtoint (void ()* @func to i32))
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:func
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDRHI]], #:got_lo12:func]
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/i128-align.ll b/test/CodeGen/AArch64/i128-align.ll
new file mode 100644
index 0000000..f019ea0
--- /dev/null
+++ b/test/CodeGen/AArch64/i128-align.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+%struct = type { i32, i128, i8 }
+
+@var = global %struct zeroinitializer
+
+define i64 @check_size() {
+; CHECK: check_size:
+  %starti = ptrtoint %struct* @var to i64
+
+  %endp = getelementptr %struct* @var, i64 1
+  %endi = ptrtoint %struct* %endp to i64
+
+  %diff = sub i64 %endi, %starti
+  ret i64 %diff
+; CHECK: movz x0, #48
+}
+
+define i64 @check_field() {
+; CHECK: check_field:
+  %starti = ptrtoint %struct* @var to i64
+
+  %endp = getelementptr %struct* @var, i64 0, i32 1
+  %endi = ptrtoint i128* %endp to i64
+
+  %diff = sub i64 %endi, %starti
+  ret i64 %diff
+; CHECK: movz x0, #16
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/illegal-float-ops.ll b/test/CodeGen/AArch64/illegal-float-ops.ll
new file mode 100644
index 0000000..446151b
--- /dev/null
+++ b/test/CodeGen/AArch64/illegal-float-ops.ll
@@ -0,0 +1,221 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+@varfp128 = global fp128 zeroinitializer
+
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare fp128 @llvm.cos.f128(fp128)
+
+define void @test_cos(float %float, double %double, fp128 %fp128) {
+; CHECK: test_cos:
+
+   %cosfloat = call float @llvm.cos.f32(float %float)
+   store float %cosfloat, float* @varfloat
+; CHECK: bl cosf
+
+   %cosdouble = call double @llvm.cos.f64(double %double)
+   store double %cosdouble, double* @vardouble
+; CHECK: bl cos
+
+   %cosfp128 = call fp128 @llvm.cos.f128(fp128 %fp128)
+   store fp128 %cosfp128, fp128* @varfp128
+; CHECK: bl cosl
+
+  ret void
+}
+
+declare float @llvm.exp.f32(float)
+declare double @llvm.exp.f64(double)
+declare fp128 @llvm.exp.f128(fp128)
+
+define void @test_exp(float %float, double %double, fp128 %fp128) {
+; CHECK: test_exp:
+
+   %expfloat = call float @llvm.exp.f32(float %float)
+   store float %expfloat, float* @varfloat
+; CHECK: bl expf
+
+   %expdouble = call double @llvm.exp.f64(double %double)
+   store double %expdouble, double* @vardouble
+; CHECK: bl exp
+
+   %expfp128 = call fp128 @llvm.exp.f128(fp128 %fp128)
+   store fp128 %expfp128, fp128* @varfp128
+; CHECK: bl expl
+
+  ret void
+}
+
+declare float @llvm.exp2.f32(float)
+declare double @llvm.exp2.f64(double)
+declare fp128 @llvm.exp2.f128(fp128)
+
+define void @test_exp2(float %float, double %double, fp128 %fp128) {
+; CHECK: test_exp2:
+
+   %exp2float = call float @llvm.exp2.f32(float %float)
+   store float %exp2float, float* @varfloat
+; CHECK: bl exp2f
+
+   %exp2double = call double @llvm.exp2.f64(double %double)
+   store double %exp2double, double* @vardouble
+; CHECK: bl exp2
+
+   %exp2fp128 = call fp128 @llvm.exp2.f128(fp128 %fp128)
+   store fp128 %exp2fp128, fp128* @varfp128
+; CHECK: bl exp2l
+  ret void
+
+}
+
+declare float @llvm.log.f32(float)
+declare double @llvm.log.f64(double)
+declare fp128 @llvm.log.f128(fp128)
+
+define void @test_log(float %float, double %double, fp128 %fp128) {
+; CHECK: test_log:
+
+   %logfloat = call float @llvm.log.f32(float %float)
+   store float %logfloat, float* @varfloat
+; CHECK: bl logf
+
+   %logdouble = call double @llvm.log.f64(double %double)
+   store double %logdouble, double* @vardouble
+; CHECK: bl log
+
+   %logfp128 = call fp128 @llvm.log.f128(fp128 %fp128)
+   store fp128 %logfp128, fp128* @varfp128
+; CHECK: bl logl
+
+  ret void
+}
+
+declare float @llvm.log2.f32(float)
+declare double @llvm.log2.f64(double)
+declare fp128 @llvm.log2.f128(fp128)
+
+define void @test_log2(float %float, double %double, fp128 %fp128) {
+; CHECK: test_log2:
+
+   %log2float = call float @llvm.log2.f32(float %float)
+   store float %log2float, float* @varfloat
+; CHECK: bl log2f
+
+   %log2double = call double @llvm.log2.f64(double %double)
+   store double %log2double, double* @vardouble
+; CHECK: bl log2
+
+   %log2fp128 = call fp128 @llvm.log2.f128(fp128 %fp128)
+   store fp128 %log2fp128, fp128* @varfp128
+; CHECK: bl log2l
+  ret void
+
+}
+
+declare float @llvm.log10.f32(float)
+declare double @llvm.log10.f64(double)
+declare fp128 @llvm.log10.f128(fp128)
+
+define void @test_log10(float %float, double %double, fp128 %fp128) {
+; CHECK: test_log10:
+
+   %log10float = call float @llvm.log10.f32(float %float)
+   store float %log10float, float* @varfloat
+; CHECK: bl log10f
+
+   %log10double = call double @llvm.log10.f64(double %double)
+   store double %log10double, double* @vardouble
+; CHECK: bl log10
+
+   %log10fp128 = call fp128 @llvm.log10.f128(fp128 %fp128)
+   store fp128 %log10fp128, fp128* @varfp128
+; CHECK: bl log10l
+
+  ret void
+}
+
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare fp128 @llvm.sin.f128(fp128)
+
+define void @test_sin(float %float, double %double, fp128 %fp128) {
+; CHECK: test_sin:
+
+   %sinfloat = call float @llvm.sin.f32(float %float)
+   store float %sinfloat, float* @varfloat
+; CHECK: bl sinf
+
+   %sindouble = call double @llvm.sin.f64(double %double)
+   store double %sindouble, double* @vardouble
+; CHECK: bl sin
+
+   %sinfp128 = call fp128 @llvm.sin.f128(fp128 %fp128)
+   store fp128 %sinfp128, fp128* @varfp128
+; CHECK: bl sinl
+  ret void
+
+}
+
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
+declare fp128 @llvm.pow.f128(fp128, fp128)
+
+define void @test_pow(float %float, double %double, fp128 %fp128) {
+; CHECK: test_pow:
+
+   %powfloat = call float @llvm.pow.f32(float %float, float %float)
+   store float %powfloat, float* @varfloat
+; CHECK: bl powf
+
+   %powdouble = call double @llvm.pow.f64(double %double, double %double)
+   store double %powdouble, double* @vardouble
+; CHECK: bl pow
+
+   %powfp128 = call fp128 @llvm.pow.f128(fp128 %fp128, fp128 %fp128)
+   store fp128 %powfp128, fp128* @varfp128
+; CHECK: bl powl
+
+  ret void
+}
+
+declare float @llvm.powi.f32(float, i32)
+declare double @llvm.powi.f64(double, i32)
+declare fp128 @llvm.powi.f128(fp128, i32)
+
+define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) {
+; CHECK: test_powi:
+
+   %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent)
+   store float %powifloat, float* @varfloat
+; CHECK: bl __powisf2
+
+   %powidouble = call double @llvm.powi.f64(double %double, i32 %exponent)
+   store double %powidouble, double* @vardouble
+; CHECK: bl __powidf2
+
+   %powifp128 = call fp128 @llvm.powi.f128(fp128 %fp128, i32 %exponent)
+   store fp128 %powifp128, fp128* @varfp128
+; CHECK: bl __powitf2
+  ret void
+
+}
+
+define void @test_frem(float %float, double %double, fp128 %fp128) {
+; CHECK: test_frem:
+
+  %fremfloat = frem float %float, %float
+  store float %fremfloat, float* @varfloat
+; CHECK: bl fmodf
+
+  %fremdouble = frem double %double, %double
+  store double %fremdouble, double* @vardouble
+; CHECK: bl fmod
+
+  %fremfp128 = frem fp128 %fp128, %fp128
+  store fp128 %fremfp128, fp128* @varfp128
+; CHECK: bl fmodl
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
new file mode 100644
index 0000000..d80be8f
--- /dev/null
+++ b/test/CodeGen/AArch64/init-array.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array < %s | FileCheck %s
+
+define internal void @_GLOBAL__I_a() section ".text.startup" {
+  ret void
+}
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+
+; CHECK: .section .init_array
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
new file mode 100644
index 0000000..c39c57f
--- /dev/null
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
@@ -0,0 +1,7 @@
+; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+
+define void @foo() {
+  ; Out of range immediate for I.
+  call void asm sideeffect "add x0, x0, $0", "I"(i32 4096)
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
new file mode 100644
index 0000000..47c5f98
--- /dev/null
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
@@ -0,0 +1,7 @@
+; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+
+define void @foo() {
+  ; 32-bit bitpattern ending in 1101 can't be produced.
+  call void asm sideeffect "and w0, w0, $0", "K"(i32 13)
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
new file mode 100644
index 0000000..7a5b99e
--- /dev/null
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
@@ -0,0 +1,7 @@
+; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+
+define void @foo() {
+  ; 32-bit bitpattern ending in 1101 can't be produced.
+  call void asm sideeffect "and w0, w0, $0", "K"(i64 4294967296)
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
new file mode 100644
index 0000000..4f00398
--- /dev/null
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
@@ -0,0 +1,7 @@
+; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+
+define void @foo() {
+  ; 32-bit bitpattern ending in 1101 can't be produced.
+  call void asm sideeffect "and x0, x0, $0", "L"(i32 13)
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll
new file mode 100644
index 0000000..c232f32
--- /dev/null
+++ b/test/CodeGen/AArch64/inline-asm-constraints.ll
@@ -0,0 +1,117 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+define i64 @test_inline_constraint_r(i64 %base, i32 %offset) {
+; CHECK: test_inline_constraint_r:
+  %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 %base, i32 %offset)
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw
+  ret i64 %val
+}
+
+define i16 @test_small_reg(i16 %lhs, i16 %rhs) {
+; CHECK: test_small_reg:
+  %val = call i16 asm sideeffect "add $0, $1, $2, sxth", "=r,r,r"(i16 %lhs, i16 %rhs)
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth
+  ret i16 %val
+}
+
+define i64 @test_inline_constraint_r_imm(i64 %base, i32 %offset) {
+; CHECK: test_inline_constraint_r_imm:
+  %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 4, i32 12)
+; CHECK: movz [[FOUR:x[0-9]+]], #4
+; CHECK: movz [[TWELVE:w[0-9]+]], #12
+; CHECK: add {{x[0-9]+}}, [[FOUR]], [[TWELVE]], sxtw
+  ret i64 %val
+}
+
+; m is permitted to have a base/offset form. We don't do that
+; currently though.
+define i32 @test_inline_constraint_m(i32 *%ptr) {
+; CHECK: test_inline_constraint_m:
+  %val = call i32 asm "ldr $0, $1", "=r,m"(i32 *%ptr)
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+  ret i32 %val
+}
+
+@arr = global [8 x i32] zeroinitializer
+
+; Q should *never* have base/offset form even if given the chance.
+define i32 @test_inline_constraint_Q(i32 *%ptr) {
+; CHECK: test_inline_constraint_Q:
+  %val = call i32 asm "ldr $0, $1", "=r,Q"(i32* getelementptr([8 x i32]* @arr, i32 0, i32 1))
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+  ret i32 %val
+}
+
+@dump = global fp128 zeroinitializer
+
+define void @test_inline_constraint_I() {
+; CHECK: test_inline_constraint_I:
+  call void asm sideeffect "add x0, x0, $0", "I"(i32 0)
+  call void asm sideeffect "add x0, x0, $0", "I"(i64 4095)
+; CHECK: add x0, x0, #0
+; CHECK: add x0, x0, #4095
+
+  ret void
+}
+
+; Skip J because it's useless
+
+define void @test_inline_constraint_K() {
+; CHECK: test_inline_constraint_K:
+  call void asm sideeffect "and w0, w0, $0", "K"(i32 2863311530) ; = 0xaaaaaaaa
+  call void asm sideeffect "and w0, w0, $0", "K"(i32 65535)
+; CHECK: and w0, w0, #-1431655766
+; CHECK: and w0, w0, #65535
+
+  ret void
+}
+
+define void @test_inline_constraint_L() {
+; CHECK: test_inline_constraint_L:
+  call void asm sideeffect "and x0, x0, $0", "L"(i64 4294967296) ; = 0xaaaaaaaa
+  call void asm sideeffect "and x0, x0, $0", "L"(i64 65535)
+; CHECK: and x0, x0, #4294967296
+; CHECK: and x0, x0, #65535
+
+  ret void
+}
+
+; Skip M and N because we don't support MOV pseudo-instructions yet.
+
+@var = global i32 0
+
+define void @test_inline_constraint_S() {
+; CHECK: test_inline_constraint_S:
+  call void asm sideeffect "adrp x0, $0", "S"(i32* @var)
+  call void asm sideeffect "adrp x0, ${0:A}", "S"(i32* @var)
+  call void asm sideeffect "add x0, x0, ${0:L}", "S"(i32* @var)
+; CHECK: adrp x0, var
+; CHECK: adrp x0, var
+; CHECK: add x0, x0, #:lo12:var
+  ret void
+}
+
+define i32 @test_inline_constraint_S_label(i1 %in) {
+; CHECK: test_inline_constraint_S_label:
+  call void asm sideeffect "adr x0, $0", "S"(i8* blockaddress(@test_inline_constraint_S_label, %loc))
+; CHECK: adr x0, .Ltmp{{[0-9]+}}
+  br i1 %in, label %loc, label %loc2
+loc:
+  ret i32 0
+loc2:
+  ret i32 42
+}
+
+define void @test_inline_constraint_Y() {
+; CHECK: test_inline_constraint_Y:
+  call void asm sideeffect "fcmp s0, $0", "Y"(float 0.0)
+; CHECK: fcmp s0, #0.0
+  ret void
+}
+
+define void @test_inline_constraint_Z() {
+; CHECK: test_inline_constraint_Z:
+  call void asm sideeffect "cmp w0, $0", "Z"(i32 0)
+; CHECK: cmp w0, #0
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll
new file mode 100644
index 0000000..3b55945
--- /dev/null
+++ b/test/CodeGen/AArch64/inline-asm-modifiers.ll
@@ -0,0 +1,125 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-ELF %s
+
+@var_simple = hidden global i32 0
+@var_got = global i32 0
+@var_tlsgd = thread_local global i32 0
+@var_tlsld = thread_local(localdynamic) global i32 0
+@var_tlsie = thread_local(initialexec) global i32 0
+@var_tlsle = thread_local(localexec) global i32 0
+
+define void @test_inline_modifier_L() nounwind {
+; CHECK: test_inline_modifier_L:
+  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_simple)
+  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_got)
+  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsgd)
+  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsld)
+  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_tlsie)
+  call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsle)
+; CHECK: add x0, x0, #:lo12:var_simple
+; CHECK: ldr x0, [x0, #:got_lo12:var_got]
+; CHECK: add x0, x0, #:tlsdesc_lo12:var_tlsgd
+; CHECK: add x0, x0, #:dtprel_lo12:var_tlsld
+; CHECK: ldr x0, [x0, #:gottprel_lo12:var_tlsie]
+; CHECK: add x0, x0, #:tprel_lo12:var_tlsle
+
+; CHECK-ELF: R_AARCH64_ADD_ABS_LO12_NC var_simple
+; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var_got
+; CHECK-ELF: R_AARCH64_TLSDESC_ADD_LO12_NC var_tlsgd
+; CHECK-ELF: R_AARCH64_TLSLD_ADD_DTPREL_LO12 var_tlsld
+; CHECK-ELF: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC var_tlsie
+; CHECK-ELF: R_AARCH64_TLSLE_ADD_TPREL_LO12 var_tlsle
+
+  ret void
+}
+
+define void @test_inline_modifier_G() nounwind {
+; CHECK: test_inline_modifier_G:
+  call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsld)
+  call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsle)
+; CHECK: add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12
+; CHECK: add x0, x0, #:tprel_hi12:var_tlsle, lsl #12
+
+; CHECK-ELF: R_AARCH64_TLSLD_ADD_DTPREL_HI12 var_tlsld
+; CHECK-ELF: R_AARCH64_TLSLE_ADD_TPREL_HI12 var_tlsle
+
+  ret void
+}
+
+define void @test_inline_modifier_A() nounwind {
+; CHECK: test_inline_modifier_A:
+  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_simple)
+  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_got)
+  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsgd)
+  call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsie)
+  ; N.b. All tprel and dtprel relocs are modified: lo12 or granules.
+; CHECK: adrp x0, var_simple
+; CHECK: adrp x0, :got:var_got
+; CHECK: adrp x0, :tlsdesc:var_tlsgd
+; CHECK: adrp x0, :gottprel:var_tlsie
+
+; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 var_simple
+; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var_got
+; CHECK-ELF: R_AARCH64_TLSDESC_ADR_PAGE var_tlsgd
+; CHECK-ELF: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 var_tlsie
+
+  ret void
+}
+
+define void @test_inline_modifier_wx(i32 %small, i64 %big) nounwind {
+; CHECK: test_inline_modifier_wx:
+  call i32 asm sideeffect "add $0, $0, $0", "=r,0"(i32 %small)
+  call i32 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i32 %small)
+  call i32 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i32 %small)
+; CHECK: //APP
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+
+  call i64 asm sideeffect "add $0, $0, $0", "=r,0"(i64 %big)
+  call i64 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i64 %big)
+  call i64 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i64 %big)
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+
+  call i32 asm sideeffect "add ${0:w}, ${1:w}, ${1:w}", "=r,r"(i32 0)
+  call i32 asm sideeffect "add ${0:x}, ${1:x}, ${1:x}", "=r,r"(i32 0)
+; CHECK: add {{w[0-9]+}}, wzr, wzr
+; CHECK: add {{x[0-9]+}}, xzr, xzr
+  ret void
+}
+
+define void @test_inline_modifier_bhsdq() nounwind {
+; CHECK: test_inline_modifier_bhsdq:
+  call float asm sideeffect "ldr ${0:b}, [sp]", "=w"()
+  call float asm sideeffect "ldr ${0:h}, [sp]", "=w"()
+  call float asm sideeffect "ldr ${0:s}, [sp]", "=w"()
+  call float asm sideeffect "ldr ${0:d}, [sp]", "=w"()
+  call float asm sideeffect "ldr ${0:q}, [sp]", "=w"()
+; CHECK: ldr b0, [sp]
+; CHECK: ldr h0, [sp]
+; CHECK: ldr s0, [sp]
+; CHECK: ldr d0, [sp]
+; CHECK: ldr q0, [sp]
+
+  call double asm sideeffect "ldr ${0:b}, [sp]", "=w"()
+  call double asm sideeffect "ldr ${0:h}, [sp]", "=w"()
+  call double asm sideeffect "ldr ${0:s}, [sp]", "=w"()
+  call double asm sideeffect "ldr ${0:d}, [sp]", "=w"()
+  call double asm sideeffect "ldr ${0:q}, [sp]", "=w"()
+; CHECK: ldr b0, [sp]
+; CHECK: ldr h0, [sp]
+; CHECK: ldr s0, [sp]
+; CHECK: ldr d0, [sp]
+; CHECK: ldr q0, [sp]
+  ret void
+}
+
+define void @test_inline_modifier_c() nounwind {
+; CHECK: test_inline_modifier_c:
+  call void asm sideeffect "adr x0, ${0:c}", "i"(i32 3)
+; CHECK: adr x0, 3
+
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
new file mode 100644
index 0000000..dcf9f4e
--- /dev/null
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -0,0 +1,56 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -filetype=obj | elf-dump | FileCheck %s -check-prefix=CHECK-ELF
+
+define i32 @test_jumptable(i32 %in) {
+; CHECK: test_jumptable
+
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+  ]
+; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
+; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], #:lo12:.LJTI0_0
+; CHECK: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3]
+; CHECK: br [[DEST]]
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+}
+
+; CHECK: .rodata
+
+; CHECK: .LJTI0_0:
+; CHECK-NEXT: .xword
+; CHECK-NEXT: .xword
+; CHECK-NEXT: .xword
+; CHECK-NEXT: .xword
+; CHECK-NEXT: .xword
+
+; ELF tests:
+
+; First make sure we get a page/lo12 pair in .text to pick up the jump-table
+; CHECK-ELF: .rela.text
+; CHECK-ELF: ('r_sym', 0x00000008)
+; CHECK-ELF-NEXT: ('r_type', 0x00000113)
+; CHECK-ELF: ('r_sym', 0x00000008)
+; CHECK-ELF-NEXT: ('r_type', 0x00000115)
+
+; Also check the targets in .rodata are relocated
+; CHECK-ELF: .rela.rodata
+; CHECK-ELF: ('r_sym', 0x00000005)
+; CHECK-ELF-NEXT: ('r_type', 0x00000101)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/large-frame.ll b/test/CodeGen/AArch64/large-frame.ll
new file mode 100644
index 0000000..2b2e129
--- /dev/null
+++ b/test/CodeGen/AArch64/large-frame.ll
@@ -0,0 +1,114 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+declare void @use_addr(i8*)
+
+@addr = global i8* null
+
+define void @test_bigframe() {
+; CHECK: test_bigframe:
+
+  %var1 = alloca i8, i32 20000000
+  %var2 = alloca i8, i32 16
+  %var3 = alloca i8, i32 20000000
+; CHECK: sub sp, sp, #496
+; CHECK: str x30, [sp, #488]
+  ; Total adjust is 39999536
+; CHECK: movz [[SUBCONST:x[0-9]+]], #22576
+; CHECK: movk [[SUBCONST]], #610, lsl #16
+; CHECK: sub sp, sp, [[SUBCONST]]
+
+  ; Total offset is 20000024
+; CHECK: movz [[VAR1OFFSET:x[0-9]+]], #11544
+; CHECK: movk [[VAR1OFFSET]], #305, lsl #16
+; CHECK: add {{x[0-9]+}}, sp, [[VAR1OFFSET]]
+  store volatile i8* %var1, i8** @addr
+
+  %var1plus2 = getelementptr i8* %var1, i32 2
+  store volatile i8* %var1plus2, i8** @addr
+
+; CHECK: movz [[VAR2OFFSET:x[0-9]+]], #11528
+; CHECK: movk [[VAR2OFFSET]], #305, lsl #16
+; CHECK: add {{x[0-9]+}}, sp, [[VAR2OFFSET]]
+  store volatile i8* %var2, i8** @addr
+
+  %var2plus2 = getelementptr i8* %var2, i32 2
+  store volatile i8* %var2plus2, i8** @addr
+
+  store volatile i8* %var3, i8** @addr
+
+  %var3plus2 = getelementptr i8* %var3, i32 2
+  store volatile i8* %var3plus2, i8** @addr
+
+; CHECK: movz [[ADDCONST:x[0-9]+]], #22576
+; CHECK: movk [[ADDCONST]], #610, lsl #16
+; CHECK: add sp, sp, [[ADDCONST]]
+  ret void
+}
+
+define void @test_mediumframe() {
+; CHECK: test_mediumframe:
+  %var1 = alloca i8, i32 1000000
+  %var2 = alloca i8, i32 16
+  %var3 = alloca i8, i32 1000000
+; CHECK: sub sp, sp, #496
+; CHECK: str x30, [sp, #488]
+; CHECK: sub sp, sp, #688
+; CHECK-NEXT: sub sp, sp, #488, lsl #12
+
+  store volatile i8* %var1, i8** @addr
+; CHECK: add [[VAR1ADDR:x[0-9]+]], sp, #600
+; CHECK: add [[VAR1ADDR]], [[VAR1ADDR]], #244, lsl #12
+
+  %var1plus2 = getelementptr i8* %var1, i32 2
+  store volatile i8* %var1plus2, i8** @addr
+; CHECK: add [[VAR1PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
+
+  store volatile i8* %var2, i8** @addr
+; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #584
+; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #244, lsl #12
+
+  %var2plus2 = getelementptr i8* %var2, i32 2
+  store volatile i8* %var2plus2, i8** @addr
+; CHECK: add [[VAR2PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
+
+  store volatile i8* %var3, i8** @addr
+
+  %var3plus2 = getelementptr i8* %var3, i32 2
+  store volatile i8* %var3plus2, i8** @addr
+
+; CHECK: add sp, sp, #688
+; CHECK: add sp, sp, #488, lsl #12
+; CHECK: ldr x30, [sp, #488]
+; CHECK: add sp, sp, #496
+  ret void
+}
+
+
+@bigspace = global [8 x i64] zeroinitializer
+
+; If temporary registers are allocated for adjustment, they should *not* clobber
+; argument registers.
+define void @test_tempallocation([8 x i64] %val) nounwind {
+; CHECK: test_tempallocation:
+  %var = alloca i8, i32 1000000
+; CHECK: sub sp, sp,
+
+; Make sure the prologue is reasonably efficient
+; CHECK-NEXT: stp x29, x30, [sp,
+; CHECK-NEXT: stp x25, x26, [sp,
+; CHECK-NEXT: stp x23, x24, [sp,
+; CHECK-NEXT: stp x21, x22, [sp,
+; CHECK-NEXT: stp x19, x20, [sp,
+
+; Make sure we don't trash an argument register
+; CHECK-NOT: movz {{x[0-7],}}
+; CHECK: sub sp, sp,
+
+; CHECK-NOT: movz {{x[0-7],}}
+
+; CHECK: bl use_addr
+  call void @use_addr(i8* %var)
+
+  store [8 x i64] %val, [8 x i64]* @bigspace
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/ldst-regoffset.ll b/test/CodeGen/AArch64/ldst-regoffset.ll
new file mode 100644
index 0000000..4593512
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-regoffset.ll
@@ -0,0 +1,333 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var_8bit = global i8 0
+@var_16bit = global i16 0
+@var_32bit = global i32 0
+@var_64bit = global i64 0
+
+@var_float = global float 0.0
+@var_double = global double 0.0
+
+define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) {
+; CHECK: ldst_8bit:
+
+   %addr8_sxtw = getelementptr i8* %base, i32 %off32
+   %val8_sxtw = load volatile i8* %addr8_sxtw
+   %val32_signed = sext i8 %val8_sxtw to i32
+   store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+  %addr_lsl = getelementptr i8* %base, i64 %off64
+  %val8_lsl = load volatile i8* %addr_lsl
+  %val32_unsigned = zext i8 %val8_lsl to i32
+  store volatile i32 %val32_unsigned, i32* @var_32bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+
+  %addrint_uxtw = ptrtoint i8* %base to i64
+  %offset_uxtw = zext i32 %off32 to i64
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i8*
+  %val8_uxtw = load volatile i8* %addr_uxtw
+  %newval8 = add i8 %val8_uxtw, 1
+  store volatile i8 %newval8, i8* @var_8bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+   ret void
+}
+
+
+define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
+; CHECK: ldst_16bit:
+
+   %addr8_sxtwN = getelementptr i16* %base, i32 %off32
+   %val8_sxtwN = load volatile i16* %addr8_sxtwN
+   %val32_signed = sext i16 %val8_sxtwN to i32
+   store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #1]
+
+  %addr_lslN = getelementptr i16* %base, i64 %off64
+  %val8_lslN = load volatile i16* %addr_lslN
+  %val32_unsigned = zext i16 %val8_lslN to i32
+  store volatile i32 %val32_unsigned, i32* @var_32bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #1]
+
+  %addrint_uxtw = ptrtoint i16* %base to i64
+  %offset_uxtw = zext i32 %off32 to i64
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i16*
+  %val8_uxtw = load volatile i16* %addr_uxtw
+  %newval8 = add i16 %val8_uxtw, 1
+  store volatile i16 %newval8, i16* @var_16bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i16* %base to i64
+  %offset_sxtw = sext i32 %off32 to i64
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i16*
+  %val16_sxtw = load volatile i16* %addr_sxtw
+  %val64_signed = sext i16 %val16_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_lsl = ptrtoint i16* %base to i64
+  %addrint_lsl = add i64 %base_lsl, %off64
+  %addr_lsl = inttoptr i64 %addrint_lsl to i16*
+  %val16_lsl = load volatile i16* %addr_lsl
+  %val64_unsigned = zext i16 %val16_lsl to i64
+  store volatile i64 %val64_unsigned, i64* @var_64bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+
+  %base_uxtwN = ptrtoint i16* %base to i64
+  %offset_uxtwN = zext i32 %off32 to i64
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 1
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i16*
+  %val32 = load volatile i32* @var_32bit
+  %val16_trunc32 = trunc i32 %val32 to i16
+  store volatile i16 %val16_trunc32, i16* %addr_uxtwN
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
+   ret void
+}
+
+define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
+; CHECK: ldst_32bit:
+
+   %addr_sxtwN = getelementptr i32* %base, i32 %off32
+   %val_sxtwN = load volatile i32* %addr_sxtwN
+   store volatile i32 %val_sxtwN, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+
+  %addr_lslN = getelementptr i32* %base, i64 %off64
+  %val_lslN = load volatile i32* %addr_lslN
+  store volatile i32 %val_lslN, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #2]
+
+  %addrint_uxtw = ptrtoint i32* %base to i64
+  %offset_uxtw = zext i32 %off32 to i64
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i32*
+  %val_uxtw = load volatile i32* %addr_uxtw
+  %newval8 = add i32 %val_uxtw, 1
+  store volatile i32 %newval8, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+
+  %base_sxtw = ptrtoint i32* %base to i64
+  %offset_sxtw = sext i32 %off32 to i64
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i32*
+  %val16_sxtw = load volatile i32* %addr_sxtw
+  %val64_signed = sext i32 %val16_sxtw to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+  %base_lsl = ptrtoint i32* %base to i64
+  %addrint_lsl = add i64 %base_lsl, %off64
+  %addr_lsl = inttoptr i64 %addrint_lsl to i32*
+  %val16_lsl = load volatile i32* %addr_lsl
+  %val64_unsigned = zext i32 %val16_lsl to i64
+  store volatile i64 %val64_unsigned, i64* @var_64bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+
+  %base_uxtwN = ptrtoint i32* %base to i64
+  %offset_uxtwN = zext i32 %off32 to i64
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 2
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
+  %val32 = load volatile i32* @var_32bit
+  store volatile i32 %val32, i32* %addr_uxtwN
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+   ret void
+}
+
+define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
+; CHECK: ldst_64bit:
+
+   %addr_sxtwN = getelementptr i64* %base, i32 %off32
+   %val_sxtwN = load volatile i64* %addr_sxtwN
+   store volatile i64 %val_sxtwN, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+
+  %addr_lslN = getelementptr i64* %base, i64 %off64
+  %val_lslN = load volatile i64* %addr_lslN
+  store volatile i64 %val_lslN, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #3]
+
+  %addrint_uxtw = ptrtoint i64* %base to i64
+  %offset_uxtw = zext i32 %off32 to i64
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to i64*
+  %val8_uxtw = load volatile i64* %addr_uxtw
+  %newval8 = add i64 %val8_uxtw, 1
+  store volatile i64 %newval8, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint i64* %base to i64
+  %offset_sxtw = sext i32 %off32 to i64
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
+  %val64_sxtw = load volatile i64* %addr_sxtw
+  store volatile i64 %val64_sxtw, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+  %base_lsl = ptrtoint i64* %base to i64
+  %addrint_lsl = add i64 %base_lsl, %off64
+  %addr_lsl = inttoptr i64 %addrint_lsl to i64*
+  %val64_lsl = load volatile i64* %addr_lsl
+  store volatile i64 %val64_lsl, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+
+  %base_uxtwN = ptrtoint i64* %base to i64
+  %offset_uxtwN = zext i32 %off32 to i64
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 3
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
+  %val64 = load volatile i64* @var_64bit
+  store volatile i64 %val64, i64* %addr_uxtwN
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+   ret void
+}
+
+define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
+; CHECK: ldst_float:
+
+   %addr_sxtwN = getelementptr float* %base, i32 %off32
+   %val_sxtwN = load volatile float* %addr_sxtwN
+   store volatile float %val_sxtwN, float* @var_float
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+
+  %addr_lslN = getelementptr float* %base, i64 %off64
+  %val_lslN = load volatile float* %addr_lslN
+  store volatile float %val_lslN, float* @var_float
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #2]
+
+  %addrint_uxtw = ptrtoint float* %base to i64
+  %offset_uxtw = zext i32 %off32 to i64
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to float*
+  %val_uxtw = load volatile float* %addr_uxtw
+  store volatile float %val_uxtw, float* @var_float
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint float* %base to i64
+  %offset_sxtw = sext i32 %off32 to i64
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to float*
+  %val64_sxtw = load volatile float* %addr_sxtw
+  store volatile float %val64_sxtw, float* @var_float
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+  %base_lsl = ptrtoint float* %base to i64
+  %addrint_lsl = add i64 %base_lsl, %off64
+  %addr_lsl = inttoptr i64 %addrint_lsl to float*
+  %val64_lsl = load volatile float* %addr_lsl
+  store volatile float %val64_lsl, float* @var_float
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+
+  %base_uxtwN = ptrtoint float* %base to i64
+  %offset_uxtwN = zext i32 %off32 to i64
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 2
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to float*
+  %val64 = load volatile float* @var_float
+  store volatile float %val64, float* %addr_uxtwN
+; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+   ret void
+}
+
+define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
+; CHECK: ldst_double:
+
+   %addr_sxtwN = getelementptr double* %base, i32 %off32
+   %val_sxtwN = load volatile double* %addr_sxtwN
+   store volatile double %val_sxtwN, double* @var_double
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+
+  %addr_lslN = getelementptr double* %base, i64 %off64
+  %val_lslN = load volatile double* %addr_lslN
+  store volatile double %val_lslN, double* @var_double
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #3]
+
+  %addrint_uxtw = ptrtoint double* %base to i64
+  %offset_uxtw = zext i32 %off32 to i64
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to double*
+  %val_uxtw = load volatile double* %addr_uxtw
+  store volatile double %val_uxtw, double* @var_double
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint double* %base to i64
+  %offset_sxtw = sext i32 %off32 to i64
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to double*
+  %val64_sxtw = load volatile double* %addr_sxtw
+  store volatile double %val64_sxtw, double* @var_double
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+  %base_lsl = ptrtoint double* %base to i64
+  %addrint_lsl = add i64 %base_lsl, %off64
+  %addr_lsl = inttoptr i64 %addrint_lsl to double*
+  %val64_lsl = load volatile double* %addr_lsl
+  store volatile double %val64_lsl, double* @var_double
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+
+  %base_uxtwN = ptrtoint double* %base to i64
+  %offset_uxtwN = zext i32 %off32 to i64
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 3
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to double*
+  %val64 = load volatile double* @var_double
+  store volatile double %val64, double* %addr_uxtwN
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+   ret void
+}
+
+
+define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
+; CHECK: ldst_128bit:
+
+   %addr_sxtwN = getelementptr fp128* %base, i32 %off32
+   %val_sxtwN = load volatile fp128* %addr_sxtwN
+   store volatile fp128 %val_sxtwN, fp128* %base
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+
+  %addr_lslN = getelementptr fp128* %base, i64 %off64
+  %val_lslN = load volatile fp128* %addr_lslN
+  store volatile fp128 %val_lslN, fp128* %base
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #4]
+
+  %addrint_uxtw = ptrtoint fp128* %base to i64
+  %offset_uxtw = zext i32 %off32 to i64
+  %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+  %addr_uxtw = inttoptr i64 %addrint1_uxtw to fp128*
+  %val_uxtw = load volatile fp128* %addr_uxtw
+  store volatile fp128 %val_uxtw, fp128* %base
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+  %base_sxtw = ptrtoint fp128* %base to i64
+  %offset_sxtw = sext i32 %off32 to i64
+  %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+  %addr_sxtw = inttoptr i64 %addrint_sxtw to fp128*
+  %val64_sxtw = load volatile fp128* %addr_sxtw
+  store volatile fp128 %val64_sxtw, fp128* %base
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+  %base_lsl = ptrtoint fp128* %base to i64
+  %addrint_lsl = add i64 %base_lsl, %off64
+  %addr_lsl = inttoptr i64 %addrint_lsl to fp128*
+  %val64_lsl = load volatile fp128* %addr_lsl
+  store volatile fp128 %val64_lsl, fp128* %base
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+
+  %base_uxtwN = ptrtoint fp128* %base to i64
+  %offset_uxtwN = zext i32 %off32 to i64
+  %offset2_uxtwN = shl i64 %offset_uxtwN, 4
+  %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+  %addr_uxtwN = inttoptr i64 %addrint_uxtwN to fp128*
+  %val64 = load volatile fp128* %base
+  store volatile fp128 %val64, fp128* %addr_uxtwN
+; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #4]
+   ret void
+}
diff --git a/test/CodeGen/AArch64/ldst-unscaledimm.ll b/test/CodeGen/AArch64/ldst-unscaledimm.ll
new file mode 100644
index 0000000..78a3c83
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -0,0 +1,218 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var_8bit = global i8 0
+@var_16bit = global i16 0
+@var_32bit = global i32 0
+@var_64bit = global i64 0
+
+@var_float = global float 0.0
+@var_double = global double 0.0
+
+@varptr = global i8* null
+
+define void @ldst_8bit() {
+; CHECK: ldst_8bit:
+
+; No architectural support for loads to 16-bit or 8-bit since we
+; promote i8 during lowering.
+  %addr_8bit = load i8** @varptr
+
+; match a sign-extending load 8-bit -> 32-bit
+   %addr_sext32 = getelementptr i8* %addr_8bit, i64 -256
+   %val8_sext32 = load volatile i8* %addr_sext32
+   %val32_signed = sext i8 %val8_sext32 to i32
+   store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: ldursb {{w[0-9]+}}, [{{x[0-9]+}}, #-256]
+
+; match a zero-extending load volatile 8-bit -> 32-bit
+  %addr_zext32 = getelementptr i8* %addr_8bit, i64 -12
+  %val8_zext32 = load volatile i8* %addr_zext32
+  %val32_unsigned = zext i8 %val8_zext32 to i32
+  store volatile i32 %val32_unsigned, i32* @var_32bit
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-12]
+
+; match an any-extending load volatile 8-bit -> 32-bit
+  %addr_anyext = getelementptr i8* %addr_8bit, i64 -1
+  %val8_anyext = load volatile i8* %addr_anyext
+  %newval8 = add i8 %val8_anyext, 1
+  store volatile i8 %newval8, i8* @var_8bit
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-1]
+
+; match a sign-extending load volatile 8-bit -> 64-bit
+  %addr_sext64 = getelementptr i8* %addr_8bit, i64 -5
+  %val8_sext64 = load volatile i8* %addr_sext64
+  %val64_signed = sext i8 %val8_sext64 to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldursb {{x[0-9]+}}, [{{x[0-9]+}}, #-5]
+
+; match a zero-extending load volatile 8-bit -> 64-bit.
+; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
+; of x0 so it's identical to load volatileing to 32-bits.
+  %addr_zext64 = getelementptr i8* %addr_8bit, i64 -9
+  %val8_zext64 = load volatile i8* %addr_zext64
+  %val64_unsigned = zext i8 %val8_zext64 to i64
+  store volatile i64 %val64_unsigned, i64* @var_64bit
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-9]
+
+; truncating store volatile 32-bits to 8-bits
+  %addr_trunc32 = getelementptr i8* %addr_8bit, i64 -256
+  %val32 = load volatile i32* @var_32bit
+  %val8_trunc32 = trunc i32 %val32 to i8
+  store volatile i8 %val8_trunc32, i8* %addr_trunc32
+; CHECK: sturb {{w[0-9]+}}, [{{x[0-9]+}}, #-256]
+
+; truncating store volatile 64-bits to 8-bits
+  %addr_trunc64 = getelementptr i8* %addr_8bit, i64 -1
+  %val64 = load volatile i64* @var_64bit
+  %val8_trunc64 = trunc i64 %val64 to i8
+  store volatile i8 %val8_trunc64, i8* %addr_trunc64
+; CHECK: sturb {{w[0-9]+}}, [{{x[0-9]+}}, #-1]
+
+   ret void
+}
+
+define void @ldst_16bit() {
+; CHECK: ldst_16bit:
+
+; No architectural support for loads to 16-bit or 16-bit since we
+; promote i16 during lowering.
+  %addr_8bit = load i8** @varptr
+
+; match a sign-extending load 16-bit -> 32-bit
+   %addr8_sext32 = getelementptr i8* %addr_8bit, i64 -256
+   %addr_sext32 = bitcast i8* %addr8_sext32 to i16*
+   %val16_sext32 = load volatile i16* %addr_sext32
+   %val32_signed = sext i16 %val16_sext32 to i32
+   store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: ldursh {{w[0-9]+}}, [{{x[0-9]+}}, #-256]
+
+; match a zero-extending load volatile 16-bit -> 32-bit. With offset that would be unaligned.
+  %addr8_zext32 = getelementptr i8* %addr_8bit, i64 15
+  %addr_zext32 = bitcast i8* %addr8_zext32 to i16*
+  %val16_zext32 = load volatile i16* %addr_zext32
+  %val32_unsigned = zext i16 %val16_zext32 to i32
+  store volatile i32 %val32_unsigned, i32* @var_32bit
+; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #15]
+
+; match an any-extending load volatile 16-bit -> 32-bit
+  %addr8_anyext = getelementptr i8* %addr_8bit, i64 -1
+  %addr_anyext = bitcast i8* %addr8_anyext to i16*
+  %val16_anyext = load volatile i16* %addr_anyext
+  %newval16 = add i16 %val16_anyext, 1
+  store volatile i16 %newval16, i16* @var_16bit
+; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #-1]
+
+; match a sign-extending load volatile 16-bit -> 64-bit
+  %addr8_sext64 = getelementptr i8* %addr_8bit, i64 -5
+  %addr_sext64 = bitcast i8* %addr8_sext64 to i16*
+  %val16_sext64 = load volatile i16* %addr_sext64
+  %val64_signed = sext i16 %val16_sext64 to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldursh {{x[0-9]+}}, [{{x[0-9]+}}, #-5]
+
+; match a zero-extending load volatile 16-bit -> 64-bit.
+; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
+; of x0 so it's identical to load volatileing to 32-bits.
+  %addr8_zext64 = getelementptr i8* %addr_8bit, i64 9
+  %addr_zext64 = bitcast i8* %addr8_zext64 to i16*
+  %val16_zext64 = load volatile i16* %addr_zext64
+  %val64_unsigned = zext i16 %val16_zext64 to i64
+  store volatile i64 %val64_unsigned, i64* @var_64bit
+; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #9]
+
+; truncating store volatile 32-bits to 16-bits
+  %addr8_trunc32 = getelementptr i8* %addr_8bit, i64 -256
+  %addr_trunc32 = bitcast i8* %addr8_trunc32 to i16*
+  %val32 = load volatile i32* @var_32bit
+  %val16_trunc32 = trunc i32 %val32 to i16
+  store volatile i16 %val16_trunc32, i16* %addr_trunc32
+; CHECK: sturh {{w[0-9]+}}, [{{x[0-9]+}}, #-256]
+
+; truncating store volatile 64-bits to 16-bits
+  %addr8_trunc64 = getelementptr i8* %addr_8bit, i64 -1
+  %addr_trunc64 = bitcast i8* %addr8_trunc64 to i16*
+  %val64 = load volatile i64* @var_64bit
+  %val16_trunc64 = trunc i64 %val64 to i16
+  store volatile i16 %val16_trunc64, i16* %addr_trunc64
+; CHECK: sturh {{w[0-9]+}}, [{{x[0-9]+}}, #-1]
+
+   ret void
+}
+
+define void @ldst_32bit() {
+; CHECK: ldst_32bit:
+
+  %addr_8bit = load i8** @varptr
+
+; Straight 32-bit load/store
+  %addr32_8_noext = getelementptr i8* %addr_8bit, i64 1
+  %addr32_noext = bitcast i8* %addr32_8_noext to i32*
+  %val32_noext = load volatile i32* %addr32_noext
+  store volatile i32 %val32_noext, i32* %addr32_noext
+; CHECK: ldur {{w[0-9]+}}, [{{x[0-9]+}}, #1]
+; CHECK: stur {{w[0-9]+}}, [{{x[0-9]+}}, #1]
+
+; Zero-extension to 64-bits
+  %addr32_8_zext = getelementptr i8* %addr_8bit, i64 -256
+  %addr32_zext = bitcast i8* %addr32_8_zext to i32*
+  %val32_zext = load volatile i32* %addr32_zext
+  %val64_unsigned = zext i32 %val32_zext to i64
+  store volatile i64 %val64_unsigned, i64* @var_64bit
+; CHECK: ldur {{w[0-9]+}}, [{{x[0-9]+}}, #-256]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+
+; Sign-extension to 64-bits
+  %addr32_8_sext = getelementptr i8* %addr_8bit, i64 -12
+  %addr32_sext = bitcast i8* %addr32_8_sext to i32*
+  %val32_sext = load volatile i32* %addr32_sext
+  %val64_signed = sext i32 %val32_sext to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldursw {{x[0-9]+}}, [{{x[0-9]+}}, #-12]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+
+; Truncation from 64-bits
+  %addr64_8_trunc = getelementptr i8* %addr_8bit, i64 255
+  %addr64_trunc = bitcast i8* %addr64_8_trunc to i64*
+  %addr32_8_trunc = getelementptr i8* %addr_8bit, i64 -20
+  %addr32_trunc = bitcast i8* %addr32_8_trunc to i32*
+
+  %val64_trunc = load volatile i64* %addr64_trunc
+  %val32_trunc = trunc i64 %val64_trunc to i32
+  store volatile i32 %val32_trunc, i32* %addr32_trunc
+; CHECK: ldur {{x[0-9]+}}, [{{x[0-9]+}}, #255]
+; CHECK: stur {{w[0-9]+}}, [{{x[0-9]+}}, #-20]
+
+  ret void
+}
+
+define void @ldst_float() {
+; CHECK: ldst_float:
+
+  %addr_8bit = load i8** @varptr
+  %addrfp_8 = getelementptr i8* %addr_8bit, i64 -5
+  %addrfp = bitcast i8* %addrfp_8 to float*
+
+  %valfp = load volatile float* %addrfp
+; CHECK: ldur {{s[0-9]+}}, [{{x[0-9]+}}, #-5]
+
+  store volatile float %valfp, float* %addrfp
+; CHECK: stur {{s[0-9]+}}, [{{x[0-9]+}}, #-5]
+
+  ret void
+}
+
+define void @ldst_double() {
+; CHECK: ldst_double:
+
+  %addr_8bit = load i8** @varptr
+  %addrfp_8 = getelementptr i8* %addr_8bit, i64 4
+  %addrfp = bitcast i8* %addrfp_8 to double*
+
+  %valfp = load volatile double* %addrfp
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  store volatile double %valfp, double* %addrfp
+; CHECK: stur {{d[0-9]+}}, [{{x[0-9]+}}, #4]
+
+   ret void
+}
diff --git a/test/CodeGen/AArch64/ldst-unsignedimm.ll b/test/CodeGen/AArch64/ldst-unsignedimm.ll
new file mode 100644
index 0000000..1e7540d
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -0,0 +1,251 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var_8bit = global i8 0
+@var_16bit = global i16 0
+@var_32bit = global i32 0
+@var_64bit = global i64 0
+
+@var_float = global float 0.0
+@var_double = global double 0.0
+
+define void @ldst_8bit() {
+; CHECK: ldst_8bit:
+
+; No architectural support for loads to 16-bit or 8-bit since we
+; promote i8 during lowering.
+
+; match a sign-extending load 8-bit -> 32-bit
+   %val8_sext32 = load volatile i8* @var_8bit
+   %val32_signed = sext i8 %val8_sext32 to i32
+   store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: adrp {{x[0-9]+}}, var_8bit
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+
+; match a zero-extending load volatile 8-bit -> 32-bit
+  %val8_zext32 = load volatile i8* @var_8bit
+  %val32_unsigned = zext i8 %val8_zext32 to i32
+  store volatile i32 %val32_unsigned, i32* @var_32bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+
+; match an any-extending load volatile 8-bit -> 32-bit
+  %val8_anyext = load volatile i8* @var_8bit
+  %newval8 = add i8 %val8_anyext, 1
+  store volatile i8 %newval8, i8* @var_8bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+
+; match a sign-extending load volatile 8-bit -> 64-bit
+  %val8_sext64 = load volatile i8* @var_8bit
+  %val64_signed = sext i8 %val8_sext64 to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsb {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+
+; match a zero-extending load volatile 8-bit -> 64-bit.
+; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
+; of x0 so it's identical to load volatileing to 32-bits.
+  %val8_zext64 = load volatile i8* @var_8bit
+  %val64_unsigned = zext i8 %val8_zext64 to i64
+  store volatile i64 %val64_unsigned, i64* @var_64bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+
+; truncating store volatile 32-bits to 8-bits
+  %val32 = load volatile i32* @var_32bit
+  %val8_trunc32 = trunc i32 %val32 to i8
+  store volatile i8 %val8_trunc32, i8* @var_8bit
+; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+
+; truncating store volatile 64-bits to 8-bits
+  %val64 = load volatile i64* @var_64bit
+  %val8_trunc64 = trunc i64 %val64 to i8
+  store volatile i8 %val8_trunc64, i8* @var_8bit
+; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+
+   ret void
+}
+
+define void @ldst_16bit() {
+; CHECK: ldst_16bit:
+
+; No architectural support for load volatiles to 16-bit promote i16 during
+; lowering.
+
+; match a sign-extending load volatile 16-bit -> 32-bit
+  %val16_sext32 = load volatile i16* @var_16bit
+  %val32_signed = sext i16 %val16_sext32 to i32
+  store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: adrp {{x[0-9]+}}, var_16bit
+; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+
+; match a zero-extending load volatile 16-bit -> 32-bit
+  %val16_zext32 = load volatile i16* @var_16bit
+  %val32_unsigned = zext i16 %val16_zext32 to i32
+  store volatile i32 %val32_unsigned, i32* @var_32bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+
+; match an any-extending load volatile 16-bit -> 32-bit
+  %val16_anyext = load volatile i16* @var_16bit
+  %newval16 = add i16 %val16_anyext, 1
+  store volatile i16 %newval16, i16* @var_16bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+
+; match a sign-extending load volatile 16-bit -> 64-bit
+  %val16_sext64 = load volatile i16* @var_16bit
+  %val64_signed = sext i16 %val16_sext64 to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+
+; match a zero-extending load volatile 16-bit -> 64-bit.
+; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
+; of x0 so it's identical to load volatileing to 32-bits.
+  %val16_zext64 = load volatile i16* @var_16bit
+  %val64_unsigned = zext i16 %val16_zext64 to i64
+  store volatile i64 %val64_unsigned, i64* @var_64bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+
+; truncating store volatile 32-bits to 16-bits
+  %val32 = load volatile i32* @var_32bit
+  %val16_trunc32 = trunc i32 %val32 to i16
+  store volatile i16 %val16_trunc32, i16* @var_16bit
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+
+; truncating store volatile 64-bits to 16-bits
+  %val64 = load volatile i64* @var_64bit
+  %val16_trunc64 = trunc i64 %val64 to i16
+  store volatile i16 %val16_trunc64, i16* @var_16bit
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+
+  ret void
+}
+
+define void @ldst_32bit() {
+; CHECK: ldst_32bit:
+
+; Straight 32-bit load/store
+  %val32_noext = load volatile i32* @var_32bit
+  store volatile i32 %val32_noext, i32* @var_32bit
+; CHECK: adrp {{x[0-9]+}}, var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+
+; Zero-extension to 64-bits
+  %val32_zext = load volatile i32* @var_32bit
+  %val64_unsigned = zext i32 %val32_zext to i64
+  store volatile i64 %val64_unsigned, i64* @var_64bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+
+; Sign-extension to 64-bits
+  %val32_sext = load volatile i32* @var_32bit
+  %val64_signed = sext i32 %val32_sext to i64
+  store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+
+; Truncation from 64-bits
+  %val64_trunc = load volatile i64* @var_64bit
+  %val32_trunc = trunc i64 %val64_trunc to i32
+  store volatile i32 %val32_trunc, i32* @var_32bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+
+  ret void
+}
+
+@arr8 = global i8* null
+@arr16 = global i16* null
+@arr32 = global i32* null
+@arr64 = global i64* null
+
+; Now check that our selection copes with accesses more complex than a
+; single symbol. Permitted offsets should be folded into the loads and
+; stores. Since all forms use the same Operand it's only necessary to
+; check the various access-sizes involved.
+
+define void @ldst_complex_offsets() {
+; CHECK: ldst_complex_offsets
+  %arr8_addr = load volatile i8** @arr8
+; CHECK: adrp {{x[0-9]+}}, arr8
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr8]
+
+  %arr8_sub1_addr = getelementptr i8* %arr8_addr, i64 1
+  %arr8_sub1 = load volatile i8* %arr8_sub1_addr
+  store volatile i8 %arr8_sub1, i8* @var_8bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #1]
+
+  %arr8_sub4095_addr = getelementptr i8* %arr8_addr, i64 4095
+  %arr8_sub4095 = load volatile i8* %arr8_sub4095_addr
+  store volatile i8 %arr8_sub4095, i8* @var_8bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #4095]
+
+
+  %arr16_addr = load volatile i16** @arr16
+; CHECK: adrp {{x[0-9]+}}, arr16
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr16]
+
+  %arr16_sub1_addr = getelementptr i16* %arr16_addr, i64 1
+  %arr16_sub1 = load volatile i16* %arr16_sub1_addr
+  store volatile i16 %arr16_sub1, i16* @var_16bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #2]
+
+  %arr16_sub4095_addr = getelementptr i16* %arr16_addr, i64 4095
+  %arr16_sub4095 = load volatile i16* %arr16_sub4095_addr
+  store volatile i16 %arr16_sub4095, i16* @var_16bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #8190]
+
+
+  %arr32_addr = load volatile i32** @arr32
+; CHECK: adrp {{x[0-9]+}}, arr32
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr32]
+
+  %arr32_sub1_addr = getelementptr i32* %arr32_addr, i64 1
+  %arr32_sub1 = load volatile i32* %arr32_sub1_addr
+  store volatile i32 %arr32_sub1, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #4]
+
+  %arr32_sub4095_addr = getelementptr i32* %arr32_addr, i64 4095
+  %arr32_sub4095 = load volatile i32* %arr32_sub4095_addr
+  store volatile i32 %arr32_sub4095, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #16380]
+
+
+  %arr64_addr = load volatile i64** @arr64
+; CHECK: adrp {{x[0-9]+}}, arr64
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr64]
+
+  %arr64_sub1_addr = getelementptr i64* %arr64_addr, i64 1
+  %arr64_sub1 = load volatile i64* %arr64_sub1_addr
+  store volatile i64 %arr64_sub1, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #8]
+
+  %arr64_sub4095_addr = getelementptr i64* %arr64_addr, i64 4095
+  %arr64_sub4095 = load volatile i64* %arr64_sub4095_addr
+  store volatile i64 %arr64_sub4095, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #32760]
+
+  ret void
+}
+
+define void @ldst_float() {
+; CHECK: ldst_float:
+
+   %valfp = load volatile float* @var_float
+; CHECK: adrp {{x[0-9]+}}, var_float
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+
+  store volatile float %valfp, float* @var_float
+; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+
+   ret void
+}
+
+define void @ldst_double() {
+; CHECK: ldst_double:
+
+   %valfp = load volatile double* @var_double
+; CHECK: adrp {{x[0-9]+}}, var_double
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+
+  store volatile double %valfp, double* @var_double
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+
+   ret void
+}
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
new file mode 100644
index 0000000..c5ce241
--- /dev/null
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+
diff --git a/test/CodeGen/AArch64/literal_pools.ll b/test/CodeGen/AArch64/literal_pools.ll
new file mode 100644
index 0000000..e090841
--- /dev/null
+++ b/test/CodeGen/AArch64/literal_pools.ll
@@ -0,0 +1,55 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @foo() {
+; CHECK: foo:
+    %val32 = load i32* @var32
+    %val64 = load i64* @var64
+
+    %val32_lit32 = and i32 %val32, 123456785
+    store volatile i32 %val32_lit32, i32* @var32
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
+; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
+
+    %val64_lit32 = and i64 %val64, 305402420
+    store volatile i64 %val64_lit32, i64* @var64
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
+; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
+
+    %val64_lit32signed = and i64 %val64, -12345678
+    store volatile i64 %val64_lit32signed, i64* @var64
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
+; CHECK: ldrsw {{x[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
+
+    %val64_lit64 = and i64 %val64, 1234567898765432
+    store volatile i64 %val64_lit64, i64* @var64
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
+; CHECK: ldr {{x[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
+
+    ret void
+}
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+
+define void @floating_lits() {
+; CHECK: floating_lits:
+
+  %floatval = load float* @varfloat
+  %newfloat = fadd float %floatval, 128.0
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
+; CHECK: ldr {{s[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
+; CHECK: fadd
+  store float %newfloat, float* @varfloat
+
+  %doubleval = load double* @vardouble
+  %newdouble = fadd double %doubleval, 129.0
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
+; CHECK: ldr {{d[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
+; CHECK: fadd
+  store double %newdouble, double* @vardouble
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/local_vars.ll b/test/CodeGen/AArch64/local_vars.ll
new file mode 100644
index 0000000..5cbf5a3
--- /dev/null
+++ b/test/CodeGen/AArch64/local_vars.ll
@@ -0,0 +1,57 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP %s
+
+; Make sure a reasonably sane prologue and epilogue are
+; generated. This test is not robust in the face of an frame-handling
+; evolving, but still has value for unrelated changes, I
+; believe.
+;
+; In particular, it will fail when ldp/stp are used for frame setup,
+; when FP-elim is implemented, and when addressing from FP is
+; implemented.
+
+@var = global i64 0
+@local_addr = global i64* null
+
+declare void @foo()
+
+define void @trivial_func() nounwind {
+; CHECK: trivial_func: // @trivial_func
+; CHECK-NEXT: // BB#0
+; CHECK-NEXT: ret
+
+  ret void
+}
+
+define void @trivial_fp_func() {
+; CHECK-WITHFP: trivial_fp_func:
+
+; CHECK-WITHFP: sub sp, sp, #16
+; CHECK-WITHFP: stp x29, x30, [sp]
+; CHECK-WITHFP-NEXT: mov x29, sp
+
+; Dont't really care, but it would be a Bad Thing if this came after the epilogue.
+; CHECK: bl foo
+  call void @foo()
+  ret void
+
+; CHECK-WITHFP: ldp x29, x30, [sp]
+; CHECK-WITHFP: add sp, sp, #16
+
+; CHECK-WITHFP: ret
+}
+
+define void @stack_local() {
+  %local_var = alloca i64
+; CHECK: stack_local:
+; CHECK: sub sp, sp, #16
+
+  %val = load i64* @var
+  store i64 %val, i64* %local_var
+; CHECK: str {{x[0-9]+}}, [sp, #{{[0-9]+}}]
+
+  store i64* %local_var, i64** @local_addr
+; CHECK: add {{x[0-9]+}}, sp, #{{[0-9]+}}
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/logical-imm.ll b/test/CodeGen/AArch64/logical-imm.ll
new file mode 100644
index 0000000..5f3f4da
--- /dev/null
+++ b/test/CodeGen/AArch64/logical-imm.ll
@@ -0,0 +1,84 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_and(i32 %in32, i64 %in64) {
+; CHECK: test_and:
+
+  %val0 = and i32 %in32, 2863311530
+  store volatile i32 %val0, i32* @var32
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xaaaaaaaa
+
+  %val1 = and i32 %in32, 4293984240
+  store volatile i32 %val1, i32* @var32
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xfff0fff0
+
+  %val2 = and i64 %in64, 9331882296111890817
+  store volatile i64 %val2, i64* @var64
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0x8181818181818181
+
+  %val3 = and i64 %in64, 18429855317404942275
+  store volatile i64 %val3, i64* @var64
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffc3ffc3ffc3ffc3
+
+  ret void
+}
+
+define void @test_orr(i32 %in32, i64 %in64) {
+; CHECK: test_orr:
+
+  %val0 = or i32 %in32, 2863311530
+  store volatile i32 %val0, i32* @var32
+; CHECK: orr {{w[0-9]+}}, {{w[0-9]+}}, #0xaaaaaaaa
+
+  %val1 = or i32 %in32, 4293984240
+  store volatile i32 %val1, i32* @var32
+; CHECK: orr {{w[0-9]+}}, {{w[0-9]+}}, #0xfff0fff0
+
+  %val2 = or i64 %in64, 9331882296111890817
+  store volatile i64 %val2, i64* @var64
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8181818181818181
+
+  %val3 = or i64 %in64, 18429855317404942275
+  store volatile i64 %val3, i64* @var64
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0xffc3ffc3ffc3ffc3
+
+  ret void
+}
+
+define void @test_eor(i32 %in32, i64 %in64) {
+; CHECK: test_eor:
+
+  %val0 = xor i32 %in32, 2863311530
+  store volatile i32 %val0, i32* @var32
+; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #0xaaaaaaaa
+
+  %val1 = xor i32 %in32, 4293984240
+  store volatile i32 %val1, i32* @var32
+; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #0xfff0fff0
+
+  %val2 = xor i64 %in64, 9331882296111890817
+  store volatile i64 %val2, i64* @var64
+; CHECK: eor {{x[0-9]+}}, {{x[0-9]+}}, #0x8181818181818181
+
+  %val3 = xor i64 %in64, 18429855317404942275
+  store volatile i64 %val3, i64* @var64
+; CHECK: eor {{x[0-9]+}}, {{x[0-9]+}}, #0xffc3ffc3ffc3ffc3
+
+  ret void
+}
+
+define void @test_mov(i32 %in32, i64 %in64) {
+; CHECK: test_mov:
+  %val0 = add i32 %in32, 2863311530
+  store i32 %val0, i32* @var32
+; CHECK: orr {{w[0-9]+}}, wzr, #0xaaaaaaaa
+
+  %val1 = add i64 %in64, 11068046444225730969
+  store i64 %val1, i64* @var64
+; CHECK: orr {{x[0-9]+}}, xzr, #0x9999999999999999
+
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/logical_shifted_reg.ll b/test/CodeGen/AArch64/logical_shifted_reg.ll
new file mode 100644
index 0000000..bbbfcc1
--- /dev/null
+++ b/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -0,0 +1,224 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
+
+@var1_32 = global i32 0
+@var2_32 = global i32 0
+
+@var1_64 = global i64 0
+@var2_64 = global i64 0
+
+define void @logical_32bit() {
+; CHECK: logical_32bit:
+  %val1 = load i32* @var1_32
+  %val2 = load i32* @var2_32
+
+  ; First check basic and/bic/or/orn/eor/eon patterns with no shift
+  %neg_val2 = xor i32 -1, %val2
+
+  %and_noshift = and i32 %val1, %val2
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  store volatile i32 %and_noshift, i32* @var1_32
+  %bic_noshift = and i32 %neg_val2, %val1
+; CHECK: bic {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  store volatile i32 %bic_noshift, i32* @var1_32
+
+  %or_noshift = or i32 %val1, %val2
+; CHECK: orr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  store volatile i32 %or_noshift, i32* @var1_32
+  %orn_noshift = or i32 %neg_val2, %val1
+; CHECK: orn {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  store volatile i32 %orn_noshift, i32* @var1_32
+
+  %xor_noshift = xor i32 %val1, %val2
+; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  store volatile i32 %xor_noshift, i32* @var1_32
+  %xorn_noshift = xor i32 %neg_val2, %val1
+; CHECK: eon {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  store volatile i32 %xorn_noshift, i32* @var1_32
+
+  ; Check the maximum shift on each
+  %operand_lsl31 = shl i32 %val2, 31
+  %neg_operand_lsl31 = xor i32 -1, %operand_lsl31
+
+  %and_lsl31 = and i32 %val1, %operand_lsl31
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #31
+  store volatile i32 %and_lsl31, i32* @var1_32
+  %bic_lsl31 = and i32 %val1, %neg_operand_lsl31
+; CHECK: bic {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #31
+  store volatile i32 %bic_lsl31, i32* @var1_32
+
+  %or_lsl31 = or i32 %val1, %operand_lsl31
+; CHECK: orr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #31
+  store volatile i32 %or_lsl31, i32* @var1_32
+  %orn_lsl31 = or i32 %val1, %neg_operand_lsl31
+; CHECK: orn {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #31
+  store volatile i32 %orn_lsl31, i32* @var1_32
+
+  %xor_lsl31 = xor i32 %val1, %operand_lsl31
+; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #31
+  store volatile i32 %xor_lsl31, i32* @var1_32
+  %xorn_lsl31 = xor i32 %val1, %neg_operand_lsl31
+; CHECK: eon {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsl #31
+  store volatile i32 %xorn_lsl31, i32* @var1_32
+
+  ; Check other shifts on a subset
+  %operand_asr10 = ashr i32 %val2, 10
+  %neg_operand_asr10 = xor i32 -1, %operand_asr10
+
+  %bic_asr10 = and i32 %val1, %neg_operand_asr10
+; CHECK: bic {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, asr #10
+  store volatile i32 %bic_asr10, i32* @var1_32
+  %xor_asr10 = xor i32 %val1, %operand_asr10
+; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, asr #10
+  store volatile i32 %xor_asr10, i32* @var1_32
+
+  %operand_lsr1 = lshr i32 %val2, 1
+  %neg_operand_lsr1 = xor i32 -1, %operand_lsr1
+
+  %orn_lsr1 = or i32 %val1, %neg_operand_lsr1
+; CHECK: orn {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsr #1
+  store volatile i32 %orn_lsr1, i32* @var1_32
+  %xor_lsr1 = xor i32 %val1, %operand_lsr1
+; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, lsr #1
+  store volatile i32 %xor_lsr1, i32* @var1_32
+
+  %operand_ror20_big = shl i32 %val2, 12
+  %operand_ror20_small = lshr i32 %val2, 20
+  %operand_ror20 = or i32 %operand_ror20_big, %operand_ror20_small
+  %neg_operand_ror20 = xor i32 -1, %operand_ror20
+
+  %xorn_ror20 = xor i32 %val1, %neg_operand_ror20
+; CHECK: eon {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, ror #20
+  store volatile i32 %xorn_ror20, i32* @var1_32
+  %and_ror20 = and i32 %val1, %operand_ror20
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, ror #20
+  store volatile i32 %and_ror20, i32* @var1_32
+
+  ret void
+}
+
+define void @logical_64bit() {
+; CHECK: logical_64bit:
+  %val1 = load i64* @var1_64
+  %val2 = load i64* @var2_64
+
+  ; First check basic and/bic/or/orn/eor/eon patterns with no shift
+  %neg_val2 = xor i64 -1, %val2
+
+  %and_noshift = and i64 %val1, %val2
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  store volatile i64 %and_noshift, i64* @var1_64
+  %bic_noshift = and i64 %neg_val2, %val1
+; CHECK: bic {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  store volatile i64 %bic_noshift, i64* @var1_64
+
+  %or_noshift = or i64 %val1, %val2
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  store volatile i64 %or_noshift, i64* @var1_64
+  %orn_noshift = or i64 %neg_val2, %val1
+; CHECK: orn {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  store volatile i64 %orn_noshift, i64* @var1_64
+
+  %xor_noshift = xor i64 %val1, %val2
+; CHECK: eor {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  store volatile i64 %xor_noshift, i64* @var1_64
+  %xorn_noshift = xor i64 %neg_val2, %val1
+; CHECK: eon {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  store volatile i64 %xorn_noshift, i64* @var1_64
+
+  ; Check the maximum shift on each
+  %operand_lsl63 = shl i64 %val2, 63
+  %neg_operand_lsl63 = xor i64 -1, %operand_lsl63
+
+  %and_lsl63 = and i64 %val1, %operand_lsl63
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #63
+  store volatile i64 %and_lsl63, i64* @var1_64
+  %bic_lsl63 = and i64 %val1, %neg_operand_lsl63
+; CHECK: bic {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #63
+  store volatile i64 %bic_lsl63, i64* @var1_64
+
+  %or_lsl63 = or i64 %val1, %operand_lsl63
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #63
+  store volatile i64 %or_lsl63, i64* @var1_64
+  %orn_lsl63 = or i64 %val1, %neg_operand_lsl63
+; CHECK: orn {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #63
+  store volatile i64 %orn_lsl63, i64* @var1_64
+
+  %xor_lsl63 = xor i64 %val1, %operand_lsl63
+; CHECK: eor {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #63
+  store volatile i64 %xor_lsl63, i64* @var1_64
+  %xorn_lsl63 = xor i64 %val1, %neg_operand_lsl63
+; CHECK: eon {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsl #63
+  store volatile i64 %xorn_lsl63, i64* @var1_64
+
+  ; Check other shifts on a subset
+  %operand_asr10 = ashr i64 %val2, 10
+  %neg_operand_asr10 = xor i64 -1, %operand_asr10
+
+  %bic_asr10 = and i64 %val1, %neg_operand_asr10
+; CHECK: bic {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, asr #10
+  store volatile i64 %bic_asr10, i64* @var1_64
+  %xor_asr10 = xor i64 %val1, %operand_asr10
+; CHECK: eor {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, asr #10
+  store volatile i64 %xor_asr10, i64* @var1_64
+
+  %operand_lsr1 = lshr i64 %val2, 1
+  %neg_operand_lsr1 = xor i64 -1, %operand_lsr1
+
+  %orn_lsr1 = or i64 %val1, %neg_operand_lsr1
+; CHECK: orn {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsr #1
+  store volatile i64 %orn_lsr1, i64* @var1_64
+  %xor_lsr1 = xor i64 %val1, %operand_lsr1
+; CHECK: eor {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, lsr #1
+  store volatile i64 %xor_lsr1, i64* @var1_64
+
+  ; Construct a rotate-right from a bunch of other logical
+  ; operations. DAGCombiner should ensure we the ROTR during
+  ; selection
+  %operand_ror20_big = shl i64 %val2, 44
+  %operand_ror20_small = lshr i64 %val2, 20
+  %operand_ror20 = or i64 %operand_ror20_big, %operand_ror20_small
+  %neg_operand_ror20 = xor i64 -1, %operand_ror20
+
+  %xorn_ror20 = xor i64 %val1, %neg_operand_ror20
+; CHECK: eon {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, ror #20
+  store volatile i64 %xorn_ror20, i64* @var1_64
+  %and_ror20 = and i64 %val1, %operand_ror20
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, ror #20
+  store volatile i64 %and_ror20, i64* @var1_64
+
+  ret void
+}
+
+define void @flag_setting() {
+; CHECK: flag_setting:
+  %val1 = load i64* @var1_64
+  %val2 = load i64* @var2_64
+
+; CHECK: tst {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: b.gt .L
+  %simple_and = and i64 %val1, %val2
+  %tst1 = icmp sgt i64 %simple_and, 0
+  br i1 %tst1, label %ret, label %test2
+
+test2:
+; CHECK: tst {{x[0-9]+}}, {{x[0-9]+}}, lsl #63
+; CHECK: b.lt .L
+  %shifted_op = shl i64 %val2, 63
+  %shifted_and = and i64 %val1, %shifted_op
+  %tst2 = icmp slt i64 %shifted_and, 0
+  br i1 %tst2, label %ret, label %test3
+
+test3:
+; CHECK: tst {{x[0-9]+}}, {{x[0-9]+}}, asr #12
+; CHECK: b.gt .L
+  %asr_op = ashr i64 %val2, 12
+  %asr_and = and i64 %asr_op, %val1
+  %tst3 = icmp sgt i64 %asr_and, 0
+  br i1 %tst3, label %ret, label %other_exit
+
+other_exit:
+  store volatile i64 %val1, i64* @var1_64
+  ret void
+ret:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/logical_shifted_reg.s b/test/CodeGen/AArch64/logical_shifted_reg.s
new file mode 100644
index 0000000..89aea58
--- /dev/null
+++ b/test/CodeGen/AArch64/logical_shifted_reg.s
@@ -0,0 +1,208 @@
+	.file	"/home/timnor01/a64-trunk/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll"
+	.text
+	.globl	logical_32bit
+	.type	logical_32bit,@function
+logical_32bit:                          // @logical_32bit
+	.cfi_startproc
+// BB#0:
+	adrp	x0, var1_32
+	ldr	w1, [x0, #:lo12:var1_32]
+	adrp	x0, var2_32
+	ldr	w2, [x0, #:lo12:var2_32]
+	and	w3, w1, w2
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	bic	w3, w1, w2
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	orr	w3, w1, w2
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	orn	w3, w1, w2
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	eor	w3, w1, w2
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	eon	w3, w2, w1
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	and	w3, w1, w2, lsl #31
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	bic	w3, w1, w2, lsl #31
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	orr	w3, w1, w2, lsl #31
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	orn	w3, w1, w2, lsl #31
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	eor	w3, w1, w2, lsl #31
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	eon	w3, w1, w2, lsl #31
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	bic	w3, w1, w2, asr #10
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	eor	w3, w1, w2, asr #10
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	orn	w3, w1, w2, lsr #1
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	eor	w3, w1, w2, lsr #1
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	eon	w3, w1, w2, ror #20
+	adrp	x0, var1_32
+	str	w3, [x0, #:lo12:var1_32]
+	and	w1, w1, w2, ror #20
+	adrp	x0, var1_32
+	str	w1, [x0, #:lo12:var1_32]
+	ret
+.Ltmp0:
+	.size	logical_32bit, .Ltmp0-logical_32bit
+	.cfi_endproc
+
+	.globl	logical_64bit
+	.type	logical_64bit,@function
+logical_64bit:                          // @logical_64bit
+	.cfi_startproc
+// BB#0:
+	adrp	x0, var1_64
+	ldr	x0, [x0, #:lo12:var1_64]
+	adrp	x1, var2_64
+	ldr	x1, [x1, #:lo12:var2_64]
+	and	x2, x0, x1
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	bic	x2, x0, x1
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	orr	x2, x0, x1
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	orn	x2, x0, x1
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	eor	x2, x0, x1
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	eon	x2, x1, x0
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	and	x2, x0, x1, lsl #63
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	bic	x2, x0, x1, lsl #63
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	orr	x2, x0, x1, lsl #63
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	orn	x2, x0, x1, lsl #63
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	eor	x2, x0, x1, lsl #63
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	eon	x2, x0, x1, lsl #63
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	bic	x2, x0, x1, asr #10
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	eor	x2, x0, x1, asr #10
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	orn	x2, x0, x1, lsr #1
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	eor	x2, x0, x1, lsr #1
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	eon	x2, x0, x1, ror #20
+	adrp	x3, var1_64
+	str	x2, [x3, #:lo12:var1_64]
+	and	x0, x0, x1, ror #20
+	adrp	x1, var1_64
+	str	x0, [x1, #:lo12:var1_64]
+	ret
+.Ltmp1:
+	.size	logical_64bit, .Ltmp1-logical_64bit
+	.cfi_endproc
+
+	.globl	flag_setting
+	.type	flag_setting,@function
+flag_setting:                           // @flag_setting
+	.cfi_startproc
+// BB#0:
+	sub	sp, sp, #16
+	adrp	x0, var1_64
+	ldr	x0, [x0, #:lo12:var1_64]
+	adrp	x1, var2_64
+	ldr	x1, [x1, #:lo12:var2_64]
+	tst	x0, x1
+	str	x0, [sp, #8]            // 8-byte Folded Spill
+	str	x1, [sp]                // 8-byte Folded Spill
+	b.gt .LBB2_4
+	b	.LBB2_1
+.LBB2_1:                                // %test2
+	ldr	x0, [sp, #8]            // 8-byte Folded Reload
+	ldr	x1, [sp]                // 8-byte Folded Reload
+	tst	x0, x1, lsl #63
+	b.lt .LBB2_4
+	b	.LBB2_2
+.LBB2_2:                                // %test3
+	ldr	x0, [sp, #8]            // 8-byte Folded Reload
+	ldr	x1, [sp]                // 8-byte Folded Reload
+	tst	x0, x1, asr #12
+	b.gt .LBB2_4
+	b	.LBB2_3
+.LBB2_3:                                // %other_exit
+	adrp	x0, var1_64
+	ldr	x1, [sp, #8]            // 8-byte Folded Reload
+	str	x1, [x0, #:lo12:var1_64]
+	add	sp, sp, #16
+	ret
+.LBB2_4:                                // %ret
+	add	sp, sp, #16
+	ret
+.Ltmp2:
+	.size	flag_setting, .Ltmp2-flag_setting
+	.cfi_endproc
+
+	.type	var1_32,@object         // @var1_32
+	.bss
+	.globl	var1_32
+	.align	2
+var1_32:
+	.word	0                       // 0x0
+	.size	var1_32, 4
+
+	.type	var2_32,@object         // @var2_32
+	.globl	var2_32
+	.align	2
+var2_32:
+	.word	0                       // 0x0
+	.size	var2_32, 4
+
+	.type	var1_64,@object         // @var1_64
+	.globl	var1_64
+	.align	3
+var1_64:
+	.xword	0                       // 0x0
+	.size	var1_64, 8
+
+	.type	var2_64,@object         // @var2_64
+	.globl	var2_64
+	.align	3
+var2_64:
+	.xword	0                       // 0x0
+	.size	var2_64, 8
+
+
diff --git a/test/CodeGen/AArch64/movw-consts.ll b/test/CodeGen/AArch64/movw-consts.ll
new file mode 100644
index 0000000..afdf681
--- /dev/null
+++ b/test/CodeGen/AArch64/movw-consts.ll
@@ -0,0 +1,124 @@
+; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i64 @test0() {
+; CHECK: test0:
+; Not produced by move wide instructions, but good to make sure we can return 0 anyway:
+; CHECK: mov x0, xzr
+  ret i64 0
+}
+
+define i64 @test1() {
+; CHECK: test1:
+; CHECK: movz x0, #1
+  ret i64 1
+}
+
+define i64 @test2() {
+; CHECK: test2:
+; CHECK: movz x0, #65535
+  ret i64 65535
+}
+
+define i64 @test3() {
+; CHECK: test3:
+; CHECK: movz x0, #1, lsl #16
+  ret i64 65536
+}
+
+define i64 @test4() {
+; CHECK: test4:
+; CHECK: movz x0, #65535, lsl #16
+  ret i64 4294901760
+}
+
+define i64 @test5() {
+; CHECK: test5:
+; CHECK: movz x0, #1, lsl #32
+  ret i64 4294967296
+}
+
+define i64 @test6() {
+; CHECK: test6:
+; CHECK: movz x0, #65535, lsl #32
+  ret i64 281470681743360
+}
+
+define i64 @test7() {
+; CHECK: test7:
+; CHECK: movz x0, #1, lsl #48
+  ret i64 281474976710656
+}
+
+; A 32-bit MOVN can generate some 64-bit patterns that a 64-bit one
+; couldn't. Useful even for i64
+define i64 @test8() {
+; CHECK: test8:
+; CHECK: movn w0, #60875
+  ret i64 4294906420
+}
+
+define i64 @test9() {
+; CHECK: test9:
+; CHECK: movn x0, #0
+  ret i64 -1
+}
+
+define i64 @test10() {
+; CHECK: test10:
+; CHECK: movn x0, #60875, lsl #16
+  ret i64 18446744069720047615
+}
+
+; For reasonably legitimate reasons returning an i32 results in the
+; selection of an i64 constant, so we need a different idiom to test that selection
+@var32 = global i32 0
+
+define void @test11() {
+; CHECK: test11:
+; CHECK movz {{w[0-9]+}}, #0
+  store i32 0, i32* @var32
+  ret void
+}
+
+define void @test12() {
+; CHECK: test12:
+; CHECK: movz {{w[0-9]+}}, #1
+  store i32 1, i32* @var32
+  ret void
+}
+
+define void @test13() {
+; CHECK: test13:
+; CHECK: movz {{w[0-9]+}}, #65535
+  store i32 65535, i32* @var32
+  ret void
+}
+
+define void @test14() {
+; CHECK: test14:
+; CHECK: movz {{w[0-9]+}}, #1, lsl #16
+  store i32 65536, i32* @var32
+  ret void
+}
+
+define void @test15() {
+; CHECK: test15:
+; CHECK: movz {{w[0-9]+}}, #65535, lsl #16
+  store i32 4294901760, i32* @var32
+  ret void
+}
+
+define void @test16() {
+; CHECK: test16:
+; CHECK: movn {{w[0-9]+}}, #0
+  store i32 -1, i32* @var32
+  ret void
+}
+
+define i64 @test17() {
+; CHECK: test17:
+
+  ; Mustn't MOVN w0 here.
+; CHECK: movn x0, #2
+  ret i64 -3
+}
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
new file mode 100644
index 0000000..77bf691
--- /dev/null
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+
+; Make sure exception-handling PIC code can be linked correctly. An alternative
+; to the sequence described below would have .gcc_except_table itself writable
+; and not use the indirection, but this isn't what LLVM does right now.
+
+  ; There should be a read-only .gcc_except_table section...
+; CHECK: .section .gcc_except_table,"a"
+
+  ; ... referring indirectly to stubs for its typeinfo ...
+; CHECK: // @TType Encoding = indirect pcrel sdata8
+  ; ... one of which is "int"'s typeinfo
+; CHECK: .Ltmp9:
+; CHECK-NEXT: .xword  .L_ZTIi.DW.stub-.Ltmp9
+
+  ; .. and which is properly defined (in a writable section for the dynamic loader) later.
+; CHECK: .section .data.rel,"aw"
+; CHECK: .L_ZTIi.DW.stub:
+; CHECK-NEXT: .xword _ZTIi
+
+@_ZTIi = external constant i8*
+
+define i32 @_Z3barv() {
+entry:
+  invoke void @_Z3foov()
+          to label %return unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %3 = extractvalue { i8*, i32 } %0, 0
+  %4 = tail call i8* @__cxa_begin_catch(i8* %3) nounwind
+  %5 = bitcast i8* %4 to i32*
+  %exn.scalar = load i32* %5, align 4
+  tail call void @__cxa_end_catch() nounwind
+  br label %return
+
+return:                                           ; preds = %entry, %catch
+  %retval.0 = phi i32 [ %exn.scalar, %catch ], [ 42, %entry ]
+  ret i32 %retval.0
+
+eh.resume:                                        ; preds = %lpad
+  resume { i8*, i32 } %0
+}
+
+declare void @_Z3foov()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/regress-bitcast-formals.ll b/test/CodeGen/AArch64/regress-bitcast-formals.ll
new file mode 100644
index 0000000..28dc9a7
--- /dev/null
+++ b/test/CodeGen/AArch64/regress-bitcast-formals.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+; CallingConv.td requires a bitcast for vector arguments. Make sure we're
+; actually capable of that (the test was omitted from LowerFormalArguments).
+
+define void @test_bitcast_lower(<2 x i32> %a) {
+; CHECK: test_bitcast_lower:
+
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/regress-f128csel-flags.ll b/test/CodeGen/AArch64/regress-f128csel-flags.ll
new file mode 100644
index 0000000..b35185c
--- /dev/null
+++ b/test/CodeGen/AArch64/regress-f128csel-flags.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+; We used to not mark NZCV as being used in the continuation basic-block
+; when lowering a 128-bit "select" to branches. This meant a subsequent use
+; of the same flags gave an internal fault here.
+
+declare void @foo(fp128)
+
+define double @test_f128csel_flags(i32 %lhs, fp128 %a, fp128 %b) nounwind {
+; CHECK: test_f128csel_flags
+
+    %tst = icmp ne i32 %lhs, 42
+    %val = select i1 %tst, fp128 %a, fp128 %b
+; CHECK: cmp w0, #42
+; CHECK: b.eq .LBB0
+
+    call void @foo(fp128 %val)
+    %retval = select i1 %tst, double 4.0, double 5.0
+
+    ; It's also reasonably important that the actual fcsel comes before the
+    ; function call since bl may corrupt NZCV. We were doing the right thing anyway,
+    ; but just as well test it while we're here.
+; CHECK: fcsel {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, ne
+; CHECK: bl foo
+
+    ret double %retval
+}
diff --git a/test/CodeGen/AArch64/regress-tail-livereg.ll b/test/CodeGen/AArch64/regress-tail-livereg.ll
new file mode 100644
index 0000000..8d5485c
--- /dev/null
+++ b/test/CodeGen/AArch64/regress-tail-livereg.ll
@@ -0,0 +1,19 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+@var = global void()* zeroinitializer
+
+declare void @bar()
+
+define void @foo() {
+; CHECK: foo:
+       %func = load void()** @var
+
+       ; Calling a function encourages @foo to use a callee-saved register,
+       ; which makes it a natural choice for the tail call itself. But we don't
+       ; want that: the final "br xN" has to use a temporary or argument
+       ; register.
+       call void @bar()
+
+       tail call void %func()
+; CHECK: br {{x([0-79]|1[0-8])}}
+       ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
new file mode 100644
index 0000000..e54552f
--- /dev/null
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; When generating DAG selection tables, TableGen used to only flag an
+; instruction as needing a chain on its own account if it had a built-in pattern
+; which used the chain. This meant that the AArch64 load/stores weren't
+; recognised and so both loads from %locvar below were coalesced into a single
+; LS8_LDR instruction (same operands other than the non-existent chain) and the
+; increment was lost at return.
+
+; This was obviously a Bad Thing.
+
+declare void @bar(i8*)
+
+define i64 @test_chains() {
+; CHECK: test_chains:
+
+  %locvar = alloca i8
+
+  call void @bar(i8* %locvar)
+; CHECK: bl bar
+
+  %inc.1 = load i8* %locvar
+  %inc.2 = zext i8 %inc.1 to i64
+  %inc.3 = add i64 %inc.2, 1
+  %inc.4 = trunc i64 %inc.3 to i8
+  store i8 %inc.4, i8* %locvar
+; CHECK: ldrb {{w[0-9]+}}, [sp, [[LOCADDR:#[0-9]+]]]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #1
+; CHECK: strb {{w[0-9]+}}, [sp, [[LOCADDR]]]
+; CHECK: ldrb {{w[0-9]+}}, [sp, [[LOCADDR]]]
+
+  %ret.1 = load i8* %locvar
+  %ret.2 = zext i8 %ret.1 to i64
+  ret i64 %ret.2
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
new file mode 100644
index 0000000..5c97a02
--- /dev/null
+++ b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+@var = global i32 0
+
+declare void @bar()
+
+define void @test_w29_reserved() {
+; CHECK: test_w29_reserved:
+; CHECK add x29, sp, #{{[0-9]+}}
+
+  %val1 = load volatile i32* @var
+  %val2 = load volatile i32* @var
+  %val3 = load volatile i32* @var
+  %val4 = load volatile i32* @var
+  %val5 = load volatile i32* @var
+  %val6 = load volatile i32* @var
+  %val7 = load volatile i32* @var
+  %val8 = load volatile i32* @var
+  %val9 = load volatile i32* @var
+
+; CHECK-NOT: ldr w29,
+
+  ; Call to prevent fp-elim that occurs regardless in leaf functions.
+  call void @bar()
+
+  store volatile i32 %val1,  i32* @var
+  store volatile i32 %val2,  i32* @var
+  store volatile i32 %val3,  i32* @var
+  store volatile i32 %val4,  i32* @var
+  store volatile i32 %val5,  i32* @var
+  store volatile i32 %val6,  i32* @var
+  store volatile i32 %val7,  i32* @var
+  store volatile i32 %val8,  i32* @var
+  store volatile i32 %val9,  i32* @var
+
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/regress-wzr-allocatable.ll b/test/CodeGen/AArch64/regress-wzr-allocatable.ll
new file mode 100644
index 0000000..764d2bc
--- /dev/null
+++ b/test/CodeGen/AArch64/regress-wzr-allocatable.ll
@@ -0,0 +1,41 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0
+
+; When WZR wasn't marked as reserved, this function tried to allocate
+; it at O0 and then generated an internal fault (mostly incidentally)
+; when it discovered that it was already in use for a multiplication.
+
+; I'm not really convinced this is a good test since it could easily
+; stop testing what it does now with no-one any the wiser. However, I
+; can't think of a better way to force the allocator to use WZR
+; specifically.
+
+define void @test() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.body9, %for.end
+  br i1 undef, label %for.body9, label %while.cond30
+
+for.body9:                                        ; preds = %for.cond6
+  store i16 0, i16* undef, align 2
+  %0 = load i32* undef, align 4
+  %1 = load i32* undef, align 4
+  %mul15 = mul i32 %0, %1
+  %add16 = add i32 %mul15, 32768
+  %div = udiv i32 %add16, 65535
+  %add17 = add i32 %div, 1
+  store i32 %add17, i32* undef, align 4
+  br label %for.cond6
+
+while.cond30:                                     ; preds = %for.cond6
+  ret void
+}
diff --git a/test/CodeGen/AArch64/setcc-takes-i32.ll b/test/CodeGen/AArch64/setcc-takes-i32.ll
new file mode 100644
index 0000000..d2eb77a
--- /dev/null
+++ b/test/CodeGen/AArch64/setcc-takes-i32.ll
@@ -0,0 +1,22 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Most important point here is that the promotion of the i1 works
+; correctly. Previously LLVM thought that i64 was the appropriate SetCC output,
+; which meant it proceded in two steps and produced an i64 -> i64 any_ext which
+; couldn't be selected and faulted.
+
+; It was expecting the smallest legal promotion of i1 to be the preferred SetCC
+; type, so we'll satisfy it (this actually arguably gives better code anyway,
+; with flag-manipulation operations allowed to use W-registers).
+
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64)
+
+define i64 @test_select(i64 %lhs, i64 %rhs) {
+; CHECK: test_select:
+
+  %res = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %lhs, i64 %rhs)
+  %flag = extractvalue {i64, i1} %res, 1
+  %retval = select i1 %flag, i64 %lhs, i64 %rhs
+  ret i64 %retval
+; CHECK: ret
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
new file mode 100644
index 0000000..a1ec618
--- /dev/null
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -0,0 +1,97 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+declare void @callee_stack0()
+declare void @callee_stack8([8 x i32], i64)
+declare void @callee_stack16([8 x i32], i64, i64)
+
+define void @caller_to0_from0() nounwind {
+; CHECK: caller_to0_from0:
+; CHECK-NEXT: // BB
+  tail call void @callee_stack0()
+  ret void
+; CHECK-NEXT: b callee_stack0
+}
+
+define void @caller_to0_from8([8 x i32], i64) nounwind{
+; CHECK: caller_to0_from8:
+; CHECK-NEXT: // BB
+
+  tail call void @callee_stack0()
+  ret void
+; CHECK-NEXT: b callee_stack0
+}
+
+define void @caller_to8_from0() {
+; CHECK: caller_to8_from0:
+
+; Caller isn't going to clean up any extra stack we allocate, so it
+; can't be a tail call.
+  tail call void @callee_stack8([8 x i32] undef, i64 42)
+  ret void
+; CHECK: bl callee_stack8
+}
+
+define void @caller_to8_from8([8 x i32], i64 %a) {
+; CHECK: caller_to8_from8:
+; CHECK-NOT: sub sp, sp,
+
+; This should reuse our stack area for the 42
+  tail call void @callee_stack8([8 x i32] undef, i64 42)
+  ret void
+; CHECK: str {{x[0-9]+}}, [sp]
+; CHECK-NEXT: b callee_stack8
+}
+
+define void @caller_to16_from8([8 x i32], i64 %a) {
+; CHECK: caller_to16_from8:
+
+; Shouldn't be a tail call: we can't use SP+8 because our caller might
+; have something there. This may sound obvious but implementation does
+; some funky aligning.
+  tail call void @callee_stack16([8 x i32] undef, i64 undef, i64 undef)
+; CHECK: bl callee_stack16
+  ret void
+}
+
+define void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
+; CHECK: caller_to8_from24:
+; CHECK-NOT: sub sp, sp
+
+; Reuse our area, putting "42" at incoming sp
+  tail call void @callee_stack8([8 x i32] undef, i64 42)
+  ret void
+; CHECK: str {{x[0-9]+}}, [sp]
+; CHECK-NEXT: b callee_stack8
+}
+
+define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
+; CHECK: caller_to16_from16:
+; CHECK-NOT: sub sp, sp,
+
+; Here we want to make sure that both loads happen before the stores:
+; otherwise either %a or %b will be wrongly clobbered.
+  tail call void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
+  ret void
+
+; CHECK: ldr x0,
+; CHECK: ldr x1,
+; CHECK: str x1,
+; CHECK: str x0,
+
+; CHECK-NOT: add sp, sp,
+; CHECK: b callee_stack16
+}
+
+@func = global void(i32)* null
+
+define void @indirect_tail() {
+; CHECK: indirect_tail:
+; CHECK-NOT: sub sp, sp
+
+  %fptr = load void(i32)** @func
+  tail call void %fptr(i32 42)
+  ret void
+; CHECK: movz w0, #42
+; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, #:lo12:func]
+; CHECK: br [[FPTR]]
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
new file mode 100644
index 0000000..f323b15
--- /dev/null
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -0,0 +1,94 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
+
+declare fastcc void @callee_stack0()
+declare fastcc void @callee_stack8([8 x i32], i64)
+declare fastcc void @callee_stack16([8 x i32], i64, i64)
+
+define fastcc void @caller_to0_from0() nounwind {
+; CHECK: caller_to0_from0:
+; CHECK-NEXT: // BB
+  tail call fastcc void @callee_stack0()
+  ret void
+; CHECK-NEXT: b callee_stack0
+}
+
+define fastcc void @caller_to0_from8([8 x i32], i64) {
+; CHECK: caller_to0_from8:
+
+  tail call fastcc void @callee_stack0()
+  ret void
+; CHECK: add sp, sp, #16
+; CHECK-NEXT: b callee_stack0
+}
+
+define fastcc void @caller_to8_from0() {
+; CHECK: caller_to8_from0:
+; CHECK: sub sp, sp, #32
+
+; Key point is that the "42" should go #16 below incoming stack
+; pointer (we didn't have arg space to reuse).
+  tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
+  ret void
+; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack8
+}
+
+define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
+; CHECK: caller_to8_from8:
+; CHECK: sub sp, sp, #16
+
+; Key point is that the "%a" should go where at SP on entry.
+  tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
+  ret void
+; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack8
+}
+
+define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
+; CHECK: caller_to16_from8:
+; CHECK: sub sp, sp, #16
+
+; Important point is that the call reuses the "dead" argument space
+; above %a on the stack. If it tries to go below incoming-SP then the
+; callee will not deallocate the space, even in fastcc.
+  tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
+; CHECK: str {{x[0-9]+}}, [sp, #24]
+; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: add sp, sp, #16
+; CHECK: b callee_stack16
+  ret void
+}
+
+
+define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
+; CHECK: caller_to8_from24:
+; CHECK: sub sp, sp, #16
+
+; Key point is that the "%a" should go where at #16 above SP on entry.
+  tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
+  ret void
+; CHECK: str {{x[0-9]+}}, [sp, #32]
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: b callee_stack8
+}
+
+
+define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
+; CHECK: caller_to16_from16:
+; CHECK: sub sp, sp, #16
+
+; Here we want to make sure that both loads happen before the stores:
+; otherwise either %a or %b will be wrongly clobbered.
+  tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
+  ret void
+
+; CHECK: ldr x0,
+; CHECK: ldr x1,
+; CHECK: str x1,
+; CHECK: str x0,
+
+; CHECK: add sp, sp, #16
+; CHECK: b callee_stack16
+}
diff --git a/test/CodeGen/AArch64/tls-dynamic-together.ll b/test/CodeGen/AArch64/tls-dynamic-together.ll
new file mode 100644
index 0000000..bad2298
--- /dev/null
+++ b/test/CodeGen/AArch64/tls-dynamic-together.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+
+; If the .tlsdesccall and blr parts are emitted completely separately (even with
+; glue) then LLVM will separate them quite happily (with a spill at O0, hence
+; the option). This is definitely wrong, so we make sure they are emitted
+; together.
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr {{x[0-9]+}}
+}
diff --git a/test/CodeGen/AArch64/tls-dynamics.ll b/test/CodeGen/AArch64/tls-dynamics.ll
new file mode 100644
index 0000000..cdfd117
--- /dev/null
+++ b/test/CodeGen/AArch64/tls-dynamics.ll
@@ -0,0 +1,121 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
+; CHECK: ldr w0, [x[[TP]], x0]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_generaldynamic_addr() {
+; CHECK: test_generaldynamic_addr:
+
+  ret i32* @general_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
+; CHECK: add x0, [[TP]], x0
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+@local_dynamic_var = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic() {
+; CHECK: test_localdynamic:
+
+  %val = load i32* @local_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: ldr w0, [x0, [[DTP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_localdynamic_addr() {
+; CHECK: test_localdynamic_addr:
+
+  ret i32* @local_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add x0, x0, [[DTP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+; The entire point of the local-dynamic access model is to have a single call to
+; the expensive resolver. Make sure we achieve that goal.
+
+@local_dynamic_var2 = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic_deduplicate() {
+; CHECK: test_localdynamic_deduplicate:
+
+  %val = load i32* @local_dynamic_var
+  %val2 = load i32* @local_dynamic_var2
+
+  %sum = add i32 %val, %val2
+  ret i32 %sum
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK-NOT: _TLS_MODULE_BASE_
+
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/tls-execs.ll b/test/CodeGen/AArch64/tls-execs.ll
new file mode 100644
index 0000000..a665884
--- /dev/null
+++ b/test/CodeGen/AArch64/tls-execs.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@initial_exec_var = external thread_local(initialexec) global i32
+
+define i32 @test_initial_exec() {
+; CHECK: test_initial_exec:
+  %val = load i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
+; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
+; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+  ret i32 %val
+}
+
+define i32* @test_initial_exec_addr() {
+; CHECK: test_initial_exec_addr:
+  ret i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
+; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+}
+
+@local_exec_var = thread_local(initialexec) global i32 0
+
+define i32 @test_local_exec() {
+; CHECK: test_local_exec:
+  %val = load i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
+; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+
+  ret i32 %val
+}
+
+define i32* @test_local_exec_addr() {
+; CHECK: test_local_exec_addr:
+  ret i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+}
diff --git a/test/CodeGen/AArch64/tst-br.ll b/test/CodeGen/AArch64/tst-br.ll
new file mode 100644
index 0000000..65c1fda
--- /dev/null
+++ b/test/CodeGen/AArch64/tst-br.ll
@@ -0,0 +1,48 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+; We've got the usual issues with LLVM reordering blocks here. The
+; tests are correct for the current order, but who knows when that
+; will change. Beware!
+@var32 = global i32 0
+@var64 = global i64 0
+
+define i32 @test_tbz() {
+; CHECK: test_tbz:
+
+  %val = load i32* @var32
+  %val64 = load i64* @var64
+
+  %tbit0 = and i32 %val, 32768
+  %tst0 = icmp ne i32 %tbit0, 0
+  br i1 %tst0, label %test1, label %end1
+; CHECK: tbz {{w[0-9]+}}, #15, [[LBL_end1:.LBB0_[0-9]+]]
+
+test1:
+  %tbit1 = and i32 %val, 4096
+  %tst1 = icmp ne i32 %tbit1, 0
+  br i1 %tst1, label %test2, label %end1
+; CHECK: tbz {{w[0-9]+}}, #12, [[LBL_end1]]
+
+test2:
+  %tbit2 = and i64 %val64, 32768
+  %tst2 = icmp ne i64 %tbit2, 0
+  br i1 %tst2, label %test3, label %end1
+; CHECK: tbz {{x[0-9]+}}, #15, [[LBL_end1]]
+
+test3:
+  %tbit3 = and i64 %val64, 4096
+  %tst3 = icmp ne i64 %tbit3, 0
+  br i1 %tst3, label %end2, label %end1
+; CHECK: tbz {{x[0-9]+}}, #12, [[LBL_end1]]
+
+end2:
+; CHECK: movz x0, #1
+; CHECK-NEXT: ret
+  ret i32 1
+
+end1:
+; CHECK: [[LBL_end1]]:
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: ret
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll
new file mode 100644
index 0000000..c5d319e
--- /dev/null
+++ b/test/CodeGen/AArch64/variadic.ll
@@ -0,0 +1,144 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+%va_list = type {i8*, i8*, i8*, i32, i32}
+
+@var = global %va_list zeroinitializer
+
+declare void @llvm.va_start(i8*)
+
+define void @test_simple(i32 %n, ...) {
+; CHECK: test_simple:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: mov x[[FPRBASE:[0-9]+]], sp
+; CHECK: str q7, [x[[FPRBASE]], #112]
+; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
+; CHECK: str x7, [x[[GPRBASE]], #48]
+
+; Omit the middle ones
+
+; CHECK: str q0, [sp]
+; CHECK: str x1, [sp, #[[GPRFROMSP]]]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
+; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+
+  ret void
+}
+
+define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
+; CHECK: test_fewargs:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: mov x[[FPRBASE:[0-9]+]], sp
+; CHECK: str q7, [x[[FPRBASE]], #96]
+; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
+; CHECK: str x7, [x[[GPRBASE]], #32]
+
+; Omit the middle ones
+
+; CHECK: str q1, [sp]
+; CHECK: str x3, [sp, #[[GPRFROMSP]]]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #112
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #40
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
+; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+
+  ret void
+}
+
+define void @test_nospare([8 x i64], [8 x float], ...) {
+; CHECK: test_nospare:
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK-NOT: sub sp, sp
+; CHECK: mov [[STACK:x[0-9]+]], sp
+; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+
+  ret void
+}
+
+; If there are non-variadic arguments on the stack (here two i64s) then the
+; __stack field should point just past them.
+define void @test_offsetstack([10 x i64], [3 x float], ...) {
+; CHECK: test_offsetstack:
+; CHECK: sub sp, sp, #80
+; CHECK: mov x[[FPRBASE:[0-9]+]], sp
+; CHECK: str q7, [x[[FPRBASE]], #64]
+
+; CHECK-NOT: str x{{[0-9]+}},
+; Omit the middle ones
+
+; CHECK: str q3, [sp]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #79
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+; CHECK: str wzr, [x[[VA_LIST]], #24]
+; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #80
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+; CHECK: add [[STACK:x[0-9]+]], sp, #96
+; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+
+  ret void
+}
+
+declare void @llvm.va_end(i8*)
+
+define void @test_va_end() nounwind {
+; CHECK: test_va_end:
+; CHECK-NEXT: BB#0
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_end(i8* %addr)
+
+  ret void
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.va_copy(i8* %dest, i8* %src)
+
+@second_list = global %va_list zeroinitializer
+
+define void @test_va_copy() {
+; CHECK: test_va_copy:
+  %srcaddr = bitcast %va_list* @var to i8*
+  %dstaddr = bitcast %va_list* @second_list to i8*
+  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
+
+; Check beginning and end again:
+
+; CHECK: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list]
+
+; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
+; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+
+; CHECK: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24]
+; CHECK: str [[BLOCK]], [x[[DEST_LIST]], #24]
+
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/zero-reg.ll b/test/CodeGen/AArch64/zero-reg.ll
new file mode 100644
index 0000000..fef0437
--- /dev/null
+++ b/test/CodeGen/AArch64/zero-reg.ll
@@ -0,0 +1,31 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_zr() {
+; CHECK: test_zr:
+
+  store i32 0, i32* @var32
+; CHECK: str wzr, [{{x[0-9]+}}, #:lo12:var32]
+  store i64 0, i64* @var64
+; CHECK: str xzr, [{{x[0-9]+}}, #:lo12:var64]
+
+  ret void
+; CHECK: ret
+}
+
+define void @test_sp(i32 %val) {
+; CHECK: test_sp:
+
+; Important correctness point here is that LLVM doesn't try to use xzr
+; as an addressing register: "str w0, [xzr]" is not a valid A64
+; instruction (0b11111 in the Rn field would mean "sp").
+  %addr = getelementptr i32* null, i64 0
+  store i32 %val, i32* %addr
+; CHECK: mov x[[NULL:[0-9]+]], xzr
+; CHECK: str {{w[0-9]+}}, [x[[NULL]]]
+
+  ret void
+; CHECK: ret
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
index f9ede74..0d0d03b 100644
--- a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
+++ b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
@@ -8,7 +8,7 @@ define void @test_sqrt(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw    r1, :lower16:{{.*}}
 ; CHECK:      movt    r1, :upper16:{{.*}}
-; CHECK:      vld1.64 {{.*}}, [r1, :128]
+; CHECK:      vld1.64 {{.*}}, [r1:128]
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
@@ -252,7 +252,7 @@ define void @test_powi(<4 x float>* %X) nounwind {
 
 ; CHECK:       movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:       movt  [[reg0]], :upper16:{{.*}}
-; CHECK:       vld1.64 {{.*}}, :128
+; CHECK:       vld1.64 {{.*}}:128
 ; CHECK:       vmul.f32 {{.*}}
 
 ; CHECK:      vst1.64
diff --git a/test/CodeGen/ARM/2012-08-09-neon-extload.ll b/test/CodeGen/ARM/2012-08-09-neon-extload.ll
index b55f1ca..764c58f 100644
--- a/test/CodeGen/ARM/2012-08-09-neon-extload.ll
+++ b/test/CodeGen/ARM/2012-08-09-neon-extload.ll
@@ -18,7 +18,7 @@ define void @test_v2i8tov2i32() {
 
   %i32val = sext <2 x i8> %i8val to <2 x i32>
   store <2 x i32> %i32val, <2 x i32>* @var_v2i32
-; CHECK: vld1.16 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :16]
+; CHECK: vld1.16 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}:16]
 ; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
 ; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
 
@@ -32,7 +32,7 @@ define void @test_v2i8tov2i64() {
 
   %i64val = sext <2 x i8> %i8val to <2 x i64>
   store <2 x i64> %i64val, <2 x i64>* @var_v2i64
-; CHECK: vld1.16 {d{{[0-9]+}}[0]}, [{{r[0-9]+}}, :16]
+; CHECK: vld1.16 {d{{[0-9]+}}[0]}, [{{r[0-9]+}}:16]
 ; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
 ; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
 ; CHECK: vmovl.s32 {{q[0-9]+}}, {{d[0-9]+}}
@@ -50,7 +50,7 @@ define void @test_v4i8tov4i16() {
 
   %i16val = sext <4 x i8> %i8val to <4 x i16>
   store <4 x i16> %i16val, <4 x i16>* @var_v4i16
-; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}:32]
 ; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
 ; CHECK-NOT: vmovl.s16
 
@@ -65,7 +65,7 @@ define void @test_v4i8tov4i32() {
 
   %i16val = sext <4 x i8> %i8val to <4 x i32>
   store <4 x i32> %i16val, <4 x i32>* @var_v4i32
-; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}:32]
 ; CHECK: vmovl.s8 {{q[0-9]+}}, d[[LOAD]]
 ; CHECK: vmovl.s16 {{q[0-9]+}}, {{d[0-9]+}}
 
@@ -79,7 +79,7 @@ define void @test_v2i16tov2i32() {
 
   %i32val = sext <2 x i16> %i16val to <2 x i32>
   store <2 x i32> %i32val, <2 x i32>* @var_v2i32
-; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}:32]
 ; CHECK: vmovl.s16 {{q[0-9]+}}, d[[LOAD]]
 ; CHECK-NOT: vmovl
 
@@ -94,7 +94,7 @@ define void @test_v2i16tov2i64() {
 
   %i64val = sext <2 x i16> %i16val to <2 x i64>
   store <2 x i64> %i64val, <2 x i64>* @var_v2i64
-; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}, :32]
+; CHECK: vld1.32 {d[[LOAD:[0-9]+]][0]}, [{{r[0-9]+}}:32]
 ; CHECK: vmovl.s16 {{q[0-9]+}}, d[[LOAD]]
 ; CHECK: vmovl.s32 {{q[0-9]+}}, d[[LOAD]]
 
diff --git a/test/CodeGen/ARM/2013-01-21-PR14992.ll b/test/CodeGen/ARM/2013-01-21-PR14992.ll
new file mode 100644
index 0000000..38b9e0e
--- /dev/null
+++ b/test/CodeGen/ARM/2013-01-21-PR14992.ll
@@ -0,0 +1,28 @@
+;PR14492 - Tablegen incorrectly converts ARM tLDMIA_UPD pseudo to tLDMIA
+;RUN: llc -mtriple=thumbv7 < %s  | FileCheck -check-prefix=EXPECTED %s
+;RUN: llc -mtriple=thumbv7 < %s  | FileCheck %s
+
+;EXPECTED: foo:
+;CHECK: foo:
+define i32 @foo(i32* %a) nounwind optsize {
+entry:
+  %0 = load i32* %a, align 4, !tbaa !0
+  %arrayidx1 = getelementptr inbounds i32* %a, i32 1
+  %1 = load i32* %arrayidx1, align 4, !tbaa !0
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 2
+  %2 = load i32* %arrayidx2, align 4, !tbaa !0
+  %add.ptr = getelementptr inbounds i32* %a, i32 3
+;Make sure we do not have a duplicated register in the front of the reg list
+;EXPECTED:  ldm [[BASE:r[0-9]+]]!, {[[REG:r[0-9]+]], {{r[0-9]+}},
+;CHECK-NOT: ldm [[BASE:r[0-9]+]]!, {[[REG:r[0-9]+]], [[REG]],
+  tail call void @bar(i32* %add.ptr) nounwind optsize
+  %add = add nsw i32 %1, %0
+  %add3 = add nsw i32 %add, %2
+  ret i32 %add3
+}
+
+declare void @bar(i32*) optsize
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/arm-modifier.ll b/test/CodeGen/ARM/arm-modifier.ll
index 5e12d8e..c747016 100644
--- a/test/CodeGen/ARM/arm-modifier.ll
+++ b/test/CodeGen/ARM/arm-modifier.ll
@@ -61,8 +61,7 @@ ret void
 define i64 @f4(i64* %val) nounwind {
 entry:
   ;CHECK: f4
-  ;CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], {{r[0-9]?[13579]}}, [r0]
-  ;CHECK: mov r0, [[REG1]]
+  ;CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
   %0 = tail call i64 asm sideeffect "ldrexd $0, ${0:H}, [$1]", "=&r,r,*Qo"(i64* %val, i64* %val) nounwind
   ret i64 %0
 }
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 69da622..f2c7305 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB
 
 define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK: test1:
@@ -10,6 +11,17 @@ define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test1:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: adds.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB: adc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = atomicrmw add i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -24,6 +36,17 @@ define i64 @test2(i64* %ptr, i64 %val) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test2:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: subs.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB: sbc.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = atomicrmw sub i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -38,6 +61,17 @@ define i64 @test3(i64* %ptr, i64 %val) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test3:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: and.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB: and.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = atomicrmw and i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -52,6 +86,17 @@ define i64 @test4(i64* %ptr, i64 %val) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test4:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: orr.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB: orr.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = atomicrmw or i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -66,6 +111,17 @@ define i64 @test5(i64* %ptr, i64 %val) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test5:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: eor.w [[REG3:[a-z0-9]+]], [[REG1]]
+; CHECK-THUMB: eor.w [[REG4:[a-z0-9]+]], [[REG2]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = atomicrmw xor i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -78,6 +134,15 @@ define i64 @test6(i64* %ptr, i64 %val) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test6:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = atomicrmw xchg i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -93,6 +158,19 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test7:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: cmp [[REG1]]
+; CHECK-THUMB: it eq
+; CHECK-THUMB: cmpeq [[REG2]]
+; CHECK-THUMB: bne
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = cmpxchg i64* %ptr, i64 %val1, i64 %val2 seq_cst
   ret i64 %r
 }
@@ -109,6 +187,18 @@ define i64 @test8(i64* %ptr) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test8:
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: cmp [[REG1]]
+; CHECK-THUMB: it eq
+; CHECK-THUMB: cmpeq [[REG2]]
+; CHECK-THUMB: bne
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = load atomic i64* %ptr seq_cst, align 8
   ret i64 %r
 }
@@ -123,6 +213,15 @@ define void @test9(i64* %ptr, i64 %val) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test9:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   store atomic i64 %val, i64* %ptr seq_cst, align 8
   ret void
 }
@@ -133,11 +232,23 @@ define i64 @test10(i64* %ptr, i64 %val) {
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
 ; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
-; CHECK: ble
+; CHECK: blt
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test10:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: subs.w {{[a-z0-9]+}}, [[REG1]], [[REG3:[a-z0-9]+]]
+; CHECK-THUMB: sbcs.w {{[a-z0-9]+}}, [[REG2]], [[REG4:[a-z0-9]+]]
+; CHECK-THUMB: blt
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = atomicrmw min i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -148,11 +259,24 @@ define i64 @test11(i64* %ptr, i64 %val) {
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
 ; CHECK: subs {{[a-z0-9]+}}, [[REG1]], [[REG3:(r[0-9]?[02468])]]
 ; CHECK: sbcs {{[a-z0-9]+}}, [[REG2]], [[REG4:(r[0-9]?[13579])]]
-; CHECK: bls
+; CHECK: blo
 ; CHECK: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+
+; CHECK-THUMB: test11:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: subs.w {{[a-z0-9]+}}, [[REG1]], [[REG3:[a-z0-9]+]]
+; CHECK-THUMB: sbcs.w {{[a-z0-9]+}}, [[REG2]], [[REG4:[a-z0-9]+]]
+; CHECK-THUMB: blo
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = atomicrmw umin i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -168,6 +292,18 @@ define i64 @test12(i64* %ptr, i64 %val) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test12:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: subs.w {{[a-z0-9]+}}, [[REG1]], [[REG3:[a-z0-9]+]]
+; CHECK-THUMB: sbcs.w {{[a-z0-9]+}}, [[REG2]], [[REG4:[a-z0-9]+]]
+; CHECK-THUMB: bge
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
+
   %r = atomicrmw max i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
@@ -183,6 +319,17 @@ define i64 @test13(i64* %ptr, i64 %val) {
 ; CHECK: cmp
 ; CHECK: bne
 ; CHECK: dmb ish
+
+; CHECK-THUMB: test13:
+; CHECK-THUMB: dmb ish
+; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
+; CHECK-THUMB: subs.w {{[a-z0-9]+}}, [[REG1]], [[REG3:[a-z0-9]+]]
+; CHECK-THUMB: sbcs.w {{[a-z0-9]+}}, [[REG2]], [[REG4:[a-z0-9]+]]
+; CHECK-THUMB: bhs
+; CHECK-THUMB: strexd {{[a-z0-9]+}}, [[REG3]], [[REG4]]
+; CHECK-THUMB: cmp
+; CHECK-THUMB: bne
+; CHECK-THUMB: dmb ish
   %r = atomicrmw umax i64* %ptr, i64 %val seq_cst
   ret i64 %r
 }
diff --git a/test/CodeGen/ARM/ehabi-mc-cantunwind.ll b/test/CodeGen/ARM/ehabi-mc-cantunwind.ll
new file mode 100644
index 0000000..698d76e
--- /dev/null
+++ b/test/CodeGen/ARM/ehabi-mc-cantunwind.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple arm-unknown-linux-gnueabi \
+; RUN:     -arm-enable-ehabi -arm-enable-ehabi-descriptors \
+; RUN:     -filetype=obj -o - %s \
+; RUN:   | llvm-objdump -s - \
+; RUN:   | FileCheck %s
+
+define void @test() nounwind {
+entry:
+  ret void
+}
+
+; CHECK: section .text
+; CHECK: section .ARM.exidx
+; CHECK-NEXT: 0000 00000000 01000000
diff --git a/test/CodeGen/ARM/ehabi-mc-section-group.ll b/test/CodeGen/ARM/ehabi-mc-section-group.ll
new file mode 100644
index 0000000..5e4b509
--- /dev/null
+++ b/test/CodeGen/ARM/ehabi-mc-section-group.ll
@@ -0,0 +1,79 @@
+; Test section group of the function with linkonce_odr
+
+; The instantiation of C++ function template will come with linkonce_odr,
+; which indicates that the linker can remove the duplicated instantiation.
+; However, to make this feature work, we have to group the section properly.
+; .text, .ARM.extab, and .ARM.exidx should be grouped together.
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi \
+; RUN:     -arm-enable-ehabi -arm-enable-ehabi-descriptors \
+; RUN:     -filetype=obj -o - %s \
+; RUN:   | elf-dump --dump-section-data \
+; RUN:   | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv4t--linux-gnueabi"
+
+define void @_Z11instantiatev() {
+entry:
+  tail call void @_Z4testIidEvT_S0_S0_S0_S0_T0_S1_S1_S1_S1_(i32 1, i32 2, i32 3, i32 4, i32 5, double 1.000000e-01, double 2.000000e-01, double 3.000000e-01, double 4.000000e-01, double 5.000000e-01)
+  ret void
+}
+
+define linkonce_odr void @_Z4testIidEvT_S0_S0_S0_S0_T0_S1_S1_S1_S1_(i32 %u1, i32 %u2, i32 %u3, i32 %u4, i32 %u5, double %v1, double %v2, double %v3, double %v4, double %v5) {
+entry:
+  invoke void @_Z5printiiiii(i32 %u1, i32 %u2, i32 %u3, i32 %u4, i32 %u5)
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1) nounwind
+  invoke void @_Z5printddddd(double %v1, double %v2, double %v3, double %v4, double %v5)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %lpad
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %invoke.cont2
+  ret void
+
+lpad1:                                            ; preds = %lpad
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1
+  resume { i8*, i32 } %3
+
+terminate.lpad:                                   ; preds = %lpad1
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  tail call void @_ZSt9terminatev() noreturn nounwind
+  unreachable
+}
+
+declare void @_Z5printiiiii(i32, i32, i32, i32, i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @_Z5printddddd(double, double, double, double, double)
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+; CHECK:      # Section 1
+; CHECK-NEXT: (('sh_name', 0x0000002f) # '.group'
+; CHECK:       ('_section_data', '01000000 0a000000 0c000000 0e000000')
+; CHECK:      # Section 10
+; CHECK-NEXT: (('sh_name', 0x000000e1) # '.text._Z4testIidEvT_S0_S0_S0_S0_T0_S1_S1_S1_S1_'
+; CHECK:      # Section 12
+; CHECK-NEXT: (('sh_name', 0x000000d7) # '.ARM.extab.text._Z4testIidEvT_S0_S0_S0_S0_T0_S1_S1_S1_S1_'
+; CHECK:      # Section 14
+; CHECK-NEXT: (('sh_name', 0x00000065) # '.ARM.exidx.text._Z4testIidEvT_S0_S0_S0_S0_T0_S1_S1_S1_S1_'
diff --git a/test/CodeGen/ARM/ehabi-mc-section.ll b/test/CodeGen/ARM/ehabi-mc-section.ll
new file mode 100644
index 0000000..fc51b24
--- /dev/null
+++ b/test/CodeGen/ARM/ehabi-mc-section.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple arm-unknown-linux-gnueabi \
+; RUN:     -arm-enable-ehabi -arm-enable-ehabi-descriptors \
+; RUN:     -filetype=obj -o - %s \
+; RUN:   | llvm-objdump -s - \
+; RUN:   | FileCheck %s
+
+define void @_Z4testiiiiiddddd(i32 %u1, i32 %u2, i32 %u3, i32 %u4, i32 %u5, double %v1, double %v2, double %v3, double %v4, double %v5) section ".test_section" {
+entry:
+  invoke void @_Z5printiiiii(i32 %u1, i32 %u2, i32 %u3, i32 %u4, i32 %u5)
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1) nounwind
+  invoke void @_Z5printddddd(double %v1, double %v2, double %v3, double %v4, double %v5)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %lpad
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %invoke.cont2
+  ret void
+
+lpad1:                                            ; preds = %lpad
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1
+  resume { i8*, i32 } %3
+
+terminate.lpad:                                   ; preds = %lpad1
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  tail call void @_ZSt9terminatev() noreturn nounwind
+  unreachable
+}
+
+declare void @_Z5printiiiii(i32, i32, i32, i32, i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @_Z5printddddd(double, double, double, double, double)
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+; CHECK: section .test_section
+; CHECK: section .ARM.extab.test_section
+; CHECK-NEXT: 0000 00000000 b0b0b000
+; CHECK: section .ARM.exidx.test_section
+; CHECK-NEXT: 0000 00000000 00000000
diff --git a/test/CodeGen/ARM/ehabi-mc-sh_link.ll b/test/CodeGen/ARM/ehabi-mc-sh_link.ll
new file mode 100644
index 0000000..f90e5f3
--- /dev/null
+++ b/test/CodeGen/ARM/ehabi-mc-sh_link.ll
@@ -0,0 +1,47 @@
+; Test the sh_link in Elf32_Shdr.
+
+; The .ARM.exidx section should be linked with corresponding text section.
+; The sh_link in Elf32_Shdr should be filled with the section index of
+; the text section.
+
+; RUN: llc -mtriple arm-unknown-linux-gnueabi \
+; RUN:     -arm-enable-ehabi -arm-enable-ehabi-descriptors \
+; RUN:     -filetype=obj -o - %s \
+; RUN:   | elf-dump --dump-section-data \
+; RUN:   | FileCheck %s
+
+define void @test1() nounwind {
+entry:
+  ret void
+}
+
+define void @test2() nounwind section ".test_section" {
+entry:
+  ret void
+}
+
+; CHECK: # Section 1
+; CHECK-NEXT: (('sh_name', 0x00000010) # '.text'
+
+; CHECK:      (('sh_name', 0x00000005) # '.ARM.exidx'
+; CHECK-NEXT:  ('sh_type', 0x70000001)
+; CHECK-NEXT:  ('sh_flags', 0x00000082)
+; CHECK-NEXT:  ('sh_addr', 0x00000000)
+; CHECK-NEXT:  ('sh_offset', 0x0000005c)
+; CHECK-NEXT:  ('sh_size', 0x00000008)
+; CHECK-NEXT:  ('sh_link',  0x00000001)
+; CHECK-NEXT:  ('sh_info',  0x00000000)
+; CHECK-NEXT:  ('sh_addralign',  0x00000004)
+
+; CHECK: # Section 7
+; CHECK-NEXT: (('sh_name', 0x00000039) # '.test_section'
+
+; CHECK:      (('sh_name', 0x0000002f) # '.ARM.exidx.test_section'
+; CHECK-NEXT:  ('sh_type', 0x70000001)
+; CHECK-NEXT:  ('sh_flags', 0x00000082)
+; CHECK-NEXT:  ('sh_addr', 0x00000000)
+; CHECK-NEXT:  ('sh_offset', 0x00000068)
+; CHECK-NEXT:  ('sh_size', 0x00000008)
+; CHECK-NEXT:  ('sh_link',  0x00000007)
+; CHECK-NEXT:  ('sh_info',  0x00000000)
+; CHECK-NEXT:  ('sh_addralign',  0x00000004)
diff --git a/test/CodeGen/ARM/ehabi-mc.ll b/test/CodeGen/ARM/ehabi-mc.ll
new file mode 100644
index 0000000..0dc2ef7
--- /dev/null
+++ b/test/CodeGen/ARM/ehabi-mc.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple arm-unknown-linux-gnueabi \
+; RUN:     -arm-enable-ehabi -arm-enable-ehabi-descriptors \
+; RUN:     -filetype=obj -o - %s \
+; RUN:   | llvm-objdump -s - \
+; RUN:   | FileCheck %s
+
+define void @_Z4testiiiiiddddd(i32 %u1, i32 %u2, i32 %u3, i32 %u4, i32 %u5, double %v1, double %v2, double %v3, double %v4, double %v5) {
+entry:
+  invoke void @_Z5printiiiii(i32 %u1, i32 %u2, i32 %u3, i32 %u4, i32 %u5)
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = tail call i8* @__cxa_begin_catch(i8* %1) nounwind
+  invoke void @_Z5printddddd(double %v1, double %v2, double %v3, double %v4, double %v5)
+          to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:                                     ; preds = %lpad
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %invoke.cont2
+  ret void
+
+lpad1:                                            ; preds = %lpad
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  invoke void @__cxa_end_catch()
+          to label %eh.resume unwind label %terminate.lpad
+
+eh.resume:                                        ; preds = %lpad1
+  resume { i8*, i32 } %3
+
+terminate.lpad:                                   ; preds = %lpad1
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  tail call void @_ZSt9terminatev() noreturn nounwind
+  unreachable
+}
+
+declare void @_Z5printiiiii(i32, i32, i32, i32, i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @_Z5printddddd(double, double, double, double, double)
+
+declare void @__cxa_end_catch()
+
+declare void @_ZSt9terminatev()
+
+; CHECK: section .text
+; CHECK: section .ARM.extab
+; CHECK-NEXT: 0000 00000000 b0b0b000
+; CHECK: section .ARM.exidx
+; CHECK-NEXT: 0000 00000000 00000000
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index 46c2f1c..c3e00ce 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -14,12 +14,12 @@ entry:
 declare float @fabsf(float)
 
 ; VFP2: test:
-; VFP2: 	vabs.f32	s2, s2
+; VFP2: 	vabs.f32	s
 
 ; NFP1: test:
-; NFP1: 	vabs.f32	d1, d1
+; NFP1: 	vabs.f32	d
 ; NFP0: test:
-; NFP0: 	vabs.f32	s2, s2
+; NFP0: 	vabs.f32	s
 
 ; CORTEXA8: test:
 ; CORTEXA8:     vadd.f32        [[D1:d[0-9]+]]
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index 7d38cc2..4108978 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -231,3 +231,10 @@ define void @t6() nounwind ssp {
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 10, i32 1, i1 false)
   ret void
 }
+
+; rdar://13202135
+define void @t7() nounwind ssp {
+; Just make sure this doesn't assert when we have an odd length and an alignment of 2.
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 4), i8* getelementptr inbounds ([60 x i8]* @temp, i32 0, i32 16), i32 3, i32 2, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index 8fab002..8f13f39 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -10,14 +10,14 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vdiv.f32	s0, s2, s0
+; VFP2: 	vdiv.f32	s{{.}}, s{{.}}, s{{.}}
 
 ; NFP1: test:
-; NFP1: 	vdiv.f32	s0, s2, s0
+; NFP1: 	vdiv.f32	s{{.}}, s{{.}}, s{{.}}
 ; NFP0: test:
-; NFP0: 	vdiv.f32	s0, s2, s0
+; NFP0: 	vdiv.f32	s{{.}}, s{{.}}, s{{.}}
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vdiv.f32	s0, s2, s0
+; CORTEXA8: 	vdiv.f32	s{{.}}, s{{.}}, s{{.}}
 ; CORTEXA9: test:
 ; CORTEXA9: 	vdiv.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fnmscs.ll b/test/CodeGen/ARM/fnmscs.ll
index 6081712..9ce9b7a 100644
--- a/test/CodeGen/ARM/fnmscs.ll
+++ b/test/CodeGen/ARM/fnmscs.ll
@@ -46,8 +46,8 @@ entry:
 ; NEON: vnmla.f64
 
 ; A8: t3:
-; A8: vnmul.f64 d1{{[67]}}, d1{{[67]}}, d1{{[67]}}
-; A8: vsub.f64 d1{{[67]}}, d1{{[67]}}, d1{{[67]}}
+; A8: vnmul.f64 d
+; A8: vsub.f64 d
 	%0 = fmul double %a, %b
 	%1 = fsub double -0.0, %0
         %2 = fsub double %1, %acc
@@ -63,8 +63,8 @@ entry:
 ; NEON: vnmla.f64
 
 ; A8: t4:
-; A8: vnmul.f64 d1{{[67]}}, d1{{[67]}}, d1{{[67]}}
-; A8: vsub.f64 d1{{[67]}}, d1{{[67]}}, d1{{[67]}}
+; A8: vnmul.f64 d
+; A8: vsub.f64 d
 	%0 = fmul double %a, %b
 	%1 = fmul double -1.0, %0
         %2 = fsub double %1, %acc
diff --git a/test/CodeGen/ARM/fp128.ll b/test/CodeGen/ARM/fp128.ll
deleted file mode 100644
index bdeb547..0000000
--- a/test/CodeGen/ARM/fp128.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc -mtriple=arm-none-linux < %s | FileCheck --check-prefix=LITTLEENDIAN %s
-
-@var = global fp128 0xL00000000000000008000000000000000
-
-; CHECK-LITTLEENDIAN: var:
-; CHECK-LITTLEENDIAN-NEXT: .long   0                       @ fp128 -0
-; CHECK-LITTLEENDIAN-NEXT: .long   0
-; CHECK-LITTLEENDIAN-NEXT: .long   0
-; CHECK-LITTLEENDIAN-NEXT: .long   2147483648
-
diff --git a/test/CodeGen/ARM/inlineasm-64bit.ll b/test/CodeGen/ARM/inlineasm-64bit.ll
new file mode 100644
index 0000000..be5eb81
--- /dev/null
+++ b/test/CodeGen/ARM/inlineasm-64bit.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -O3  -mtriple=arm-linux-gnueabi | FileCheck %s
+
+; check if regs are passing correctly
+define void @i64_write(i64* %p, i64 %val) nounwind {
+; CHECK: i64_write:
+; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; CHECK: strexd [[REG1]], {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
+  %1 = tail call i64 asm sideeffect "1: ldrexd $0, ${0:H}, [$2]\0A strexd $0, $3, ${3:H}, [$2]\0A teq $0, #0\0A bne 1b", "=&r,=*Qo,r,r,~{cc}"(i64* %p, i64* %p, i64 %val) nounwind
+  ret void
+}
+
+; check if register allocation can reuse the registers
+define void @multi_writes(i64* %p, i64 %val1, i64 %val2, i64 %val3, i64 %val4, i64 %val5, i64 %val6) nounwind {
+entry:
+; CHECK: multi_writes:
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; check: strexd {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+
+  tail call void asm sideeffect " strexd $1, ${1:H}, [$0]\0A strexd $2, ${2:H}, [$0]\0A strexd $3, ${3:H}, [$0]\0A strexd $4, ${4:H}, [$0]\0A strexd $5, ${5:H}, [$0]\0A strexd $6, ${6:H}, [$0]\0A", "r,r,r,r,r,r,r"(i64* %p, i64 %val1, i64 %val2, i64 %val3, i64 %val4, i64 %val5, i64 %val6) nounwind
+  %incdec.ptr = getelementptr inbounds i64* %p, i32 1
+  tail call void asm sideeffect " strexd $1, ${1:H}, [$0]\0A strexd $2, ${2:H}, [$0]\0A strexd $3, ${3:H}, [$0]\0A strexd $4, ${4:H}, [$0]\0A strexd $5, ${5:H}, [$0]\0A strexd $6, ${6:H}, [$0]\0A", "r,r,r,r,r,r,r"(i64* %incdec.ptr, i64 %val1, i64 %val2, i64 %val3, i64 %val4, i64 %val5, i64 %val6) nounwind
+  tail call void asm sideeffect " strexd $1, ${1:H}, [$0]\0A strexd $2, ${2:H}, [$0]\0A strexd $3, ${3:H}, [$0]\0A strexd $4, ${4:H}, [$0]\0A strexd $5, ${5:H}, [$0]\0A strexd $6, ${6:H}, [$0]\0A", "r,r,r,r,r,r,r"(i64* %incdec.ptr, i64 %val1, i64 %val2, i64 %val3, i64 %val4, i64 %val5, i64 %val6) nounwind
+  ret void
+}
+
+
+; check if callee-saved registers used by inline asm are saved/restored
+define void @foo(i64* %p, i64 %i) nounwind {
+; CHECK:foo:
+; CHECK: push {{{r[4-9]|r10|r11}}
+; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], {{r[0-9]?[13579]}}, [r{{[0-9]+}}]
+; CHECK: strexd [[REG1]], {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
+; CHECK: pop {{{r[4-9]|r10|r11}}
+  %1 = tail call { i64, i64 } asm sideeffect "@ atomic64_set\0A1: ldrexd $0, ${0:H}, [$3]\0Aldrexd $1, ${1:H}, [$3]\0A strexd $0, $4, ${4:H}, [$3]\0A teq $0, #0\0A bne 1b", "=&r,=&r,=*Qo,r,r,~{cc}"(i64* %p, i64* %p, i64 %i) nounwind
+  ret void
+}
diff --git a/test/CodeGen/ARM/inlineasm3.ll b/test/CodeGen/ARM/inlineasm3.ll
index 2fcc45f..390a44e 100644
--- a/test/CodeGen/ARM/inlineasm3.ll
+++ b/test/CodeGen/ARM/inlineasm3.ll
@@ -30,7 +30,7 @@ entry:
 
 define hidden void @conv4_8_E() nounwind {
 entry:
-%asmtmp31 = call %0 asm "vld1.u8  {$0}, [$1, :128]!\0A", "=w,=r,1"(<16 x i8>* undef) nounwind
+%asmtmp31 = call %0 asm "vld1.u8  {$0}, [$1:128]!\0A", "=w,=r,1"(<16 x i8>* undef) nounwind
 unreachable
 }
 
diff --git a/test/CodeGen/ARM/neon_cmp.ll b/test/CodeGen/ARM/neon_cmp.ll
new file mode 100644
index 0000000..046b5da
--- /dev/null
+++ b/test/CodeGen/ARM/neon_cmp.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s
+; bug 15283
+; radar://13191881
+; CHECK: vfcmp
+define void @vfcmp(<2 x double>* %a, <2 x double>* %b) {
+  %wide.load = load <2 x double>* %a, align 4
+  %wide.load2 = load <2 x double>* %b, align 4
+; CHECK-NOT: vdup.32
+; CHECK-NOT: vmovn.i64
+  %v1 = fcmp olt <2 x double> %wide.load, %wide.load2
+  %v2 = zext <2 x i1> %v1 to <2 x i32>
+  %v3 = sitofp <2 x i32> %v2 to <2 x double>
+  store <2 x double> %v3, <2 x double>* %b, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM/neon_fpconv.ll b/test/CodeGen/ARM/neon_fpconv.ll
index 1948ad8..149f4c7 100644
--- a/test/CodeGen/ARM/neon_fpconv.ll
+++ b/test/CodeGen/ARM/neon_fpconv.ll
@@ -15,3 +15,28 @@ define <2 x double> @vextend(<2 x float> %a) {
   ret <2 x double> %ve
 }
 
+; We used to generate vmovs between scalar and vfp/neon registers.
+; CHECK: vsitofp_double
+define void @vsitofp_double(<2 x i32>* %loadaddr,
+                            <2 x double>* %storeaddr) {
+  %v0 = load <2 x i32>* %loadaddr
+; CHECK:      vldr
+; CHECK-NEXT:	vcvt.f64.s32
+; CHECK-NEXT:	vcvt.f64.s32
+; CHECK-NEXT:	vst
+  %r = sitofp <2 x i32> %v0 to <2 x double>
+  store <2 x double> %r, <2 x double>* %storeaddr
+  ret void
+}
+; CHECK: vuitofp_double
+define void @vuitofp_double(<2 x i32>* %loadaddr,
+                            <2 x double>* %storeaddr) {
+  %v0 = load <2 x i32>* %loadaddr
+; CHECK:      vldr
+; CHECK-NEXT:	vcvt.f64.u32
+; CHECK-NEXT:	vcvt.f64.u32
+; CHECK-NEXT:	vst
+  %r = uitofp <2 x i32> %v0 to <2 x double>
+  store <2 x double> %r, <2 x double>* %storeaddr
+  ret void
+}
diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll
index 497619e..25a670b 100644
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@@ -7,10 +7,10 @@
 ; CHECK: vadd.i64 q
 ; CHECK: vst1.64
 ; SWIFT: t1
-; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
-; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+:128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+:128\]}}
 ; SWIFT: vadd.i64 q
-; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+:128\]}}
 define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
 entry:
 	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
@@ -28,8 +28,8 @@ entry:
 ; CHECK: vmov r0, r1, d
 ; CHECK: vmov r2, r3, d
 ; SWIFT: t2
-; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
-; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+:128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+:128\]}}
 ; SWIFT: vsub.i64 q
 ; SWIFT: vmov r0, r1, d
 ; SWIFT: vmov r2, r3, d
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 6d6586e..fd2083c 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -242,8 +242,8 @@ define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
 ; CHECK:        vldr
 ; CHECK-NOT:    vmov d{{.*}}, d16
 ; CHECK:        vmov.i32 d17
-; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
-; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0:128]
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0:128]
   %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
   %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
   store <4 x float> %4, <4 x float>* undef, align 16
diff --git a/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll b/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
new file mode 100644
index 0000000..d8241d0
--- /dev/null
+++ b/test/CodeGen/ARM/sjlj-prepare-critical-edge.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -O1 -mtriple thumbv7-apple-ios6
+; Just make sure no one tries to make the assumption that the normal edge of an
+; invoke is never a critical edge.  Previously, this code would assert.
+
+%struct.__CFString = type opaque
+
+declare void @bar(%struct.__CFString*, %struct.__CFString*)
+
+define noalias i8* @foo(i8* nocapture %inRefURL) noreturn ssp {
+entry:
+  %call = tail call %struct.__CFString* @bar3()
+  %call2 = invoke i8* @bar2()
+          to label %for.cond unwind label %lpad
+
+for.cond:                                         ; preds = %entry, %for.cond
+  invoke void @bar(%struct.__CFString* undef, %struct.__CFString* null)
+          to label %for.cond unwind label %lpad5
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  br label %ehcleanup
+
+lpad5:                                            ; preds = %for.cond
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  %4 = extractvalue { i8*, i32 } %3, 0
+  %5 = extractvalue { i8*, i32 } %3, 1
+  invoke void @release(i8* %call2)
+          to label %ehcleanup unwind label %terminate.lpad.i.i16
+
+terminate.lpad.i.i16:                             ; preds = %lpad5
+  %6 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* null
+  tail call void @terminatev() noreturn nounwind
+  unreachable
+
+ehcleanup:                                        ; preds = %lpad5, %lpad
+  %exn.slot.0 = phi i8* [ %1, %lpad ], [ %4, %lpad5 ]
+  %ehselector.slot.0 = phi i32 [ %2, %lpad ], [ %5, %lpad5 ]
+  %7 = bitcast %struct.__CFString* %call to i8*
+  invoke void @release(i8* %7)
+          to label %_ZN5SmartIPK10__CFStringED1Ev.exit unwind label %terminate.lpad.i.i
+
+terminate.lpad.i.i:                               ; preds = %ehcleanup
+  %8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          catch i8* null
+  tail call void @terminatev() noreturn nounwind
+  unreachable
+
+_ZN5SmartIPK10__CFStringED1Ev.exit:               ; preds = %ehcleanup
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val12 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val12
+}
+
+declare %struct.__CFString* @bar3()
+
+declare i8* @bar2()
+
+declare i32 @__gxx_personality_sj0(...)
+
+declare void @release(i8*)
+
+declare void @terminatev()
diff --git a/test/CodeGen/ARM/spill-q.ll b/test/CodeGen/ARM/spill-q.ll
index 057ea11..e93cdbc 100644
--- a/test/CodeGen/ARM/spill-q.ll
+++ b/test/CodeGen/ARM/spill-q.ll
@@ -12,8 +12,8 @@ declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
 define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK: aaa:
 ; CHECK: bic {{.*}}, #15
-; CHECK: vst1.64 {{.*}}sp, :128
-; CHECK: vld1.64 {{.*}}sp, :128
+; CHECK: vst1.64 {{.*}}sp:128
+; CHECK: vld1.64 {{.*}}sp:128
 entry:
   %aligned_vec = alloca <4 x float>, align 16
   %"alloca point" = bitcast i32 0 to i32
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 21865f8..a4e3c3c 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -1,5 +1,23 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=INSTR
 ; RUN: llc < %s -mtriple=arm-apple-darwin -trap-func=_trap | FileCheck %s -check-prefix=FUNC
+; RUN: llc -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7-unknown-nacl - \
+; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
+; RUN: llc -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 -mattr=+nacl-trap - \
+; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
+; RUN: llc -mtriple=armv7 -mattr=+nacl-trap -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 -mattr=+nacl-trap - \
+; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
+; RUN: llc -fast-isel -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7-unknown-nacl - \
+; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
+; RUN: llc -mtriple=armv7 -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - \
+; RUN:  | FileCheck %s -check-prefix=ENCODING-ALL
+; RUN: llc -fast-isel -mtriple=armv7 -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - \
+; RUN:  | FileCheck %s -check-prefix=ENCODING-ALL
 ; rdar://7961298
 ; rdar://9249183
 
@@ -10,6 +28,11 @@ entry:
 
 ; FUNC: t:
 ; FUNC: bl __trap
+
+; ENCODING-NACL: f0 de fe e7
+
+; ENCODING-ALL: fe de ff e7
+
   call void @llvm.trap()
   unreachable
 }
@@ -21,6 +44,11 @@ entry:
 
 ; FUNC: t2:
 ; FUNC: bl __trap
+
+; ENCODING-NACL: f0 de fe e7
+
+; ENCODING-ALL: fe de ff e7
+
   call void @llvm.debugtrap()
   unreachable
 }
diff --git a/test/CodeGen/ARM/vector-DAGCombine.ll b/test/CodeGen/ARM/vector-DAGCombine.ll
index a38a0fe..42964de 100644
--- a/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -133,3 +133,30 @@ define i16 @foldBuildVectors() {
   %3 = extractelement <8 x i16> %2, i32 0
   ret i16 %3
 }
+
+; Test that we are generating vrev and vext for reverse shuffles of v8i16
+; shuffles.
+; CHECK: reverse_v8i16
+define void @reverse_v8i16(<8 x i16>* %loadaddr, <8 x i16>* %storeaddr) {
+  %v0 = load <8 x i16>* %loadaddr
+  ; CHECK: vrev64.16
+  ; CHECK: vext.16
+  %v1 = shufflevector <8 x i16> %v0, <8 x i16> undef,
+              <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <8 x i16> %v1, <8 x i16>* %storeaddr
+  ret void
+}
+
+; Test that we are generating vrev and vext for reverse shuffles of v16i8
+; shuffles.
+; CHECK: reverse_v16i8
+define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) {
+  %v0 = load <16 x i8>* %loadaddr
+  ; CHECK: vrev64.8
+  ; CHECK: vext.8
+  %v1 = shufflevector <16 x i8> %v0, <16 x i8> undef,
+       <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8,
+                   i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  store <16 x i8> %v1, <16 x i8>* %storeaddr
+  ret void
+}
diff --git a/test/CodeGen/ARM/vld1.ll b/test/CodeGen/ARM/vld1.ll
index e524395..994f05d 100644
--- a/test/CodeGen/ARM/vld1.ll
+++ b/test/CodeGen/ARM/vld1.ll
@@ -4,7 +4,7 @@
 define <8 x i8> @vld1i8(i8* %A) nounwind {
 ;CHECK: vld1i8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld1.8 {d16}, [r0, :64]
+;CHECK: vld1.8 {d16}, [r0:64]
 	%tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16)
 	ret <8 x i8> %tmp1
 }
@@ -68,7 +68,7 @@ define <1 x i64> @vld1i64(i64* %A) nounwind {
 define <16 x i8> @vld1Qi8(i8* %A) nounwind {
 ;CHECK: vld1Qi8:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vld1.8 {d16, d17}, [r0, :64]
+;CHECK: vld1.8 {d16, d17}, [r0:64]
 	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
 	ret <16 x i8> %tmp1
 }
@@ -76,7 +76,7 @@ define <16 x i8> @vld1Qi8(i8* %A) nounwind {
 ;Check for a post-increment updating load.
 define <16 x i8> @vld1Qi8_update(i8** %ptr) nounwind {
 ;CHECK: vld1Qi8_update:
-;CHECK: vld1.8 {d16, d17}, [{{r[0-9]+}}, :64]!
+;CHECK: vld1.8 {d16, d17}, [{{r[0-9]+}}:64]!
 	%A = load i8** %ptr
 	%tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
 	%tmp2 = getelementptr i8* %A, i32 16
@@ -87,7 +87,7 @@ define <16 x i8> @vld1Qi8_update(i8** %ptr) nounwind {
 define <8 x i16> @vld1Qi16(i16* %A) nounwind {
 ;CHECK: vld1Qi16:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vld1.16 {d16, d17}, [r0, :128]
+;CHECK: vld1.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32)
 	ret <8 x i16> %tmp1
diff --git a/test/CodeGen/ARM/vld2.ll b/test/CodeGen/ARM/vld2.ll
index 29b3794..caa016e 100644
--- a/test/CodeGen/ARM/vld2.ll
+++ b/test/CodeGen/ARM/vld2.ll
@@ -14,7 +14,7 @@
 define <8 x i8> @vld2i8(i8* %A) nounwind {
 ;CHECK: vld2i8:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vld2.8 {d16, d17}, [r0, :64]
+;CHECK: vld2.8 {d16, d17}, [r0:64]
 	%tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 8)
         %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
@@ -25,7 +25,7 @@ define <8 x i8> @vld2i8(i8* %A) nounwind {
 define <4 x i16> @vld2i16(i16* %A) nounwind {
 ;CHECK: vld2i16:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vld2.16 {d16, d17}, [r0, :128]
+;CHECK: vld2.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 32)
         %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
@@ -74,7 +74,7 @@ define <2 x float> @vld2f_update(float** %ptr) nounwind {
 define <1 x i64> @vld2i64(i64* %A) nounwind {
 ;CHECK: vld2i64:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vld1.64 {d16, d17}, [r0, :128]
+;CHECK: vld1.64 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0, i32 32)
         %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
@@ -86,7 +86,7 @@ define <1 x i64> @vld2i64(i64* %A) nounwind {
 define <16 x i8> @vld2Qi8(i8* %A) nounwind {
 ;CHECK: vld2Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vld2.8 {d16, d17, d18, d19}, [r0, :64]
+;CHECK: vld2.8 {d16, d17, d18, d19}, [r0:64]
 	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 8)
         %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
@@ -97,7 +97,7 @@ define <16 x i8> @vld2Qi8(i8* %A) nounwind {
 ;Check for a post-increment updating load with register increment.
 define <16 x i8> @vld2Qi8_update(i8** %ptr, i32 %inc) nounwind {
 ;CHECK: vld2Qi8_update:
-;CHECK: vld2.8 {d16, d17, d18, d19}, [r2, :128], r1
+;CHECK: vld2.8 {d16, d17, d18, d19}, [r2:128], r1
 	%A = load i8** %ptr
 	%tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 16)
         %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
@@ -111,7 +111,7 @@ define <16 x i8> @vld2Qi8_update(i8** %ptr, i32 %inc) nounwind {
 define <8 x i16> @vld2Qi16(i16* %A) nounwind {
 ;CHECK: vld2Qi16:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vld2.16 {d16, d17, d18, d19}, [r0, :128]
+;CHECK: vld2.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 16)
         %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
@@ -123,7 +123,7 @@ define <8 x i16> @vld2Qi16(i16* %A) nounwind {
 define <4 x i32> @vld2Qi32(i32* %A) nounwind {
 ;CHECK: vld2Qi32:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vld2.32 {d16, d17, d18, d19}, [r0, :256]
+;CHECK: vld2.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 64)
         %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
diff --git a/test/CodeGen/ARM/vld3.ll b/test/CodeGen/ARM/vld3.ll
index b495319..ad63e1f 100644
--- a/test/CodeGen/ARM/vld3.ll
+++ b/test/CodeGen/ARM/vld3.ll
@@ -15,7 +15,7 @@
 define <8 x i8> @vld3i8(i8* %A) nounwind {
 ;CHECK: vld3i8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld3.8 {d16, d17, d18}, [r0, :64]
+;CHECK: vld3.8 {d16, d17, d18}, [r0:64]
 	%tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 32)
         %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
@@ -74,7 +74,7 @@ define <2 x float> @vld3f(float* %A) nounwind {
 define <1 x i64> @vld3i64(i64* %A) nounwind {
 ;CHECK: vld3i64:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld1.64 {d16, d17, d18}, [r0, :64]
+;CHECK: vld1.64 {d16, d17, d18}, [r0:64]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
         %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
@@ -86,8 +86,8 @@ define <1 x i64> @vld3i64(i64* %A) nounwind {
 define <16 x i8> @vld3Qi8(i8* %A) nounwind {
 ;CHECK: vld3Qi8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld3.8 {d16, d18, d20}, [r0, :64]!
-;CHECK: vld3.8 {d17, d19, d21}, [r0, :64]
+;CHECK: vld3.8 {d16, d18, d20}, [r0:64]!
+;CHECK: vld3.8 {d17, d19, d21}, [r0:64]
 	%tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 32)
         %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
diff --git a/test/CodeGen/ARM/vld4.ll b/test/CodeGen/ARM/vld4.ll
index 59a73db..9ee5fe4 100644
--- a/test/CodeGen/ARM/vld4.ll
+++ b/test/CodeGen/ARM/vld4.ll
@@ -14,7 +14,7 @@
 define <8 x i8> @vld4i8(i8* %A) nounwind {
 ;CHECK: vld4i8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vld4.8 {d16, d17, d18, d19}, [r0, :64]
+;CHECK: vld4.8 {d16, d17, d18, d19}, [r0:64]
 	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 8)
         %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
@@ -25,7 +25,7 @@ define <8 x i8> @vld4i8(i8* %A) nounwind {
 ;Check for a post-increment updating load with register increment.
 define <8 x i8> @vld4i8_update(i8** %ptr, i32 %inc) nounwind {
 ;CHECK: vld4i8_update:
-;CHECK: vld4.8 {d16, d17, d18, d19}, [r2, :128], r1
+;CHECK: vld4.8 {d16, d17, d18, d19}, [r2:128], r1
 	%A = load i8** %ptr
 	%tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 16)
 	%tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
@@ -39,7 +39,7 @@ define <8 x i8> @vld4i8_update(i8** %ptr, i32 %inc) nounwind {
 define <4 x i16> @vld4i16(i16* %A) nounwind {
 ;CHECK: vld4i16:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vld4.16 {d16, d17, d18, d19}, [r0, :128]
+;CHECK: vld4.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 16)
         %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
@@ -51,7 +51,7 @@ define <4 x i16> @vld4i16(i16* %A) nounwind {
 define <2 x i32> @vld4i32(i32* %A) nounwind {
 ;CHECK: vld4i32:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vld4.32 {d16, d17, d18, d19}, [r0, :256]
+;CHECK: vld4.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 32)
         %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
@@ -74,7 +74,7 @@ define <2 x float> @vld4f(float* %A) nounwind {
 define <1 x i64> @vld4i64(i64* %A) nounwind {
 ;CHECK: vld4i64:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vld1.64 {d16, d17, d18, d19}, [r0, :256]
+;CHECK: vld1.64 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
         %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
@@ -86,8 +86,8 @@ define <1 x i64> @vld4i64(i64* %A) nounwind {
 define <16 x i8> @vld4Qi8(i8* %A) nounwind {
 ;CHECK: vld4Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vld4.8 {d16, d18, d20, d22}, [r0, :256]!
-;CHECK: vld4.8 {d17, d19, d21, d23}, [r0, :256]
+;CHECK: vld4.8 {d16, d18, d20, d22}, [r0:256]!
+;CHECK: vld4.8 {d17, d19, d21, d23}, [r0:256]
 	%tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 64)
         %tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
         %tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
@@ -111,8 +111,8 @@ define <8 x i16> @vld4Qi16(i16* %A) nounwind {
 ;Check for a post-increment updating load. 
 define <8 x i16> @vld4Qi16_update(i16** %ptr) nounwind {
 ;CHECK: vld4Qi16_update:
-;CHECK: vld4.16 {d16, d18, d20, d22}, [r1, :64]!
-;CHECK: vld4.16 {d17, d19, d21, d23}, [r1, :64]!
+;CHECK: vld4.16 {d16, d18, d20, d22}, [r1:64]!
+;CHECK: vld4.16 {d17, d19, d21, d23}, [r1:64]!
 	%A = load i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll
index c69473f..7c7319c 100644
--- a/test/CodeGen/ARM/vlddup.ll
+++ b/test/CodeGen/ARM/vlddup.ll
@@ -13,7 +13,7 @@ define <8 x i8> @vld1dupi8(i8* %A) nounwind {
 define <4 x i16> @vld1dupi16(i16* %A) nounwind {
 ;CHECK: vld1dupi16:
 ;Check the alignment value.  Max for this instruction is 16 bits:
-;CHECK: vld1.16 {d16[]}, [r0, :16]
+;CHECK: vld1.16 {d16[]}, [r0:16]
 	%tmp1 = load i16* %A, align 8
 	%tmp2 = insertelement <4 x i16> undef, i16 %tmp1, i32 0
 	%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> undef, <4 x i32> zeroinitializer
@@ -23,7 +23,7 @@ define <4 x i16> @vld1dupi16(i16* %A) nounwind {
 define <2 x i32> @vld1dupi32(i32* %A) nounwind {
 ;CHECK: vld1dupi32:
 ;Check the alignment value.  Max for this instruction is 32 bits:
-;CHECK: vld1.32 {d16[]}, [r0, :32]
+;CHECK: vld1.32 {d16[]}, [r0:32]
 	%tmp1 = load i32* %A, align 8
 	%tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
 	%tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -32,7 +32,7 @@ define <2 x i32> @vld1dupi32(i32* %A) nounwind {
 
 define <2 x float> @vld1dupf(float* %A) nounwind {
 ;CHECK: vld1dupf:
-;CHECK: vld1.32 {d16[]}, [r0, :32]
+;CHECK: vld1.32 {d16[]}, [r0:32]
 	%tmp0 = load float* %A
         %tmp1 = insertelement <2 x float> undef, float %tmp0, i32 0
         %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
@@ -51,7 +51,7 @@ define <16 x i8> @vld1dupQi8(i8* %A) nounwind {
 
 define <4 x float> @vld1dupQf(float* %A) nounwind {
 ;CHECK: vld1dupQf:
-;CHECK: vld1.32 {d16[], d17[]}, [r0, :32]
+;CHECK: vld1.32 {d16[], d17[]}, [r0:32]
         %tmp0 = load float* %A
         %tmp1 = insertelement <4 x float> undef, float %tmp0, i32 0
         %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
@@ -109,7 +109,7 @@ define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
 define <2 x i32> @vld2dupi32(i8* %A) nounwind {
 ;CHECK: vld2dupi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld2.32 {d16[], d17[]}, [r0, :64]
+;CHECK: vld2.32 {d16[], d17[]}, [r0:64]
 	%tmp0 = tail call %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, i32 0, i32 16)
 	%tmp1 = extractvalue %struct.__neon_int2x32x2_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
@@ -194,7 +194,7 @@ define <2 x i32> @vld4dupi32(i8* %A) nounwind {
 ;CHECK: vld4dupi32:
 ;Check the alignment value.  An 8-byte alignment is allowed here even though
 ;it is smaller than the total size of the memory being loaded.
-;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0, :64]
+;CHECK: vld4.32 {d16[], d17[], d18[], d19[]}, [r0:64]
 	%tmp0 = tail call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %A, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 8)
 	%tmp1 = extractvalue %struct.__neon_int32x2x4_t %tmp0, 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
diff --git a/test/CodeGen/ARM/vldlane.ll b/test/CodeGen/ARM/vldlane.ll
index 7bd0cbd..f35fa92 100644
--- a/test/CodeGen/ARM/vldlane.ll
+++ b/test/CodeGen/ARM/vldlane.ll
@@ -14,7 +14,7 @@ define <8 x i8> @vld1lanei8(i8* %A, <8 x i8>* %B) nounwind {
 define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld1lanei16:
 ;Check the alignment value.  Max for this instruction is 16 bits:
-;CHECK: vld1.16 {d16[2]}, [r0, :16]
+;CHECK: vld1.16 {d16[2]}, [r0:16]
 	%tmp1 = load <4 x i16>* %B
 	%tmp2 = load i16* %A, align 8
 	%tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
@@ -24,7 +24,7 @@ define <4 x i16> @vld1lanei16(i16* %A, <4 x i16>* %B) nounwind {
 define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld1lanei32:
 ;Check the alignment value.  Max for this instruction is 32 bits:
-;CHECK: vld1.32 {d16[1]}, [r0, :32]
+;CHECK: vld1.32 {d16[1]}, [r0:32]
 	%tmp1 = load <2 x i32>* %B
 	%tmp2 = load i32* %A, align 8
 	%tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
@@ -34,7 +34,7 @@ define <2 x i32> @vld1lanei32(i32* %A, <2 x i32>* %B) nounwind {
 define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld1lanei32a32:
 ;Check the alignment value.  Legal values are none or :32.
-;CHECK: vld1.32 {d16[1]}, [r0, :32]
+;CHECK: vld1.32 {d16[1]}, [r0:32]
 	%tmp1 = load <2 x i32>* %B
 	%tmp2 = load i32* %A, align 4
 	%tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
@@ -43,7 +43,7 @@ define <2 x i32> @vld1lanei32a32(i32* %A, <2 x i32>* %B) nounwind {
 
 define <2 x float> @vld1lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vld1lanef:
-;CHECK: vld1.32 {d16[1]}, [r0, :32]
+;CHECK: vld1.32 {d16[1]}, [r0:32]
 	%tmp1 = load <2 x float>* %B
 	%tmp2 = load float* %A, align 4
 	%tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1
@@ -61,7 +61,7 @@ define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
 
 define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld1laneQi16:
-;CHECK: vld1.16 {d17[1]}, [r0, :16]
+;CHECK: vld1.16 {d17[1]}, [r0:16]
 	%tmp1 = load <8 x i16>* %B
 	%tmp2 = load i16* %A, align 8
 	%tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
@@ -70,7 +70,7 @@ define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 
 define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld1laneQi32:
-;CHECK: vld1.32 {d17[1]}, [r0, :32]
+;CHECK: vld1.32 {d17[1]}, [r0:32]
 	%tmp1 = load <4 x i32>* %B
 	%tmp2 = load i32* %A, align 8
 	%tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
@@ -79,7 +79,7 @@ define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 
 define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind {
 ;CHECK: vld1laneQf:
-;CHECK: vld1.32 {d16[0]}, [r0, :32]
+;CHECK: vld1.32 {d16[0]}, [r0:32]
 	%tmp1 = load <4 x float>* %B
 	%tmp2 = load float* %A
 	%tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0
@@ -98,7 +98,7 @@ define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind {
 define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vld2lanei8:
 ;Check the alignment value.  Max for this instruction is 16 bits:
-;CHECK: vld2.8 {d16[1], d17[1]}, [r0, :16]
+;CHECK: vld2.8 {d16[1], d17[1]}, [r0:16]
 	%tmp1 = load <8 x i8>* %B
 	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
         %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
@@ -110,7 +110,7 @@ define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vld2lanei16:
 ;Check the alignment value.  Max for this instruction is 32 bits:
-;CHECK: vld2.16 {d16[1], d17[1]}, [r0, :32]
+;CHECK: vld2.16 {d16[1], d17[1]}, [r0:32]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
@@ -176,7 +176,7 @@ define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vld2laneQi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}, :64]
+;CHECK: vld2.32 {d17[0], d19[0]}, [{{r[0-9]+}}:64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
 	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
@@ -354,7 +354,7 @@ declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x flo
 define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vld4lanei8:
 ;Check the alignment value.  Max for this instruction is 32 bits:
-;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}, :32]
+;CHECK: vld4.8 {d{{.*}}[1], d{{.*}}[1], d{{.*}}[1], d{{.*}}[1]}, [{{r[0-9]+}}:32]
 	%tmp1 = load <8 x i8>* %B
 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
         %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
@@ -370,7 +370,7 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check for a post-increment updating load.
 define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK: vld4lanei8_update:
-;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :32]!
+;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:32]!
 	%A = load i8** %ptr
 	%tmp1 = load <8 x i8>* %B
 	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
@@ -408,7 +408,7 @@ define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vld4lanei32:
 ;Check the alignment value.  An 8-byte alignment is allowed here even though
 ;it is smaller than the total size of the memory being loaded.
-;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}, :64]
+;CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [{{r[0-9]+}}:64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
 	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
@@ -441,7 +441,7 @@ define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind {
 define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vld4laneQi16:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}, :64]
+;CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [{{r[0-9]+}}:64]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
diff --git a/test/CodeGen/ARM/vst1.ll b/test/CodeGen/ARM/vst1.ll
index 364d44b..e1f3e88 100644
--- a/test/CodeGen/ARM/vst1.ll
+++ b/test/CodeGen/ARM/vst1.ll
@@ -3,7 +3,7 @@
 define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst1i8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vst1.8 {d16}, [r0, :64]
+;CHECK: vst1.8 {d16}, [r0:64]
 	%tmp1 = load <8 x i8>* %B
 	call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
 	ret void
@@ -61,7 +61,7 @@ define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
 define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst1Qi8:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vst1.8 {d16, d17}, [r0, :64]
+;CHECK: vst1.8 {d16, d17}, [r0:64]
 	%tmp1 = load <16 x i8>* %B
 	call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
 	ret void
@@ -70,7 +70,7 @@ define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
 define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst1Qi16:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vst1.16 {d16, d17}, [r0, :128]
+;CHECK: vst1.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
@@ -80,7 +80,7 @@ define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;Check for a post-increment updating store with register increment.
 define void @vst1Qi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
 ;CHECK: vst1Qi16_update:
-;CHECK: vst1.16 {d16, d17}, [r1, :64], r2
+;CHECK: vst1.16 {d16, d17}, [r1:64], r2
 	%A = load i16** %ptr
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
diff --git a/test/CodeGen/ARM/vst2.ll b/test/CodeGen/ARM/vst2.ll
index fb05a20..a31f863 100644
--- a/test/CodeGen/ARM/vst2.ll
+++ b/test/CodeGen/ARM/vst2.ll
@@ -3,7 +3,7 @@
 define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst2i8:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vst2.8 {d16, d17}, [r0, :64]
+;CHECK: vst2.8 {d16, d17}, [r0:64]
 	%tmp1 = load <8 x i8>* %B
 	call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
 	ret void
@@ -24,7 +24,7 @@ define void @vst2i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
 define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst2i16:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vst2.16 {d16, d17}, [r0, :128]
+;CHECK: vst2.16 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
 	call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
@@ -52,7 +52,7 @@ define void @vst2f(float* %A, <2 x float>* %B) nounwind {
 define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst2i64:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vst1.64 {d16, d17}, [r0, :128]
+;CHECK: vst1.64 {d16, d17}, [r0:128]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>* %B
 	call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
@@ -62,7 +62,7 @@ define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
 ;Check for a post-increment updating store.
 define void @vst2i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
 ;CHECK: vst2i64_update:
-;CHECK: vst1.64 {d16, d17}, [r1, :64]!
+;CHECK: vst1.64 {d16, d17}, [r1:64]!
 	%A = load i64** %ptr
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>* %B
@@ -75,7 +75,7 @@ define void @vst2i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
 define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst2Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vst2.8 {d16, d17, d18, d19}, [r0, :64]
+;CHECK: vst2.8 {d16, d17, d18, d19}, [r0:64]
 	%tmp1 = load <16 x i8>* %B
 	call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
 	ret void
@@ -84,7 +84,7 @@ define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
 define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst2Qi16:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vst2.16 {d16, d17, d18, d19}, [r0, :128]
+;CHECK: vst2.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
@@ -94,7 +94,7 @@ define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
 define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst2Qi32:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vst2.32 {d16, d17, d18, d19}, [r0, :256]
+;CHECK: vst2.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
 	call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
index f117ab2..281bb73 100644
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@@ -4,7 +4,7 @@ define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst3i8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;This test runs at -O0 so do not check for specific register numbers.
-;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}, :64]
+;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
 	%tmp1 = load <8 x i8>* %B
 	call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
 	ret void
@@ -54,7 +54,7 @@ define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst3i64:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;This test runs at -O0 so do not check for specific register numbers.
-;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}, :64]
+;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>* %B
 	call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 16)
@@ -65,8 +65,8 @@ define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst3Qi8:
 ;Check the alignment value.  Max for this instruction is 64 bits:
 ;This test runs at -O0 so do not check for specific register numbers.
-;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}, :64]!
-;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}, :64]
+;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]!
+;CHECK: vst3.8 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}:64]
 	%tmp1 = load <16 x i8>* %B
 	call void @llvm.arm.neon.vst3.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 32)
 	ret void
diff --git a/test/CodeGen/ARM/vst4.ll b/test/CodeGen/ARM/vst4.ll
index e94acb6..7dedb2f 100644
--- a/test/CodeGen/ARM/vst4.ll
+++ b/test/CodeGen/ARM/vst4.ll
@@ -3,7 +3,7 @@
 define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst4i8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vst4.8 {d16, d17, d18, d19}, [r0, :64]
+;CHECK: vst4.8 {d16, d17, d18, d19}, [r0:64]
 	%tmp1 = load <8 x i8>* %B
 	call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
 	ret void
@@ -12,7 +12,7 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check for a post-increment updating store with register increment.
 define void @vst4i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
 ;CHECK: vst4i8_update:
-;CHECK: vst4.8 {d16, d17, d18, d19}, [r1, :128], r2
+;CHECK: vst4.8 {d16, d17, d18, d19}, [r1:128], r2
 	%A = load i8** %ptr
 	%tmp1 = load <8 x i8>* %B
 	call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
@@ -24,7 +24,7 @@ define void @vst4i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
 define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst4i16:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vst4.16 {d16, d17, d18, d19}, [r0, :128]
+;CHECK: vst4.16 {d16, d17, d18, d19}, [r0:128]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
 	call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
@@ -34,7 +34,7 @@ define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
 define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst4i32:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vst4.32 {d16, d17, d18, d19}, [r0, :256]
+;CHECK: vst4.32 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
 	call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
@@ -53,7 +53,7 @@ define void @vst4f(float* %A, <2 x float>* %B) nounwind {
 define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vst4i64:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vst1.64 {d16, d17, d18, d19}, [r0, :256]
+;CHECK: vst1.64 {d16, d17, d18, d19}, [r0:256]
 	%tmp0 = bitcast i64* %A to i8*
 	%tmp1 = load <1 x i64>* %B
 	call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
@@ -63,8 +63,8 @@ define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
 define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vst4Qi8:
 ;Check the alignment value.  Max for this instruction is 256 bits:
-;CHECK: vst4.8 {d16, d18, d20, d22}, [r0, :256]!
-;CHECK: vst4.8 {d17, d19, d21, d23}, [r0, :256]
+;CHECK: vst4.8 {d16, d18, d20, d22}, [r0:256]!
+;CHECK: vst4.8 {d17, d19, d21, d23}, [r0:256]
 	%tmp1 = load <16 x i8>* %B
 	call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
 	ret void
diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll
index 758b355..67f251f 100644
--- a/test/CodeGen/ARM/vstlane.ll
+++ b/test/CodeGen/ARM/vstlane.ll
@@ -26,7 +26,7 @@ define void @vst1lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst1lanei16:
 ;Check the alignment value.  Max for this instruction is 16 bits:
-;CHECK: vst1.16 {d16[2]}, [r0, :16]
+;CHECK: vst1.16 {d16[2]}, [r0:16]
 	%tmp1 = load <4 x i16>* %B
         %tmp2 = extractelement <4 x i16> %tmp1, i32 2
         store i16 %tmp2, i16* %A, align 8
@@ -36,7 +36,7 @@ define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind {
 define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst1lanei32:
 ;Check the alignment value.  Max for this instruction is 32 bits:
-;CHECK: vst1.32 {d16[1]}, [r0, :32]
+;CHECK: vst1.32 {d16[1]}, [r0:32]
 	%tmp1 = load <2 x i32>* %B
         %tmp2 = extractelement <2 x i32> %tmp1, i32 1
         store i32 %tmp2, i32* %A, align 8
@@ -45,7 +45,7 @@ define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind {
 
 define void @vst1lanef(float* %A, <2 x float>* %B) nounwind {
 ;CHECK: vst1lanef:
-;CHECK: vst1.32 {d16[1]}, [r0, :32]
+;CHECK: vst1.32 {d16[1]}, [r0:32]
 	%tmp1 = load <2 x float>* %B
         %tmp2 = extractelement <2 x float> %tmp1, i32 1
         store float %tmp2, float* %A
@@ -64,7 +64,7 @@ define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
 
 define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst1laneQi16:
-;CHECK: vst1.16 {d17[1]}, [r0, :16]
+;CHECK: vst1.16 {d17[1]}, [r0:16]
 	%tmp1 = load <8 x i16>* %B
         %tmp2 = extractelement <8 x i16> %tmp1, i32 5
         store i16 %tmp2, i16* %A, align 8
@@ -74,7 +74,7 @@ define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst1laneQi32:
 ; // Can use scalar load. No need to use vectors.
-; // CHE-CK: vst1.32 {d17[1]}, [r0, :32]
+; // CHE-CK: vst1.32 {d17[1]}, [r0:32]
 	%tmp1 = load <4 x i32>* %B
         %tmp2 = extractelement <4 x i32> %tmp1, i32 3
         store i32 %tmp2, i32* %A, align 8
@@ -85,7 +85,7 @@ define void @vst1laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 define void @vst1laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
 ;CHECK: vst1laneQi32_update:
 ; // Can use scalar load. No need to use vectors.
-; // CHE-CK: vst1.32 {d17[1]}, [r1, :32]!
+; // CHE-CK: vst1.32 {d17[1]}, [r1:32]!
 	%A = load i32** %ptr
 	%tmp1 = load <4 x i32>* %B
 	%tmp2 = extractelement <4 x i32> %tmp1, i32 3
@@ -108,7 +108,7 @@ define void @vst1laneQf(float* %A, <4 x float>* %B) nounwind {
 define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst2lanei8:
 ;Check the alignment value.  Max for this instruction is 16 bits:
-;CHECK: vst2.8 {d16[1], d17[1]}, [r0, :16]
+;CHECK: vst2.8 {d16[1], d17[1]}, [r0:16]
 	%tmp1 = load <8 x i8>* %B
 	call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
 	ret void
@@ -117,7 +117,7 @@ define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
 define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vst2lanei16:
 ;Check the alignment value.  Max for this instruction is 32 bits:
-;CHECK: vst2.16 {d16[1], d17[1]}, [r0, :32]
+;CHECK: vst2.16 {d16[1], d17[1]}, [r0:32]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <4 x i16>* %B
 	call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
@@ -168,7 +168,7 @@ define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vst2laneQi32:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vst2.32 {d17[0], d19[0]}, [r0, :64]
+;CHECK: vst2.32 {d17[0], d19[0]}, [r0:64]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <4 x i32>* %B
 	call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
@@ -283,7 +283,7 @@ declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x f
 define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vst4lanei8:
 ;Check the alignment value.  Max for this instruction is 32 bits:
-;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
+;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
 	%tmp1 = load <8 x i8>* %B
 	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
 	ret void
@@ -292,7 +292,7 @@ define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check for a post-increment updating store.
 define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK: vst4lanei8_update:
-;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
+;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32]!
 	%A = load i8** %ptr
 	%tmp1 = load <8 x i8>* %B
 	call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
@@ -313,7 +313,7 @@ define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
 define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vst4lanei32:
 ;Check the alignment value.  Max for this instruction is 128 bits:
-;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0, :128]
+;CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:128]
 	%tmp0 = bitcast i32* %A to i8*
 	%tmp1 = load <2 x i32>* %B
 	call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
@@ -332,7 +332,7 @@ define void @vst4lanef(float* %A, <2 x float>* %B) nounwind {
 define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vst4laneQi16:
 ;Check the alignment value.  Max for this instruction is 64 bits:
-;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0, :64]
+;CHECK: vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64]
 	%tmp0 = bitcast i16* %A to i8*
 	%tmp1 = load <8 x i16>* %B
 	call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
diff --git a/test/CodeGen/Generic/inline-asm-mem-clobber.ll b/test/CodeGen/Generic/inline-asm-mem-clobber.ll
new file mode 100644
index 0000000..e523d03
--- /dev/null
+++ b/test/CodeGen/Generic/inline-asm-mem-clobber.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O2 < %s | FileCheck %s
+
+@G = common global i32 0, align 4
+
+define i32 @foo(i8* %p) nounwind uwtable {
+entry:
+  %p.addr = alloca i8*, align 8
+  %rv = alloca i32, align 4
+  store i8* %p, i8** %p.addr, align 8
+  store i32 0, i32* @G, align 4
+  %0 = load i8** %p.addr, align 8
+; CHECK: blah
+  %1 = call i32 asm "blah", "=r,r,~{memory}"(i8* %0) nounwind
+; CHECK: @G
+  store i32 %1, i32* %rv, align 4
+  %2 = load i32* %rv, align 4
+  %3 = load i32* @G, align 4
+  %add = add nsw i32 %2, %3
+  ret i32 %add
+}
+
diff --git a/test/CodeGen/Hexagon/absaddr-store.ll b/test/CodeGen/Hexagon/absaddr-store.ll
new file mode 100644
index 0000000..5c2554d
--- /dev/null
+++ b/test/CodeGen/Hexagon/absaddr-store.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate load instructions with absolute addressing mode.
+
+@a = external global i32
+@b = external global i8
+@c = external global i16
+@d = external global i64
+
+define zeroext i8 @absStoreByte() nounwind {
+; CHECK: memb(##b){{ *}}={{ *}}r{{[0-9]+}}
+entry:
+  %0 = load i8* @b, align 1
+  %conv = zext i8 %0 to i32
+  %mul = mul nsw i32 100, %conv
+  %conv1 = trunc i32 %mul to i8
+  store i8 %conv1, i8* @b, align 1
+  ret i8 %conv1
+}
+
+define signext i16 @absStoreHalf() nounwind {
+; CHECK: memh(##c){{ *}}={{ *}}r{{[0-9]+}}
+entry:
+  %0 = load i16* @c, align 2
+  %conv = sext i16 %0 to i32
+  %mul = mul nsw i32 100, %conv
+  %conv1 = trunc i32 %mul to i16
+  store i16 %conv1, i16* @c, align 2
+  ret i16 %conv1
+}
+
+define i32 @absStoreWord() nounwind {
+; CHECK: memw(##a){{ *}}={{ *}}r{{[0-9]+}}
+entry:
+  %0 = load i32* @a, align 4
+  %mul = mul nsw i32 100, %0
+  store i32 %mul, i32* @a, align 4
+  ret i32 %mul
+}
+
+define void @absStoreDouble() nounwind {
+; CHECK: memd(##d){{ *}}={{ *}}r{{[0-9]+}}:{{[0-9]+}}
+entry:
+  store i64 100, i64* @d, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/Hexagon/cext-check.ll b/test/CodeGen/Hexagon/cext-check.ll
new file mode 100644
index 0000000..7c4b19e
--- /dev/null
+++ b/test/CodeGen/Hexagon/cext-check.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we constant extended instructions only when necessary.
+
+define i32 @cext_test1(i32* %a) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}+##8000)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}{{ *}},{{ *}}##300000)
+; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}+##4092)
+; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}{{ *}},{{ *}}##300)
+entry:
+  %0 = load i32* %a, align 4
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  %arrayidx1 = getelementptr inbounds i32* %a, i32 2000
+  %1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, 300000
+  br label %return
+
+if.end:
+  %arrayidx2 = getelementptr inbounds i32* %a, i32 1023
+  %2 = load i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %2, 300
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %add, %if.then ], [ %add3, %if.end ]
+  ret i32 %retval.0
+}
+
+define i32 @cext_test2(i8* %a) nounwind {
+; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}memub(r{{[0-9]+}}+##1023)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}{{ *}},{{ *}}##300000)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memub(r{{[0-9]+}}+##1024)
+; CHECK-NOT: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}{{ *}},{{ *}}##6000)
+entry:
+  %tobool = icmp ne i8* %a, null
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  %arrayidx = getelementptr inbounds i8* %a, i32 1023
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 300000
+  br label %return
+
+if.end:
+  %arrayidx1 = getelementptr inbounds i8* %a, i32 1024
+  %1 = load i8* %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %add3 = add nsw i32 %conv2, 6000
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %add, %if.then ], [ %add3, %if.end ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/Hexagon/cmp-to-genreg.ll b/test/CodeGen/Hexagon/cmp-to-genreg.ll
new file mode 100644
index 0000000..97cf51c
--- /dev/null
+++ b/test/CodeGen/Hexagon/cmp-to-genreg.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate compare to general register.
+
+define i32 @compare1(i32 %a) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}cmp.eq(r{{[0-9]+}},{{ *}}#120)
+entry:
+  %cmp = icmp eq i32 %a, 120
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @compare2(i32 %a) nounwind readnone {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}!cmp.eq(r{{[0-9]+}},{{ *}}#120)
+entry:
+  %cmp = icmp ne i32 %a, 120
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @compare3(i32 %a, i32 %b) nounwind readnone {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}cmp.eq(r{{[0-9]+}},{{ *}}r{{[0-9]+}})
+entry:
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @compare4(i32 %a, i32 %b) nounwind readnone {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}!cmp.eq(r{{[0-9]+}},{{ *}}r{{[0-9]+}})
+entry:
+  %cmp = icmp ne i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/Hexagon/cmp-to-predreg.ll b/test/CodeGen/Hexagon/cmp-to-predreg.ll
new file mode 100644
index 0000000..d430b90
--- /dev/null
+++ b/test/CodeGen/Hexagon/cmp-to-predreg.ll
@@ -0,0 +1,43 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate compare to predicate register.
+
+define i32 @compare1(i32 %a, i32 %b) nounwind {
+; CHECK: p{{[0-3]}}{{ *}}={{ *}}!cmp.eq(r{{[0-9]+}},{{ *}}r{{[0-9]+}})
+entry:
+  %cmp = icmp ne i32 %a, %b
+  %add = add nsw i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %add.sub = select i1 %cmp, i32 %add, i32 %sub
+  ret i32 %add.sub
+}
+
+define i32 @compare2(i32 %a) nounwind {
+; CHECK: p{{[0-3]}}{{ *}}={{ *}}!cmp.eq(r{{[0-9]+}},{{ *}}#10)
+entry:
+  %cmp = icmp ne i32 %a, 10
+  %add = add nsw i32 %a, 10
+  %sub = sub nsw i32 %a, 10
+  %add.sub = select i1 %cmp, i32 %add, i32 %sub
+  ret i32 %add.sub
+}
+
+define i32 @compare3(i32 %a, i32 %b) nounwind {
+; CHECK: p{{[0-3]}}{{ *}}={{ *}}cmp.gt(r{{[0-9]+}},{{ *}}r{{[0-9]+}})
+entry:
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub nsw i32 %a, %b
+  %add = add nsw i32 %a, %b
+  %sub.add = select i1 %cmp, i32 %sub, i32 %add
+  ret i32 %sub.add
+}
+
+define i32 @compare4(i32 %a) nounwind {
+; CHECK: p{{[0-3]}}{{ *}}={{ *}}cmp.gt(r{{[0-9]+}},{{ *}}#10)
+entry:
+  %cmp = icmp sgt i32 %a, 10
+  %sub = sub nsw i32 %a, 10
+  %add = add nsw i32 %a, 10
+  %sub.add = select i1 %cmp, i32 %sub, i32 %add
+  ret i32 %sub.add
+}
+
diff --git a/test/CodeGen/Hexagon/cmp_pred.ll b/test/CodeGen/Hexagon/cmp_pred.ll
new file mode 100644
index 0000000..37db3b4
--- /dev/null
+++ b/test/CodeGen/Hexagon/cmp_pred.ll
@@ -0,0 +1,115 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Generate various cmpb instruction followed by if (p0) .. if (!p0)...
+target triple = "hexagon"
+
+define i32 @Func_3Ugt(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ugt i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Uge(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp uge i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Ult(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ult i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Ule(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ule i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Ueq(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp eq i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Une(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ne i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3UneC(i32 %Enum_Par_Val) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ne i32 %Enum_Par_Val, 122
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3gt(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK: mux
+  %cmp = icmp sgt i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3ge(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp sge i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3lt(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp slt i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3le(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp sle i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3eq(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp eq i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3ne(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ne i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3neC(i32 %Enum_Par_Val) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ne i32 %Enum_Par_Val, 122
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
diff --git a/test/CodeGen/Hexagon/cmp_pred_reg.ll b/test/CodeGen/Hexagon/cmp_pred_reg.ll
new file mode 100644
index 0000000..37db3b4
--- /dev/null
+++ b/test/CodeGen/Hexagon/cmp_pred_reg.ll
@@ -0,0 +1,115 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Generate various cmpb instruction followed by if (p0) .. if (!p0)...
+target triple = "hexagon"
+
+define i32 @Func_3Ugt(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ugt i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Uge(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp uge i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Ult(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ult i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Ule(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ule i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Ueq(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp eq i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3Une(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ne i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3UneC(i32 %Enum_Par_Val) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ne i32 %Enum_Par_Val, 122
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3gt(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK: mux
+  %cmp = icmp sgt i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3ge(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp sge i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3lt(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp slt i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3le(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp sle i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3eq(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp eq i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3ne(i32 %Enum_Par_Val, i32 %pv2) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ne i32 %Enum_Par_Val, %pv2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3neC(i32 %Enum_Par_Val) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %cmp = icmp ne i32 %Enum_Par_Val, 122
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
diff --git a/test/CodeGen/Hexagon/cmpb_pred.ll b/test/CodeGen/Hexagon/cmpb_pred.ll
new file mode 100644
index 0000000..1e61447
--- /dev/null
+++ b/test/CodeGen/Hexagon/cmpb_pred.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
+; Generate various cmpb instruction followed by if (p0) .. if (!p0)...
+target triple = "hexagon"
+
+@Enum_global = external global i8
+
+define i32 @Func_3(i32) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %conv = and i32 %0, 255
+  %cmp = icmp eq i32 %conv, 2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3b(i32) nounwind readonly {
+entry:
+; CHECK-NOT: mux
+  %1 = load i8* @Enum_global, align 1, !tbaa !0
+  %2 = trunc i32 %0 to i8
+  %cmp = icmp ne i8 %1, %2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3c(i32) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %conv = and i32 %0, 255
+  %cmp = icmp eq i32 %conv, 2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3d(i32) nounwind readonly {
+entry:
+; CHECK-NOT: mux
+  %1 = load i8* @Enum_global, align 1, !tbaa !0
+  %2 = trunc i32 %0 to i8
+  %cmp = icmp eq i8 %1, %2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3e(i32) nounwind readonly {
+entry:
+; CHECK-NOT: mux
+  %1 = load i8* @Enum_global, align 1, !tbaa !0
+  %2 = trunc i32 %0 to i8
+  %cmp = icmp eq i8 %1, %2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3f(i32) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %conv = and i32 %0, 255
+  %cmp = icmp ugt i32 %conv, 2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3g(i32) nounwind readnone {
+entry:
+; CHECK: mux
+  %conv = and i32 %0, 255
+  %cmp = icmp ult i32 %conv, 3
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3h(i32) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %conv = and i32 %0, 254
+  %cmp = icmp ult i32 %conv, 2
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+define i32 @Func_3i(i32) nounwind readnone {
+entry:
+; CHECK-NOT: mux
+  %conv = and i32 %0, 254
+  %cmp = icmp ugt i32 %conv, 1
+  %selv = zext i1 %cmp to i32
+  ret i32 %selv
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/combine_ir.ll b/test/CodeGen/Hexagon/combine_ir.ll
new file mode 100644
index 0000000..921ce99
--- /dev/null
+++ b/test/CodeGen/Hexagon/combine_ir.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; CHECK: word
+; CHECK: combine(#0
+
+define void @word(i32* nocapture %a) nounwind {
+entry:
+  %0 = load i32* %a, align 4, !tbaa !0
+  %1 = zext i32 %0 to i64
+  %add.ptr = getelementptr inbounds i32* %a, i32 1
+  %2 = load i32* %add.ptr, align 4, !tbaa !0
+  %3 = zext i32 %2 to i64
+  %4 = shl nuw i64 %3, 32
+  %ins = or i64 %4, %1
+  tail call void @bar(i64 %ins) nounwind
+  ret void
+}
+
+declare void @bar(i64)
+
+; CHECK: halfword
+; CHECK: combine(#0
+
+define void @halfword(i16* nocapture %a) nounwind {
+entry:
+  %0 = load i16* %a, align 2, !tbaa !3
+  %1 = zext i16 %0 to i64
+  %add.ptr = getelementptr inbounds i16* %a, i32 1
+  %2 = load i16* %add.ptr, align 2, !tbaa !3
+  %3 = zext i16 %2 to i64
+  %4 = shl nuw nsw i64 %3, 16
+  %ins = or i64 %4, %1
+  tail call void @bar(i64 %ins) nounwind
+  ret void
+}
+
+; CHECK: byte
+; CHECK: combine(#0
+
+define void @byte(i8* nocapture %a) nounwind {
+entry:
+  %0 = load i8* %a, align 1, !tbaa !1
+  %1 = zext i8 %0 to i64
+  %add.ptr = getelementptr inbounds i8* %a, i32 1
+  %2 = load i8* %add.ptr, align 1, !tbaa !1
+  %3 = zext i8 %2 to i64
+  %4 = shl nuw nsw i64 %3, 8
+  %ins = or i64 %4, %1
+  tail call void @bar(i64 %ins) nounwind
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/Hexagon/ctlz-cttz-ctpop.ll b/test/CodeGen/Hexagon/ctlz-cttz-ctpop.ll
new file mode 100644
index 0000000..e942f8d
--- /dev/null
+++ b/test/CodeGen/Hexagon/ctlz-cttz-ctpop.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} |= lsr(r{{[0-9]+}}:{{[0-9]+}}, #4)
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} &= lsr(r{{[0-9]+}}:{{[0-9]+}}, #2)
+; CHECK: r{{[0-9]+}} += lsr(r{{[0-9]+}}, #4)
+
+define i32 @foo(i64 %a, i32 %b) nounwind  {
+entry:
+        %tmp0 = tail call i64 @llvm.ctlz.i64( i64 %a, i1 true )
+        %tmp1 = tail call i64 @llvm.cttz.i64( i64 %a, i1 true )
+        %tmp2 = tail call i32 @llvm.ctlz.i32( i32 %b, i1 true )
+        %tmp3 = tail call i32 @llvm.cttz.i32( i32 %b, i1 true )
+        %tmp4 = tail call i64 @llvm.ctpop.i64( i64 %a )
+        %tmp5 = tail call i32 @llvm.ctpop.i32( i32 %b )
+
+
+        %tmp6 = trunc i64 %tmp0 to i32
+        %tmp7 = trunc i64 %tmp1 to i32
+        %tmp8 = trunc i64 %tmp4 to i32
+        %tmp9 = add i32 %tmp6, %tmp7
+        %tmp10 = add i32 %tmp9, %tmp8
+        %tmp11 = add i32 %tmp10, %tmp2
+        %tmp12 = add i32 %tmp11, %tmp3
+        %tmp13 = add i32 %tmp12, %tmp5
+
+        ret i32 %tmp13
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
+declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
+declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
diff --git a/test/CodeGen/Hexagon/dualstore.ll b/test/CodeGen/Hexagon/dualstore.ll
index 0674995..3bf6019 100644
--- a/test/CodeGen/Hexagon/dualstore.ll
+++ b/test/CodeGen/Hexagon/dualstore.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
 ; Check that we generate dual stores in one packet in V4
 
-; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}={{ *}}#100000
-; CHECK-NEXT: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}={{ *}}#500000
+; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}={{ *}}##100000
+; CHECK-NEXT: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}={{ *}}##500000
 ; CHECK-NEXT: }
 
 @Reg = global i32 0, align 4
diff --git a/test/CodeGen/Hexagon/gp-plus-offset-load.ll b/test/CodeGen/Hexagon/gp-plus-offset-load.ll
new file mode 100644
index 0000000..a1b80a6
--- /dev/null
+++ b/test/CodeGen/Hexagon/gp-plus-offset-load.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate load instructions with global + offset
+
+%struct.struc = type { i8, i8, i16, i32 }
+
+@foo = common global %struct.struc zeroinitializer, align 4
+
+define void @loadWord(i32 %val1, i32 %val2, i32* nocapture %ival) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(##foo{{ *}}+{{ *}}4)
+entry:
+  %cmp = icmp sgt i32 %val1, %val2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %0 = load i32* getelementptr inbounds (%struct.struc* @foo, i32 0, i32 3), align 4
+  store i32 %0, i32* %ival, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @loadByte(i32 %val1, i32 %val2, i8* nocapture %ival) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memb(##foo{{ *}}+{{ *}}1)
+entry:
+  %cmp = icmp sgt i32 %val1, %val2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %0 = load i8* getelementptr inbounds (%struct.struc* @foo, i32 0, i32 1), align 1
+  store i8 %0, i8* %ival, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @loadHWord(i32 %val1, i32 %val2, i16* %ival) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memh(##foo{{ *}}+{{ *}}2)
+entry:
+  %cmp = icmp sgt i32 %val1, %val2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %0 = load i16* getelementptr inbounds (%struct.struc* @foo, i32 0, i32 2), align 2
+  store i16 %0, i16* %ival, align 2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/gp-plus-offset-store.ll b/test/CodeGen/Hexagon/gp-plus-offset-store.ll
new file mode 100644
index 0000000..c782b30
--- /dev/null
+++ b/test/CodeGen/Hexagon/gp-plus-offset-store.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate store instructions with global + offset
+
+%struct.struc = type { i8, i8, i16, i32 }
+
+@foo = common global %struct.struc zeroinitializer, align 4
+
+define void @storeByte(i32 %val1, i32 %val2, i8 zeroext %ival) nounwind {
+; CHECK: memb(##foo{{ *}}+{{ *}}1){{ *}}={{ *}}r{{[0-9]+}}
+entry:
+  %cmp = icmp sgt i32 %val1, %val2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i8 %ival, i8* getelementptr inbounds (%struct.struc* @foo, i32 0, i32 1), align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @storeHW(i32 %val1, i32 %val2, i16 signext %ival) nounwind {
+; CHECK: memh(##foo{{ *}}+{{ *}}2){{ *}}={{ *}}r{{[0-9]+}}
+entry:
+  %cmp = icmp sgt i32 %val1, %val2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i16 %ival, i16* getelementptr inbounds (%struct.struc* @foo, i32 0, i32 2), align 2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
diff --git a/test/CodeGen/Hexagon/hwloop-cleanup.ll b/test/CodeGen/Hexagon/hwloop-cleanup.ll
new file mode 100644
index 0000000..6456ebf
--- /dev/null
+++ b/test/CodeGen/Hexagon/hwloop-cleanup.ll
@@ -0,0 +1,86 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we remove the compare and induction variable instructions
+; after generating hardware loops.
+; Bug 6685.
+
+; CHECK: loop0
+; CHECK-NOT: r{{[0-9]+}}{{.}}={{.}}add(r{{[0-9]+}},{{.}}#-1)
+; CHECK-NOT: cmp.eq
+; CHECK: endloop0
+
+define i32 @test1(i32* nocapture %b, i32 %n) nounwind readonly {
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %sum.03 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %b, %for.body.preheader ]
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %0 = load i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.03
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
+  ret i32 %sum.0.lcssa
+}
+
+; This test checks that that initial loop count value is removed.
+; CHECK-NOT: ={{.}}#40
+; CHECK: loop0
+; CHECK-NOT: r{{[0-9]+}}{{.}}={{.}}add(r{{[0-9]+}},{{.}}#-1)
+; CHECK-NOT: cmp.eq
+; CHECK: endloop0
+
+define i32 @test2(i32* nocapture %b) nounwind readonly {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx.phi = phi i32* [ %b, %entry ], [ %arrayidx.inc, %for.body ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.02
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 40
+  %arrayidx.inc = getelementptr i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
+
+; This test checks that we don't remove the induction variable since it's used.
+; CHECK: loop0
+; CHECK: r{{[0-9]+}}{{.}}={{.}}add(r{{[0-9]+}},{{.}}#1)
+; CHECK-NOT: cmp.eq
+; CHECK: endloop0
+define i32 @test3(i32* nocapture %b) nounwind {
+entry:
+  br label %for.body
+
+for.body:
+  %arrayidx.phi = phi i32* [ %b, %entry ], [ %arrayidx.inc, %for.body ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  store i32 %i.01, i32* %arrayidx.phi, align 4
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 40
+  %arrayidx.inc = getelementptr i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
+
+
diff --git a/test/CodeGen/Hexagon/hwloop-const.ll b/test/CodeGen/Hexagon/hwloop-const.ll
new file mode 100644
index 0000000..a621c58
--- /dev/null
+++ b/test/CodeGen/Hexagon/hwloop-const.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -O2 < %s | FileCheck %s
+; ModuleID = 'hwloop-const.c'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon-unknown-linux-gnu"
+
+@b = common global [25000 x i32] zeroinitializer, align 8
+@a = common global [25000 x i32] zeroinitializer, align 8
+@c = common global [25000 x i32] zeroinitializer, align 8
+
+define i32 @hwloop_bug() nounwind {
+entry:
+  br label %for.body
+
+; CHECK: endloop
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [25000 x i32]* @b, i32 0, i32 %i.02
+  store i32 %i.02, i32* %arrayidx, align 4, !tbaa !0
+  %arrayidx1 = getelementptr inbounds [25000 x i32]* @a, i32 0, i32 %i.02
+  store i32 %i.02, i32* %arrayidx1, align 4, !tbaa !0
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 25000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
new file mode 100644
index 0000000..eaffa07
--- /dev/null
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -march=hexagon -mcpu=hexagonv4 -O2 -disable-lsr | FileCheck %s
+; ModuleID = 'hwloop-dbg.o'
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @foo(i32* nocapture %a, i32* nocapture %b) nounwind {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32* %a}, i64 0, metadata !13), !dbg !17
+  tail call void @llvm.dbg.value(metadata !{i32* %b}, i64 0, metadata !14), !dbg !18
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !15), !dbg !19
+  br label %for.body, !dbg !19
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK:     loop0(
+; CHECK-NOT: add({{r[0-9]*}}, #
+; CHECK:     endloop0
+  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %b.addr.01 = phi i32* [ %b, %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32* %b.addr.01, i32 1, !dbg !21
+  tail call void @llvm.dbg.value(metadata !{i32* %incdec.ptr}, i64 0, metadata !14), !dbg !21
+  %0 = load i32* %b.addr.01, align 4, !dbg !21, !tbaa !23
+  store i32 %0, i32* %arrayidx.phi, align 4, !dbg !21, !tbaa !23
+  %inc = add nsw i32 %i.02, 1, !dbg !26
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !15), !dbg !26
+  %exitcond = icmp eq i32 %inc, 10, !dbg !19
+  %arrayidx.inc = getelementptr i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !19
+
+for.end:                                          ; preds = %for.body
+  ret void, !dbg !27
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"hwloop-dbg.c", metadata !"/usr2/kparzysz/s.hex/t", metadata !"QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*)* @foo, null, null, metadata !11, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!6 = metadata !{i32 786473, metadata !"hwloop-dbg.c", metadata !"/usr2/kparzysz/s.hex/t", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !9}
+!9 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
+!10 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{metadata !12}
+!12 = metadata !{metadata !13, metadata !14, metadata !15}
+!13 = metadata !{i32 786689, metadata !5, metadata !"a", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!14 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554433, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [b] [line 1]
+!15 = metadata !{i32 786688, metadata !16, metadata !"i", metadata !6, i32 2, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2]
+!16 = metadata !{i32 786443, metadata !5, i32 1, i32 26, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!17 = metadata !{i32 1, i32 15, metadata !5, null}
+!18 = metadata !{i32 1, i32 23, metadata !5, null}
+!19 = metadata !{i32 3, i32 8, metadata !20, null}
+!20 = metadata !{i32 786443, metadata !16, i32 3, i32 3, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!21 = metadata !{i32 4, i32 5, metadata !22, null}
+!22 = metadata !{i32 786443, metadata !20, i32 3, i32 28, metadata !6, i32 2} ; [ DW_TAG_lexical_block ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c]
+!23 = metadata !{metadata !"int", metadata !24}
+!24 = metadata !{metadata !"omnipotent char", metadata !25}
+!25 = metadata !{metadata !"Simple C/C++ TBAA"}
+!26 = metadata !{i32 3, i32 23, metadata !20, null}
+!27 = metadata !{i32 6, i32 1, metadata !16, null}
diff --git a/test/CodeGen/Hexagon/hwloop-le.ll b/test/CodeGen/Hexagon/hwloop-le.ll
new file mode 100644
index 0000000..9c8cec7
--- /dev/null
+++ b/test/CodeGen/Hexagon/hwloop-le.ll
@@ -0,0 +1,438 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+
+
+; CHECK: test_pos1_ir_sle
+; CHECK: loop0
+; a < b
+define void @test_pos1_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 28395, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 28395, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos2_ir_sle
+; CHECK: loop0
+; a < b
+define void @test_pos2_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 9073, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 9073, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 2
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos4_ir_sle
+; CHECK: loop0
+; a < b
+define void @test_pos4_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 21956, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 21956, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 4
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos8_ir_sle
+; CHECK: loop0
+; a < b
+define void @test_pos8_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 16782, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 16782, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 8
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos16_ir_sle
+; CHECK: loop0
+; a < b
+define void @test_pos16_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 19097, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 19097, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 16
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos1_ri_sle
+; CHECK: loop0
+; a < b
+define void @test_pos1_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, 14040
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp sle i32 %inc, 14040
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos2_ri_sle
+; CHECK: loop0
+; a < b
+define void @test_pos2_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, 13710
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 2
+  %cmp = icmp sle i32 %inc, 13710
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos4_ri_sle
+; CHECK: loop0
+; a < b
+define void @test_pos4_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, 9920
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 4
+  %cmp = icmp sle i32 %inc, 9920
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos8_ri_sle
+; CHECK: loop0
+; a < b
+define void @test_pos8_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, 18924
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 8
+  %cmp = icmp sle i32 %inc, 18924
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos16_ri_sle
+; CHECK: loop0
+; a < b
+define void @test_pos16_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, 11812
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 16
+  %cmp = icmp sle i32 %inc, 11812
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos1_rr_sle
+; CHECK: loop0
+; a < b
+define void @test_pos1_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos2_rr_sle
+; CHECK: loop0
+; a < b
+define void @test_pos2_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 2
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos4_rr_sle
+; CHECK: loop0
+; a < b
+define void @test_pos4_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 4
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos8_rr_sle
+; CHECK: loop0
+; a < b
+define void @test_pos8_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 8
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos16_rr_sle
+; CHECK: loop0
+; a < b
+define void @test_pos16_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp sle i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 16
+  %cmp = icmp sle i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
diff --git a/test/CodeGen/Hexagon/hwloop-lt.ll b/test/CodeGen/Hexagon/hwloop-lt.ll
new file mode 100644
index 0000000..7e43733
--- /dev/null
+++ b/test/CodeGen/Hexagon/hwloop-lt.ll
@@ -0,0 +1,438 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+
+
+; CHECK: test_pos1_ir_slt
+; CHECK: loop0
+; a < b
+define void @test_pos1_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 8531, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 8531, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos2_ir_slt
+; CHECK: loop0
+; a < b
+define void @test_pos2_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 9152, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 9152, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 2
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos4_ir_slt
+; CHECK: loop0
+; a < b
+define void @test_pos4_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 18851, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 18851, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 4
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos8_ir_slt
+; CHECK: loop0
+; a < b
+define void @test_pos8_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 25466, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 25466, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 8
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos16_ir_slt
+; CHECK: loop0
+; a < b
+define void @test_pos16_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 9295, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 9295, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 16
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos1_ri_slt
+; CHECK: loop0
+; a < b
+define void @test_pos1_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 31236
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp slt i32 %inc, 31236
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos2_ri_slt
+; CHECK: loop0
+; a < b
+define void @test_pos2_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 22653
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 2
+  %cmp = icmp slt i32 %inc, 22653
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos4_ri_slt
+; CHECK: loop0
+; a < b
+define void @test_pos4_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 1431
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 4
+  %cmp = icmp slt i32 %inc, 1431
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos8_ri_slt
+; CHECK: loop0
+; a < b
+define void @test_pos8_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 22403
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 8
+  %cmp = icmp slt i32 %inc, 22403
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos16_ri_slt
+; CHECK: loop0
+; a < b
+define void @test_pos16_ri_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 21715
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 16
+  %cmp = icmp slt i32 %inc, 21715
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos1_rr_slt
+; CHECK: loop0
+; a < b
+define void @test_pos1_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos2_rr_slt
+; CHECK: loop0
+; a < b
+define void @test_pos2_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 2
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos4_rr_slt
+; CHECK: loop0
+; a < b
+define void @test_pos4_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 4
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos8_rr_slt
+; CHECK: loop0
+; a < b
+define void @test_pos8_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 8
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos16_rr_slt
+; CHECK: loop0
+; a < b
+define void @test_pos16_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 16
+  %cmp = icmp slt i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
diff --git a/test/CodeGen/Hexagon/hwloop-lt1.ll b/test/CodeGen/Hexagon/hwloop-lt1.ll
new file mode 100644
index 0000000..cf58740
--- /dev/null
+++ b/test/CodeGen/Hexagon/hwloop-lt1.ll
@@ -0,0 +1,32 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate a hardware loop instruction.
+; CHECK: endloop0
+
+@A = common global [400 x i8] zeroinitializer, align 8
+@B = common global [400 x i8] zeroinitializer, align 8
+@C = common global [400 x i8] zeroinitializer, align 8
+
+define void @run() nounwind {
+entry:
+  br label %polly.loop_body
+
+polly.loop_after:                                 ; preds = %polly.loop_body
+  ret void
+
+polly.loop_body:                                  ; preds = %entry, %polly.loop_body
+  %polly.loopiv16 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %polly.next_loopiv = add i32 %polly.loopiv16, 4
+  %p_vector_iv14 = or i32 %polly.loopiv16, 1
+  %p_vector_iv3 = add i32 %p_vector_iv14, 1
+  %p_vector_iv415 = or i32 %polly.loopiv16, 3
+  %p_arrayidx = getelementptr [400 x i8]* @A, i32 0, i32 %polly.loopiv16
+  %p_arrayidx5 = getelementptr [400 x i8]* @A, i32 0, i32 %p_vector_iv14
+  %p_arrayidx6 = getelementptr [400 x i8]* @A, i32 0, i32 %p_vector_iv3
+  %p_arrayidx7 = getelementptr [400 x i8]* @A, i32 0, i32 %p_vector_iv415
+  store i8 123, i8* %p_arrayidx, align 1
+  store i8 123, i8* %p_arrayidx5, align 1
+  store i8 123, i8* %p_arrayidx6, align 1
+  store i8 123, i8* %p_arrayidx7, align 1
+  %0 = icmp slt i32 %polly.next_loopiv, 400
+  br i1 %0, label %polly.loop_body, label %polly.loop_after
+}
diff --git a/test/CodeGen/Hexagon/hwloop-ne.ll b/test/CodeGen/Hexagon/hwloop-ne.ll
new file mode 100644
index 0000000..bceef2a
--- /dev/null
+++ b/test/CodeGen/Hexagon/hwloop-ne.ll
@@ -0,0 +1,438 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+
+
+; CHECK: test_pos1_ir_ne
+; CHECK: loop0
+; a < b
+define void @test_pos1_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 32623, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 32623, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos2_ir_ne
+; CHECK: loop0
+; a < b
+define void @test_pos2_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 29554, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 29554, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 2
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos4_ir_ne
+; CHECK: loop0
+; a < b
+define void @test_pos4_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 15692, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 15692, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 4
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos8_ir_ne
+; CHECK: loop0
+; a < b
+define void @test_pos8_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 10449, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 10449, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 8
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos16_ir_ne
+; CHECK: loop0
+; a < b
+define void @test_pos16_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 32087, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ 32087, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 16
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos1_ri_ne
+; CHECK: loop0
+; a < b
+define void @test_pos1_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 3472
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp ne i32 %inc, 3472
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos2_ri_ne
+; CHECK: loop0
+; a < b
+define void @test_pos2_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 8730
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 2
+  %cmp = icmp ne i32 %inc, 8730
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos4_ri_ne
+; CHECK: loop0
+; a < b
+define void @test_pos4_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 1493
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 4
+  %cmp = icmp ne i32 %inc, 1493
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos8_ri_ne
+; CHECK: loop0
+; a < b
+define void @test_pos8_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 1706
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 8
+  %cmp = icmp ne i32 %inc, 1706
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos16_ri_ne
+; CHECK: loop0
+; a < b
+define void @test_pos16_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, 1886
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 16
+  %cmp = icmp ne i32 %inc, 1886
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos1_rr_ne
+; CHECK: loop0
+; a < b
+define void @test_pos1_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 1
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos2_rr_ne
+; CHECK: loop0
+; a < b
+define void @test_pos2_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 2
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos4_rr_ne
+; CHECK: loop0
+; a < b
+define void @test_pos4_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 4
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos8_rr_ne
+; CHECK: loop0
+; a < b
+define void @test_pos8_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 8
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
+; CHECK: test_pos16_rr_ne
+; CHECK: loop0
+; a < b
+define void @test_pos16_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
+entry:
+  %cmp3 = icmp slt i32 %a, %b
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8* %p, i32 %i.04
+  %0 = load i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %add = add nsw i32 %conv, 1
+  %conv1 = trunc i32 %add to i8
+  store i8 %conv1, i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.04, 16
+  %cmp = icmp ne i32 %inc, %b
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+
diff --git a/test/CodeGen/Hexagon/idxload-with-zero-offset.ll b/test/CodeGen/Hexagon/idxload-with-zero-offset.ll
new file mode 100644
index 0000000..ca6df88
--- /dev/null
+++ b/test/CodeGen/Hexagon/idxload-with-zero-offset.ll
@@ -0,0 +1,70 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we generate load instruction with (base + register offset << 0)
+
+; load word
+
+define i32 @load_w(i32* nocapture %a, i32 %n) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+entry:
+  %tmp = shl i32 %n, 4
+  %scevgep9 = getelementptr i32* %a, i32 %tmp
+  %val = load i32* %scevgep9, align 4
+  ret i32 %val
+}
+
+; load unsigned half word
+
+define i16 @load_uh(i16* nocapture %a, i32 %n) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memuh(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+entry:
+  %tmp = shl i32 %n, 4
+  %scevgep9 = getelementptr i16* %a, i32 %tmp
+  %val = load i16* %scevgep9, align 2
+  ret i16 %val
+}
+
+; load signed half word
+
+define i32 @load_h(i16* nocapture %a, i32 %n) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memh(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+entry:
+  %tmp = shl i32 %n, 4
+  %scevgep9 = getelementptr i16* %a, i32 %tmp
+  %val = load i16* %scevgep9, align 2
+  %conv = sext i16 %val to i32
+  ret i32 %conv
+}
+
+; load unsigned byte
+
+define i8 @load_ub(i8* nocapture %a, i32 %n) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memub(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+entry:
+  %tmp = shl i32 %n, 4
+  %scevgep9 = getelementptr i8* %a, i32 %tmp
+  %val = load i8* %scevgep9, align 1
+  ret i8 %val
+}
+
+; load signed byte
+
+define i32 @foo_2(i8* nocapture %a, i32 %n) nounwind {
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memb(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+entry:
+  %tmp = shl i32 %n, 4
+  %scevgep9 = getelementptr i8* %a, i32 %tmp
+  %val = load i8* %scevgep9, align 1
+  %conv = sext i8 %val to i32
+  ret i32 %conv
+}
+
+; load doubleword
+
+define i64 @load_d(i64* nocapture %a, i32 %n) nounwind {
+; CHECK: r{{[0-9]+}}:{{[0-9]+}}{{ *}}={{ *}}memd(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+entry:
+  %tmp = shl i32 %n, 4
+  %scevgep9 = getelementptr i64* %a, i32 %tmp
+  %val = load i64* %scevgep9, align 8
+  ret i64 %val
+}
diff --git a/test/CodeGen/Hexagon/postinc-store.ll b/test/CodeGen/Hexagon/postinc-store.ll
new file mode 100644
index 0000000..99a3a58
--- /dev/null
+++ b/test/CodeGen/Hexagon/postinc-store.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+
+; Check that post-increment store instructions are being generated.
+; CHECK: memw(r{{[0-9]+}}{{ *}}++{{ *}}#4{{ *}}){{ *}}={{ *}}r{{[0-9]+}}
+
+define i32 @sum(i32* nocapture %a, i16* nocapture %b, i32 %n) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 10, %entry ]
+  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
+  %arrayidx1.phi = phi i16* [ %b, %entry ], [ %arrayidx1.inc, %for.body ]
+  %0 = load i32* %arrayidx.phi, align 4
+  %1 = load i16* %arrayidx1.phi, align 2
+  %conv = sext i16 %1 to i32
+  %factor = mul i32 %0, 2
+  %add3 = add i32 %factor, %conv
+  store i32 %add3, i32* %arrayidx.phi, align 4
+
+  %arrayidx.inc = getelementptr i32* %arrayidx.phi, i32 1
+  %arrayidx1.inc = getelementptr i16* %arrayidx1.phi, i32 1
+  %lsr.iv.next = add i32 %lsr.iv, -1
+  %exitcond = icmp eq i32 %lsr.iv.next, 0
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
diff --git a/test/CodeGen/Hexagon/pred-absolute-store.ll b/test/CodeGen/Hexagon/pred-absolute-store.ll
new file mode 100644
index 0000000..b1b09f4
--- /dev/null
+++ b/test/CodeGen/Hexagon/pred-absolute-store.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; Check that we are able to predicate instructions with abosolute
+; addressing mode.
+
+; CHECK: if{{ *}}(p{{[0-3]+}}){{ *}}memw(##gvar){{ *}}={{ *}}r{{[0-9]+}}
+
+@gvar = external global i32
+define i32 @test2(i32 %a, i32 %b) nounwind {
+entry:
+  %cmp = icmp eq i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 %a, i32* @gvar, align 4
+  br label %if.end
+
+if.end:
+  ret i32 %b
+}
diff --git a/test/CodeGen/Hexagon/predicate-copy.ll b/test/CodeGen/Hexagon/predicate-copy.ll
new file mode 100644
index 0000000..552b687
--- /dev/null
+++ b/test/CodeGen/Hexagon/predicate-copy.ll
@@ -0,0 +1,8 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+
+; CHECK: r{{[0-9]+}} = p{{[0-9]+}}
+define i1 @foo() {
+entry:
+  ret i1 false
+}
+
diff --git a/test/CodeGen/Hexagon/struct_args.ll b/test/CodeGen/Hexagon/struct_args.ll
index e488f33..f91300b 100644
--- a/test/CodeGen/Hexagon/struct_args.ll
+++ b/test/CodeGen/Hexagon/struct_args.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: r{{[0-9]}}:{{[0-9]}} = combine(r{{[0-9]}}, r{{[0-9]}})
+; CHECK: r{{[0-9]}}:{{[0-9]}} = combine({{r[0-9]|#0}}, r{{[0-9]}})
 ; CHECK: r{{[0-9]}}:{{[0-9]}} |= asl(r{{[0-9]}}:{{[0-9]}}, #32)
 
 %struct.small = type { i32, i32 }
diff --git a/test/CodeGen/Hexagon/validate-offset.ll b/test/CodeGen/Hexagon/validate-offset.ll
new file mode 100644
index 0000000..9e7d0aa
--- /dev/null
+++ b/test/CodeGen/Hexagon/validate-offset.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s -O0
+
+; This is a regression test which makes sure that the offset check
+; is available for STRiw_indexed instruction. This is required
+; by 'Hexagon Expand Predicate Spill Code' pass.
+
+define i32 @f(i32 %a, i32 %b) nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %0 = load i32* %a.addr, align 4
+  %1 = load i32* %b.addr, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %2 = load i32* %a.addr, align 4
+  %3 = load i32* %b.addr, align 4
+  %add = add nsw i32 %2, %3
+  store i32 %add, i32* %retval
+  br label %return
+
+if.else:
+  %4 = load i32* %a.addr, align 4
+  %5 = load i32* %b.addr, align 4
+  %sub = sub nsw i32 %4, %5
+  store i32 %sub, i32* %retval
+  br label %return
+
+return:
+  %6 = load i32* %retval
+  ret i32 %6
+}
diff --git a/test/CodeGen/Mips/addi.ll b/test/CodeGen/Mips/addi.ll
new file mode 100644
index 0000000..8f70a46
--- /dev/null
+++ b/test/CodeGen/Mips/addi.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 6, align 4
+@j = global i32 12, align 4
+@k = global i32 15, align 4
+@l = global i32 20, align 4
+@.str = private unnamed_addr constant [13 x i8] c"%i %i %i %i\0A\00", align 1
+
+define void @foo() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %add = add nsw i32 %0, 5
+  store i32 %add, i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %sub = sub nsw i32 %1, 5
+  store i32 %sub, i32* @j, align 4
+  %2 = load i32* @k, align 4
+  %add1 = add nsw i32 %2, 10000
+  store i32 %add1, i32* @k, align 4
+  %3 = load i32* @l, align 4
+  %sub2 = sub nsw i32 %3, 10000
+  store i32 %sub2, i32* @l, align 4
+; 16: 	addiu	${{[0-9]+}}, 5	# 16 bit inst
+; 16: 	addiu	${{[0-9]+}}, -5	# 16 bit inst
+; 16: 	addiu	${{[0-9]+}}, 10000
+; 16: 	addiu	${{[0-9]+}}, -10000
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/align16.ll b/test/CodeGen/Mips/align16.ll
new file mode 100644
index 0000000..99139ab
--- /dev/null
+++ b/test/CodeGen/Mips/align16.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 25, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define void @p(i32* %i) nounwind {
+entry:
+  ret void
+}
+
+
+define void @foo() nounwind {
+entry:
+  %y = alloca [512 x i32], align 4
+  %x = alloca i32, align 8
+  %zz = alloca i32, align 4
+  %z = alloca i32, align 4
+  %0 = load i32* @i, align 4
+  %arrayidx = getelementptr inbounds [512 x i32]* %y, i32 0, i32 10
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = load i32* @i, align 4
+  store i32 %1, i32* %x, align 8
+  call void @p(i32* %x)
+  %arrayidx1 = getelementptr inbounds [512 x i32]* %y, i32 0, i32 10
+  call void @p(i32* %arrayidx1)
+  ret void
+}
+; 16:	save	$ra, $s0, $s1, 2040
+; 16:	addiu	$sp, -48 # 16 bit inst
+; 16:	addiu	$sp, 48 # 16 bit inst
+; 16:	restore	$ra,  $s0, $s1, 2040
+\ No newline at end of file
diff --git a/test/CodeGen/Mips/br-jmp.ll b/test/CodeGen/Mips/br-jmp.ll
index 1b5513a..9ca8d15 100644
--- a/test/CodeGen/Mips/br-jmp.ll
+++ b/test/CodeGen/Mips/br-jmp.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-PIC
 ; RUN: llc -march=mipsel -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=CHECK-PIC16
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
 
 define void @count(i32 %x, i32 %y, i32 %z) noreturn nounwind readnone {
 entry:
@@ -11,3 +13,6 @@ bosco:                                            ; preds = %bosco, %entry
 
 ; CHECK-PIC: b	$BB0_1
 ; CHECK-STATIC: j	$BB0_1
+; CHECK-PIC16: b	$BB0_1
+; CHECK-STATIC16: b	$BB0_1
+
diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index 897fc97..c3f483a 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll
@@ -67,3 +67,16 @@ if.end:
   ret void
 }
 
+; Check that delay slot filler can place mov.s or mov.d in delay slot.
+;
+; Default:     foo6:
+; Default-NOT: nop
+
+define void @foo6(float %a0, double %a1) nounwind {
+entry:
+  tail call void @foo7(double %a1, float %a0) nounwind
+  ret void
+}
+
+declare void @foo7(double, float)
+
diff --git a/test/CodeGen/Mips/eh-return32.ll b/test/CodeGen/Mips/eh-return32.ll
new file mode 100644
index 0000000..cf18fde
--- /dev/null
+++ b/test/CodeGen/Mips/eh-return32.ll
@@ -0,0 +1,81 @@
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+
+declare void @llvm.eh.return.i32(i32, i8*)
+declare void @foo(...)
+
+define i8* @f1(i32 %offset, i8* %handler) {
+entry:
+  call void (...)* @foo()
+  call void @llvm.eh.return.i32(i32 %offset, i8* %handler)
+  unreachable
+
+; CHECK:        f1
+; CHECK:        addiu   $sp, $sp, -[[spoffset:[0-9]+]]
+
+; check that $a0-$a3 are saved on stack.
+; CHECK:        sw      $4, [[offset0:[0-9]+]]($sp)
+; CHECK:        sw      $5, [[offset1:[0-9]+]]($sp)
+; CHECK:        sw      $6, [[offset2:[0-9]+]]($sp)
+; CHECK:        sw      $7, [[offset3:[0-9]+]]($sp)
+
+; check that .cfi_offset directives are emitted for $a0-$a3.
+; CHECK:        .cfi_offset 4,
+; CHECK:        .cfi_offset 5,
+; CHECK:        .cfi_offset 6,
+; CHECK:        .cfi_offset 7,
+
+; check that stack adjustment and handler are put in $v1 and $v0.
+; CHECK:        or      $[[R0:[a-z0-9]+]], $5, $zero
+; CHECK:        or      $[[R1:[a-z0-9]+]], $4, $zero
+; CHECK:        or      $3, $[[R1]], $zero
+; CHECK:        or      $2, $[[R0]], $zero
+
+; check that $a0-$a3 are restored from stack.
+; CHECK:        lw      $4, [[offset0]]($sp)
+; CHECK:        lw      $5, [[offset1]]($sp)
+; CHECK:        lw      $6, [[offset2]]($sp)
+; CHECK:        lw      $7, [[offset3]]($sp)
+
+; check that stack is adjusted by $v1 and that code returns to address in $v0
+; CHECK:        addiu   $sp, $sp, [[spoffset]]
+; CHECK:        or      $ra, $2, $zero
+; CHECK:        jr      $ra
+; CHECK:        addu    $sp, $sp, $3
+}
+
+define i8* @f2(i32 %offset, i8* %handler) {
+entry:
+  call void @llvm.eh.return.i32(i32 %offset, i8* %handler)
+  unreachable
+
+; CHECK:        f2
+; CHECK:        addiu   $sp, $sp, -[[spoffset:[0-9]+]]
+
+; check that $a0-$a3 are saved on stack.
+; CHECK:        sw      $4, [[offset0:[0-9]+]]($sp)
+; CHECK:        sw      $5, [[offset1:[0-9]+]]($sp)
+; CHECK:        sw      $6, [[offset2:[0-9]+]]($sp)
+; CHECK:        sw      $7, [[offset3:[0-9]+]]($sp)
+
+; check that .cfi_offset directives are emitted for $a0-$a3.
+; CHECK:        .cfi_offset 4,
+; CHECK:        .cfi_offset 5,
+; CHECK:        .cfi_offset 6,
+; CHECK:        .cfi_offset 7,
+
+; check that stack adjustment and handler are put in $v1 and $v0.
+; CHECK:        or      $3, $4, $zero
+; CHECK:        or      $2, $5, $zero
+
+; check that $a0-$a3 are restored from stack.
+; CHECK:        lw      $4, [[offset0]]($sp)
+; CHECK:        lw      $5, [[offset1]]($sp)
+; CHECK:        lw      $6, [[offset2]]($sp)
+; CHECK:        lw      $7, [[offset3]]($sp)
+
+; check that stack is adjusted by $v1 and that code returns to address in $v0
+; CHECK:        addiu   $sp, $sp, [[spoffset]]
+; CHECK:        or      $ra, $2, $zero
+; CHECK:        jr      $ra
+; CHECK:        addu    $sp, $sp, $3
+}
diff --git a/test/CodeGen/Mips/eh-return64.ll b/test/CodeGen/Mips/eh-return64.ll
new file mode 100644
index 0000000..c410e1c
--- /dev/null
+++ b/test/CodeGen/Mips/eh-return64.ll
@@ -0,0 +1,83 @@
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
+
+declare void @llvm.eh.return.i64(i64, i8*)
+declare void @foo(...)
+
+define void @f1(i64 %offset, i8* %handler) {
+entry:
+  call void (...)* @foo()
+  call void @llvm.eh.return.i64(i64 %offset, i8* %handler)
+  unreachable
+
+; CHECK:        f1
+; CHECK:        daddiu  $sp, $sp, -[[spoffset:[0-9]+]]
+
+; check that $a0-$a3 are saved on stack.
+; CHECK:        sd      $4, [[offset0:[0-9]+]]($sp)
+; CHECK:        sd      $5, [[offset1:[0-9]+]]($sp)
+; CHECK:        sd      $6, [[offset2:[0-9]+]]($sp)
+; CHECK:        sd      $7, [[offset3:[0-9]+]]($sp)
+
+; check that .cfi_offset directives are emitted for $a0-$a3.
+; CHECK:        .cfi_offset 4,
+; CHECK:        .cfi_offset 5,
+; CHECK:        .cfi_offset 6,
+; CHECK:        .cfi_offset 7,
+
+; check that stack adjustment and handler are put in $v1 and $v0.
+; CHECK:        or      $[[R0:[a-z0-9]+]], $5, $zero
+; CHECK:        or      $[[R1:[a-z0-9]+]], $4, $zero
+; CHECK:        or      $3, $[[R1]], $zero
+; CHECK:        or      $2, $[[R0]], $zero
+
+; check that $a0-$a3 are restored from stack.
+; CHECK:        ld      $4, [[offset0]]($sp)
+; CHECK:        ld      $5, [[offset1]]($sp)
+; CHECK:        ld      $6, [[offset2]]($sp)
+; CHECK:        ld      $7, [[offset3]]($sp)
+
+; check that stack is adjusted by $v1 and that code returns to address in $v0
+; CHECK:        daddiu  $sp, $sp, [[spoffset]]
+; CHECK:        or      $ra, $2, $zero
+; CHECK:        jr      $ra
+; CHECK:        daddu   $sp, $sp, $3
+
+}
+
+define void @f2(i64 %offset, i8* %handler) {
+entry:
+  call void @llvm.eh.return.i64(i64 %offset, i8* %handler)
+  unreachable
+
+; CHECK:        f2
+; CHECK:        daddiu  $sp, $sp, -[[spoffset:[0-9]+]]
+
+; check that $a0-$a3 are saved on stack.
+; CHECK:        sd      $4, [[offset0:[0-9]+]]($sp)
+; CHECK:        sd      $5, [[offset1:[0-9]+]]($sp)
+; CHECK:        sd      $6, [[offset2:[0-9]+]]($sp)
+; CHECK:        sd      $7, [[offset3:[0-9]+]]($sp)
+
+; check that .cfi_offset directives are emitted for $a0-$a3.
+; CHECK:        .cfi_offset 4,
+; CHECK:        .cfi_offset 5,
+; CHECK:        .cfi_offset 6,
+; CHECK:        .cfi_offset 7,
+
+; check that stack adjustment and handler are put in $v1 and $v0.
+; CHECK:        or      $3, $4, $zero
+; CHECK:        or      $2, $5, $zero
+
+; check that $a0-$a3 are restored from stack.
+; CHECK:        ld      $4, [[offset0]]($sp)
+; CHECK:        ld      $5, [[offset1]]($sp)
+; CHECK:        ld      $6, [[offset2]]($sp)
+; CHECK:        ld      $7, [[offset3]]($sp)
+
+; check that stack is adjusted by $v1 and that code returns to address in $v0
+; CHECK:        daddiu  $sp, $sp, [[spoffset]]
+; CHECK:        or      $ra, $2, $zero
+; CHECK:        jr      $ra
+; CHECK:        daddu   $sp, $sp, $3
+
+}
diff --git a/test/CodeGen/Mips/fp16static.ll b/test/CodeGen/Mips/fp16static.ll
new file mode 100644
index 0000000..240ec75
--- /dev/null
+++ b/test/CodeGen/Mips/fp16static.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
+
+@x = common global float 0.000000e+00, align 4
+
+define void @foo() nounwind {
+entry:
+  %0 = load float* @x, align 4
+  %1 = load float* @x, align 4
+  %mul = fmul float %0, %1
+  store float %mul, float* @x, align 4
+; CHECK-STATIC16: jal	__mips16_mulsf3
+  ret void
+}
diff --git a/test/CodeGen/Mips/frame-address.ll b/test/CodeGen/Mips/frame-address.ll
index e64e6d8..9b9ee21 100644
--- a/test/CodeGen/Mips/frame-address.ll
+++ b/test/CodeGen/Mips/frame-address.ll
@@ -7,6 +7,6 @@ entry:
   %0 = call i8* @llvm.frameaddress(i32 0)
   ret i8* %0
 
-; CHECK:   addu    $fp, $sp, $zero
-; CHECK:   or      $2, $fp, $zero
+; CHECK:   move    $fp, $sp
+; CHECK:   or    $2, $fp, $zero
 }
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index aee58b6..56ee607 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -1,9 +1,11 @@
 ; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C1
 ; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C2
 ; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PE
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST1
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST2
 ;
-; re-enable this when mips16's jalr is fixed.
-; DISABLED: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR
+; RUN: llc  -march=mipsel -mcpu=mips32  -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR32
 
 
 @.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
@@ -15,7 +17,15 @@ entry:
 
 ; SR: 	.set	mips16                  # @main
 
-; SR:	save 	$ra, [[FS:[0-9]+]]
+; SR32: .set nomips16
+; SR32: .ent main
+; SR-NOT:  .set noreorder
+; SR-NOT:  .set nomacro
+; SR-NOT:  .set noat
+; SR32:  .set noreorder
+; SR32:  .set nomacro
+; SR32:  .set noat
+; SR:	save 	$ra, $s0, $s1, [[FS:[0-9]+]]
 ; PE:	li	$[[T1:[0-9]+]], %hi(_gp_disp)
 ; PE: 	addiu	$[[T2:[0-9]+]], $pc, %lo(_gp_disp)
 ; PE:	sll	$[[T3:[0-9]+]], $[[T1]], 16
@@ -25,10 +35,23 @@ entry:
 ; C2:	move	$25, ${{[0-9]+}}
 ; C1:	move 	$gp, ${{[0-9]+}}
 ; C1:	jalrc 	${{[0-9]+}}
-; SR:	restore 	$ra, [[FS]]
+; SR:	restore 	$ra, $s0, $s1, [[FS]]
 ; PE:	li	$2, 0
 ; PE:	jrc 	$ra
 
+; ST1:  li	${{[0-9]+}}, %hi($.str)
+; ST1:  sll     ${{[0-9]+}}, ${{[0-9]+}}, 16
+; ST1:	addiu	${{[0-9]+}}, %lo($.str)
+; ST2:  li	${{[0-9]+}}, %hi($.str)
+; ST2:  jal     printf
 }
 
+;  SR-NOT:  .set at
+;  SR-NOT:  .set macro
+;  SR-NOT:  .set reorder
+;  SR32:  .set at
+;  SR32:  .set macro
+;  SR32:  .set reorder
+; SR:   .end main
+; SR32:   .end main
 declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/hf16_1.ll b/test/CodeGen/Mips/hf16_1.ll
new file mode 100644
index 0000000..c7454ee
--- /dev/null
+++ b/test/CodeGen/Mips/hf16_1.ll
@@ -0,0 +1,256 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -soft-float -mips16-hard-float -O3 < %s | FileCheck %s -check-prefix=1
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -soft-float -mips16-hard-float -O3 < %s | FileCheck %s -check-prefix=2
+
+
+@x = common global float 0.000000e+00, align 4
+@xd = common global double 0.000000e+00, align 8
+@y = common global float 0.000000e+00, align 4
+@yd = common global double 0.000000e+00, align 8
+@xy = common global { float, float } zeroinitializer, align 4
+@xyd = common global { double, double } zeroinitializer, align 8
+
+define void @foo() nounwind {
+entry:
+  %0 = load float* @x, align 4
+  call void @v_sf(float %0)
+  %1 = load double* @xd, align 8
+  call void @v_df(double %1)
+  %2 = load float* @x, align 4
+  %3 = load float* @y, align 4
+  call void @v_sf_sf(float %2, float %3)
+  %4 = load double* @xd, align 8
+  %5 = load float* @x, align 4
+  call void @v_df_sf(double %4, float %5)
+  %6 = load double* @xd, align 8
+  %7 = load double* @yd, align 8
+  call void @v_df_df(double %6, double %7)
+  %call = call float @sf_v()
+  %8 = load float* @x, align 4
+  %call1 = call float @sf_sf(float %8)
+  %9 = load double* @xd, align 8
+  %call2 = call float @sf_df(double %9)
+  %10 = load float* @x, align 4
+  %11 = load float* @y, align 4
+  %call3 = call float @sf_sf_sf(float %10, float %11)
+  %12 = load double* @xd, align 8
+  %13 = load float* @x, align 4
+  %call4 = call float @sf_df_sf(double %12, float %13)
+  %14 = load double* @xd, align 8
+  %15 = load double* @yd, align 8
+  %call5 = call float @sf_df_df(double %14, double %15)
+  %call6 = call double @df_v()
+  %16 = load float* @x, align 4
+  %call7 = call double @df_sf(float %16)
+  %17 = load double* @xd, align 8
+  %call8 = call double @df_df(double %17)
+  %18 = load float* @x, align 4
+  %19 = load float* @y, align 4
+  %call9 = call double @df_sf_sf(float %18, float %19)
+  %20 = load double* @xd, align 8
+  %21 = load float* @x, align 4
+  %call10 = call double @df_df_sf(double %20, float %21)
+  %22 = load double* @xd, align 8
+  %23 = load double* @yd, align 8
+  %call11 = call double @df_df_df(double %22, double %23)
+  %call12 = call { float, float } @sc_v()
+  %24 = extractvalue { float, float } %call12, 0
+  %25 = extractvalue { float, float } %call12, 1
+  %26 = load float* @x, align 4
+  %call13 = call { float, float } @sc_sf(float %26)
+  %27 = extractvalue { float, float } %call13, 0
+  %28 = extractvalue { float, float } %call13, 1
+  %29 = load double* @xd, align 8
+  %call14 = call { float, float } @sc_df(double %29)
+  %30 = extractvalue { float, float } %call14, 0
+  %31 = extractvalue { float, float } %call14, 1
+  %32 = load float* @x, align 4
+  %33 = load float* @y, align 4
+  %call15 = call { float, float } @sc_sf_sf(float %32, float %33)
+  %34 = extractvalue { float, float } %call15, 0
+  %35 = extractvalue { float, float } %call15, 1
+  %36 = load double* @xd, align 8
+  %37 = load float* @x, align 4
+  %call16 = call { float, float } @sc_df_sf(double %36, float %37)
+  %38 = extractvalue { float, float } %call16, 0
+  %39 = extractvalue { float, float } %call16, 1
+  %40 = load double* @xd, align 8
+  %41 = load double* @yd, align 8
+  %call17 = call { float, float } @sc_df_df(double %40, double %41)
+  %42 = extractvalue { float, float } %call17, 0
+  %43 = extractvalue { float, float } %call17, 1
+  %call18 = call { double, double } @dc_v()
+  %44 = extractvalue { double, double } %call18, 0
+  %45 = extractvalue { double, double } %call18, 1
+  %46 = load float* @x, align 4
+  %call19 = call { double, double } @dc_sf(float %46)
+  %47 = extractvalue { double, double } %call19, 0
+  %48 = extractvalue { double, double } %call19, 1
+  %49 = load double* @xd, align 8
+  %call20 = call { double, double } @dc_df(double %49)
+  %50 = extractvalue { double, double } %call20, 0
+  %51 = extractvalue { double, double } %call20, 1
+  %52 = load float* @x, align 4
+  %53 = load float* @y, align 4
+  %call21 = call { double, double } @dc_sf_sf(float %52, float %53)
+  %54 = extractvalue { double, double } %call21, 0
+  %55 = extractvalue { double, double } %call21, 1
+  %56 = load double* @xd, align 8
+  %57 = load float* @x, align 4
+  %call22 = call { double, double } @dc_df_sf(double %56, float %57)
+  %58 = extractvalue { double, double } %call22, 0
+  %59 = extractvalue { double, double } %call22, 1
+  %60 = load double* @xd, align 8
+  %61 = load double* @yd, align 8
+  %call23 = call { double, double } @dc_df_df(double %60, double %61)
+  %62 = extractvalue { double, double } %call23, 0
+  %63 = extractvalue { double, double } %call23, 1
+  ret void
+}
+
+declare void @v_sf(float)
+
+declare void @v_df(double)
+
+declare void @v_sf_sf(float, float)
+
+declare void @v_df_sf(double, float)
+
+declare void @v_df_df(double, double)
+
+declare float @sf_v()
+
+declare float @sf_sf(float)
+
+declare float @sf_df(double)
+
+declare float @sf_sf_sf(float, float)
+
+declare float @sf_df_sf(double, float)
+
+declare float @sf_df_df(double, double)
+
+declare double @df_v()
+
+declare double @df_sf(float)
+
+declare double @df_df(double)
+
+declare double @df_sf_sf(float, float)
+
+declare double @df_df_sf(double, float)
+
+declare double @df_df_df(double, double)
+
+declare { float, float } @sc_v()
+
+declare { float, float } @sc_sf(float)
+
+declare { float, float } @sc_df(double)
+
+declare { float, float } @sc_sf_sf(float, float)
+
+declare { float, float } @sc_df_sf(double, float)
+
+declare { float, float } @sc_df_df(double, double)
+
+declare { double, double } @dc_v()
+
+declare { double, double } @dc_sf(float)
+
+declare { double, double } @dc_df(double)
+
+declare { double, double } @dc_sf_sf(float, float)
+
+declare { double, double } @dc_df_sf(double, float)
+
+declare { double, double } @dc_df_df(double, double)
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_1)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(v_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_2)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(v_df)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_5)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(v_sf_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_6)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(v_df_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_10)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(v_df_df)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sf_0)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sf_v)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sf_1)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sf_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sf_2)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sf_df)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sf_5)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sf_sf_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sf_6)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sf_df_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sf_10)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sf_df_df)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_df_0)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(df_v)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_df_1)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(df_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_df_2)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(df_df)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_df_5)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(df_sf_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_df_6)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(df_df_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_df_10)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(df_df_df)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sc_0)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sc_v)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sc_1)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sc_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sc_2)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sc_df)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sc_5)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sc_sf_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sc_6)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sc_df_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_sc_10)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(sc_df_df)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_dc_0)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(dc_v)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_dc_1)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(dc_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_dc_2)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(dc_df)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_dc_5)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(dc_sf_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_dc_6)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(dc_df_sf)(${{[0-9]+}})
+
+; 1:	lw	${{[0-9]+}}, %got(__mips16_call_stub_dc_10)(${{[0-9]+}})
+; 2:	lw	${{[0-9]+}}, %call16(dc_df_df)(${{[0-9]+}})
+
+
+
diff --git a/test/CodeGen/Mips/i64arg.ll b/test/CodeGen/Mips/i64arg.ll
index e16e126..2012524 100644
--- a/test/CodeGen/Mips/i64arg.ll
+++ b/test/CodeGen/Mips/i64arg.ll
@@ -17,9 +17,9 @@ entry:
 ; CHECK: jalr $25
   tail call void @ff2(i64 %ll, double 3.000000e+00) nounwind
   %sub = add nsw i32 %i, -1
+; CHECK: lw $25, %call16(ff3)
 ; CHECK: sw $[[R1]], 28($sp)
 ; CHECK: sw $[[R0]], 24($sp)
-; CHECK: lw $25, %call16(ff3)
 ; CHECK: or $6, $[[R2]], $zero
 ; CHECK: or $7, $[[R3]], $zero
 ; CHECK: jalr $25
diff --git a/test/CodeGen/Mips/jtstat.ll b/test/CodeGen/Mips/jtstat.ll
new file mode 100644
index 0000000..6c1eb8d
--- /dev/null
+++ b/test/CodeGen/Mips/jtstat.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
+
+@s = global i8 115, align 1
+@c = common global i8 0, align 1
+@.str = private unnamed_addr constant [5 x i8] c"%c \0A\00", align 1
+
+define void @test(i32 %i) nounwind {
+entry:
+  %i.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32* %i.addr, align 4
+  switch i32 %0, label %sw.epilog [
+    i32 115, label %sw.bb
+    i32 105, label %sw.bb1
+    i32 100, label %sw.bb2
+    i32 108, label %sw.bb3
+    i32 99, label %sw.bb4
+    i32 68, label %sw.bb5
+    i32 81, label %sw.bb6
+    i32 76, label %sw.bb7
+  ]
+
+sw.bb:                                            ; preds = %entry
+  store i8 115, i8* @c, align 1
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %entry
+  store i8 105, i8* @c, align 1
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %entry
+  store i8 100, i8* @c, align 1
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %entry
+  store i8 108, i8* @c, align 1
+  br label %sw.epilog
+
+sw.bb4:                                           ; preds = %entry
+  store i8 99, i8* @c, align 1
+  br label %sw.epilog
+
+sw.bb5:                                           ; preds = %entry
+  store i8 68, i8* @c, align 1
+  br label %sw.epilog
+
+sw.bb6:                                           ; preds = %entry
+  store i8 81, i8* @c, align 1
+  br label %sw.epilog
+
+sw.bb7:                                           ; preds = %entry
+  store i8 76, i8* @c, align 1
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %entry, %sw.bb7, %sw.bb6, %sw.bb5, %sw.bb4, %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb
+  ret void
+}
+
+; CHECK-STATIC16 	li	${{[0-9]+}}, %hi($JTI{{[0-9]+}}_{{[0-9]+}})
+; CHECK-STATIC16 	lw	${{[0-9]+}}, %lo($JTI{{[0-9]+}}_{{[0-9]+}})({{[0-9]+}})
+; CHECK-STATIC16: $JTI{{[0-9]+}}_{{[0-9]+}}:
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
+; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}})
diff --git a/test/CodeGen/Mips/largefr1.ll b/test/CodeGen/Mips/largefr1.ll
new file mode 100644
index 0000000..0fe89f7
--- /dev/null
+++ b/test/CodeGen/Mips/largefr1.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=1
+
+@i = common global i32 0, align 4
+@j = common global i32 0, align 4
+@.str = private unnamed_addr constant [8 x i8] c"%i %i \0A\00", align 1
+
+define void @foo(i32* %p, i32 %i, i32 %j) nounwind {
+entry:
+  %p.addr = alloca i32*, align 4
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  store i32* %p, i32** %p.addr, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  %0 = load i32* %j.addr, align 4
+  %1 = load i32** %p.addr, align 4
+  %2 = load i32* %i.addr, align 4
+  %add.ptr = getelementptr inbounds i32* %1, i32 %2
+  store i32 %0, i32* %add.ptr, align 4
+  ret void
+}
+
+define i32 @main() nounwind {
+entry:
+; 1: main: 
+; 1: 1: 	.word	-797992
+; 1:            li ${{[0-9]+}}, 12
+; 1:            sll ${{[0-9]+}}, ${{[0-9]+}}, 16
+; 1:            addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 2:            move $sp, ${{[0-9]+}}
+; 2:            addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 1:            li ${{[0-9]+}}, 6
+; 1:            sll ${{[0-9]+}}, ${{[0-9]+}}, 16
+; 1:            addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 2:            move $sp, ${{[0-9]+}}
+; 2:            addu ${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 1:          	addiu	${{[0-9]+}}, ${{[0-9]+}}, 6800
+; 1: 	        li	${{[0-9]+}}, 1
+; 1:	        sll	${{[0-9]+}}, ${{[0-9]+}}, 16
+; 2: 	        li	${{[0-9]+}}, 34463
+  %retval = alloca i32, align 4
+  %one = alloca [100000 x i32], align 4
+  %two = alloca [100000 x i32], align 4
+  store i32 0, i32* %retval
+  %arrayidx = getelementptr inbounds [100000 x i32]* %one, i32 0, i32 0
+  call void @foo(i32* %arrayidx, i32 50, i32 9999)
+  %arrayidx1 = getelementptr inbounds [100000 x i32]* %two, i32 0, i32 0
+  call void @foo(i32* %arrayidx1, i32 99999, i32 5555)
+  %arrayidx2 = getelementptr inbounds [100000 x i32]* %one, i32 0, i32 50
+  %0 = load i32* %arrayidx2, align 4
+  store i32 %0, i32* @i, align 4
+  %arrayidx3 = getelementptr inbounds [100000 x i32]* %two, i32 0, i32 99999
+  %1 = load i32* %arrayidx3, align 4
+  store i32 %1, i32* @j, align 4
+  %2 = load i32* @i, align 4
+  %3 = load i32* @j, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3)
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/Mips/mips64-libcall.ll b/test/CodeGen/Mips/mips64-libcall.ll
new file mode 100644
index 0000000..c53ccfd
--- /dev/null
+++ b/test/CodeGen/Mips/mips64-libcall.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=mips64el -mcpu=mips64r2 -O3 < %s |\
+; RUN: FileCheck %s -check-prefix=HARD
+; RUN: llc -march=mips64el -mcpu=mips64r2 -soft-float < %s |\
+; RUN: FileCheck %s -check-prefix=SOFT
+
+; Check that %add is not passed in an integer register.
+;
+; HARD    : callfloor:
+; HARD-NOT: dmfc1 $4
+
+define double @callfloor(double %d) nounwind readnone {
+entry:
+  %add = fadd double %d, 1.000000e+00
+  %call = tail call double @floor(double %add) nounwind readnone
+  ret double %call
+}
+
+declare double @floor(double) nounwind readnone
+
+; Check call16.
+;
+; SOFT: f64add:
+; SOFT: ld $25, %call16(__adddf3)
+
+define double @f64add(double %a, double %b) {
+entry:
+  %add = fadd double %a, %b
+  ret double %add
+}
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index 5558ba6..0a8f85f 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -12,20 +12,20 @@ define void @f1() nounwind {
 entry:
 ; CHECK: lw  $[[R1:[0-9]+]], %got(f1.s1)
 ; CHECK: addiu $[[R0:[0-9]+]], $[[R1]], %lo(f1.s1)
+; CHECK: lw  $[[R7:[0-9]+]], 12($[[R0]])
+; CHECK: lw  $[[R3:[0-9]+]], 16($[[R0]])
+; CHECK: lw  $[[R4:[0-9]+]], 20($[[R0]])
+; CHECK: lw  $[[R5:[0-9]+]], 24($[[R0]])
 ; CHECK: lw  $[[R6:[0-9]+]], 28($[[R0]])
 ; CHECK: sw  $[[R6]], 36($sp)
-; CHECK: lw  $[[R5:[0-9]+]], 24($[[R0]])
 ; CHECK: sw  $[[R5]], 32($sp)
-; CHECK: lw  $[[R4:[0-9]+]], 20($[[R0]])
 ; CHECK: sw  $[[R4]], 28($sp)
-; CHECK: lw  $[[R3:[0-9]+]], 16($[[R0]])
 ; CHECK: sw  $[[R3]], 24($sp)
-; CHECK: lw  $[[R7:[0-9]+]], 12($[[R0]])
 ; CHECK: sw  $[[R7]], 20($sp)
 ; CHECK: lw  $[[R2:[0-9]+]], 8($[[R0]])
 ; CHECK: sw  $[[R2]], 16($sp)
-; CHECK: lw  $7, 4($[[R0]])
 ; CHECK: lw  $6, %lo(f1.s1)($[[R1]])
+; CHECK: lw  $7, 4($[[R0]])
   %agg.tmp10 = alloca %struct.S3, align 4
   call void @callee1(float 2.000000e+01, %struct.S1* byval bitcast (%0* @f1.s1 to %struct.S1*)) nounwind
   call void @callee2(%struct.S2* byval @f1.s2) nounwind
diff --git a/test/CodeGen/Mips/selTBteqzCmpi.ll b/test/CodeGen/Mips/selTBteqzCmpi.ll
new file mode 100644
index 0000000..9cb8227
--- /dev/null
+++ b/test/CodeGen/Mips/selTBteqzCmpi.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 1, align 4
+@j = global i32 2, align 4
+@a = global i32 5, align 4
+@.str = private unnamed_addr constant [8 x i8] c"%i = 2\0A\00", align 1
+@k = common global i32 0, align 4
+
+define void @t() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 10
+  %1 = load i32* @i, align 4
+  %2 = load i32* @j, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @i, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+
+
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+
diff --git a/test/CodeGen/Mips/selTBtnezCmpi.ll b/test/CodeGen/Mips/selTBtnezCmpi.ll
new file mode 100644
index 0000000..bd334f5
--- /dev/null
+++ b/test/CodeGen/Mips/selTBtnezCmpi.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 1, align 4
+@j = global i32 2, align 4
+@a = global i32 5, align 4
+@.str = private unnamed_addr constant [8 x i8] c"%i = 1\0A\00", align 1
+@k = common global i32 0, align 4
+
+define void @t() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 10
+  %1 = load i32* @i, align 4
+  %2 = load i32* @j, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @i, align 4
+  ret void
+}
+
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+
+
diff --git a/test/CodeGen/Mips/selTBtnezSlti.ll b/test/CodeGen/Mips/selTBtnezSlti.ll
new file mode 100644
index 0000000..593f6f2
--- /dev/null
+++ b/test/CodeGen/Mips/selTBtnezSlti.ll
@@ -0,0 +1,25 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 1, align 4
+@j = global i32 2, align 4
+@a = global i32 5, align 4
+@.str = private unnamed_addr constant [9 x i8] c"%i = 2 \0A\00", align 1
+@k = common global i32 0, align 4
+
+define void @t() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 10
+  %1 = load i32* @j, align 4
+  %2 = load i32* @i, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @i, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+
+; 16:	slti	${{[0-9]+}}, 10
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+
diff --git a/test/CodeGen/Mips/seleq.ll b/test/CodeGen/Mips/seleq.ll
new file mode 100644
index 0000000..190baad
--- /dev/null
+++ b/test/CodeGen/Mips/seleq.ll
@@ -0,0 +1,95 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 10, align 4
+@c = global i32 1, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+
+define void @calc_seleq() nounwind "target-cpu"="mips32" "target-features"="+o32,+mips32" {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp eq i32 %0, %1
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %2 = load i32* @f, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32* @t, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %2, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* @z1, align 4
+  %4 = load i32* @b, align 4
+  %5 = load i32* @a, align 4
+  %cmp1 = icmp eq i32 %4, %5
+  br i1 %cmp1, label %cond.true2, label %cond.false3
+
+cond.true2:                                       ; preds = %cond.end
+  %6 = load i32* @f, align 4
+  br label %cond.end4
+
+cond.false3:                                      ; preds = %cond.end
+  %7 = load i32* @t, align 4
+  br label %cond.end4
+
+cond.end4:                                        ; preds = %cond.false3, %cond.true2
+  %cond5 = phi i32 [ %6, %cond.true2 ], [ %7, %cond.false3 ]
+  store i32 %cond5, i32* @z2, align 4
+  %8 = load i32* @c, align 4
+  %9 = load i32* @a, align 4
+  %cmp6 = icmp eq i32 %8, %9
+  br i1 %cmp6, label %cond.true7, label %cond.false8
+
+cond.true7:                                       ; preds = %cond.end4
+  %10 = load i32* @t, align 4
+  br label %cond.end9
+
+cond.false8:                                      ; preds = %cond.end4
+  %11 = load i32* @f, align 4
+  br label %cond.end9
+
+cond.end9:                                        ; preds = %cond.false8, %cond.true7
+  %cond10 = phi i32 [ %10, %cond.true7 ], [ %11, %cond.false8 ]
+  store i32 %cond10, i32* @z3, align 4
+  %12 = load i32* @a, align 4
+  %13 = load i32* @c, align 4
+  %cmp11 = icmp eq i32 %12, %13
+  br i1 %cmp11, label %cond.true12, label %cond.false13
+
+cond.true12:                                      ; preds = %cond.end9
+  %14 = load i32* @t, align 4
+  br label %cond.end14
+
+cond.false13:                                     ; preds = %cond.end9
+  %15 = load i32* @f, align 4
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %cond.false13, %cond.true12
+  %cond15 = phi i32 [ %14, %cond.true12 ], [ %15, %cond.false13 ]
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="mips32" "target-features"="+o32,+mips32" }
+
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
diff --git a/test/CodeGen/Mips/seleqk.ll b/test/CodeGen/Mips/seleqk.ll
new file mode 100644
index 0000000..3ca622d
--- /dev/null
+++ b/test/CodeGen/Mips/seleqk.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 1000, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define void @calc_seleqk() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %1 = load i32* @t, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %2 = load i32* @f, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %1, %cond.true ], [ %2, %cond.false ]
+  store i32 %cond, i32* @z1, align 4
+  %3 = load i32* @a, align 4
+  %cmp1 = icmp eq i32 %3, 1000
+  br i1 %cmp1, label %cond.true2, label %cond.false3
+
+cond.true2:                                       ; preds = %cond.end
+  %4 = load i32* @f, align 4
+  br label %cond.end4
+
+cond.false3:                                      ; preds = %cond.end
+  %5 = load i32* @t, align 4
+  br label %cond.end4
+
+cond.end4:                                        ; preds = %cond.false3, %cond.true2
+  %cond5 = phi i32 [ %4, %cond.true2 ], [ %5, %cond.false3 ]
+  store i32 %cond5, i32* @z2, align 4
+  %6 = load i32* @b, align 4
+  %cmp6 = icmp eq i32 %6, 3
+  br i1 %cmp6, label %cond.true7, label %cond.false8
+
+cond.true7:                                       ; preds = %cond.end4
+  %7 = load i32* @f, align 4
+  br label %cond.end9
+
+cond.false8:                                      ; preds = %cond.end4
+  %8 = load i32* @t, align 4
+  br label %cond.end9
+
+cond.end9:                                        ; preds = %cond.false8, %cond.true7
+  %cond10 = phi i32 [ %7, %cond.true7 ], [ %8, %cond.false8 ]
+  store i32 %cond10, i32* @z3, align 4
+  %9 = load i32* @b, align 4
+  %cmp11 = icmp eq i32 %9, 1000
+  br i1 %cmp11, label %cond.true12, label %cond.false13
+
+cond.true12:                                      ; preds = %cond.end9
+  %10 = load i32* @t, align 4
+  br label %cond.end14
+
+cond.false13:                                     ; preds = %cond.end9
+  %11 = load i32* @f, align 4
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %cond.false13, %cond.true12
+  %cond15 = phi i32 [ %10, %cond.true12 ], [ %11, %cond.false13 ]
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+attributes #1 = { "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+
+; 16:	cmpi	${{[0-9]+}}, 1 	# 16 bit inst
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmpi	${{[0-9]+}}, 1000
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmpi	${{[0-9]+}}, 3 	# 16 bit inst
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmpi	${{[0-9]+}}, 1000
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
diff --git a/test/CodeGen/Mips/selgek.ll b/test/CodeGen/Mips/selgek.ll
new file mode 100644
index 0000000..8ab4046
--- /dev/null
+++ b/test/CodeGen/Mips/selgek.ll
@@ -0,0 +1,94 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 2, align 4
+@b = global i32 1000, align 4
+@c = global i32 2, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define void @calc_z() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp sge i32 %0, 1000
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %1 = load i32* @f, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %2 = load i32* @t, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %1, %cond.true ], [ %2, %cond.false ]
+  store i32 %cond, i32* @z1, align 4
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp sge i32 %3, 1
+  br i1 %cmp1, label %cond.true2, label %cond.false3
+
+cond.true2:                                       ; preds = %cond.end
+  %4 = load i32* @t, align 4
+  br label %cond.end4
+
+cond.false3:                                      ; preds = %cond.end
+  %5 = load i32* @f, align 4
+  br label %cond.end4
+
+cond.end4:                                        ; preds = %cond.false3, %cond.true2
+  %cond5 = phi i32 [ %4, %cond.true2 ], [ %5, %cond.false3 ]
+  store i32 %cond5, i32* @z2, align 4
+  %6 = load i32* @c, align 4
+  %cmp6 = icmp sge i32 %6, 2
+  br i1 %cmp6, label %cond.true7, label %cond.false8
+
+cond.true7:                                       ; preds = %cond.end4
+  %7 = load i32* @t, align 4
+  br label %cond.end9
+
+cond.false8:                                      ; preds = %cond.end4
+  %8 = load i32* @f, align 4
+  br label %cond.end9
+
+cond.end9:                                        ; preds = %cond.false8, %cond.true7
+  %cond10 = phi i32 [ %7, %cond.true7 ], [ %8, %cond.false8 ]
+  store i32 %cond10, i32* @z3, align 4
+  %9 = load i32* @a, align 4
+  %cmp11 = icmp sge i32 %9, 2
+  br i1 %cmp11, label %cond.true12, label %cond.false13
+
+cond.true12:                                      ; preds = %cond.end9
+  %10 = load i32* @t, align 4
+  br label %cond.end14
+
+cond.false13:                                     ; preds = %cond.end9
+  %11 = load i32* @f, align 4
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %cond.false13, %cond.true12
+  %cond15 = phi i32 [ %10, %cond.true12 ], [ %11, %cond.false13 ]
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+attributes #1 = { "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+
+; 16:	slti	${{[0-9]+}}, 1000
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slti	${{[0-9]+}}, 1 	# 16 bit inst
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slti	${{[0-9]+}}, 2 	# 16 bit inst
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slti	${{[0-9]+}}, 2 	# 16 bit inst
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+
diff --git a/test/CodeGen/Mips/selgt.ll b/test/CodeGen/Mips/selgt.ll
new file mode 100644
index 0000000..67b9b49
--- /dev/null
+++ b/test/CodeGen/Mips/selgt.ll
@@ -0,0 +1,98 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 10, align 4
+@c = global i32 1, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+@.str = private unnamed_addr constant [9 x i8] c"%i = %i\0A\00", align 1
+
+define i32 @calc_z() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %retval = alloca i32, align 4
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %2 = load i32* @f, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32* @t, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %2, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* @z1, align 4
+  %4 = load i32* @b, align 4
+  %5 = load i32* @a, align 4
+  %cmp1 = icmp sgt i32 %4, %5
+  br i1 %cmp1, label %cond.true2, label %cond.false3
+
+cond.true2:                                       ; preds = %cond.end
+  %6 = load i32* @t, align 4
+  br label %cond.end4
+
+cond.false3:                                      ; preds = %cond.end
+  %7 = load i32* @f, align 4
+  br label %cond.end4
+
+cond.end4:                                        ; preds = %cond.false3, %cond.true2
+  %cond5 = phi i32 [ %6, %cond.true2 ], [ %7, %cond.false3 ]
+  store i32 %cond5, i32* @z2, align 4
+  %8 = load i32* @c, align 4
+  %9 = load i32* @a, align 4
+  %cmp6 = icmp sgt i32 %8, %9
+  br i1 %cmp6, label %cond.true7, label %cond.false8
+
+cond.true7:                                       ; preds = %cond.end4
+  %10 = load i32* @f, align 4
+  br label %cond.end9
+
+cond.false8:                                      ; preds = %cond.end4
+  %11 = load i32* @t, align 4
+  br label %cond.end9
+
+cond.end9:                                        ; preds = %cond.false8, %cond.true7
+  %cond10 = phi i32 [ %10, %cond.true7 ], [ %11, %cond.false8 ]
+  store i32 %cond10, i32* @z3, align 4
+  %12 = load i32* @a, align 4
+  %13 = load i32* @c, align 4
+  %cmp11 = icmp sgt i32 %12, %13
+  br i1 %cmp11, label %cond.true12, label %cond.false13
+
+cond.true12:                                      ; preds = %cond.end9
+  %14 = load i32* @f, align 4
+  br label %cond.end14
+
+cond.false13:                                     ; preds = %cond.end9
+  %15 = load i32* @t, align 4
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %cond.false13, %cond.true12
+  %cond15 = phi i32 [ %14, %cond.true12 ], [ %15, %cond.false13 ]
+  store i32 %cond15, i32* @z4, align 4
+  %16 = load i32* %retval
+  ret i32 %16
+}
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+attributes #1 = { "target-cpu"="mips16" "target-features"="+mips16,+o32" }
diff --git a/test/CodeGen/Mips/selle.ll b/test/CodeGen/Mips/selle.ll
new file mode 100644
index 0000000..b27df45
--- /dev/null
+++ b/test/CodeGen/Mips/selle.ll
@@ -0,0 +1,96 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 10, align 4
+@c = global i32 1, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define void @calc_z() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sle i32 %0, %1
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %2 = load i32* @t, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32* @f, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %2, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* @z1, align 4
+  %4 = load i32* @b, align 4
+  %5 = load i32* @a, align 4
+  %cmp1 = icmp sle i32 %4, %5
+  br i1 %cmp1, label %cond.true2, label %cond.false3
+
+cond.true2:                                       ; preds = %cond.end
+  %6 = load i32* @f, align 4
+  br label %cond.end4
+
+cond.false3:                                      ; preds = %cond.end
+  %7 = load i32* @t, align 4
+  br label %cond.end4
+
+cond.end4:                                        ; preds = %cond.false3, %cond.true2
+  %cond5 = phi i32 [ %6, %cond.true2 ], [ %7, %cond.false3 ]
+  store i32 %cond5, i32* @z2, align 4
+  %8 = load i32* @c, align 4
+  %9 = load i32* @a, align 4
+  %cmp6 = icmp sle i32 %8, %9
+  br i1 %cmp6, label %cond.true7, label %cond.false8
+
+cond.true7:                                       ; preds = %cond.end4
+  %10 = load i32* @t, align 4
+  br label %cond.end9
+
+cond.false8:                                      ; preds = %cond.end4
+  %11 = load i32* @f, align 4
+  br label %cond.end9
+
+cond.end9:                                        ; preds = %cond.false8, %cond.true7
+  %cond10 = phi i32 [ %10, %cond.true7 ], [ %11, %cond.false8 ]
+  store i32 %cond10, i32* @z3, align 4
+  %12 = load i32* @a, align 4
+  %13 = load i32* @c, align 4
+  %cmp11 = icmp sle i32 %12, %13
+  br i1 %cmp11, label %cond.true12, label %cond.false13
+
+cond.true12:                                      ; preds = %cond.end9
+  %14 = load i32* @t, align 4
+  br label %cond.end14
+
+cond.false13:                                     ; preds = %cond.end9
+  %15 = load i32* @f, align 4
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %cond.false13, %cond.true12
+  %cond15 = phi i32 [ %14, %cond.true12 ], [ %15, %cond.false13 ]
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+attributes #1 = { "target-cpu"="mips16" "target-features"="+mips16,+o32" }
diff --git a/test/CodeGen/Mips/selltk.ll b/test/CodeGen/Mips/selltk.ll
new file mode 100644
index 0000000..1471b89
--- /dev/null
+++ b/test/CodeGen/Mips/selltk.ll
@@ -0,0 +1,93 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 2, align 4
+@b = global i32 1000, align 4
+@c = global i32 2, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define void @calc_selltk() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 1000
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %1 = load i32* @t, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %2 = load i32* @f, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %1, %cond.true ], [ %2, %cond.false ]
+  store i32 %cond, i32* @z1, align 4
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp slt i32 %3, 2
+  br i1 %cmp1, label %cond.true2, label %cond.false3
+
+cond.true2:                                       ; preds = %cond.end
+  %4 = load i32* @f, align 4
+  br label %cond.end4
+
+cond.false3:                                      ; preds = %cond.end
+  %5 = load i32* @t, align 4
+  br label %cond.end4
+
+cond.end4:                                        ; preds = %cond.false3, %cond.true2
+  %cond5 = phi i32 [ %4, %cond.true2 ], [ %5, %cond.false3 ]
+  store i32 %cond5, i32* @z2, align 4
+  %6 = load i32* @c, align 4
+  %cmp6 = icmp sgt i32 %6, 2
+  br i1 %cmp6, label %cond.true7, label %cond.false8
+
+cond.true7:                                       ; preds = %cond.end4
+  %7 = load i32* @f, align 4
+  br label %cond.end9
+
+cond.false8:                                      ; preds = %cond.end4
+  %8 = load i32* @t, align 4
+  br label %cond.end9
+
+cond.end9:                                        ; preds = %cond.false8, %cond.true7
+  %cond10 = phi i32 [ %7, %cond.true7 ], [ %8, %cond.false8 ]
+  store i32 %cond10, i32* @z3, align 4
+  %9 = load i32* @a, align 4
+  %cmp11 = icmp sgt i32 %9, 2
+  br i1 %cmp11, label %cond.true12, label %cond.false13
+
+cond.true12:                                      ; preds = %cond.end9
+  %10 = load i32* @f, align 4
+  br label %cond.end14
+
+cond.false13:                                     ; preds = %cond.end9
+  %11 = load i32* @t, align 4
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %cond.false13, %cond.true12
+  %cond15 = phi i32 [ %10, %cond.true12 ], [ %11, %cond.false13 ]
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+attributes #1 = { "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slti	${{[0-9]+}}, 3 	# 16 bit inst
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	slti	${{[0-9]+}}, 3 	# 16 bit inst
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
+
diff --git a/test/CodeGen/Mips/selne.ll b/test/CodeGen/Mips/selne.ll
new file mode 100644
index 0000000..e3d82b8
--- /dev/null
+++ b/test/CodeGen/Mips/selne.ll
@@ -0,0 +1,97 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 10, align 4
+@c = global i32 1, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define void @calc_seleq() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ne i32 %0, %1
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %2 = load i32* @f, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32* @t, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %2, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* @z1, align 4
+  %4 = load i32* @b, align 4
+  %5 = load i32* @a, align 4
+  %cmp1 = icmp ne i32 %4, %5
+  br i1 %cmp1, label %cond.true2, label %cond.false3
+
+cond.true2:                                       ; preds = %cond.end
+  %6 = load i32* @f, align 4
+  br label %cond.end4
+
+cond.false3:                                      ; preds = %cond.end
+  %7 = load i32* @t, align 4
+  br label %cond.end4
+
+cond.end4:                                        ; preds = %cond.false3, %cond.true2
+  %cond5 = phi i32 [ %6, %cond.true2 ], [ %7, %cond.false3 ]
+  store i32 %cond5, i32* @z2, align 4
+  %8 = load i32* @c, align 4
+  %9 = load i32* @a, align 4
+  %cmp6 = icmp ne i32 %8, %9
+  br i1 %cmp6, label %cond.true7, label %cond.false8
+
+cond.true7:                                       ; preds = %cond.end4
+  %10 = load i32* @t, align 4
+  br label %cond.end9
+
+cond.false8:                                      ; preds = %cond.end4
+  %11 = load i32* @f, align 4
+  br label %cond.end9
+
+cond.end9:                                        ; preds = %cond.false8, %cond.true7
+  %cond10 = phi i32 [ %10, %cond.true7 ], [ %11, %cond.false8 ]
+  store i32 %cond10, i32* @z3, align 4
+  %12 = load i32* @a, align 4
+  %13 = load i32* @c, align 4
+  %cmp11 = icmp ne i32 %12, %13
+  br i1 %cmp11, label %cond.true12, label %cond.false13
+
+cond.true12:                                      ; preds = %cond.end9
+  %14 = load i32* @t, align 4
+  br label %cond.end14
+
+cond.false13:                                     ; preds = %cond.end9
+  %15 = load i32* @f, align 4
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %cond.false13, %cond.true12
+  %cond15 = phi i32 [ %14, %cond.true12 ], [ %15, %cond.false13 ]
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+attributes #1 = { "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
diff --git a/test/CodeGen/Mips/selnek.ll b/test/CodeGen/Mips/selnek.ll
new file mode 100644
index 0000000..2601552
--- /dev/null
+++ b/test/CodeGen/Mips/selnek.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 1000, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+@.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
+
+define void @calc_z() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 1
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %1 = load i32* @f, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %2 = load i32* @t, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %1, %cond.true ], [ %2, %cond.false ]
+  store i32 %cond, i32* @z1, align 4
+  %3 = load i32* @a, align 4
+  %cmp1 = icmp ne i32 %3, 1000
+  br i1 %cmp1, label %cond.true2, label %cond.false3
+
+cond.true2:                                       ; preds = %cond.end
+  %4 = load i32* @t, align 4
+  br label %cond.end4
+
+cond.false3:                                      ; preds = %cond.end
+  %5 = load i32* @f, align 4
+  br label %cond.end4
+
+cond.end4:                                        ; preds = %cond.false3, %cond.true2
+  %cond5 = phi i32 [ %4, %cond.true2 ], [ %5, %cond.false3 ]
+  store i32 %cond5, i32* @z2, align 4
+  %6 = load i32* @b, align 4
+  %cmp6 = icmp ne i32 %6, 3
+  br i1 %cmp6, label %cond.true7, label %cond.false8
+
+cond.true7:                                       ; preds = %cond.end4
+  %7 = load i32* @t, align 4
+  br label %cond.end9
+
+cond.false8:                                      ; preds = %cond.end4
+  %8 = load i32* @f, align 4
+  br label %cond.end9
+
+cond.end9:                                        ; preds = %cond.false8, %cond.true7
+  %cond10 = phi i32 [ %7, %cond.true7 ], [ %8, %cond.false8 ]
+  store i32 %cond10, i32* @z3, align 4
+  %9 = load i32* @b, align 4
+  %cmp11 = icmp ne i32 %9, 1000
+  br i1 %cmp11, label %cond.true12, label %cond.false13
+
+cond.true12:                                      ; preds = %cond.end9
+  %10 = load i32* @f, align 4
+  br label %cond.end14
+
+cond.false13:                                     ; preds = %cond.end9
+  %11 = load i32* @t, align 4
+  br label %cond.end14
+
+cond.end14:                                       ; preds = %cond.false13, %cond.true12
+  %cond15 = phi i32 [ %10, %cond.true12 ], [ %11, %cond.false13 ]
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define i32 @main() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+entry:
+  call void @calc_z() "target-cpu"="mips16" "target-features"="+mips16,+o32"
+  %0 = load i32* @z1, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %0) "target-cpu"="mips16" "target-features"="+mips16,+o32"
+  %1 = load i32* @z2, align 4
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %1) "target-cpu"="mips16" "target-features"="+mips16,+o32"
+  %2 = load i32* @z3, align 4
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %2) "target-cpu"="mips16" "target-features"="+mips16,+o32"
+  %3 = load i32* @z4, align 4
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i32 0, i32 0), i32 %3) "target-cpu"="mips16" "target-features"="+mips16,+o32"
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...) "target-cpu"="mips16" "target-features"="+mips16,+o32"
+
+attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+attributes #1 = { "target-cpu"="mips16" "target-features"="+mips16,+o32" }
+
+; 16:	cmpi	${{[0-9]+}}, 1 	# 16 bit inst
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmpi	${{[0-9]+}}, 1000
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmpi	${{[0-9]+}}, 3 	# 16 bit inst
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+
+; 16:	cmpi	${{[0-9]+}}, 1000
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
+\ No newline at end of file
diff --git a/test/CodeGen/Mips/selpat.ll b/test/CodeGen/Mips/selpat.ll
index cda0c96..8eda8de 100644
--- a/test/CodeGen/Mips/selpat.ll
+++ b/test/CodeGen/Mips/selpat.ll
@@ -20,7 +20,7 @@ entry:
   %cond = select i1 %cmp, i32 %2, i32 %3
   store i32 %cond, i32* @z1, align 4
 ; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	bteqz	.+4
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   store i32 %cond, i32* @z2, align 4
   %4 = load i32* @c, align 4
@@ -41,7 +41,7 @@ entry:
   %cond = select i1 %cmp, i32 %1, i32 %2
   store i32 %cond, i32* @z1, align 4
 ; 16:	cmpi	${{[0-9]+}}, 1
-; 16:	bteqz	.+4
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %cmp1 = icmp eq i32 %0, 10
   %cond5 = select i1 %cmp1, i32 %2, i32 %1
@@ -51,7 +51,7 @@ entry:
   %cond10 = select i1 %cmp6, i32 %2, i32 %1
   store i32 %cond10, i32* @z3, align 4
 ; 16:	cmpi	${{[0-9]+}}, 10
-; 16:	bteqz	.+4
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %cmp11 = icmp eq i32 %3, 10
   %cond15 = select i1 %cmp11, i32 %1, i32 %2
@@ -67,7 +67,7 @@ entry:
   %2 = load i32* @f, align 4
   %cond = select i1 %cmp, i32 %1, i32 %2
   store i32 %cond, i32* @z1, align 4
-; 16:	beqz	${{[0-9]+}}, .+4
+; 16:	beqz	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %3 = load i32* @b, align 4
   %cmp1 = icmp eq i32 %3, 0
@@ -91,7 +91,7 @@ entry:
   %cond = select i1 %cmp, i32 %2, i32 %3
   store i32 %cond, i32* @z1, align 4
 ; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	bteqz	.+4
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %cmp1 = icmp sge i32 %1, %0
   %cond5 = select i1 %cmp1, i32 %3, i32 %2
@@ -112,7 +112,7 @@ entry:
   %1 = load i32* @b, align 4
   %cmp = icmp sgt i32 %0, %1
 ; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	.+4
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %2 = load i32* @f, align 4
   %3 = load i32* @t, align 4
@@ -141,7 +141,7 @@ entry:
   %cond = select i1 %cmp, i32 %2, i32 %3
   store i32 %cond, i32* @z1, align 4
 ; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	bteqz	.+4
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %cmp1 = icmp sle i32 %1, %0
   %cond5 = select i1 %cmp1, i32 %3, i32 %2
@@ -165,7 +165,7 @@ entry:
   %cond = select i1 %cmp, i32 %1, i32 %2
   store i32 %cond, i32* @z1, align 4
 ; 16:	slti	${{[0-9]+}}, {{[0-9]+}}
-; 16:	btnez	.+4
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %3 = load i32* @b, align 4
   %cmp1 = icmp slt i32 %3, 2
@@ -192,7 +192,7 @@ entry:
   %cond = select i1 %cmp, i32 %2, i32 %3
   store i32 %cond, i32* @z1, align 4
 ; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	.+4
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   store i32 %cond, i32* @z2, align 4
   %4 = load i32* @c, align 4
@@ -212,7 +212,7 @@ entry:
   %cond = select i1 %cmp, i32 %1, i32 %2
   store i32 %cond, i32* @z1, align 4
 ; 16:	cmpi	${{[0-9]+}}, 1
-; 16:	btnez	.+4
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %cmp1 = icmp ne i32 %0, 10
   %cond5 = select i1 %cmp1, i32 %2, i32 %1
@@ -222,7 +222,7 @@ entry:
   %cond10 = select i1 %cmp6, i32 %2, i32 %1
   store i32 %cond10, i32* @z3, align 4
 ; 16:	cmpi	${{[0-9]+}}, 10
-; 16:	btnez	.+4
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %cmp11 = icmp ne i32 %3, 10
   %cond15 = select i1 %cmp11, i32 %1, i32 %2
@@ -238,7 +238,7 @@ entry:
   %2 = load i32* @t, align 4
   %cond = select i1 %cmp, i32 %1, i32 %2
   store i32 %cond, i32* @z1, align 4
-; 16:	bnez	${{[0-9]+}}, .+4
+; 16:	bnez	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %3 = load i32* @b, align 4
   %cmp1 = icmp ne i32 %3, 0
@@ -260,7 +260,7 @@ entry:
   %2 = load i32* @t, align 4
   %cond = select i1 %tobool, i32 %1, i32 %2
   store i32 %cond, i32* @z1, align 4
-; 16:	bnez	${{[0-9]+}}, .+4
+; 16:	bnez	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %3 = load i32* @b, align 4
   %tobool1 = icmp ne i32 %3, 0
@@ -284,7 +284,7 @@ entry:
   %cond = select i1 %cmp, i32 %2, i32 %3
   store i32 %cond, i32* @z1, align 4
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	bteqz	.+4
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %cmp1 = icmp uge i32 %1, %0
   %cond5 = select i1 %cmp1, i32 %3, i32 %2
@@ -309,7 +309,7 @@ entry:
   %cond = select i1 %cmp, i32 %2, i32 %3
   store i32 %cond, i32* @z1, align 4
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	.+4
+; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %cmp1 = icmp ugt i32 %1, %0
   %cond5 = select i1 %cmp1, i32 %3, i32 %2
@@ -334,7 +334,7 @@ entry:
   %cond = select i1 %cmp, i32 %2, i32 %3
   store i32 %cond, i32* @z1, align 4
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	bteqz	.+4
+; 16:	bteqz	$BB{{[0-9]+}}_{{[0-9]}}
 ; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
   %cmp1 = icmp ule i32 %1, %0
   %cond5 = select i1 %cmp1, i32 %3, i32 %2
diff --git a/test/CodeGen/Mips/seteq.ll b/test/CodeGen/Mips/seteq.ll
index da840c8..5fadf78 100644
--- a/test/CodeGen/Mips/seteq.ll
+++ b/test/CodeGen/Mips/seteq.ll
@@ -15,7 +15,7 @@ entry:
   store i32 %conv, i32* @r1, align 4
 ; 16:	xor	$[[REGISTER:[0-9A-Ba-b_]+]], ${{[0-9]+}}
 ; 16:	sltiu	$[[REGISTER:[0-9A-Ba-b_]+]], 1
-; 16:	move	${{[0-9]+}}, $t8
+; 16:	move	${{[0-9]+}}, $24
   ret void
 }
 
diff --git a/test/CodeGen/Mips/seteqz.ll b/test/CodeGen/Mips/seteqz.ll
index d445be6..80dc312 100644
--- a/test/CodeGen/Mips/seteqz.ll
+++ b/test/CodeGen/Mips/seteqz.ll
@@ -12,13 +12,13 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	sltiu	${{[0-9]+}}, 1
-; 16:	move	${{[0-9]+}}, $t8
+; 16:	move	${{[0-9]+}}, $24
   %1 = load i32* @j, align 4
   %cmp1 = icmp eq i32 %1, 99
   %conv2 = zext i1 %cmp1 to i32
   store i32 %conv2, i32* @r2, align 4
 ; 16:	xor	$[[REGISTER:[0-9A-Ba-b_]+]], ${{[0-9]+}}
 ; 16:	sltiu	$[[REGISTER:[0-9A-Ba-b_]+]], 1
-; 16:	move	${{[0-9]+}}, $t8
+; 16:	move	${{[0-9]+}}, $24
   ret void
 }
diff --git a/test/CodeGen/Mips/setge.ll b/test/CodeGen/Mips/setge.ll
index 94b499b..8869eb8 100644
--- a/test/CodeGen/Mips/setge.ll
+++ b/test/CodeGen/Mips/setge.ll
@@ -17,7 +17,7 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	move	$[[REGISTER:[0-9]+]], $24
 ; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
   %2 = load i32* @m, align 4
   %cmp1 = icmp sge i32 %0, %2
diff --git a/test/CodeGen/Mips/setgek.ll b/test/CodeGen/Mips/setgek.ll
index b6bae09..18a0fcf 100644
--- a/test/CodeGen/Mips/setgek.ll
+++ b/test/CodeGen/Mips/setgek.ll
@@ -12,7 +12,7 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	slti	${{[0-9]+}}, -32768
-; 16:	move	${{[0-9]+}}, $t8
+; 16:	move	${{[0-9]+}}, $24
 ; 16:	xor	${{[0-9]+}}, ${{[0-9]+}}
   ret void
 }
diff --git a/test/CodeGen/Mips/setle.ll b/test/CodeGen/Mips/setle.ll
index f36fb43..2df6774 100644
--- a/test/CodeGen/Mips/setle.ll
+++ b/test/CodeGen/Mips/setle.ll
@@ -16,7 +16,7 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	move	$[[REGISTER:[0-9]+]], $24
 ; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
   %2 = load i32* @m, align 4
   %cmp1 = icmp sle i32 %2, %1
diff --git a/test/CodeGen/Mips/setlt.ll b/test/CodeGen/Mips/setlt.ll
index 435be8e..3dac74b 100644
--- a/test/CodeGen/Mips/setlt.ll
+++ b/test/CodeGen/Mips/setlt.ll
@@ -16,6 +16,6 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
+; 16:	move	${{[0-9]+}}, $24
   ret void
 }
diff --git a/test/CodeGen/Mips/setltk.ll b/test/CodeGen/Mips/setltk.ll
index c0b610e..ecebc7e 100644
--- a/test/CodeGen/Mips/setltk.ll
+++ b/test/CodeGen/Mips/setltk.ll
@@ -15,6 +15,6 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	slti	$[[REGISTER:[0-9]+]], 10
-; 16:	move	$[[REGISTER]], $t8
+; 16:	move	$[[REGISTER]], $24
   ret void
 }
diff --git a/test/CodeGen/Mips/setne.ll b/test/CodeGen/Mips/setne.ll
index 6460c83..9e66901 100644
--- a/test/CodeGen/Mips/setne.ll
+++ b/test/CodeGen/Mips/setne.ll
@@ -15,6 +15,6 @@ entry:
   store i32 %conv, i32* @r1, align 4
 ; 16:	xor	$[[REGISTER:[0-9]+]], ${{[0-9]+}}
 ; 16:	sltu	${{[0-9]+}}, $[[REGISTER]]
-; 16:	move	${{[0-9]+}}, $t8
+; 16:	move	${{[0-9]+}}, $24
   ret void
 }
diff --git a/test/CodeGen/Mips/setuge.ll b/test/CodeGen/Mips/setuge.ll
index ac72b66..1c9b5bb 100644
--- a/test/CodeGen/Mips/setuge.ll
+++ b/test/CodeGen/Mips/setuge.ll
@@ -16,7 +16,7 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move    $[[REGISTER:[0-9]+]], $t8
+; 16:	move    $[[REGISTER:[0-9]+]], $24
 ; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
   %2 = load i32* @m, align 4
   %cmp1 = icmp uge i32 %0, %2
diff --git a/test/CodeGen/Mips/setugt.ll b/test/CodeGen/Mips/setugt.ll
index 328f0e3..f10b47a 100644
--- a/test/CodeGen/Mips/setugt.ll
+++ b/test/CodeGen/Mips/setugt.ll
@@ -16,6 +16,6 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move    ${{[0-9]+}}, $t8
+; 16:	move    ${{[0-9]+}}, $24
   ret void
 }
diff --git a/test/CodeGen/Mips/setule.ll b/test/CodeGen/Mips/setule.ll
index 792f2ae..a6d6bf0 100644
--- a/test/CodeGen/Mips/setule.ll
+++ b/test/CodeGen/Mips/setule.ll
@@ -16,7 +16,7 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	move	$[[REGISTER:[0-9]+]], $24
 ; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
   %2 = load i32* @m, align 4
   %cmp1 = icmp ule i32 %2, %1
diff --git a/test/CodeGen/Mips/setult.ll b/test/CodeGen/Mips/setult.ll
index 56d2e8d..00ee437 100644
--- a/test/CodeGen/Mips/setult.ll
+++ b/test/CodeGen/Mips/setult.ll
@@ -16,6 +16,6 @@ entry:
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
 ; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	move	${{[0-9]+}}, $t8
+; 16:	move	${{[0-9]+}}, $24
   ret void
 }
diff --git a/test/CodeGen/Mips/setultk.ll b/test/CodeGen/Mips/setultk.ll
index 75b270e..eb9edba 100644
--- a/test/CodeGen/Mips/setultk.ll
+++ b/test/CodeGen/Mips/setultk.ll
@@ -14,7 +14,7 @@ entry:
   %cmp = icmp ult i32 %0, 10
   %conv = zext i1 %cmp to i32
   store i32 %conv, i32* @r1, align 4
-; 16:	sltiu	$[[REGISTER:[0-9]+]], 10
-; 16:	move	$[[REGISTER]], $t8
+; 16:	sltiu	${{[0-9]+}}, 10 # 16 bit inst
+; 16:	move	${{[0-9]+}}, $24
   ret void
 }
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
index 72d30dc..b86d25e 100644
--- a/test/CodeGen/Mips/tls.ll
+++ b/test/CodeGen/Mips/tls.ll
@@ -21,9 +21,9 @@ entry:
 ; PIC:   jalr    $25
 ; PIC:   lw      $2, 0($2)
 
-; STATIC:   rdhwr   $3, $29
 ; STATIC:   lui     $[[R0:[0-9]+]], %tprel_hi(t1)
 ; STATIC:   addiu   $[[R1:[0-9]+]], $[[R0]], %tprel_lo(t1)
+; STATIC:   rdhwr   $3, $29
 ; STATIC:   addu    $[[R2:[0-9]+]], $3, $[[R1]]
 ; STATIC:   lw      $2, 0($[[R2]])
 }
diff --git a/test/CodeGen/NVPTX/intrin-nocapture.ll b/test/CodeGen/NVPTX/intrin-nocapture.ll
new file mode 100644
index 0000000..55781bb
--- /dev/null
+++ b/test/CodeGen/NVPTX/intrin-nocapture.ll
@@ -0,0 +1,21 @@
+; RUN: opt < %s -O3 -S | FileCheck %s
+
+; Address space intrinsics were erroneously marked NoCapture, leading to bad
+; optimizations (such as the store below being eliminated as dead code). This
+; test makes sure we don't regress.
+
+declare void @foo(i32 addrspace(1)*)
+
+declare i32 addrspace(1)* @llvm.nvvm.ptr.gen.to.global.p1i32.p0i32(i32*)
+
+; CHECK: @bar
+define void @bar() {
+  %t1 = alloca i32
+; CHECK: call i32 addrspace(1)* @llvm.nvvm.ptr.gen.to.global.p1i32.p0i32(i32* %t1)
+; CHECK-NEXT: store i32 10, i32* %t1
+  %t2 = call i32 addrspace(1)* @llvm.nvvm.ptr.gen.to.global.p1i32.p0i32(i32* %t1)
+  store i32 10, i32* %t1
+  call void @foo(i32 addrspace(1)* %t2)
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/vector-loads.ll b/test/CodeGen/NVPTX/vector-loads.ll
new file mode 100644
index 0000000..f5a1795
--- /dev/null
+++ b/test/CodeGen/NVPTX/vector-loads.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; Even though general vector types are not supported in PTX, we can still
+; optimize loads/stores with pseudo-vector instructions of the form:
+;
+; ld.v2.f32 {%f0, %f1}, [%r0]
+;
+; which will load two floats at once into scalar registers.
+
+define void @foo(<2 x float>* %a) {
+; CHECK: .func foo
+; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%r{{[0-9]+}}];
+  %t1 = load <2 x float>* %a
+  %t2 = fmul <2 x float> %t1, %t1
+  store <2 x float> %t2, <2 x float>* %a
+  ret void
+}
+
+define void @foo2(<4 x float>* %a) {
+; CHECK: .func foo2
+; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%r{{[0-9]+}}];
+  %t1 = load <4 x float>* %a
+  %t2 = fmul <4 x float> %t1, %t1
+  store <4 x float> %t2, <4 x float>* %a
+  ret void
+}
+
+define void @foo3(<8 x float>* %a) {
+; CHECK: .func foo3
+; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%r{{[0-9]+}}];
+; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%r{{[0-9]+}}+16];
+  %t1 = load <8 x float>* %a
+  %t2 = fmul <8 x float> %t1, %t1
+  store <8 x float> %t2, <8 x float>* %a
+  ret void
+}
+
+
+
+define void @foo4(<2 x i32>* %a) {
+; CHECK: .func foo4
+; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%r{{[0-9]+}}];
+  %t1 = load <2 x i32>* %a
+  %t2 = mul <2 x i32> %t1, %t1
+  store <2 x i32> %t2, <2 x i32>* %a
+  ret void
+}
+
+define void @foo5(<4 x i32>* %a) {
+; CHECK: .func foo5
+; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%r{{[0-9]+}}];
+  %t1 = load <4 x i32>* %a
+  %t2 = mul <4 x i32> %t1, %t1
+  store <4 x i32> %t2, <4 x i32>* %a
+  ret void
+}
+
+define void @foo6(<8 x i32>* %a) {
+; CHECK: .func foo6
+; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%r{{[0-9]+}}];
+; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%r{{[0-9]+}}+16];
+  %t1 = load <8 x i32>* %a
+  %t2 = mul <8 x i32> %t1, %t1
+  store <8 x i32> %t2, <8 x i32>* %a
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll b/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
index 8802b97..00a402e 100644
--- a/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
+++ b/test/CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vadduhm
 ; RUN: llc < %s -march=ppc32 -mcpu=g5 | grep vsubuhm
+; XFAIL: *
 
 define <4 x i32> @test() nounwind {
 	ret <4 x i32> < i32 4293066722, i32 4293066722, i32 4293066722, i32 4293066722>
diff --git a/test/CodeGen/PowerPC/a2q-stackalign.ll b/test/CodeGen/PowerPC/a2q-stackalign.ll
new file mode 100644
index 0000000..00c3291
--- /dev/null
+++ b/test/CodeGen/PowerPC/a2q-stackalign.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=ppc64 -mcpu=a2 | FileCheck -check-prefix=CHECK-A2 %s
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck -check-prefix=CHECK-A2Q %s
+; RUN: llc < %s -march=ppc64 -mtriple=powerpc64-bgq-linux -mcpu=a2 | FileCheck -check-prefix=CHECK-BGQ %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare i32 @bar(i8* %a) nounwind;
+define i32 @foo() nounwind {
+  %p = alloca i8, i8 115
+  store i8 0, i8* %p
+  %r = call i32 @bar(i8* %p)
+  ret i32 %r
+}
+
+; Without QPX, the allocated stack frame is 240 bytes, but with QPX
+; (because we require 32-byte alignment), it is 256 bytes.
+; CHECK-A2: @foo
+; CHECK-A2: stdu 1, -240(1)
+; CHECK-A2Q: @foo
+; CHECK-A2Q: stdu 1, -256(1)
+; CHECK-BGQ: @foo
+; CHECK-BGQ: stdu 1, -256(1)
+
diff --git a/test/CodeGen/PowerPC/a2q.ll b/test/CodeGen/PowerPC/a2q.ll
new file mode 100644
index 0000000..b26480f
--- /dev/null
+++ b/test/CodeGen/PowerPC/a2q.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=ppc64 -mcpu=a2q | FileCheck %s
+; RUN: llc < %s -march=ppc64 -mcpu=a2 -mattr=+qpx | FileCheck %s
+
+define void @foo() {
+entry:
+  ret void
+}
+
+; CHECK: @foo
+
diff --git a/test/CodeGen/PowerPC/anon_aggr.ll b/test/CodeGen/PowerPC/anon_aggr.ll
new file mode 100644
index 0000000..52587e2
--- /dev/null
+++ b/test/CodeGen/PowerPC/anon_aggr.ll
@@ -0,0 +1,99 @@
+; RUN: llc -O0 -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; Test case for PR 14779: anonymous aggregates are not handled correctly.
+; The bug is triggered by passing a byval structure after an anonymous
+; aggregate.
+
+%tarray = type { i64, i8* }
+
+define i8* @func1({ i64, i8* } %array, i8* %ptr) {
+entry:
+  %array_ptr = extractvalue {i64, i8* } %array, 1
+  %cond = icmp eq i8* %array_ptr, %ptr
+  br i1 %cond, label %equal, label %unequal
+equal:
+  ret i8* %array_ptr
+unequal:
+  ret i8* %ptr
+}
+
+; CHECK: func1:
+; CHECK: cmpld {{[0-9]+}}, 4, 5
+; CHECK: std 4, -[[OFFSET1:[0-9]+]]
+; CHECK: std 5, -[[OFFSET2:[0-9]+]]
+; CHECK: ld 3, -[[OFFSET1]](1)
+; CHECK: ld 3, -[[OFFSET2]](1)
+
+
+define i8* @func2({ i64, i8* } %array1, %tarray* byval %array2) {
+entry:
+  %array1_ptr = extractvalue {i64, i8* } %array1, 1
+  %tmp = getelementptr inbounds %tarray* %array2, i32 0, i32 1
+  %array2_ptr = load i8** %tmp
+  %cond = icmp eq i8* %array1_ptr, %array2_ptr
+  br i1 %cond, label %equal, label %unequal
+equal:
+  ret i8* %array1_ptr
+unequal:
+  ret i8* %array2_ptr
+}
+
+; CHECK: func2:
+; CHECK: addi [[REG1:[0-9]+]], 1, 64
+; CHECK: ld [[REG2:[0-9]+]], 8([[REG1]])
+; CHECK: cmpld {{[0-9]+}}, 4, [[REG2]]
+; CHECK: std [[REG2]], -[[OFFSET1:[0-9]+]]
+; CHECK: std 4, -[[OFFSET2:[0-9]+]]
+; CHECK: ld 3, -[[OFFSET2]](1)
+; CHECK: ld 3, -[[OFFSET1]](1)
+
+define i8* @func3({ i64, i8* }* byval %array1, %tarray* byval %array2) {
+entry:
+  %tmp1 = getelementptr inbounds { i64, i8* }* %array1, i32 0, i32 1
+  %array1_ptr = load i8** %tmp1
+  %tmp2 = getelementptr inbounds %tarray* %array2, i32 0, i32 1
+  %array2_ptr = load i8** %tmp2
+  %cond = icmp eq i8* %array1_ptr, %array2_ptr
+  br i1 %cond, label %equal, label %unequal
+equal:
+  ret i8* %array1_ptr
+unequal:
+  ret i8* %array2_ptr
+}
+
+; CHECK: func3:
+; CHECK: addi [[REG1:[0-9]+]], 1, 64
+; CHECK: addi [[REG2:[0-9]+]], 1, 48
+; CHECK: ld [[REG3:[0-9]+]], 8([[REG1]])
+; CHECK: ld [[REG4:[0-9]+]], 8([[REG2]])
+; CHECK: cmpld {{[0-9]+}}, [[REG4]], [[REG3]]
+; CHECK: std [[REG3]], -[[OFFSET1:[0-9]+]](1)
+; CHECK: std [[REG4]], -[[OFFSET2:[0-9]+]](1)
+; CHECK: ld 3, -[[OFFSET2]](1)
+; CHECK: ld 3, -[[OFFSET1]](1)
+
+define i8* @func4(i64 %p1, i64 %p2, i64 %p3, i64 %p4,
+                  i64 %p5, i64 %p6, i64 %p7, i64 %p8,
+                  { i64, i8* } %array1, %tarray* byval %array2) {
+entry:
+  %array1_ptr = extractvalue {i64, i8* } %array1, 1
+  %tmp = getelementptr inbounds %tarray* %array2, i32 0, i32 1
+  %array2_ptr = load i8** %tmp
+  %cond = icmp eq i8* %array1_ptr, %array2_ptr
+  br i1 %cond, label %equal, label %unequal
+equal:
+  ret i8* %array1_ptr
+unequal:
+  ret i8* %array2_ptr
+}
+
+; CHECK: func4:
+; CHECK: addi [[REG1:[0-9]+]], 1, 128
+; CHECK: ld [[REG2:[0-9]+]], 120(1)
+; CHECK: ld [[REG3:[0-9]+]], 8([[REG1]])
+; CHECK: cmpld {{[0-9]+}}, [[REG2]], [[REG3]]
+; CHECK: std [[REG2]], -[[OFFSET1:[0-9]+]](1)
+; CHECK: std [[REG3]], -[[OFFSET2:[0-9]+]](1)
+; CHECK: ld 3, -[[OFFSET1]](1)
+; CHECK: ld 3, -[[OFFSET2]](1)
+
diff --git a/test/CodeGen/PowerPC/complex-return.ll b/test/CodeGen/PowerPC/complex-return.ll
new file mode 100644
index 0000000..f12152f
--- /dev/null
+++ b/test/CodeGen/PowerPC/complex-return.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define { ppc_fp128, ppc_fp128 } @foo() nounwind {
+entry:
+  %retval = alloca { ppc_fp128, ppc_fp128 }, align 16
+  %x = alloca { ppc_fp128, ppc_fp128 }, align 16
+  %real = getelementptr inbounds { ppc_fp128, ppc_fp128 }* %x, i32 0, i32 0
+  %imag = getelementptr inbounds { ppc_fp128, ppc_fp128 }* %x, i32 0, i32 1
+  store ppc_fp128 0xM400C0000000000000000000000000000, ppc_fp128* %real
+  store ppc_fp128 0xMC00547AE147AE1483CA47AE147AE147A, ppc_fp128* %imag
+  %x.realp = getelementptr inbounds { ppc_fp128, ppc_fp128 }* %x, i32 0, i32 0
+  %x.real = load ppc_fp128* %x.realp
+  %x.imagp = getelementptr inbounds { ppc_fp128, ppc_fp128 }* %x, i32 0, i32 1
+  %x.imag = load ppc_fp128* %x.imagp
+  %real1 = getelementptr inbounds { ppc_fp128, ppc_fp128 }* %retval, i32 0, i32 0
+  %imag2 = getelementptr inbounds { ppc_fp128, ppc_fp128 }* %retval, i32 0, i32 1
+  store ppc_fp128 %x.real, ppc_fp128* %real1
+  store ppc_fp128 %x.imag, ppc_fp128* %imag2
+  %0 = load { ppc_fp128, ppc_fp128 }* %retval
+  ret { ppc_fp128, ppc_fp128 } %0
+}
+
+; CHECK: foo:
+; CHECK: lfd 3
+; CHECK: lfd 4
+; CHECK: lfd 2
+; CHECK: lfd 1
+
+define { float, float } @oof() nounwind {
+entry:
+  %retval = alloca { float, float }, align 4
+  %x = alloca { float, float }, align 4
+  %real = getelementptr inbounds { float, float }* %x, i32 0, i32 0
+  %imag = getelementptr inbounds { float, float }* %x, i32 0, i32 1
+  store float 3.500000e+00, float* %real
+  store float 0xC00547AE20000000, float* %imag
+  %x.realp = getelementptr inbounds { float, float }* %x, i32 0, i32 0
+  %x.real = load float* %x.realp
+  %x.imagp = getelementptr inbounds { float, float }* %x, i32 0, i32 1
+  %x.imag = load float* %x.imagp
+  %real1 = getelementptr inbounds { float, float }* %retval, i32 0, i32 0
+  %imag2 = getelementptr inbounds { float, float }* %retval, i32 0, i32 1
+  store float %x.real, float* %real1
+  store float %x.imag, float* %imag2
+  %0 = load { float, float }* %retval
+  ret { float, float } %0
+}
+
+; CHECK: oof:
+; CHECK: lfs 2
+; CHECK: lfs 1
+
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index e161cb0..8d87cf7 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -17,10 +17,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 12, metadata !"dbg.c", metadata !"/src", metadata !"clang version 3.1", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !13} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !"dbg.c", metadata !"/src", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/CodeGen/PowerPC/float-asmprint.ll b/test/CodeGen/PowerPC/float-asmprint.ll
new file mode 100644
index 0000000..c9dc028
--- /dev/null
+++ b/test/CodeGen/PowerPC/float-asmprint.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=powerpc64-none-linux < %s | FileCheck %s
+
+; Check that all current floating-point types are correctly emitted to assembly
+; on a big-endian target. x86_fp80 can't actually print for unrelated reasons,
+; but that's not really a problem.
+
+@var128 = global fp128 0xL00000000000000008000000000000000, align 16
+@varppc128 = global ppc_fp128 0xM80000000000000000000000000000000, align 16
+@var64 = global double -0.0, align 8
+@var32 = global float -0.0, align 4
+@var16 = global half -0.0, align 2
+
+; CHECK: var128:
+; CHECK-NEXT: .quad -9223372036854775808      # fp128 -0
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .size
+
+; CHECK: varppc128:
+; CHECK-NEXT: .quad -9223372036854775808      # ppc_fp128 -0
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .size
+
+; CHECK: var64:
+; CHECK-NEXT: .quad -9223372036854775808      # double -0
+; CHECK-NEXT: .size
+
+; CHECK: var32:
+; CHECK-NEXT: .long 2147483648                # float -0
+; CHECK-NEXT: .size
+
+; CHECK: var16:
+; CHECK-NEXT: .short 32768                    # half -0
+; CHECK-NEXT: .size
+
diff --git a/test/CodeGen/PowerPC/fp128.ll b/test/CodeGen/PowerPC/fp128.ll
deleted file mode 100644
index a0b06a4..0000000
--- a/test/CodeGen/PowerPC/fp128.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc -mtriple=powerpc64-none-linux < %s | FileCheck --check-prefix=BIGENDIAN %s
-
-@var = global fp128 0xL00000000000000008000000000000000
-
-; CHECK-BIGENDIAN: var:
-; CHECK-BIGENDIAN-NEXT: .quad   -9223372036854775808    # fp128 -0
-; CHECK-BIGENDIAN-NEXT: .quad   0
-
diff --git a/test/CodeGen/PowerPC/load-shift-combine.ll b/test/CodeGen/PowerPC/load-shift-combine.ll
new file mode 100644
index 0000000..a5d1224
--- /dev/null
+++ b/test/CodeGen/PowerPC/load-shift-combine.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s
+
+; This used to cause a crash.  A standard load is converted to a pre-increment
+; load.  Later the pre-increment load is combined with a subsequent SRL to
+; produce a smaller load.  This transform invalidly created a standard load
+; and propagated the produced value into uses of both produced values of the
+; pre-increment load.  The result was a crash when attempting to process an
+; add with a token-chain operand.
+
+%struct.Info = type { i32, i32, i8*, i8*, i8*, [32 x i8*], i64, [32 x i64], i64, i64, i64, [32 x i64] }
+%struct.S1847 = type { [12 x i8], [4 x i8], [8 x i8], [4 x i8], [8 x i8], [2 x i8], i8, [4 x i64], i8, [3 x i8], [4 x i8], i8, i16, [4 x %struct.anon.76], i16, i8, i8* }
+%struct.anon.76 = type { i32 }
+@info = common global %struct.Info zeroinitializer, align 8
+@fails = common global i32 0, align 4
+@a1847 = external global [5 x %struct.S1847]
+define void @test1847() nounwind {
+entry:
+  %j = alloca i32, align 4
+  %0 = load i64* getelementptr inbounds (%struct.Info* @info, i32 0, i32 8), align 8
+  %1 = load i32* @fails, align 4
+  %bf.load1 = load i96* bitcast (%struct.S1847* getelementptr inbounds ([5 x %struct.S1847]* @a1847, i32 0, i64 2) to i96*), align 8
+  %bf.clear2 = and i96 %bf.load1, 302231454903657293676543
+  %bf.set3 = or i96 %bf.clear2, -38383394772764476296921088
+  store i96 %bf.set3, i96* bitcast (%struct.S1847* getelementptr inbounds ([5 x %struct.S1847]* @a1847, i32 0, i64 2) to i96*), align 8
+  %2 = load i32* %j, align 4
+  %3 = load i32* %j, align 4
+  %inc11 = add nsw i32 %3, 1
+  store i32 %inc11, i32* %j, align 4
+  %bf.load15 = load i96* bitcast (%struct.S1847* getelementptr inbounds ([5 x %struct.S1847]* @a1847, i32 0, i64 2) to i96*), align 8
+  %bf.clear16 = and i96 %bf.load15, -18446744069414584321
+  %bf.set17 = or i96 %bf.clear16, 18446743532543672320
+  store i96 %bf.set17, i96* bitcast (%struct.S1847* getelementptr inbounds ([5 x %struct.S1847]* @a1847, i32 0, i64 2) to i96*), align 8
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/mcm-1.ll b/test/CodeGen/PowerPC/mcm-1.ll
index 62fe88c..a57fb9d 100644
--- a/test/CodeGen/PowerPC/mcm-1.ll
+++ b/test/CodeGen/PowerPC/mcm-1.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck %s
 
-; Test correct code generation for medium code model (32-bit TOC offsets)
+; Test correct code generation for medium and large code model
 ; for loading and storing an external variable.
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
diff --git a/test/CodeGen/PowerPC/mcm-10.ll b/test/CodeGen/PowerPC/mcm-10.ll
new file mode 100644
index 0000000..4bec3e1
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-10.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck %s
+
+; Test peephole optimization for medium code model (32-bit TOC offsets)
+; for loading and storing a static variable scoped to a function.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@test_fn_static.si = internal global i32 0, align 4
+
+define signext i32 @test_fn_static() nounwind {
+entry:
+  %0 = load i32* @test_fn_static.si, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @test_fn_static.si, align 4
+  ret i32 %0
+}
+
+; CHECK: test_fn_static:
+; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK: lwz {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK: .type [[VAR]],@object
+; CHECK: .local [[VAR]]
+; CHECK: .comm [[VAR]],4,4
diff --git a/test/CodeGen/PowerPC/mcm-11.ll b/test/CodeGen/PowerPC/mcm-11.ll
new file mode 100644
index 0000000..f2bc4c9
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-11.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck %s
+
+; Test peephole optimization for medium code model (32-bit TOC offsets)
+; for loading and storing a file-scope static variable.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@gi = global i32 5, align 4
+
+define signext i32 @test_file_static() nounwind {
+entry:
+  %0 = load i32* @gi, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @gi, align 4
+  ret i32 %0
+}
+
+; CHECK: test_file_static:
+; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; CHECK: lwz {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
+; CHECK: .type [[VAR]],@object
+; CHECK: .data
+; CHECK: .globl [[VAR]]
+; CHECK: [[VAR]]:
+; CHECK: .long 5
diff --git a/test/CodeGen/PowerPC/mcm-12.ll b/test/CodeGen/PowerPC/mcm-12.ll
new file mode 100644
index 0000000..911305d
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-12.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck %s
+
+; Test peephole optimization for medium code model (32-bit TOC offsets)
+; for loading a value from the constant pool (TOC-relative).
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define double @test_double_const() nounwind {
+entry:
+  ret double 0x3F4FD4920B498CF0
+}
+
+; CHECK: [[VAR:[a-z0-9A-Z_.]+]]:
+; CHECK: .quad 4562098671269285104
+; CHECK: test_double_const:
+; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
+; CHECK: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
diff --git a/test/CodeGen/PowerPC/mcm-2.ll b/test/CodeGen/PowerPC/mcm-2.ll
index 45df0ab..f0dff4c 100644
--- a/test/CodeGen/PowerPC/mcm-2.ll
+++ b/test/CodeGen/PowerPC/mcm-2.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck -check-prefix=MEDIUM %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck -check-prefix=LARGE %s
 
-; Test correct code generation for medium code model (32-bit TOC offsets)
+; Test correct code generation for medium and large code model
 ; for loading and storing a static variable scoped to a function.
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
@@ -16,11 +17,21 @@ entry:
   ret i32 %0
 }
 
-; CHECK: test_fn_static:
-; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
-; CHECK: lwz {{[0-9]+}}, 0([[REG2]])
-; CHECK: stw {{[0-9]+}}, 0([[REG2]])
-; CHECK: .type [[VAR]],@object
-; CHECK: .local [[VAR]]
-; CHECK: .comm [[VAR]],4,4
+; MEDIUM: test_fn_static:
+; MEDIUM: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; MEDIUM: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; MEDIUM: lwz {{[0-9]+}}, 0([[REG2]])
+; MEDIUM: stw {{[0-9]+}}, 0([[REG2]])
+; MEDIUM: .type [[VAR]],@object
+; MEDIUM: .local [[VAR]]
+; MEDIUM: .comm [[VAR]],4,4
+
+; LARGE: test_fn_static:
+; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; LARGE: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
+; LARGE: lwz {{[0-9]+}}, 0([[REG2]])
+; LARGE: stw {{[0-9]+}}, 0([[REG2]])
+; LARGE: .type [[VAR]],@object
+; LARGE: .local [[VAR]]
+; LARGE: .comm [[VAR]],4,4
+
diff --git a/test/CodeGen/PowerPC/mcm-3.ll b/test/CodeGen/PowerPC/mcm-3.ll
index 0e7bbe7..b790550 100644
--- a/test/CodeGen/PowerPC/mcm-3.ll
+++ b/test/CodeGen/PowerPC/mcm-3.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck -check-prefix=MEDIUM %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck -check-prefix=LARGE %s
 
-; Test correct code generation for medium code model (32-bit TOC offsets)
+; Test correct code generation for medium and large code model
 ; for loading and storing a file-scope static variable.
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
@@ -16,13 +17,25 @@ entry:
   ret i32 %0
 }
 
-; CHECK: test_file_static:
-; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
-; CHECK: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
-; CHECK: lwz {{[0-9]+}}, 0([[REG2]])
-; CHECK: stw {{[0-9]+}}, 0([[REG2]])
-; CHECK: .type [[VAR]],@object
-; CHECK: .data
-; CHECK: .globl [[VAR]]
-; CHECK: [[VAR]]:
-; CHECK: .long 5
+; MEDIUM: test_file_static:
+; MEDIUM: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; MEDIUM: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; MEDIUM: lwz {{[0-9]+}}, 0([[REG2]])
+; MEDIUM: stw {{[0-9]+}}, 0([[REG2]])
+; MEDIUM: .type [[VAR]],@object
+; MEDIUM: .data
+; MEDIUM: .globl [[VAR]]
+; MEDIUM: [[VAR]]:
+; MEDIUM: .long 5
+
+; LARGE: test_file_static:
+; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
+; LARGE: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
+; LARGE: lwz {{[0-9]+}}, 0([[REG2]])
+; LARGE: stw {{[0-9]+}}, 0([[REG2]])
+; LARGE: .type [[VAR]],@object
+; LARGE: .data
+; LARGE: .globl [[VAR]]
+; LARGE: [[VAR]]:
+; LARGE: .long 5
+
diff --git a/test/CodeGen/PowerPC/mcm-4.ll b/test/CodeGen/PowerPC/mcm-4.ll
index db36d0b..47c60c9 100644
--- a/test/CodeGen/PowerPC/mcm-4.ll
+++ b/test/CodeGen/PowerPC/mcm-4.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck -check-prefix=MEDIUM %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck -check-prefix=LARGE %s
 
-; Test correct code generation for medium code model (32-bit TOC offsets)
+; Test correct code generation for medium and large code model
 ; for loading a value from the constant pool (TOC-relative).
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
@@ -11,9 +12,16 @@ entry:
   ret double 0x3F4FD4920B498CF0
 }
 
-; CHECK: [[VAR:[a-z0-9A-Z_.]+]]:
-; CHECK: .quad 4562098671269285104
-; CHECK: test_double_const:
-; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
-; CHECK: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
-; CHECK: lfd {{[0-9]+}}, 0([[REG2]])
+; MEDIUM: [[VAR:[a-z0-9A-Z_.]+]]:
+; MEDIUM: .quad 4562098671269285104
+; MEDIUM: test_double_const:
+; MEDIUM: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
+; MEDIUM: addi [[REG2:[0-9]+]], [[REG1]], [[VAR]]@toc@l
+; MEDIUM: lfd {{[0-9]+}}, 0([[REG2]])
+
+; LARGE: [[VAR:[a-z0-9A-Z_.]+]]:
+; LARGE: .quad 4562098671269285104
+; LARGE: test_double_const:
+; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
+; LARGE: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
+; LARGE: lfd {{[0-9]+}}, 0([[REG2]])
diff --git a/test/CodeGen/PowerPC/mcm-5.ll b/test/CodeGen/PowerPC/mcm-5.ll
index 10d89f5..1be27b7 100644
--- a/test/CodeGen/PowerPC/mcm-5.ll
+++ b/test/CodeGen/PowerPC/mcm-5.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck %s
 
-; Test correct code generation for medium code model (32-bit TOC offsets)
+; Test correct code generation for medium and large code model
 ; for loading the address of a jump table from the TOC.
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
diff --git a/test/CodeGen/PowerPC/mcm-6.ll b/test/CodeGen/PowerPC/mcm-6.ll
index 0a7fa76..35efaaa 100644
--- a/test/CodeGen/PowerPC/mcm-6.ll
+++ b/test/CodeGen/PowerPC/mcm-6.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mcpu=pwr7 -O0 -code-model=medium < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large < %s | FileCheck %s
 
-; Test correct code generation for medium code model (32-bit TOC offsets)
+; Test correct code generation for medium and large code model
 ; for loading and storing a tentatively defined variable.
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
diff --git a/test/CodeGen/PowerPC/mcm-7.ll b/test/CodeGen/PowerPC/mcm-7.ll
index 0e9fa2b..0dd39ee 100644
--- a/test/CodeGen/PowerPC/mcm-7.ll
+++ b/test/CodeGen/PowerPC/mcm-7.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mcpu=pwr7 -O0 -code-model=medium < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large < %s | FileCheck %s
 
-; Test correct code generation for medium code model (32-bit TOC offsets)
+; Test correct code generation for medium and large code model
 ; for loading a function address.
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
diff --git a/test/CodeGen/PowerPC/mcm-8.ll b/test/CodeGen/PowerPC/mcm-8.ll
index 9381a97..3ece786 100644
--- a/test/CodeGen/PowerPC/mcm-8.ll
+++ b/test/CodeGen/PowerPC/mcm-8.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mcpu=pwr7 -O0 -code-model=medium < %s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large < %s | FileCheck %s
 
-; Test correct code generation for medium code model (32-bit TOC offsets)
+; Test correct code generation for medium and large code model
 ; for loading a variable with available-externally linkage.
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
diff --git a/test/CodeGen/PowerPC/mcm-9.ll b/test/CodeGen/PowerPC/mcm-9.ll
index 422607c..f366f45 100644
--- a/test/CodeGen/PowerPC/mcm-9.ll
+++ b/test/CodeGen/PowerPC/mcm-9.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mcpu=pwr7 -O0 -code-model=medium <%s | FileCheck %s
+; RUN: llc -mcpu=pwr7 -O0 -code-model=large <%s | FileCheck %s
 
-; Test correct code generation for medium code model (32-bit TOC offsets)
+; Test correct code generation for medium and large code model
 ; for loading and storing an aliased external variable.
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
diff --git a/test/CodeGen/PowerPC/mcm-obj-2.ll b/test/CodeGen/PowerPC/mcm-obj-2.ll
new file mode 100644
index 0000000..2dd1718
--- /dev/null
+++ b/test/CodeGen/PowerPC/mcm-obj-2.ll
@@ -0,0 +1,77 @@
+; RUN: llc -O1 -mcpu=pwr7 -code-model=medium -filetype=obj %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s
+
+; FIXME: When asm-parse is available, could make this an assembly test.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@test_fn_static.si = internal global i32 0, align 4
+
+define signext i32 @test_fn_static() nounwind {
+entry:
+  %0 = load i32* @test_fn_static.si, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @test_fn_static.si, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
+; accessing function-scoped variable si.
+;
+; CHECK:       Relocation 0
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM2:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 1
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM2]]
+; CHECK-NEXT:  'r_type', 0x00000030
+; CHECK:       Relocation 2
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM2]]
+; CHECK-NEXT:  'r_type', 0x00000030
+
+@gi = global i32 5, align 4
+
+define signext i32 @test_file_static() nounwind {
+entry:
+  %0 = load i32* @gi, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @gi, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
+; accessing file-scope variable gi.
+;
+; CHECK:       Relocation 3
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM3:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 4
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM3]]
+; CHECK-NEXT:  'r_type', 0x00000030
+; CHECK:       Relocation 5
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM3]]
+; CHECK-NEXT:  'r_type', 0x00000030
+
+define double @test_double_const() nounwind {
+entry:
+  ret double 0x3F4FD4920B498CF0
+}
+
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
+; accessing a constant.
+;
+; CHECK:       Relocation 6
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM4:[0-9]+]]
+; CHECK-NEXT:  'r_type', 0x00000032
+; CHECK:       Relocation 7
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM4]]
+; CHECK-NEXT:  'r_type', 0x00000030
+
diff --git a/test/CodeGen/PowerPC/mcm-obj.ll b/test/CodeGen/PowerPC/mcm-obj.ll
index ec1b7b0..117c3b3 100644
--- a/test/CodeGen/PowerPC/mcm-obj.ll
+++ b/test/CodeGen/PowerPC/mcm-obj.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -O0 -mcpu=pwr7 -code-model=medium -filetype=obj %s -o - | \
-; RUN: elf-dump --dump-section-data | FileCheck %s
+; RUN: elf-dump --dump-section-data | FileCheck -check-prefix=MEDIUM %s
+; RUN: llc -O0 -mcpu=pwr7 -code-model=large -filetype=obj %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck -check-prefix=LARGE %s
 
 ; FIXME: When asm-parse is available, could make this an assembly test.
 
@@ -19,15 +21,25 @@ entry:
 ; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
 ; accessing external variable ei.
 ;
-; CHECK:       '.rela.text'
-; CHECK:       Relocation 0
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM1:[0-9]+]]
-; CHECK-NEXT:  'r_type', 0x00000032
-; CHECK:       Relocation 1
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
-; CHECK-NEXT:  'r_type', 0x00000040
+; MEDIUM:       '.rela.text'
+; MEDIUM:       Relocation 0
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM1:[0-9]+]]
+; MEDIUM-NEXT:  'r_type', 0x00000032
+; MEDIUM:       Relocation 1
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM1]]
+; MEDIUM-NEXT:  'r_type', 0x00000040
+;
+; LARGE:       '.rela.text'
+; LARGE:       Relocation 0
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM1:[0-9]+]]
+; LARGE-NEXT:  'r_type', 0x00000032
+; LARGE:       Relocation 1
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM1]]
+; LARGE-NEXT:  'r_type', 0x00000040
 
 @test_fn_static.si = internal global i32 0, align 4
 
@@ -42,14 +54,26 @@ entry:
 ; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
 ; accessing function-scoped variable si.
 ;
-; CHECK:       Relocation 2
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM2:[0-9]+]]
-; CHECK-NEXT:  'r_type', 0x00000032
-; CHECK:       Relocation 3
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM2]]
-; CHECK-NEXT:  'r_type', 0x00000030
+; MEDIUM:       Relocation 2
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM2:[0-9]+]]
+; MEDIUM-NEXT:  'r_type', 0x00000032
+; MEDIUM:       Relocation 3
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM2]]
+; MEDIUM-NEXT:  'r_type', 0x00000030
+;
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
+; accessing function-scoped variable si.
+;
+; LARGE:       Relocation 2
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM2:[0-9]+]]
+; LARGE-NEXT:  'r_type', 0x00000032
+; LARGE:       Relocation 3
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM2]]
+; LARGE-NEXT:  'r_type', 0x00000040
 
 @gi = global i32 5, align 4
 
@@ -64,14 +88,26 @@ entry:
 ; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
 ; accessing file-scope variable gi.
 ;
-; CHECK:       Relocation 4
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM3:[0-9]+]]
-; CHECK-NEXT:  'r_type', 0x00000032
-; CHECK:       Relocation 5
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM3]]
-; CHECK-NEXT:  'r_type', 0x00000030
+; MEDIUM:       Relocation 4
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM3:[0-9]+]]
+; MEDIUM-NEXT:  'r_type', 0x00000032
+; MEDIUM:       Relocation 5
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM3]]
+; MEDIUM-NEXT:  'r_type', 0x00000030
+;
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
+; accessing file-scope variable gi.
+;
+; LARGE:       Relocation 4
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM3:[0-9]+]]
+; LARGE-NEXT:  'r_type', 0x00000032
+; LARGE:       Relocation 5
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM3]]
+; LARGE-NEXT:  'r_type', 0x00000040
 
 define double @test_double_const() nounwind {
 entry:
@@ -81,14 +117,26 @@ entry:
 ; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
 ; accessing a constant.
 ;
-; CHECK:       Relocation 6
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM4:[0-9]+]]
-; CHECK-NEXT:  'r_type', 0x00000032
-; CHECK:       Relocation 7
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM4]]
-; CHECK-NEXT:  'r_type', 0x00000030
+; MEDIUM:       Relocation 6
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM4:[0-9]+]]
+; MEDIUM-NEXT:  'r_type', 0x00000032
+; MEDIUM:       Relocation 7
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM4]]
+; MEDIUM-NEXT:  'r_type', 0x00000030
+;
+; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
+; accessing a constant.
+;
+; LARGE:       Relocation 6
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM4:[0-9]+]]
+; LARGE-NEXT:  'r_type', 0x00000032
+; LARGE:       Relocation 7
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM4]]
+; LARGE-NEXT:  'r_type', 0x00000040
 
 define signext i32 @test_jump_table(i32 signext %i) nounwind {
 entry:
@@ -137,14 +185,23 @@ sw.epilog:                                        ; preds = %sw.bb3, %sw.default
 ; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
 ; accessing a jump table address.
 ;
-; CHECK:       Relocation 8
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM5:[0-9]+]]
-; CHECK-NEXT:  'r_type', 0x00000032
-; CHECK:       Relocation 9
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM5]]
-; CHECK-NEXT:  'r_type', 0x00000040
+; MEDIUM:       Relocation 8
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM5:[0-9]+]]
+; MEDIUM-NEXT:  'r_type', 0x00000032
+; MEDIUM:       Relocation 9
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM5]]
+; MEDIUM-NEXT:  'r_type', 0x00000040
+;
+; LARGE:       Relocation 8
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM5:[0-9]+]]
+; LARGE-NEXT:  'r_type', 0x00000032
+; LARGE:       Relocation 9
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM5]]
+; LARGE-NEXT:  'r_type', 0x00000040
 
 @ti = common global i32 0, align 4
 
@@ -159,14 +216,23 @@ entry:
 ; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
 ; accessing tentatively declared variable ti.
 ;
-; CHECK:       Relocation 10
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM6:[0-9]+]]
-; CHECK-NEXT:  'r_type', 0x00000032
-; CHECK:       Relocation 11
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM6]]
-; CHECK-NEXT:  'r_type', 0x00000040
+; MEDIUM:       Relocation 10
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM6:[0-9]+]]
+; MEDIUM-NEXT:  'r_type', 0x00000032
+; MEDIUM:       Relocation 11
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM6]]
+; MEDIUM-NEXT:  'r_type', 0x00000040
+;
+; LARGE:       Relocation 10
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM6:[0-9]+]]
+; LARGE-NEXT:  'r_type', 0x00000032
+; LARGE:       Relocation 11
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM6]]
+; LARGE-NEXT:  'r_type', 0x00000040
 
 define i8* @test_fnaddr() nounwind {
 entry:
@@ -182,12 +248,21 @@ declare signext i32 @foo(i32 signext)
 ; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO_DS for
 ; accessing function address foo.
 ;
-; CHECK:       Relocation 12
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM7:[0-9]+]]
-; CHECK-NEXT:  'r_type', 0x00000032
-; CHECK:       Relocation 13
-; CHECK-NEXT:  'r_offset'
-; CHECK-NEXT:  'r_sym', 0x[[SYM7]]
-; CHECK-NEXT:  'r_type', 0x00000040
+; MEDIUM:       Relocation 12
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM7:[0-9]+]]
+; MEDIUM-NEXT:  'r_type', 0x00000032
+; MEDIUM:       Relocation 13
+; MEDIUM-NEXT:  'r_offset'
+; MEDIUM-NEXT:  'r_sym', 0x[[SYM7]]
+; MEDIUM-NEXT:  'r_type', 0x00000040
+;
+; LARGE:       Relocation 12
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM7:[0-9]+]]
+; LARGE-NEXT:  'r_type', 0x00000032
+; LARGE:       Relocation 13
+; LARGE-NEXT:  'r_offset'
+; LARGE-NEXT:  'r_sym', 0x[[SYM7]]
+; LARGE-NEXT:  'r_type', 0x00000040
 
diff --git a/test/CodeGen/ARM/misched-inorder-latency.ll b/test/CodeGen/PowerPC/misched-inorder-latency.ll
index 8c06b4c..8fae7ad 100644
--- a/test/CodeGen/ARM/misched-inorder-latency.ll
+++ b/test/CodeGen/PowerPC/misched-inorder-latency.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -enable-misched -march=thumb -mcpu=swift \
-; RUN:          -pre-RA-sched=source -scheditins=false -ilp-window=0 \
+; RUN: llc < %s -enable-misched -pre-RA-sched=source -scheditins=false \
 ; RUN:          -disable-ifcvt-triangle-false -disable-post-ra | FileCheck %s
 ;
-; For these tests, we set -ilp-window=0 to simulate in order processor.
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
 
-; %val1 is a 3-cycle load live out of %entry. It should be hoisted
+; %val1 is a load live out of %entry. It should be hoisted
 ; above the add.
-; CHECK: @testload
+; CHECK: testload:
 ; CHECK: %entry
-; CHECK: ldr
-; CHECK: adds
+; CHECK: lwz
+; CHECK: addi
 ; CHECK: bne
 ; CHECK: %true
 define i32 @testload(i32 *%ptr, i32 %sumin) {
@@ -34,15 +34,22 @@ end:
 ; The prefetch gets a default latency of 3 cycles and should be hoisted
 ; above the add.
 ;
-; CHECK: @testprefetch
+; CHECK: testprefetch:
 ; CHECK: %entry
-; CHECK: pld
-; CHECK: adds
-; CHECK: bx
+; CHECK: dcbt
+; CHECK: addi
+; CHECK: blr
 define i32 @testprefetch(i8 *%ptr, i32 %i) {
 entry:
-  %tmp = add i32 %i, 1
+  %val1 = add i32 %i, 1
   tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
-  ret i32 %tmp
+  %p = icmp eq i32 %i, 0
+  br i1 %p, label %true, label %end
+true:
+  %val2 = add i32 %val1, 1
+  br label %end
+end:
+  %valmerge = phi i32 [ %val1, %entry], [ %val2, %true ]
+  ret i32 %valmerge
 }
 declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
diff --git a/test/CodeGen/PowerPC/pr15031.ll b/test/CodeGen/PowerPC/pr15031.ll
new file mode 100644
index 0000000..5ccf941
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr15031.ll
@@ -0,0 +1,370 @@
+; RUN: llc -mcpu=pwr7 -O3 < %s | FileCheck %s
+
+; Test case derived from bug report 15031.  The code in the post-RA
+; scheduler to break critical anti-dependencies was failing to check
+; whether an instruction had more than one definition, and ensuring
+; that any additional definitions interfered with the choice of a new
+; register.  As a result, this test originally caused this to be
+; generated:
+;
+;   lbzu 3, 1(3)
+;
+; which is illegal, since it requires register 3 to both receive the
+; loaded value and receive the updated address.  With the fix to bug
+; 15031, a different register is chosen to receive the loaded value.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%"class.llvm::MachineMemOperand" = type { %"struct.llvm::MachinePointerInfo", i64, i32, %"class.llvm::MDNode"*, %"class.llvm::MDNode"* }
+%"struct.llvm::MachinePointerInfo" = type { %"class.llvm::Value"*, i64 }
+%"class.llvm::Value" = type { i32 (...)**, i8, i8, i16, %"class.llvm::Type"*, %"class.llvm::Use"*, %"class.llvm::StringMapEntry"* }
+%"class.llvm::Type" = type { %"class.llvm::LLVMContext"*, i32, i32, %"class.llvm::Type"** }
+%"class.llvm::LLVMContext" = type { %"class.llvm::LLVMContextImpl"* }
+%"class.llvm::LLVMContextImpl" = type opaque
+%"class.llvm::Use" = type { %"class.llvm::Value"*, %"class.llvm::Use"*, %"class.llvm::PointerIntPair" }
+%"class.llvm::PointerIntPair" = type { i64 }
+%"class.llvm::StringMapEntry" = type opaque
+%"class.llvm::MDNode" = type { %"class.llvm::Value", %"class.llvm::FoldingSetImpl::Node", i32, i32 }
+%"class.llvm::FoldingSetImpl::Node" = type { i8* }
+%"class.llvm::MachineInstr" = type { %"class.llvm::ilist_node", %"class.llvm::MCInstrDesc"*, %"class.llvm::MachineBasicBlock"*, %"class.llvm::MachineOperand"*, i32, %"class.llvm::ArrayRecycler<llvm::MachineOperand, 8>::Capacity", i8, i8, i8, %"class.llvm::MachineMemOperand"**, %"class.llvm::DebugLoc" }
+%"class.llvm::ilist_node" = type { %"class.llvm::ilist_half_node", %"class.llvm::MachineInstr"* }
+%"class.llvm::ilist_half_node" = type { %"class.llvm::MachineInstr"* }
+%"class.llvm::MCInstrDesc" = type { i16, i16, i16, i16, i16, i32, i64, i16*, i16*, %"class.llvm::MCOperandInfo"* }
+%"class.llvm::MCOperandInfo" = type { i16, i8, i8, i32 }
+%"class.llvm::MachineBasicBlock" = type { %"class.llvm::ilist_node.0", %"struct.llvm::ilist", %"class.llvm::BasicBlock"*, i32, %"class.llvm::MachineFunction"*, %"class.std::vector.163", %"class.std::vector.163", %"class.std::vector.123", %"class.std::vector.123", i32, i8, i8 }
+%"class.llvm::ilist_node.0" = type { %"class.llvm::ilist_half_node.1", %"class.llvm::MachineBasicBlock"* }
+%"class.llvm::ilist_half_node.1" = type { %"class.llvm::MachineBasicBlock"* }
+%"struct.llvm::ilist" = type { %"class.llvm::iplist" }
+%"class.llvm::iplist" = type { %"struct.llvm::ilist_traits", %"class.llvm::MachineInstr"* }
+%"struct.llvm::ilist_traits" = type { %"class.llvm::ilist_half_node", %"class.llvm::MachineBasicBlock"* }
+%"class.llvm::BasicBlock" = type { %"class.llvm::Value", %"class.llvm::ilist_node.2", %"class.llvm::iplist.4", %"class.llvm::Function"* }
+%"class.llvm::ilist_node.2" = type { %"class.llvm::ilist_half_node.3", %"class.llvm::BasicBlock"* }
+%"class.llvm::ilist_half_node.3" = type { %"class.llvm::BasicBlock"* }
+%"class.llvm::iplist.4" = type { %"struct.llvm::ilist_traits.5", %"class.llvm::Instruction"* }
+%"struct.llvm::ilist_traits.5" = type { %"class.llvm::ilist_half_node.10" }
+%"class.llvm::ilist_half_node.10" = type { %"class.llvm::Instruction"* }
+%"class.llvm::Instruction" = type { %"class.llvm::User", %"class.llvm::ilist_node.193", %"class.llvm::BasicBlock"*, %"class.llvm::DebugLoc" }
+%"class.llvm::User" = type { %"class.llvm::Value", %"class.llvm::Use"*, i32 }
+%"class.llvm::ilist_node.193" = type { %"class.llvm::ilist_half_node.10", %"class.llvm::Instruction"* }
+%"class.llvm::DebugLoc" = type { i32, i32 }
+%"class.llvm::Function" = type { %"class.llvm::GlobalValue", %"class.llvm::ilist_node.27", %"class.llvm::iplist.47", %"class.llvm::iplist.54", %"class.llvm::ValueSymbolTable"*, %"class.llvm::AttributeSet" }
+%"class.llvm::GlobalValue" = type { [52 x i8], [4 x i8], %"class.llvm::Module"*, %"class.std::basic_string" }
+%"class.llvm::Module" = type { %"class.llvm::LLVMContext"*, %"class.llvm::iplist.11", %"class.llvm::iplist.20", %"class.llvm::iplist.29", %"struct.llvm::ilist.38", %"class.std::basic_string", %"class.llvm::ValueSymbolTable"*, %"class.llvm::OwningPtr", %"class.std::basic_string", %"class.std::basic_string", %"class.std::basic_string", i8* }
+%"class.llvm::iplist.11" = type { %"struct.llvm::ilist_traits.12", %"class.llvm::GlobalVariable"* }
+%"struct.llvm::ilist_traits.12" = type { %"class.llvm::ilist_node.18" }
+%"class.llvm::ilist_node.18" = type { %"class.llvm::ilist_half_node.19", %"class.llvm::GlobalVariable"* }
+%"class.llvm::ilist_half_node.19" = type { %"class.llvm::GlobalVariable"* }
+%"class.llvm::GlobalVariable" = type { %"class.llvm::GlobalValue", %"class.llvm::ilist_node.18", i8 }
+%"class.llvm::iplist.20" = type { %"struct.llvm::ilist_traits.21", %"class.llvm::Function"* }
+%"struct.llvm::ilist_traits.21" = type { %"class.llvm::ilist_node.27" }
+%"class.llvm::ilist_node.27" = type { %"class.llvm::ilist_half_node.28", %"class.llvm::Function"* }
+%"class.llvm::ilist_half_node.28" = type { %"class.llvm::Function"* }
+%"class.llvm::iplist.29" = type { %"struct.llvm::ilist_traits.30", %"class.llvm::GlobalAlias"* }
+%"struct.llvm::ilist_traits.30" = type { %"class.llvm::ilist_node.36" }
+%"class.llvm::ilist_node.36" = type { %"class.llvm::ilist_half_node.37", %"class.llvm::GlobalAlias"* }
+%"class.llvm::ilist_half_node.37" = type { %"class.llvm::GlobalAlias"* }
+%"class.llvm::GlobalAlias" = type { %"class.llvm::GlobalValue", %"class.llvm::ilist_node.36" }
+%"struct.llvm::ilist.38" = type { %"class.llvm::iplist.39" }
+%"class.llvm::iplist.39" = type { %"struct.llvm::ilist_traits.40", %"class.llvm::NamedMDNode"* }
+%"struct.llvm::ilist_traits.40" = type { %"class.llvm::ilist_node.45" }
+%"class.llvm::ilist_node.45" = type { %"class.llvm::ilist_half_node.46", %"class.llvm::NamedMDNode"* }
+%"class.llvm::ilist_half_node.46" = type { %"class.llvm::NamedMDNode"* }
+%"class.llvm::NamedMDNode" = type { %"class.llvm::ilist_node.45", %"class.std::basic_string", %"class.llvm::Module"*, i8* }
+%"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+%"class.llvm::ValueSymbolTable" = type opaque
+%"class.llvm::OwningPtr" = type { %"class.llvm::GVMaterializer"* }
+%"class.llvm::GVMaterializer" = type opaque
+%"class.llvm::iplist.47" = type { %"struct.llvm::ilist_traits.48", %"class.llvm::BasicBlock"* }
+%"struct.llvm::ilist_traits.48" = type { %"class.llvm::ilist_half_node.3" }
+%"class.llvm::iplist.54" = type { %"struct.llvm::ilist_traits.55", %"class.llvm::Argument"* }
+%"struct.llvm::ilist_traits.55" = type { %"class.llvm::ilist_half_node.61" }
+%"class.llvm::ilist_half_node.61" = type { %"class.llvm::Argument"* }
+%"class.llvm::Argument" = type { %"class.llvm::Value", %"class.llvm::ilist_node.192", %"class.llvm::Function"* }
+%"class.llvm::ilist_node.192" = type { %"class.llvm::ilist_half_node.61", %"class.llvm::Argument"* }
+%"class.llvm::AttributeSet" = type { %"class.llvm::AttributeSetImpl"* }
+%"class.llvm::AttributeSetImpl" = type opaque
+%"class.llvm::MachineFunction" = type { %"class.llvm::Function"*, %"class.llvm::TargetMachine"*, %"class.llvm::MCContext"*, %"class.llvm::MachineModuleInfo"*, %"class.llvm::GCModuleInfo"*, %"class.llvm::MachineRegisterInfo"*, %"struct.llvm::MachineFunctionInfo"*, %"class.llvm::MachineFrameInfo"*, %"class.llvm::MachineConstantPool"*, %"class.llvm::MachineJumpTableInfo"*, %"class.std::vector.163", %"class.llvm::BumpPtrAllocator", %"class.llvm::Recycler", %"class.llvm::ArrayRecycler", %"class.llvm::Recycler.180", %"struct.llvm::ilist.181", i32, i32, i8 }
+%"class.llvm::TargetMachine" = type { i32 (...)**, %"class.llvm::Target"*, %"class.std::basic_string", %"class.std::basic_string", %"class.std::basic_string", %"class.llvm::MCCodeGenInfo"*, %"class.llvm::MCAsmInfo"*, i8, %"class.llvm::TargetOptions" }
+%"class.llvm::Target" = type opaque
+%"class.llvm::MCCodeGenInfo" = type opaque
+%"class.llvm::MCAsmInfo" = type opaque
+%"class.llvm::TargetOptions" = type { [2 x i8], i32, i8, i32, i8, %"class.std::basic_string", i32, i32 }
+%"class.llvm::MCContext" = type { %"class.llvm::SourceMgr"*, %"class.llvm::MCAsmInfo"*, %"class.llvm::MCRegisterInfo"*, %"class.llvm::MCObjectFileInfo"*, %"class.llvm::BumpPtrAllocator", %"class.llvm::StringMap", %"class.llvm::StringMap.62", i32, %"class.llvm::DenseMap.63", i8*, %"class.llvm::raw_ostream"*, i8, %"class.std::basic_string", %"class.std::basic_string", %"class.std::vector", %"class.std::vector.70", %"class.llvm::MCDwarfLoc", i8, i8, i32, %"class.llvm::MCSection"*, %"class.llvm::MCSymbol"*, %"class.llvm::MCSymbol"*, %"class.std::vector.75", %"class.llvm::StringRef", %"class.llvm::StringRef", i8, %"class.llvm::DenseMap.80", %"class.std::vector.84", i8*, i8*, i8*, i8 }
+%"class.llvm::SourceMgr" = type opaque
+%"class.llvm::MCRegisterInfo" = type { %"struct.llvm::MCRegisterDesc"*, i32, i32, i32, %"class.llvm::MCRegisterClass"*, i32, i32, [2 x i16]*, i16*, i8*, i16*, i32, i16*, i32, i32, i32, i32, %"struct.llvm::MCRegisterInfo::DwarfLLVMRegPair"*, %"struct.llvm::MCRegisterInfo::DwarfLLVMRegPair"*, %"struct.llvm::MCRegisterInfo::DwarfLLVMRegPair"*, %"struct.llvm::MCRegisterInfo::DwarfLLVMRegPair"*, %"class.llvm::DenseMap" }
+%"struct.llvm::MCRegisterDesc" = type { i32, i32, i32, i32, i32, i32 }
+%"class.llvm::MCRegisterClass" = type { i8*, i16*, i8*, i16, i16, i16, i16, i16, i8, i8 }
+%"struct.llvm::MCRegisterInfo::DwarfLLVMRegPair" = type { i32, i32 }
+%"class.llvm::DenseMap" = type { %"struct.std::pair"*, i32, i32, i32 }
+%"struct.std::pair" = type { i32, i32 }
+%"class.llvm::MCObjectFileInfo" = type opaque
+%"class.llvm::BumpPtrAllocator" = type { i64, i64, %"class.llvm::SlabAllocator"*, %"class.llvm::MemSlab"*, i8*, i8*, i64 }
+%"class.llvm::SlabAllocator" = type { i32 (...)** }
+%"class.llvm::MemSlab" = type { i64, %"class.llvm::MemSlab"* }
+%"class.llvm::StringMap" = type { %"class.llvm::StringMapImpl", %"class.llvm::BumpPtrAllocator"* }
+%"class.llvm::StringMapImpl" = type { %"class.llvm::StringMapEntryBase"**, i32, i32, i32, i32 }
+%"class.llvm::StringMapEntryBase" = type { i32 }
+%"class.llvm::StringMap.62" = type { %"class.llvm::StringMapImpl", %"class.llvm::BumpPtrAllocator"* }
+%"class.llvm::DenseMap.63" = type { %"struct.std::pair.66"*, i32, i32, i32 }
+%"struct.std::pair.66" = type opaque
+%"class.llvm::raw_ostream" = type { i32 (...)**, i8*, i8*, i8*, i32 }
+%"class.std::vector" = type { %"struct.std::_Vector_base" }
+%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<llvm::MCDwarfFile *, std::allocator<llvm::MCDwarfFile *> >::_Vector_impl" }
+%"struct.std::_Vector_base<llvm::MCDwarfFile *, std::allocator<llvm::MCDwarfFile *> >::_Vector_impl" = type { %"class.llvm::MCDwarfFile"**, %"class.llvm::MCDwarfFile"**, %"class.llvm::MCDwarfFile"** }
+%"class.llvm::MCDwarfFile" = type { %"class.llvm::StringRef", i32 }
+%"class.llvm::StringRef" = type { i8*, i64 }
+%"class.std::vector.70" = type { %"struct.std::_Vector_base.71" }
+%"struct.std::_Vector_base.71" = type { %"struct.std::_Vector_base<llvm::StringRef, std::allocator<llvm::StringRef> >::_Vector_impl" }
+%"struct.std::_Vector_base<llvm::StringRef, std::allocator<llvm::StringRef> >::_Vector_impl" = type { %"class.llvm::StringRef"*, %"class.llvm::StringRef"*, %"class.llvm::StringRef"* }
+%"class.llvm::MCDwarfLoc" = type { i32, i32, i32, i32, i32, i32 }
+%"class.llvm::MCSection" = type opaque
+%"class.llvm::MCSymbol" = type { %"class.llvm::StringRef", %"class.llvm::MCSection"*, %"class.llvm::MCExpr"*, i8 }
+%"class.llvm::MCExpr" = type opaque
+%"class.std::vector.75" = type { %"struct.std::_Vector_base.76" }
+%"struct.std::_Vector_base.76" = type { %"struct.std::_Vector_base<const llvm::MCGenDwarfLabelEntry *, std::allocator<const llvm::MCGenDwarfLabelEntry *> >::_Vector_impl" }
+%"struct.std::_Vector_base<const llvm::MCGenDwarfLabelEntry *, std::allocator<const llvm::MCGenDwarfLabelEntry *> >::_Vector_impl" = type { %"class.llvm::MCGenDwarfLabelEntry"**, %"class.llvm::MCGenDwarfLabelEntry"**, %"class.llvm::MCGenDwarfLabelEntry"** }
+%"class.llvm::MCGenDwarfLabelEntry" = type { %"class.llvm::StringRef", i32, i32, %"class.llvm::MCSymbol"* }
+%"class.llvm::DenseMap.80" = type { %"struct.std::pair.83"*, i32, i32, i32 }
+%"struct.std::pair.83" = type { %"class.llvm::MCSection"*, %"class.llvm::MCLineSection"* }
+%"class.llvm::MCLineSection" = type { %"class.std::vector.215" }
+%"class.std::vector.215" = type { %"struct.std::_Vector_base.216" }
+%"struct.std::_Vector_base.216" = type { %"struct.std::_Vector_base<llvm::MCLineEntry, std::allocator<llvm::MCLineEntry> >::_Vector_impl" }
+%"struct.std::_Vector_base<llvm::MCLineEntry, std::allocator<llvm::MCLineEntry> >::_Vector_impl" = type { %"class.llvm::MCLineEntry"*, %"class.llvm::MCLineEntry"*, %"class.llvm::MCLineEntry"* }
+%"class.llvm::MCLineEntry" = type { %"class.llvm::MCDwarfLoc", %"class.llvm::MCSymbol"* }
+%"class.std::vector.84" = type { %"struct.std::_Vector_base.85" }
+%"struct.std::_Vector_base.85" = type { %"struct.std::_Vector_base<const llvm::MCSection *, std::allocator<const llvm::MCSection *> >::_Vector_impl" }
+%"struct.std::_Vector_base<const llvm::MCSection *, std::allocator<const llvm::MCSection *> >::_Vector_impl" = type { %"class.llvm::MCSection"**, %"class.llvm::MCSection"**, %"class.llvm::MCSection"** }
+%"class.llvm::MachineModuleInfo" = type { %"class.llvm::ImmutablePass", %"class.llvm::MCContext", %"class.llvm::Module"*, %"class.llvm::MachineModuleInfoImpl"*, %"class.std::vector.95", i32, %"class.std::vector.100", %"class.llvm::DenseMap.110", %"class.llvm::DenseMap.114", i32, %"class.std::vector.118", %"class.std::vector.123", %"class.std::vector.123", %"class.std::vector.128", %"class.llvm::SmallPtrSet", %"class.llvm::MMIAddrLabelMap"*, i8, i8, i8, i8, %"class.llvm::SmallVector.133" }
+%"class.llvm::ImmutablePass" = type { %"class.llvm::ModulePass" }
+%"class.llvm::ModulePass" = type { %"class.llvm::Pass" }
+%"class.llvm::Pass" = type { i32 (...)**, %"class.llvm::AnalysisResolver"*, i8*, i32 }
+%"class.llvm::AnalysisResolver" = type { %"class.std::vector.89", %"class.llvm::PMDataManager"* }
+%"class.std::vector.89" = type { %"struct.std::_Vector_base.90" }
+%"struct.std::_Vector_base.90" = type { %"struct.std::_Vector_base<std::pair<const void *, llvm::Pass *>, std::allocator<std::pair<const void *, llvm::Pass *> > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::pair<const void *, llvm::Pass *>, std::allocator<std::pair<const void *, llvm::Pass *> > >::_Vector_impl" = type { %"struct.std::pair.94"*, %"struct.std::pair.94"*, %"struct.std::pair.94"* }
+%"struct.std::pair.94" = type { i8*, %"class.llvm::Pass"* }
+%"class.llvm::PMDataManager" = type opaque
+%"class.llvm::MachineModuleInfoImpl" = type { i32 (...)** }
+%"class.std::vector.95" = type { %"struct.std::_Vector_base.96" }
+%"struct.std::_Vector_base.96" = type { %"struct.std::_Vector_base<llvm::MachineMove, std::allocator<llvm::MachineMove> >::_Vector_impl" }
+%"struct.std::_Vector_base<llvm::MachineMove, std::allocator<llvm::MachineMove> >::_Vector_impl" = type { %"class.llvm::MachineMove"*, %"class.llvm::MachineMove"*, %"class.llvm::MachineMove"* }
+%"class.llvm::MachineMove" = type { %"class.llvm::MCSymbol"*, %"class.llvm::MachineLocation", %"class.llvm::MachineLocation" }
+%"class.llvm::MachineLocation" = type { i8, i32, i32 }
+%"class.std::vector.100" = type { %"struct.std::_Vector_base.101" }
+%"struct.std::_Vector_base.101" = type { %"struct.std::_Vector_base<llvm::LandingPadInfo, std::allocator<llvm::LandingPadInfo> >::_Vector_impl" }
+%"struct.std::_Vector_base<llvm::LandingPadInfo, std::allocator<llvm::LandingPadInfo> >::_Vector_impl" = type { %"struct.llvm::LandingPadInfo"*, %"struct.llvm::LandingPadInfo"*, %"struct.llvm::LandingPadInfo"* }
+%"struct.llvm::LandingPadInfo" = type { %"class.llvm::MachineBasicBlock"*, %"class.llvm::SmallVector", %"class.llvm::SmallVector", %"class.llvm::MCSymbol"*, %"class.llvm::Function"*, %"class.std::vector.105" }
+%"class.llvm::SmallVector" = type { %"class.llvm::SmallVectorImpl", %"struct.llvm::SmallVectorStorage" }
+%"class.llvm::SmallVectorImpl" = type { %"class.llvm::SmallVectorTemplateBase" }
+%"class.llvm::SmallVectorTemplateBase" = type { %"class.llvm::SmallVectorTemplateCommon" }
+%"class.llvm::SmallVectorTemplateCommon" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion" }
+%"class.llvm::SmallVectorBase" = type { i8*, i8*, i8* }
+%"struct.llvm::AlignedCharArrayUnion" = type { %"struct.llvm::AlignedCharArray" }
+%"struct.llvm::AlignedCharArray" = type { [8 x i8] }
+%"struct.llvm::SmallVectorStorage" = type { i8 }
+%"class.std::vector.105" = type { %"struct.std::_Vector_base.106" }
+%"struct.std::_Vector_base.106" = type { %"struct.std::_Vector_base<int, std::allocator<int> >::_Vector_impl" }
+%"struct.std::_Vector_base<int, std::allocator<int> >::_Vector_impl" = type { i32*, i32*, i32* }
+%"class.llvm::DenseMap.110" = type { %"struct.std::pair.113"*, i32, i32, i32 }
+%"struct.std::pair.113" = type { %"class.llvm::MCSymbol"*, %"class.llvm::SmallVector.206" }
+%"class.llvm::SmallVector.206" = type { [28 x i8], %"struct.llvm::SmallVectorStorage.207" }
+%"struct.llvm::SmallVectorStorage.207" = type { [3 x %"struct.llvm::AlignedCharArrayUnion.198"] }
+%"struct.llvm::AlignedCharArrayUnion.198" = type { %"struct.llvm::AlignedCharArray.199" }
+%"struct.llvm::AlignedCharArray.199" = type { [4 x i8] }
+%"class.llvm::DenseMap.114" = type { %"struct.std::pair.117"*, i32, i32, i32 }
+%"struct.std::pair.117" = type { %"class.llvm::MCSymbol"*, i32 }
+%"class.std::vector.118" = type { %"struct.std::_Vector_base.119" }
+%"struct.std::_Vector_base.119" = type { %"struct.std::_Vector_base<const llvm::GlobalVariable *, std::allocator<const llvm::GlobalVariable *> >::_Vector_impl" }
+%"struct.std::_Vector_base<const llvm::GlobalVariable *, std::allocator<const llvm::GlobalVariable *> >::_Vector_impl" = type { %"class.llvm::GlobalVariable"**, %"class.llvm::GlobalVariable"**, %"class.llvm::GlobalVariable"** }
+%"class.std::vector.123" = type { %"struct.std::_Vector_base.124" }
+%"struct.std::_Vector_base.124" = type { %"struct.std::_Vector_base<unsigned int, std::allocator<unsigned int> >::_Vector_impl" }
+%"struct.std::_Vector_base<unsigned int, std::allocator<unsigned int> >::_Vector_impl" = type { i32*, i32*, i32* }
+%"class.std::vector.128" = type { %"struct.std::_Vector_base.129" }
+%"struct.std::_Vector_base.129" = type { %"struct.std::_Vector_base<const llvm::Function *, std::allocator<const llvm::Function *> >::_Vector_impl" }
+%"struct.std::_Vector_base<const llvm::Function *, std::allocator<const llvm::Function *> >::_Vector_impl" = type { %"class.llvm::Function"**, %"class.llvm::Function"**, %"class.llvm::Function"** }
+%"class.llvm::SmallPtrSet" = type { %"class.llvm::SmallPtrSetImpl", [33 x i8*] }
+%"class.llvm::SmallPtrSetImpl" = type { i8**, i8**, i32, i32, i32 }
+%"class.llvm::MMIAddrLabelMap" = type opaque
+%"class.llvm::SmallVector.133" = type { %"class.llvm::SmallVectorImpl.134", %"struct.llvm::SmallVectorStorage.139" }
+%"class.llvm::SmallVectorImpl.134" = type { %"class.llvm::SmallVectorTemplateBase.135" }
+%"class.llvm::SmallVectorTemplateBase.135" = type { %"class.llvm::SmallVectorTemplateCommon.136" }
+%"class.llvm::SmallVectorTemplateCommon.136" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.137" }
+%"struct.llvm::AlignedCharArrayUnion.137" = type { %"struct.llvm::AlignedCharArray.138" }
+%"struct.llvm::AlignedCharArray.138" = type { [40 x i8] }
+%"struct.llvm::SmallVectorStorage.139" = type { [3 x %"struct.llvm::AlignedCharArrayUnion.137"] }
+%"class.llvm::GCModuleInfo" = type opaque
+%"class.llvm::MachineRegisterInfo" = type { %"class.llvm::TargetRegisterInfo"*, i8, i8, %"class.llvm::IndexedMap", %"class.llvm::IndexedMap.146", %"class.llvm::MachineOperand"**, %"class.llvm::BitVector", %"class.llvm::BitVector", %"class.llvm::BitVector", %"class.std::vector.147", %"class.std::vector.123" }
+%"class.llvm::TargetRegisterInfo" = type { i32 (...)**, %"class.llvm::MCRegisterInfo", %"struct.llvm::TargetRegisterInfoDesc"*, i8**, i32*, %"class.llvm::TargetRegisterClass"**, %"class.llvm::TargetRegisterClass"** }
+%"struct.llvm::TargetRegisterInfoDesc" = type { i32, i8 }
+%"class.llvm::TargetRegisterClass" = type { %"class.llvm::MCRegisterClass"*, i32*, i32*, i16*, %"class.llvm::TargetRegisterClass"**, void (%"class.llvm::ArrayRef"*, %"class.llvm::MachineFunction"*)* }
+%"class.llvm::ArrayRef" = type { i16*, i64 }
+%"class.llvm::IndexedMap" = type { %"class.std::vector.140", %"struct.std::pair.145", %"struct.llvm::VirtReg2IndexFunctor" }
+%"class.std::vector.140" = type { %"struct.std::_Vector_base.141" }
+%"struct.std::_Vector_base.141" = type { %"struct.std::_Vector_base<std::pair<const llvm::TargetRegisterClass *, llvm::MachineOperand *>, std::allocator<std::pair<const llvm::TargetRegisterClass *, llvm::MachineOperand *> > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::pair<const llvm::TargetRegisterClass *, llvm::MachineOperand *>, std::allocator<std::pair<const llvm::TargetRegisterClass *, llvm::MachineOperand *> > >::_Vector_impl" = type { %"struct.std::pair.145"*, %"struct.std::pair.145"*, %"struct.std::pair.145"* }
+%"struct.std::pair.145" = type { %"class.llvm::TargetRegisterClass"*, %"class.llvm::MachineOperand"* }
+%"class.llvm::MachineOperand" = type { i8, [3 x i8], %union.anon, %"class.llvm::MachineInstr"*, %union.anon.188 }
+%union.anon = type { i32 }
+%union.anon.188 = type { %struct.anon }
+%struct.anon = type { %"class.llvm::MachineOperand"*, %"class.llvm::MachineOperand"* }
+%"struct.llvm::VirtReg2IndexFunctor" = type { i8 }
+%"class.llvm::IndexedMap.146" = type { %"class.std::vector.147", %"struct.std::pair.152", %"struct.llvm::VirtReg2IndexFunctor" }
+%"class.std::vector.147" = type { %"struct.std::_Vector_base.148" }
+%"struct.std::_Vector_base.148" = type { %"struct.std::_Vector_base<std::pair<unsigned int, unsigned int>, std::allocator<std::pair<unsigned int, unsigned int> > >::_Vector_impl" }
+%"struct.std::_Vector_base<std::pair<unsigned int, unsigned int>, std::allocator<std::pair<unsigned int, unsigned int> > >::_Vector_impl" = type { %"struct.std::pair.152"*, %"struct.std::pair.152"*, %"struct.std::pair.152"* }
+%"struct.std::pair.152" = type { i32, i32 }
+%"class.llvm::BitVector" = type { i64*, i32, i32 }
+%"struct.llvm::MachineFunctionInfo" = type { i32 (...)** }
+%"class.llvm::MachineFrameInfo" = type opaque
+%"class.llvm::MachineConstantPool" = type { %"class.llvm::DataLayout"*, i32, %"class.std::vector.153", %"class.llvm::DenseSet" }
+%"class.llvm::DataLayout" = type opaque
+%"class.std::vector.153" = type { %"struct.std::_Vector_base.154" }
+%"struct.std::_Vector_base.154" = type { %"struct.std::_Vector_base<llvm::MachineConstantPoolEntry, std::allocator<llvm::MachineConstantPoolEntry> >::_Vector_impl" }
+%"struct.std::_Vector_base<llvm::MachineConstantPoolEntry, std::allocator<llvm::MachineConstantPoolEntry> >::_Vector_impl" = type { %"class.llvm::MachineConstantPoolEntry"*, %"class.llvm::MachineConstantPoolEntry"*, %"class.llvm::MachineConstantPoolEntry"* }
+%"class.llvm::MachineConstantPoolEntry" = type { %union.anon.158, i32 }
+%union.anon.158 = type { %"class.llvm::Constant"* }
+%"class.llvm::Constant" = type { %"class.llvm::User" }
+%"class.llvm::DenseSet" = type { %"class.llvm::DenseMap.159" }
+%"class.llvm::DenseMap.159" = type { %"struct.std::pair.162"*, i32, i32, i32 }
+%"struct.std::pair.162" = type { %"class.llvm::MachineConstantPoolValue"*, i8 }
+%"class.llvm::MachineConstantPoolValue" = type { i32 (...)**, %"class.llvm::Type"* }
+%"class.llvm::MachineJumpTableInfo" = type opaque
+%"class.std::vector.163" = type { %"struct.std::_Vector_base.164" }
+%"struct.std::_Vector_base.164" = type { %"struct.std::_Vector_base<llvm::MachineBasicBlock *, std::allocator<llvm::MachineBasicBlock *> >::_Vector_impl" }
+%"struct.std::_Vector_base<llvm::MachineBasicBlock *, std::allocator<llvm::MachineBasicBlock *> >::_Vector_impl" = type { %"class.llvm::MachineBasicBlock"**, %"class.llvm::MachineBasicBlock"**, %"class.llvm::MachineBasicBlock"** }
+%"class.llvm::Recycler" = type { %"class.llvm::iplist.168" }
+%"class.llvm::iplist.168" = type { %"struct.llvm::ilist_traits.169", %"struct.llvm::RecyclerStruct"* }
+%"struct.llvm::ilist_traits.169" = type { %"struct.llvm::RecyclerStruct" }
+%"struct.llvm::RecyclerStruct" = type { %"struct.llvm::RecyclerStruct"*, %"struct.llvm::RecyclerStruct"* }
+%"class.llvm::ArrayRecycler" = type { %"class.llvm::SmallVector.174" }
+%"class.llvm::SmallVector.174" = type { %"class.llvm::SmallVectorImpl.175", %"struct.llvm::SmallVectorStorage.179" }
+%"class.llvm::SmallVectorImpl.175" = type { %"class.llvm::SmallVectorTemplateBase.176" }
+%"class.llvm::SmallVectorTemplateBase.176" = type { %"class.llvm::SmallVectorTemplateCommon.177" }
+%"class.llvm::SmallVectorTemplateCommon.177" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.178" }
+%"struct.llvm::AlignedCharArrayUnion.178" = type { %"struct.llvm::AlignedCharArray" }
+%"struct.llvm::SmallVectorStorage.179" = type { [7 x %"struct.llvm::AlignedCharArrayUnion.178"] }
+%"class.llvm::Recycler.180" = type { %"class.llvm::iplist.168" }
+%"struct.llvm::ilist.181" = type { %"class.llvm::iplist.182" }
+%"class.llvm::iplist.182" = type { %"struct.llvm::ilist_traits.183", %"class.llvm::MachineBasicBlock"* }
+%"struct.llvm::ilist_traits.183" = type { %"class.llvm::ilist_half_node.1" }
+%"class.llvm::ArrayRecycler<llvm::MachineOperand, 8>::Capacity" = type { i8 }
+%"class.llvm::ConstantInt" = type { %"class.llvm::Constant", %"class.llvm::APInt" }
+%"class.llvm::APInt" = type { i32, %union.anon.189 }
+%union.anon.189 = type { i64 }
+%"class.llvm::ConstantFP" = type { %"class.llvm::Constant", %"class.llvm::APFloat" }
+%"class.llvm::APFloat" = type { %"struct.llvm::fltSemantics"*, %"union.llvm::APFloat::Significand", i16, i8 }
+%"struct.llvm::fltSemantics" = type opaque
+%"union.llvm::APFloat::Significand" = type { i64 }
+%"class.llvm::BlockAddress" = type { %"class.llvm::Constant" }
+%"class.llvm::hash_code" = type { i64 }
+%"struct.llvm::hashing::detail::hash_combine_recursive_helper" = type { [64 x i8], %"struct.llvm::hashing::detail::hash_state", i64 }
+%"struct.llvm::hashing::detail::hash_state" = type { i64, i64, i64, i64, i64, i64, i64, i64 }
+%"class.llvm::PrintReg" = type { %"class.llvm::TargetRegisterInfo"*, i32, i32 }
+%"class.llvm::PseudoSourceValue" = type { %"class.llvm::Value" }
+%"class.llvm::FoldingSetNodeID" = type { %"class.llvm::SmallVector.194" }
+%"class.llvm::SmallVector.194" = type { [28 x i8], %"struct.llvm::SmallVectorStorage.200" }
+%"struct.llvm::SmallVectorStorage.200" = type { [31 x %"struct.llvm::AlignedCharArrayUnion.198"] }
+%"struct.llvm::ArrayRecycler<llvm::MachineOperand, 8>::FreeList" = type { %"struct.llvm::ArrayRecycler<llvm::MachineOperand, 8>::FreeList"* }
+%"class.llvm::ilist_iterator.202" = type { %"class.llvm::MachineInstr"* }
+%"class.llvm::TargetInstrInfo" = type { i32 (...)**, [28 x i8], i32, i32 }
+%"struct.std::pair.203" = type { i8, i8 }
+%"class.llvm::SmallVectorImpl.195" = type { %"class.llvm::SmallVectorTemplateBase.196" }
+%"class.llvm::SmallVectorTemplateBase.196" = type { %"class.llvm::SmallVectorTemplateCommon.197" }
+%"class.llvm::SmallVectorTemplateCommon.197" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.198" }
+%"class.llvm::AliasAnalysis" = type { i32 (...)**, %"class.llvm::DataLayout"*, %"class.llvm::TargetLibraryInfo"*, %"class.llvm::AliasAnalysis"* }
+%"class.llvm::TargetLibraryInfo" = type opaque
+%"struct.llvm::AliasAnalysis::Location" = type { %"class.llvm::Value"*, i64, %"class.llvm::MDNode"* }
+%"class.llvm::DIVariable" = type { %"class.llvm::DIDescriptor" }
+%"class.llvm::DIDescriptor" = type { %"class.llvm::MDNode"* }
+%"class.llvm::DIScope" = type { %"class.llvm::DIDescriptor" }
+%"class.llvm::ArrayRef.208" = type { i32*, i64 }
+%"class.llvm::SmallVector.209" = type { %"class.llvm::SmallVectorImpl.210", %"struct.llvm::SmallVectorStorage.214" }
+%"class.llvm::SmallVectorImpl.210" = type { %"class.llvm::SmallVectorTemplateBase.211" }
+%"class.llvm::SmallVectorTemplateBase.211" = type { %"class.llvm::SmallVectorTemplateCommon.212" }
+%"class.llvm::SmallVectorTemplateCommon.212" = type { %"class.llvm::SmallVectorBase", %"struct.llvm::AlignedCharArrayUnion.213" }
+%"struct.llvm::AlignedCharArrayUnion.213" = type { %"struct.llvm::AlignedCharArray" }
+%"struct.llvm::SmallVectorStorage.214" = type { [7 x %"struct.llvm::AlignedCharArrayUnion.213"] }
+%"class.llvm::Twine" = type { %"union.llvm::Twine::Child", %"union.llvm::Twine::Child", i8, i8 }
+%"union.llvm::Twine::Child" = type { %"class.llvm::Twine"* }
+%"struct.std::random_access_iterator_tag" = type { i8 }
+
+declare void @_ZN4llvm19MachineRegisterInfo27removeRegOperandFromUseListEPNS_14MachineOperandE(%"class.llvm::MachineRegisterInfo"*, %"class.llvm::MachineOperand"*)
+
+declare void @_ZN4llvm19MachineRegisterInfo22addRegOperandToUseListEPNS_14MachineOperandE(%"class.llvm::MachineRegisterInfo"*, %"class.llvm::MachineOperand"*)
+
+declare zeroext i32 @_ZNK4llvm14MCRegisterInfo9getSubRegEjj(%"class.llvm::MCRegisterInfo"*, i32 zeroext, i32 zeroext)
+
+define void @_ZN4llvm14MachineOperand12substPhysRegEjRKNS_18TargetRegisterInfoE(%"class.llvm::MachineOperand"* %this, i32 zeroext %Reg, %"class.llvm::TargetRegisterInfo"* %TRI) align 2 {
+entry:
+  %SubReg_TargetFlags.i = getelementptr inbounds %"class.llvm::MachineOperand"* %this, i64 0, i32 1
+  %0 = bitcast [3 x i8]* %SubReg_TargetFlags.i to i24*
+  %bf.load.i = load i24* %0, align 1
+  %bf.lshr.i = lshr i24 %bf.load.i, 12
+  %tobool = icmp eq i24 %bf.lshr.i, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %bf.cast.i = zext i24 %bf.lshr.i to i32
+  %add.ptr = getelementptr inbounds %"class.llvm::TargetRegisterInfo"* %TRI, i64 0, i32 1
+  %call3 = tail call zeroext i32 @_ZNK4llvm14MCRegisterInfo9getSubRegEjj(%"class.llvm::MCRegisterInfo"* %add.ptr, i32 zeroext %Reg, i32 zeroext %bf.cast.i)
+  %bf.load.i10 = load i24* %0, align 1
+  %bf.clear.i = and i24 %bf.load.i10, 4095
+  store i24 %bf.clear.i, i24* %0, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %Reg.addr.0 = phi i32 [ %call3, %if.then ], [ %Reg, %entry ]
+  %RegNo.i.i = getelementptr inbounds %"class.llvm::MachineOperand"* %this, i64 0, i32 2, i32 0
+  %1 = load i32* %RegNo.i.i, align 4, !tbaa !0
+  %cmp.i = icmp eq i32 %1, %Reg.addr.0
+  br i1 %cmp.i, label %_ZN4llvm14MachineOperand6setRegEj.exit, label %if.end.i
+
+if.end.i:                                         ; preds = %if.end
+  %ParentMI.i.i = getelementptr inbounds %"class.llvm::MachineOperand"* %this, i64 0, i32 3
+  %2 = load %"class.llvm::MachineInstr"** %ParentMI.i.i, align 8, !tbaa !3
+  %tobool.i = icmp eq %"class.llvm::MachineInstr"* %2, null
+  br i1 %tobool.i, label %if.end13.i, label %if.then3.i
+
+if.then3.i:                                       ; preds = %if.end.i
+  %Parent.i.i = getelementptr inbounds %"class.llvm::MachineInstr"* %2, i64 0, i32 2
+  %3 = load %"class.llvm::MachineBasicBlock"** %Parent.i.i, align 8, !tbaa !3
+  %tobool5.i = icmp eq %"class.llvm::MachineBasicBlock"* %3, null
+  br i1 %tobool5.i, label %if.end13.i, label %if.then6.i
+
+if.then6.i:                                       ; preds = %if.then3.i
+  %xParent.i.i = getelementptr inbounds %"class.llvm::MachineBasicBlock"* %3, i64 0, i32 4
+  %4 = load %"class.llvm::MachineFunction"** %xParent.i.i, align 8, !tbaa !3
+  %tobool8.i = icmp eq %"class.llvm::MachineFunction"* %4, null
+  br i1 %tobool8.i, label %if.end13.i, label %if.then9.i
+
+if.then9.i:                                       ; preds = %if.then6.i
+  %RegInfo.i.i = getelementptr inbounds %"class.llvm::MachineFunction"* %4, i64 0, i32 5
+  %5 = load %"class.llvm::MachineRegisterInfo"** %RegInfo.i.i, align 8, !tbaa !3
+  tail call void @_ZN4llvm19MachineRegisterInfo27removeRegOperandFromUseListEPNS_14MachineOperandE(%"class.llvm::MachineRegisterInfo"* %5, %"class.llvm::MachineOperand"* %this)
+  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4, !tbaa !0
+  tail call void @_ZN4llvm19MachineRegisterInfo22addRegOperandToUseListEPNS_14MachineOperandE(%"class.llvm::MachineRegisterInfo"* %5, %"class.llvm::MachineOperand"* %this)
+  br label %_ZN4llvm14MachineOperand6setRegEj.exit
+
+if.end13.i:                                       ; preds = %if.then6.i, %if.then3.i, %if.end.i
+  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4, !tbaa !0
+  br label %_ZN4llvm14MachineOperand6setRegEj.exit
+
+_ZN4llvm14MachineOperand6setRegEj.exit:           ; preds = %if.end, %if.then9.i, %if.end13.i
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
+!4 = metadata !{metadata !"vtable pointer", metadata !2}
+!5 = metadata !{metadata !"long", metadata !1}
+!6 = metadata !{i64 0, i64 8, metadata !3, i64 8, i64 8, metadata !5}
+!7 = metadata !{metadata !"short", metadata !1}
+!8 = metadata !{i64 0, i64 1, metadata !1, i64 1, i64 4, metadata !0, i64 2, i64 1, metadata !1, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 4, i64 4, metadata !0, i64 4, i64 4, metadata !0, i64 8, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !5, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 24, i64 8, metadata !3, i64 16, i64 4, metadata !0, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 24, i64 4, metadata !0}
+!9 = metadata !{metadata !"bool", metadata !1}
+!10 = metadata !{i8 0, i8 2}
+
+; CHECK-NOT: lbzu 3, 1(3)
diff --git a/test/CodeGen/PowerPC/pr15359.ll b/test/CodeGen/PowerPC/pr15359.ll
new file mode 100644
index 0000000..12fa3e5
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr15359.ll
@@ -0,0 +1,20 @@
+; RUN: llc -O0 -mcpu=pwr7 -filetype=obj %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s
+
+target datalayout = "E-p:64:64:64-S0-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@nextIdx = external thread_local global i32
+
+define fastcc void @func() nounwind {
+entry:
+  store i32 42, i32* @nextIdx
+  ret void
+}
+
+; Verify that nextIdx has symbol type TLS.
+;
+; CHECK:    '.symtab'
+; CHECK:    'nextIdx'
+; CHECK:    'st_type', 0x6
+
diff --git a/test/CodeGen/PowerPC/pwr3-6x.ll b/test/CodeGen/PowerPC/pwr3-6x.ll
new file mode 100644
index 0000000..a9cfe41
--- /dev/null
+++ b/test/CodeGen/PowerPC/pwr3-6x.ll
@@ -0,0 +1,14 @@
+; Test basic support for some older processors.
+
+;RUN: llc < %s -march=ppc64 -mcpu=pwr3 | FileCheck %s
+;RUN: llc < %s -march=ppc64 -mcpu=pwr4 | FileCheck %s
+;RUN: llc < %s -march=ppc64 -mcpu=pwr5 | FileCheck %s
+;RUN: llc < %s -march=ppc64 -mcpu=pwr5x | FileCheck %s
+;RUN: llc < %s -march=ppc64 -mcpu=pwr6x | FileCheck %s
+
+define void @foo() {
+entry:
+  ret void
+}
+
+; CHECK: @foo
diff --git a/test/CodeGen/PowerPC/quadint-return.ll b/test/CodeGen/PowerPC/quadint-return.ll
new file mode 100644
index 0000000..0349991
--- /dev/null
+++ b/test/CodeGen/PowerPC/quadint-return.ll
@@ -0,0 +1,19 @@
+; REQUIRES: asserts
+; RUN: llc -O0 -debug -o - < %s 2>&1 | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i128 @foo() nounwind {
+entry:
+  %x = alloca i128, align 16
+  store i128 27, i128* %x, align 16
+  %0 = load i128* %x, align 16
+  ret i128 %0
+}
+
+; CHECK: ********** Function: foo
+; CHECK: ********** FAST REGISTER ALLOCATION **********
+; CHECK: %X3<def> = COPY %vreg
+; CHECK-NEXT: %X4<def> = COPY %vreg
+; CHECK-NEXT: BLR
diff --git a/test/CodeGen/PowerPC/stdux-constuse.ll b/test/CodeGen/PowerPC/stdux-constuse.ll
new file mode 100644
index 0000000..e62d438
--- /dev/null
+++ b/test/CodeGen/PowerPC/stdux-constuse.ll
@@ -0,0 +1,47 @@
+; RUN: llc -mcpu=a2 -disable-lsr < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @test1(i64 %add, i64* %ptr) nounwind {
+entry:
+  %p1 = getelementptr i64* %ptr, i64 144115188075855
+  br label %for.cond2.preheader
+
+for.cond2.preheader:
+  %nl.018 = phi i32 [ 0, %entry ], [ %inc9, %for.end ]
+  br label %for.body4
+
+for.body4:
+  %lsr.iv = phi i32 [ %lsr.iv.next, %for.body4 ], [ 16000, %for.cond2.preheader ]
+  %i0 = phi i64* [ %p1, %for.cond2.preheader ], [ %i6, %for.body4 ]
+  %i6 = getelementptr i64* %i0, i64 400000
+  %i7 = getelementptr i64* %i6, i64 300000
+  %i8 = getelementptr i64* %i6, i64 200000
+  %i9 = getelementptr i64* %i6, i64 100000
+  store i64 %add, i64* %i6, align 32
+  store i64 %add, i64* %i7, align 32
+  store i64 %add, i64* %i8, align 32
+  store i64 %add, i64* %i9, align 32
+  %lsr.iv.next = add i32 %lsr.iv, -16
+  %exitcond.15 = icmp eq i32 %lsr.iv.next, 0
+  br i1 %exitcond.15, label %for.end, label %for.body4
+
+; Make sure that we generate the most compact form of this loop with no
+; unnecessary moves
+; CHECK: @test1
+; CHECK: mtctr
+; CHECK: stdux
+; CHECK-NEXT: stdx
+; CHECK-NEXT: stdx
+; CHECK-NEXT: stdx
+; CHECK-NEXT: bdnz
+
+for.end:
+  %inc9 = add nsw i32 %nl.018, 1
+  %exitcond = icmp eq i32 %inc9, 400000
+  br i1 %exitcond, label %for.end10, label %for.cond2.preheader
+
+for.end10:
+  ret i32 0
+}
+
diff --git a/test/CodeGen/PowerPC/svr4-redzone.ll b/test/CodeGen/PowerPC/svr4-redzone.ll
new file mode 100644
index 0000000..91ff579
--- /dev/null
+++ b/test/CodeGen/PowerPC/svr4-redzone.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple="powerpc-unknown-linux-gnu" < %s | FileCheck %s --check-prefix=PPC32
+; RUN: llc -mtriple="powerpc64-unknown-linux-gnu" < %s | FileCheck %s --check-prefix=PPC64
+; PR15332
+
+define void @regalloc() nounwind {
+entry:
+	%0 = add i32 1, 2
+	ret void
+}
+; PPC32: regalloc:
+; PPC32-NOT: stwu 1, -{{[0-9]+}}(1)
+; PPC32: blr
+
+; PPC64: regalloc:
+; PPC64-NOT: stdu 1, -{{[0-9]+}}(1)
+; PPC64: blr
+
+define void @smallstack() nounwind {
+entry:
+	%0 = alloca i8, i32 4
+	ret void
+}
+; PPC32: smallstack:
+; PPC32: stwu 1, -16(1)
+
+; PPC64: smallstack:
+; PPC64-NOT: stdu 1, -{{[0-9]+}}(1)
+; PPC64: blr
+
+define void @bigstack() nounwind {
+entry:
+	%0 = alloca i8, i32 230
+	ret void
+}
+; PPC32: bigstack:
+; PPC32: stwu 1, -240(1)
+
+; PPC64: bigstack:
+; PPC64: stdu 1, -352(1)
diff --git a/test/CodeGen/PowerPC/tls-2.ll b/test/CodeGen/PowerPC/tls-2.ll
new file mode 100644
index 0000000..20d8fe4
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-2.ll
@@ -0,0 +1,15 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-freebsd10.0"
+; RUN: llc -O1 < %s -march=ppc64 | FileCheck %s
+
+@a = thread_local global i32 0, align 4
+
+;CHECK:          localexec:
+define i32 @localexec() nounwind {
+entry:
+;CHECK:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
+;CHECK-NEXT:     li [[REG2:[0-9]+]], 42
+;CHECK-NEXT:     stw [[REG2]], a@tprel@l([[REG1]])
+  store i32 42, i32* @a, align 4
+  ret i32 0
+}
diff --git a/test/CodeGen/PowerPC/tls-ld-2.ll b/test/CodeGen/PowerPC/tls-ld-2.ll
new file mode 100644
index 0000000..4954afe
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-ld-2.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=pwr7 -O1 -relocation-model=pic < %s | FileCheck %s
+
+; Test peephole optimization for thread-local storage using the
+; local dynamic model.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = hidden thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; CHECK:      addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
+; CHECK-NEXT: addi 3, [[REG]], a@got@tlsld@l
+; CHECK-NEXT: bl __tls_get_addr(a@tlsld)
+; CHECK-NEXT: nop
+; CHECK-NEXT: addis [[REG2:[0-9]+]], 3, a@dtprel@ha
+; CHECK-NEXT: lwa {{[0-9]+}}, a@dtprel@l([[REG2]])
diff --git a/test/CodeGen/PowerPC/tls.ll b/test/CodeGen/PowerPC/tls.ll
index 713893b..151b4b7 100644
--- a/test/CodeGen/PowerPC/tls.ll
+++ b/test/CodeGen/PowerPC/tls.ll
@@ -1,16 +1,21 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-freebsd10.0"
-; RUN: llc < %s -march=ppc64 | FileCheck %s
+; RUN: llc -O0 < %s -march=ppc64 | FileCheck -check-prefix=OPT0 %s
+; RUN: llc -O1 < %s -march=ppc64 | FileCheck -check-prefix=OPT1 %s
 
 @a = thread_local global i32 0, align 4
 
-;CHECK:          localexec:
+;OPT0:          localexec:
+;OPT1:          localexec:
 define i32 @localexec() nounwind {
 entry:
-;CHECK:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
-;CHECK-NEXT:     li [[REG2:[0-9]+]], 42
-;CHECK-NEXT:     addi [[REG1]], [[REG1]], a@tprel@l
-;CHECK-NEXT:     stw [[REG2]], 0([[REG1]])
+;OPT0:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
+;OPT0-NEXT:     li [[REG2:[0-9]+]], 42
+;OPT0-NEXT:     addi [[REG1]], [[REG1]], a@tprel@l
+;OPT0-NEXT:     stw [[REG2]], 0([[REG1]])
+;OPT1:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
+;OPT1-NEXT:     li [[REG2:[0-9]+]], 42
+;OPT1-NEXT:     stw [[REG2]], a@tprel@l([[REG1]])
   store i32 42, i32* @a, align 4
   ret i32 0
 }
diff --git a/test/CodeGen/PowerPC/vaddsplat.ll b/test/CodeGen/PowerPC/vaddsplat.ll
new file mode 100644
index 0000000..e65148a
--- /dev/null
+++ b/test/CodeGen/PowerPC/vaddsplat.ll
@@ -0,0 +1,149 @@
+; RUN: llc -O0 -mcpu=pwr7 <%s | FileCheck %s
+
+; Test optimizations of build_vector for 6-bit immediates.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%v4i32 = type <4 x i32>
+%v8i16 = type <8 x i16>
+%v16i8 = type <16 x i8>
+
+define void @test_v4i32_pos_even(%v4i32* %P, %v4i32* %S) {
+       %p = load %v4i32* %P
+       %r = add %v4i32 %p, < i32 18, i32 18, i32 18, i32 18 >
+       store %v4i32 %r, %v4i32* %S
+       ret void
+}
+
+; CHECK: test_v4i32_pos_even:
+; CHECK: vspltisw [[REG1:[0-9]+]], 9
+; CHECK: vadduwm {{[0-9]+}}, [[REG1]], [[REG1]]
+
+define void @test_v4i32_neg_even(%v4i32* %P, %v4i32* %S) {
+       %p = load %v4i32* %P
+       %r = add %v4i32 %p, < i32 -28, i32 -28, i32 -28, i32 -28 >
+       store %v4i32 %r, %v4i32* %S
+       ret void
+}
+
+; CHECK: test_v4i32_neg_even:
+; CHECK: vspltisw [[REG1:[0-9]+]], -14
+; CHECK: vadduwm {{[0-9]+}}, [[REG1]], [[REG1]]
+
+define void @test_v8i16_pos_even(%v8i16* %P, %v8i16* %S) {
+       %p = load %v8i16* %P
+       %r = add %v8i16 %p, < i16 30, i16 30, i16 30, i16 30, i16 30, i16 30, i16 30, i16 30 >
+       store %v8i16 %r, %v8i16* %S
+       ret void
+}
+
+; CHECK: test_v8i16_pos_even:
+; CHECK: vspltish [[REG1:[0-9]+]], 15
+; CHECK: vadduhm {{[0-9]+}}, [[REG1]], [[REG1]]
+
+define void @test_v8i16_neg_even(%v8i16* %P, %v8i16* %S) {
+       %p = load %v8i16* %P
+       %r = add %v8i16 %p, < i16 -32, i16 -32, i16 -32, i16 -32, i16 -32, i16 -32, i16 -32, i16 -32 >
+       store %v8i16 %r, %v8i16* %S
+       ret void
+}
+
+; CHECK: test_v8i16_neg_even:
+; CHECK: vspltish [[REG1:[0-9]+]], -16
+; CHECK: vadduhm {{[0-9]+}}, [[REG1]], [[REG1]]
+
+define void @test_v16i8_pos_even(%v16i8* %P, %v16i8* %S) {
+       %p = load %v16i8* %P
+       %r = add %v16i8 %p, < i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16 >
+       store %v16i8 %r, %v16i8* %S
+       ret void
+}
+
+; CHECK: test_v16i8_pos_even:
+; CHECK: vspltisb [[REG1:[0-9]+]], 8
+; CHECK: vaddubm {{[0-9]+}}, [[REG1]], [[REG1]]
+
+define void @test_v16i8_neg_even(%v16i8* %P, %v16i8* %S) {
+       %p = load %v16i8* %P
+       %r = add %v16i8 %p, < i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18 >
+       store %v16i8 %r, %v16i8* %S
+       ret void
+}
+
+; CHECK: test_v16i8_neg_even:
+; CHECK: vspltisb [[REG1:[0-9]+]], -9
+; CHECK: vaddubm {{[0-9]+}}, [[REG1]], [[REG1]]
+
+define void @test_v4i32_pos_odd(%v4i32* %P, %v4i32* %S) {
+       %p = load %v4i32* %P
+       %r = add %v4i32 %p, < i32 27, i32 27, i32 27, i32 27 >
+       store %v4i32 %r, %v4i32* %S
+       ret void
+}
+
+; CHECK: test_v4i32_pos_odd:
+; CHECK: vspltisw [[REG2:[0-9]+]], -16
+; CHECK: vspltisw [[REG1:[0-9]+]], 11
+; CHECK: vsubuwm {{[0-9]+}}, [[REG1]], [[REG2]]
+
+define void @test_v4i32_neg_odd(%v4i32* %P, %v4i32* %S) {
+       %p = load %v4i32* %P
+       %r = add %v4i32 %p, < i32 -27, i32 -27, i32 -27, i32 -27 >
+       store %v4i32 %r, %v4i32* %S
+       ret void
+}
+
+; CHECK: test_v4i32_neg_odd:
+; CHECK: vspltisw [[REG2:[0-9]+]], -16
+; CHECK: vspltisw [[REG1:[0-9]+]], -11
+; CHECK: vadduwm {{[0-9]+}}, [[REG1]], [[REG2]]
+
+define void @test_v8i16_pos_odd(%v8i16* %P, %v8i16* %S) {
+       %p = load %v8i16* %P
+       %r = add %v8i16 %p, < i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31 >
+       store %v8i16 %r, %v8i16* %S
+       ret void
+}
+
+; CHECK: test_v8i16_pos_odd:
+; CHECK: vspltish [[REG2:[0-9]+]], -16
+; CHECK: vspltish [[REG1:[0-9]+]], 15
+; CHECK: vsubuhm {{[0-9]+}}, [[REG1]], [[REG2]]
+
+define void @test_v8i16_neg_odd(%v8i16* %P, %v8i16* %S) {
+       %p = load %v8i16* %P
+       %r = add %v8i16 %p, < i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31 >
+       store %v8i16 %r, %v8i16* %S
+       ret void
+}
+
+; CHECK: test_v8i16_neg_odd:
+; CHECK: vspltish [[REG2:[0-9]+]], -16
+; CHECK: vspltish [[REG1:[0-9]+]], -15
+; CHECK: vadduhm {{[0-9]+}}, [[REG1]], [[REG2]]
+
+define void @test_v16i8_pos_odd(%v16i8* %P, %v16i8* %S) {
+       %p = load %v16i8* %P
+       %r = add %v16i8 %p, < i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17 >
+       store %v16i8 %r, %v16i8* %S
+       ret void
+}
+
+; CHECK: test_v16i8_pos_odd:
+; CHECK: vspltisb [[REG2:[0-9]+]], -16
+; CHECK: vspltisb [[REG1:[0-9]+]], 1
+; CHECK: vsububm {{[0-9]+}}, [[REG1]], [[REG2]]
+
+define void @test_v16i8_neg_odd(%v16i8* %P, %v16i8* %S) {
+       %p = load %v16i8* %P
+       %r = add %v16i8 %p, < i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17 >
+       store %v16i8 %r, %v16i8* %S
+       ret void
+}
+
+; CHECK: test_v16i8_neg_odd:
+; CHECK: vspltisb [[REG2:[0-9]+]], -16
+; CHECK: vspltisb [[REG1:[0-9]+]], -1
+; CHECK: vaddubm {{[0-9]+}}, [[REG1]], [[REG2]]
+
diff --git a/test/CodeGen/PowerPC/vec_constants.ll b/test/CodeGen/PowerPC/vec_constants.ll
index 399f19f..e4799e5 100644
--- a/test/CodeGen/PowerPC/vec_constants.ll
+++ b/test/CodeGen/PowerPC/vec_constants.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=ppc32 -mcpu=g5 | not grep CPI
+; RUN: llc -O0 -mcpu=pwr7 < %s | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
 
 define void @test1(<4 x i32>* %P1, <4 x i32>* %P2, <4 x float>* %P3) nounwind {
 	%tmp = load <4 x i32>* %P1		; <<4 x i32>> [#uses=1]
@@ -13,32 +16,71 @@ define void @test1(<4 x i32>* %P1, <4 x i32>* %P2, <4 x float>* %P3) nounwind {
 	%tmp13 = bitcast <4 x i32> %tmp12 to <4 x float>		; <<4 x float>> [#uses=1]
 	store <4 x float> %tmp13, <4 x float>* %P3
 	ret void
+
+; CHECK: test1:
+; CHECK-NOT: CPI
 }
 
 define <4 x i32> @test_30() nounwind {
 	ret <4 x i32> < i32 30, i32 30, i32 30, i32 30 >
+
+; CHECK: test_30:
+; CHECK: vspltisw
+; CHECK-NEXT: vadduwm
+; CHECK-NEXT: blr
 }
 
 define <4 x i32> @test_29() nounwind {
 	ret <4 x i32> < i32 29, i32 29, i32 29, i32 29 >
+
+; CHECK: test_29:
+; CHECK: vspltisw
+; CHECK-NEXT: vspltisw
+; CHECK-NEXT: vsubuwm
+; CHECK-NEXT: blr
 }
 
 define <8 x i16> @test_n30() nounwind {
 	ret <8 x i16> < i16 -30, i16 -30, i16 -30, i16 -30, i16 -30, i16 -30, i16 -30, i16 -30 >
+
+; CHECK: test_n30:
+; CHECK: vspltish
+; CHECK-NEXT: vadduhm
+; CHECK-NEXT: blr
 }
 
 define <16 x i8> @test_n104() nounwind {
 	ret <16 x i8> < i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104, i8 -104 >
+
+; CHECK: test_n104:
+; CHECK: vspltisb
+; CHECK-NEXT: vslb
+; CHECK-NEXT: blr
 }
 
 define <4 x i32> @test_vsldoi() nounwind {
 	ret <4 x i32> < i32 512, i32 512, i32 512, i32 512 >
+
+; CHECK: test_vsldoi:
+; CHECK: vspltisw
+; CHECK-NEXT: vsldoi
+; CHECK-NEXT: blr
 }
 
 define <8 x i16> @test_vsldoi_65023() nounwind {
 	ret <8 x i16> < i16 65023, i16 65023,i16 65023,i16 65023,i16 65023,i16 65023,i16 65023,i16 65023 >
+
+; CHECK: test_vsldoi_65023:
+; CHECK: vspltish
+; CHECK-NEXT: vsldoi
+; CHECK-NEXT: blr
 }
 
 define <4 x i32> @test_rol() nounwind {
 	ret <4 x i32> < i32 -11534337, i32 -11534337, i32 -11534337, i32 -11534337 >
+
+; CHECK: test_rol:
+; CHECK: vspltisw
+; CHECK-NEXT: vrlw
+; CHECK-NEXT: blr
 }
diff --git a/test/CodeGen/PowerPC/vec_extload.ll b/test/CodeGen/PowerPC/vec_extload.ll
index 15a3f9f..998645d 100644
--- a/test/CodeGen/PowerPC/vec_extload.ll
+++ b/test/CodeGen/PowerPC/vec_extload.ll
@@ -15,55 +15,9 @@ define <16 x i8> @v16si8_sext_in_reg(<16 x i8> %a) {
   ret <16 x i8> %c
 }
 ; CHECK: v16si8_sext_in_reg:
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lbz
-; CHECK: stb
-; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vslb
+; CHECK: vsrab
+; CHECK: blr 
 
 ; The zero extend uses a more clever logic: a vector splat
 ; and a logic and to set higher bits to 0.
@@ -83,31 +37,9 @@ define <8 x i16> @v8si16_sext_in_reg(<8 x i16> %a) {
   ret <8 x i16> %c
 }
 ; CHECK: v8si16_sext_in_reg:
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lhz
-; CHECK: sth
-; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vslh
+; CHECK: vsrah
+; CHECK: blr 
 
 ; Same as v8si16_sext_in_reg, but instead of creating the mask
 ; with a splat, loads it from memory.
@@ -129,19 +61,9 @@ define <4 x i32> @v4si32_sext_in_reg(<4 x i32> %a) {
   ret <4 x i32> %c
 }
 ; CHECK: v4si32_sext_in_reg:
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lha
-; CHECK: stw
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lha
-; CHECK: stw
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lha
-; CHECK: stw
-; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
-; CHECK: lha
-; CHECK: stw
-; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vslw
+; CHECK: vsraw
+; CHECK: blr 
 
 ; Same as v8si16_sext_in_reg.
 define <4 x i32> @v4si32_zext_in_reg(<4 x i32> %a) {
diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
new file mode 100644
index 0000000..114f9e7
--- /dev/null
+++ b/test/CodeGen/R600/128bit-kernel-args.ll
@@ -0,0 +1,18 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @v4i32_kernel_arg
+; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40
+
+define void @v4i32_kernel_arg(<4 x i32> addrspace(1)* %out, <4 x i32>  %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @v4f32_kernel_arg
+; CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 40
+define void @v4f32_kernel_args(<4 x float> addrspace(1)* %out, <4 x float>  %in) {
+entry:
+  store <4 x float> %in, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
index 1acf905..fd958b3 100644
--- a/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/test/CodeGen/R600/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -1,13 +1,15 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
 ; This test is for a bug in
 ; DAGCombiner::reduceBuildVecConvertToConvertBuildVec() where
 ; the wrong type was being passed to
 ; TargetLowering::getOperationAction() when checking the legality of
 ; ISD::UINT_TO_FP and ISD::SINT_TO_FP opcodes.
 
+
+; CHECK: @sint
+; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
 define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32 addrspace(1)* %in, i32 1
@@ -19,6 +21,7 @@ entry:
   ret void
 }
 
+;CHECK: @uint
 ;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/disconnected-predset-break-bug.ll b/test/CodeGen/R600/disconnected-predset-break-bug.ll
new file mode 100644
index 0000000..a586742
--- /dev/null
+++ b/test/CodeGen/R600/disconnected-predset-break-bug.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; PRED_SET* instructions must be tied to any instruction that uses their
+; result.  This tests that there are no instructions between the PRED_SET*
+; and the PREDICATE_BREAK in this loop.
+
+; CHECK: @loop_ge
+; CHECK: WHILE
+; CHECK: PRED_SET
+; CHECK-NEXT: PREDICATED_BREAK
+define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
+entry:
+  %cmp5 = icmp sgt i32 %iterations, 0
+  br i1 %cmp5, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.07.in = phi i32 [ %i.07, %for.body ], [ %iterations, %entry ]
+  %ai.06 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %i.07 = add nsw i32 %i.07.in, -1
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %ai.06
+  store i32 %i.07, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %ai.06, 1
+  %exitcond = icmp eq i32 %add, %iterations
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
index 1dcd07c..89f5e9e 100644
--- a/test/CodeGen/R600/fcmp.ll
+++ b/test/CodeGen/R600/fcmp.ll
@@ -1,8 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-;CHECK: SETE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-;CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SETE_DX10 T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/fmad.ll b/test/CodeGen/R600/fmad.ll
new file mode 100644
index 0000000..a3d4d0f
--- /dev/null
+++ b/test/CodeGen/R600/fmad.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MULADD_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = call float @llvm.R600.load.input(i32 2)
+   %r3 = fmul float %r0, %r1
+	%r4 = fadd float %r3, %r2
+   call void @llvm.AMDGPU.store.output(float %r4, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @fabs(float ) readnone
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 0ec1c37..591aa52 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -1,7 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
 
 define void @test() {
    %r0 = call float @llvm.R600.load.input(i32 0)
diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
new file mode 100644
index 0000000..382f78c
--- /dev/null
+++ b/test/CodeGen/R600/kcache-fold.ll
@@ -0,0 +1,52 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: MOV T{{[0-9]+\.[XYZW], CBuf0\[[0-9]+\]\.[XYZW]}}
+
+define void @main() {
+main_body:
+  %0 = load <4 x float> addrspace(9)* null
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %3 = extractelement <4 x float> %2, i32 0
+  %4 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = fcmp ult float %1, 0.000000e+00
+  %7 = select i1 %6, float %3, float %5
+  %8 = load <4 x float> addrspace(9)* null
+  %9 = extractelement <4 x float> %8, i32 1
+  %10 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %11 = extractelement <4 x float> %10, i32 1
+  %12 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %13 = extractelement <4 x float> %12, i32 1
+  %14 = fcmp ult float %9, 0.000000e+00
+  %15 = select i1 %14, float %11, float %13
+  %16 = load <4 x float> addrspace(9)* null
+  %17 = extractelement <4 x float> %16, i32 2
+  %18 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %19 = extractelement <4 x float> %18, i32 2
+  %20 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %21 = extractelement <4 x float> %20, i32 2
+  %22 = fcmp ult float %17, 0.000000e+00
+  %23 = select i1 %22, float %19, float %21
+  %24 = load <4 x float> addrspace(9)* null
+  %25 = extractelement <4 x float> %24, i32 3
+  %26 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
+  %27 = extractelement <4 x float> %26, i32 3
+  %28 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
+  %29 = extractelement <4 x float> %28, i32 3
+  %30 = fcmp ult float %25, 0.000000e+00
+  %31 = select i1 %30, float %27, float %29
+  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
+  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
+  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
+  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
+  %36 = insertelement <4 x float> undef, float %32, i32 0
+  %37 = insertelement <4 x float> %36, float %33, i32 1
+  %38 = insertelement <4 x float> %37, float %34, i32 2
+  %39 = insertelement <4 x float> %38, float %35, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDIL.clamp.(float, float, float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
index 4c731b2..be62342 100644
--- a/test/CodeGen/R600/literals.ll
+++ b/test/CodeGen/R600/literals.ll
@@ -6,6 +6,7 @@
 ; or
 ; ADD_INT literal.x REG, 5
 
+; CHECK; @i32_literal
 ; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -20,6 +21,7 @@ entry:
 ; or
 ; ADD literal.x REG, 5.0
 
+; CHECK: @float_literal
 ; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0
 define void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/R600/llvm.AMDGPU.tex.ll b/test/CodeGen/R600/llvm.AMDGPU.tex.ll
new file mode 100644
index 0000000..74331fa
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.tex.ll
@@ -0,0 +1,42 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 1
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 2
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 3
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 4
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 5
+;CHECK: TEX_SAMPLE_CT{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 6
+;CHECK: TEX_SAMPLE_CT{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 7
+;CHECK: TEX_SAMPLE_CT{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 8
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 9
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 10
+;CHECK: TEX_SAMPLE_CT{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 11
+;CHECK: TEX_SAMPLE_CT{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 12
+;CHECK: TEX_SAMPLE_CT{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 13
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 14
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 15
+;CHECK: TEX_SAMPLET{{[0-9]+\.XYZW, T[0-9]+\.XYZW}}, 0, 0, 16
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+   %addr = load <4 x float> addrspace(1)* %in
+   %res1 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %addr, i32 0, i32 0, i32 1)
+   %res2 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res1, i32 0, i32 0, i32 2)
+   %res3 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res2, i32 0, i32 0, i32 3)
+   %res4 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res3, i32 0, i32 0, i32 4)
+   %res5 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res4, i32 0, i32 0, i32 5)
+   %res6 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res5, i32 0, i32 0, i32 6)
+   %res7 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res6, i32 0, i32 0, i32 7)
+   %res8 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res7, i32 0, i32 0, i32 8)
+   %res9 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res8, i32 0, i32 0, i32 9)
+   %res10 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res9, i32 0, i32 0, i32 10)
+   %res11 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res10, i32 0, i32 0, i32 11)
+   %res12 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res11, i32 0, i32 0, i32 12)
+   %res13 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res12, i32 0, i32 0, i32 13)
+   %res14 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res13, i32 0, i32 0, i32 14)
+   %res15 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res14, i32 0, i32 0, i32 15)
+   %res16 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %res15, i32 0, i32 0, i32 16)
+   store <4 x float> %res16, <4 x float> addrspace(1)* %out
+   ret void
+}
+
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
new file mode 100644
index 0000000..0c19f14
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
@@ -0,0 +1,23 @@
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+;CHECK: S_MOV_B32
+;CHECK-NEXT: V_INTERP_MOV_F32
+
+define void @main() {
+main_body:
+  call void @llvm.AMDGPU.shader.type(i32 0)
+  %0 = load i32 addrspace(8)* inttoptr (i32 6 to i32 addrspace(8)*)
+  %1 = call float @llvm.SI.fs.interp.constant(i32 0, i32 0, i32 %0)
+  %2 = call i32 @llvm.SI.packf16(float %1, float %1)
+  %3 = bitcast i32 %2 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
+  ret void
+}
+
+declare void @llvm.AMDGPU.shader.type(i32)
+
+declare float @llvm.SI.fs.interp.constant(i32, i32, i32) readonly
+
+declare i32 @llvm.SI.packf16(float, float) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
new file mode 100644
index 0000000..34d1935
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.sample.ll
@@ -0,0 +1,71 @@
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+;CHECK: IMAGE_SAMPLE
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE_C
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE_C
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE_C
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE_C
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE_C
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE_C
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE
+;CHECK-NEXT: S_WAITCNT 1792
+;CHECK-NEXT: IMAGE_SAMPLE
+
+define void @test() {
+   %res1 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 1)
+   %res2 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 2)
+   %res3 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 3)
+   %res4 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 4)
+   %res5 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 5)
+   %res6 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 6)
+   %res7 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 7)
+   %res8 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 8)
+   %res9 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 9)
+   %res10 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 10)
+   %res11 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 11)
+   %res12 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 12)
+   %res13 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 13)
+   %res14 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 14)
+   %res15 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 15)
+   %res16 = call <4 x float> @llvm.SI.sample.(i32 15, <4 x i32> undef,
+      <8 x i32> undef, <4 x i32> undef, i32 16)
+   ret void
+}
+
+declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32)
diff --git a/test/CodeGen/R600/predicates.ll b/test/CodeGen/R600/predicates.ll
new file mode 100644
index 0000000..18895a4
--- /dev/null
+++ b/test/CodeGen/R600/predicates.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; These tests make sure the compiler is optimizing branches using predicates
+; when it is legal to do so.
+
+; CHECK: @simple_if
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT Pred,
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
+define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF, label %ENDIF
+
+IF:
+  %1 = shl i32 %in, 1
+  br label %ENDIF
+
+ENDIF:
+  %2 = phi i32 [ %in, %entry ], [ %1, %IF ]
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @simple_if_else
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT Pred,
+; CHECK: LSH{{[LR] T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
+; CHECK: LSH{{[LR] T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
+define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF, label %ELSE
+
+IF:
+  %1 = shl i32 %in, 1
+  br label %ENDIF
+
+ELSE:
+  %2 = lshr i32 %in, 1
+  br label %ENDIF
+
+ENDIF:
+  %3 = phi i32 [ %1, %IF ], [ %2, %ELSE ]
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @nested_if
+; CHECK: IF_PREDICATE_SET
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT Pred,
+; CHECK: LSHL T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
+; CHECK: ENDIF
+define void @nested_if(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF0, label %ENDIF
+
+IF0:
+  %1 = add i32 %in, 10
+  %2 = icmp sgt i32 %1, 0
+  br i1 %2, label %IF1, label %ENDIF
+
+IF1:
+  %3 = shl i32  %1, 1
+  br label %ENDIF
+
+ENDIF:
+  %4 = phi i32 [%in, %entry], [%1, %IF0], [%3, %IF1]
+  store i32 %4, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @nested_if_else
+; CHECK: IF_PREDICATE_SET
+; CHECK: PRED_SET{{[EGN][ET]*}}_INT Pred,
+; CHECK: LSH{{[LR] T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
+; CHECK: LSH{{[LR] T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, 0(0.000000e+00) Pred_sel
+; CHECK: ENDIF
+define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sgt i32 %in, 0
+  br i1 %0, label %IF0, label %ENDIF
+
+IF0:
+  %1 = add i32 %in, 10
+  %2 = icmp sgt i32 %1, 0
+  br i1 %2, label %IF1, label %ELSE1
+
+IF1:
+  %3 = shl i32  %1, 1
+  br label %ENDIF
+
+ELSE1:
+  %4 = lshr i32 %in, 1
+  br label %ENDIF
+
+ENDIF:
+  %5 = phi i32 [%in, %entry], [%3, %IF1], [%4, %ELSE1]
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll b/test/CodeGen/R600/selectcc-icmp-select-float.ll
index f65a300..359ca1e 100644
--- a/test/CodeGen/R600/selectcc-icmp-select-float.ll
+++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
@@ -2,7 +2,7 @@
 
 ; Note additional optimizations may cause this SGT to be replaced with a
 ; CND* instruction.
-; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
+; CHECK: SETGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
 ; Test a selectcc with i32 LHS/RHS and float True/False
 
 define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll
new file mode 100644
index 0000000..54febcf
--- /dev/null
+++ b/test/CodeGen/R600/set-dx10.ll
@@ -0,0 +1,137 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; These tests check that floating point comparisons which are used by select
+; to store integer true (-1) and false (0) values are lowered to one of the
+; SET*DX10 instructions.
+
+; CHECK: @fcmp_une_select_fptosi
+; CHECK: SETNE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
+define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp une float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_une_select_i32
+; CHECK: SETNE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
+define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp une float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_ueq_select_fptosi
+; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
+define void @fcmp_ueq_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ueq float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_ueq_select_i32
+; CHECK: SETE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
+define void @fcmp_ueq_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ueq float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_ugt_select_fptosi
+; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
+define void @fcmp_ugt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ugt float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_ugt_select_i32
+; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
+define void @fcmp_ugt_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ugt float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_uge_select_fptosi
+; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
+define void @fcmp_uge_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp uge float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_uge_select_i32
+; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, T{{[0-9]+\.[XYZW]}}, literal.x, 1084227584(5.000000e+00)
+define void @fcmp_uge_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp uge float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_ule_select_fptosi
+; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
+define void @fcmp_ule_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ule float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_ule_select_i32
+; CHECK: SETGE_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
+define void @fcmp_ule_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ule float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_ult_select_fptosi
+; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
+define void @fcmp_ult_select_fptosi(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 5.0
+  %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
+  %2 = fsub float -0.000000e+00, %1
+  %3 = fptosi float %2 to i32
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @fcmp_ult_select_i32
+; CHECK: SETGT_DX10 T{{[0-9]+\.[XYZW]}}, literal.x, T{{[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
+define void @fcmp_ult_select_i32(i32 addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 5.0
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/short-args.ll b/test/CodeGen/R600/short-args.ll
index 1070250..b69e327 100644
--- a/test/CodeGen/R600/short-args.ll
+++ b/test/CodeGen/R600/short-args.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @i8_arg
 ; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
 define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
@@ -9,6 +10,7 @@ entry:
   ret void
 }
 
+; CHECK: @i8_zext_arg
 ; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
 define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
@@ -18,6 +20,7 @@ entry:
   ret void
 }
 
+; CHECK: @i16_arg
 ; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
 
 define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
@@ -27,6 +30,7 @@ entry:
   ret void
 }
 
+; CHECK: @i16_zext_arg
 ; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
 
 define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
diff --git a/test/CodeGen/R600/unsupported-cc.ll b/test/CodeGen/R600/unsupported-cc.ll
new file mode 100644
index 0000000..b48c591
--- /dev/null
+++ b/test/CodeGen/R600/unsupported-cc.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; These tests are for condition codes that are not supported by the hardware
+
+; CHECK: @slt
+; CHECK: SETGT_INT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 5(7.006492e-45)
+define void @slt(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp slt i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @ult_i32
+; CHECK: SETGT_UINT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 5(7.006492e-45)
+define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp ult i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @ult_float
+; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
+define void @ult_float(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @olt
+; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
+define void @olt(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp olt float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @sle
+; CHECK: SETGT_INT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 6(8.407791e-45)
+define void @sle(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp sle i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @ule_i32
+; CHECK: SETGT_UINT T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 6(8.407791e-45)
+define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = icmp ule i32 %in, 5
+  %1 = select i1 %0, i32 -1, i32 0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @ule_float
+; CHECK: SETGE T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
+define void @ule_float(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ule float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @ole
+; CHECK: SETGE T{{[0-9]+\.[XYZW]}}, literal.x, {{T[0-9]+\.[XYZW]}}, 1084227584(5.000000e+00)
+define void @ole(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ole float %in, 5.0
+  %1 = select i1 %0, float 1.0, float 0.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/vec4-expand.ll b/test/CodeGen/R600/vec4-expand.ll
index c61f6e2..8f62bc6 100644
--- a/test/CodeGen/R600/vec4-expand.ll
+++ b/test/CodeGen/R600/vec4-expand.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
+; CHECK: @fp_to_sint
 ; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -12,6 +13,7 @@ define void @fp_to_sint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)*
   ret void
 }
 
+; CHECK: @fp_to_uint
 ; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: FLT_TO_UINT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -24,6 +26,7 @@ define void @fp_to_uint(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)*
   ret void
 }
 
+; CHECK: @sint_to_fp
 ; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: INT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@@ -36,6 +39,7 @@ define void @sint_to_fp(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)*
   ret void
 }
 
+; CHECK: @uint_to_fp
 ; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
diff --git a/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll b/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
new file mode 100644
index 0000000..3f6407a
--- /dev/null
+++ b/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=thumb -mcpu=arm1022e
+
+%iterator = type { i8**, i8**, i8**, i8*** }
+%insert_iterator = type { %deque*, %iterator }
+%deque = type { %iterator, %iterator, i8***, i32 }
+
+define i32 @test_thumbv5e_fp_elim() nounwind optsize {
+entry:
+  %var1 = alloca %iterator, align 4
+  %var2 = alloca %insert_iterator, align 4
+  %var3 = alloca %deque, align 4
+
+  %0 = bitcast %deque* %var3 to i8*
+  %1 = bitcast %iterator* %var1 to i8*
+  call void @llvm.lifetime.start(i64 16, i8* %1) nounwind
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %0, i32 16, i32 4, i1 false)
+  call void @llvm.lifetime.end(i64 16, i8* %1) nounwind
+
+  %2 = bitcast %insert_iterator* %var2 to i8*
+  call void @llvm.lifetime.start(i64 20, i8* %2) nounwind
+
+  ret i32 0
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/Thumb2/2013-02-19-tail-call-register-hint.ll b/test/CodeGen/Thumb2/2013-02-19-tail-call-register-hint.ll
new file mode 100644
index 0000000..502b138
--- /dev/null
+++ b/test/CodeGen/Thumb2/2013-02-19-tail-call-register-hint.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -mtriple=thumbv7s-apple-ios6.0.0 -verify-machineinstrs
+
+; Check to make sure the tail-call return at the end doesn't use a
+; callee-saved register. Register hinting from t2LDRDri was getting this
+; wrong. The intervening call will force allocation to try a high register
+; first, so the hint will attempt to fire, but must be rejected due to
+; not being in the allocation order for the tcGPR register class.
+; The machine instruction verifier will make sure that all actually worked
+; out the way it's supposed to.
+
+%"myclass" = type { %struct.foo }
+%struct.foo = type { i32, [40 x i8] }
+
+define hidden void @func(i8* %Data) nounwind ssp {
+  %1 = getelementptr inbounds i8* %Data, i32 12
+  %2 = bitcast i8* %1 to %"myclass"*
+  tail call void @abc(%"myclass"* %2) nounwind
+  tail call void @def(%"myclass"* %2) nounwind
+  %3 = getelementptr inbounds i8* %Data, i32 8
+  %4 = bitcast i8* %3 to i8**
+  %5 = load i8** %4, align 4, !tbaa !0
+  tail call void @ghi(i8* %5) nounwind
+  %6 = bitcast i8* %Data to void (i8*)**
+  %7 = load void (i8*)** %6, align 4, !tbaa !0
+  %8 = getelementptr inbounds i8* %Data, i32 4
+  %9 = bitcast i8* %8 to i8**
+  %10 = load i8** %9, align 4, !tbaa !0
+  %11 = icmp eq i8* %Data, null
+  br i1 %11, label %14, label %12
+
+; <label>:12                                      ; preds = %0
+  %13 = tail call %"myclass"* @jkl(%"myclass"* %2) nounwind
+  tail call void @mno(i8* %Data) nounwind
+  br label %14
+
+; <label>:14                                      ; preds = %12, %0
+  tail call void %7(i8* %10) nounwind
+  ret void
+}
+
+declare void @mno(i8*)
+
+declare void @def(%"myclass"*)
+
+declare void @abc(%"myclass"*)
+
+declare void @ghi(i8*)
+
+declare %"myclass"* @jkl(%"myclass"*) nounwind
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Thumb2/aligned-spill.ll b/test/CodeGen/Thumb2/aligned-spill.ll
index c98ca80..3a2803f 100644
--- a/test/CodeGen/Thumb2/aligned-spill.ll
+++ b/test/CodeGen/Thumb2/aligned-spill.ll
@@ -26,8 +26,8 @@ entry:
 ; NEON: bic r4, r4, #15
 ; Stack pointer must be updated before the spills.
 ; NEON: mov sp, r4
-; NEON: vst1.64 {d8, d9, d10, d11}, [r4, :128]!
-; NEON: vst1.64 {d12, d13, d14, d15}, [r4, :128]
+; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]!
+; NEON: vst1.64 {d12, d13, d14, d15}, [r4:128]
 ; Stack pointer adjustment for the stack frame contents.
 ; This could legally happen before the spills.
 ; Since the spill slot is only 8 bytes, technically it would be fine to only
@@ -36,8 +36,8 @@ entry:
 ; NEON: sub sp, #16
 ; The epilog is free to use another scratch register than r4.
 ; NEON: add r[[R4:[0-9]+]], sp, #16
-; NEON: vld1.64 {d8, d9, d10, d11}, [r[[R4]], :128]!
-; NEON: vld1.64 {d12, d13, d14, d15}, [r[[R4]], :128]
+; NEON: vld1.64 {d8, d9, d10, d11}, [r[[R4]]:128]!
+; NEON: vld1.64 {d12, d13, d14, d15}, [r[[R4]]:128]
 ; The stack pointer restore must happen after the reloads.
 ; NEON: mov sp,
 ; NEON: pop
@@ -57,8 +57,8 @@ entry:
 ; NEON: bic r4, r4, #15
 ; Stack pointer must be updated before the spills.
 ; NEON: mov sp, r4
-; NEON: vst1.64 {d8, d9, d10, d11}, [r4, :128]!
-; NEON: vst1.64 {d12, d13}, [r4, :128]
+; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]!
+; NEON: vst1.64 {d12, d13}, [r4:128]
 ; NEON: vstr d14, [r4, #16]
 ; Epilog
 ; NEON: vld1.64 {d8, d9, d10, d11},
@@ -84,7 +84,7 @@ entry:
 ; NEON: bic r4, r4, #15
 ; Stack pointer must be updated before the spills.
 ; NEON: mov sp, r4
-; NEON: vst1.64 {d8, d9}, [r4, :128]
+; NEON: vst1.64 {d8, d9}, [r4:128]
 ; NEON: vstr d10, [r4, #16]
 ; Epilog
 ; NEON: vld1.64 {d8, d9},
diff --git a/test/CodeGen/Thumb2/cortex-fp.ll b/test/CodeGen/Thumb2/cortex-fp.ll
index b7df2fb..f6cea72 100644
--- a/test/CodeGen/Thumb2/cortex-fp.ll
+++ b/test/CodeGen/Thumb2/cortex-fp.ll
@@ -7,7 +7,7 @@ define float @foo(float %a, float %b) {
 entry:
 ; CHECK: foo
 ; CORTEXM3: blx ___mulsf3
-; CORTEXM4: vmul.f32  s0, s2, s0
+; CORTEXM4: vmul.f32  s
 ; CORTEXA8: vmul.f32  d
   %0 = fmul float %a, %b
   ret float %0
diff --git a/test/CodeGen/Thumb2/crash.ll b/test/CodeGen/Thumb2/crash.ll
index cb4d080..6ce0b82 100644
--- a/test/CodeGen/Thumb2/crash.ll
+++ b/test/CodeGen/Thumb2/crash.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -verify-machineinstrs
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -verify-machineinstrs -O0
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
 target triple = "thumbv7-apple-darwin10"
 
@@ -76,3 +77,11 @@ entry:
   store i32 %num, i32* %p2, align 4
   ret void
 }
+
+; Check RAFast handling of inline assembly with many dense clobbers.
+; The large tuple aliases of the vector registers can cause problems.
+define void @rdar13249625(double* nocapture %p) nounwind {
+  %1 = tail call double asm sideeffect "@ $0", "=w,~{d0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15}"() nounwind
+  store double %1, double* %p, align 4
+  ret void
+}
diff --git a/test/CodeGen/Thumb2/thumb2-ldr_post.ll b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
index 2178eec..bce8474 100644
--- a/test/CodeGen/Thumb2/thumb2-ldr_post.ll
+++ b/test/CodeGen/Thumb2/thumb2-ldr_post.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | \
-; RUN:   grep "ldr.*\[.*\]," | count 1
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
 
 define i32 @test(i32 %a, i32 %b, i32 %c) {
         %tmp1 = mul i32 %a, %b          ; <i32> [#uses=2]
@@ -9,4 +8,5 @@ define i32 @test(i32 %a, i32 %b, i32 %c) {
         %tmp5 = mul i32 %tmp4, %tmp3            ; <i32> [#uses=1]
         ret i32 %tmp5
 }
+; CHECK: 	ldr	r{{.*}},	[{{.*}}],
 
diff --git a/test/CodeGen/Thumb2/thumb2-spill-q.ll b/test/CodeGen/Thumb2/thumb2-spill-q.ll
index d9a0617..5bff268 100644
--- a/test/CodeGen/Thumb2/thumb2-spill-q.ll
+++ b/test/CodeGen/Thumb2/thumb2-spill-q.ll
@@ -12,8 +12,8 @@ declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
 define void @aaa(%quuz* %this, i8* %block) {
 ; CHECK: aaa:
 ; CHECK: bic r4, r4, #15
-; CHECK: vst1.64 {{.*}}[{{.*}}, :128]
-; CHECK: vld1.64 {{.*}}[{{.*}}, :128]
+; CHECK: vst1.64 {{.*}}[{{.*}}:128]
+; CHECK: vld1.64 {{.*}}[{{.*}}:128]
 entry:
   %aligned_vec = alloca <4 x float>, align 16
   %"alloca point" = bitcast i32 0 to i32
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
index d423bfc..496779c 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
@@ -1,10 +1,15 @@
 ; Check that eh_return & unwind_init were properly lowered
-; RUN: llc < %s | grep %rbp | count 7
-; RUN: llc < %s | grep %rcx | count 3
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; CHECK: test
+; CHECK: pushq %rbp
+; CHECK: movq %rsp, %rbp
+; CHECK: popq %rbp
+; CHECK: movq %rcx, %rsp
+; CHECK: ret # eh_return, addr: %rcx
 define i8* @test(i64 %a, i8* %b)  {
 entry:
   call void @llvm.eh.unwind.init()
@@ -15,3 +20,36 @@ entry:
 
 declare void @llvm.eh.return.i64(i64, i8*)
 declare void @llvm.eh.unwind.init()
+
+@b = common global i32 0, align 4
+@a = common global i32 0, align 4
+
+; PR14750
+; This function contains a normal return as well as eh_return.
+; CHECK: _Unwind_Resume_or_Rethrow
+define i32 @_Unwind_Resume_or_Rethrow() nounwind uwtable ssp {
+entry:
+  %0 = load i32* @b, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  ret i32 0
+
+if.end:                                           ; preds = %entry
+  %call = tail call i32 (...)* @_Unwind_ForcedUnwind_Phase2() nounwind
+  store i32 %call, i32* @a, align 4
+  %tobool1 = icmp eq i32 %call, 0
+  br i1 %tobool1, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %if.end
+  tail call void @abort() noreturn nounwind
+  unreachable
+
+cond.end:                                         ; preds = %if.end
+  tail call void @llvm.eh.return.i64(i64 0, i8* null)
+  unreachable
+}
+
+declare i32 @_Unwind_ForcedUnwind_Phase2(...)
+declare void @abort() noreturn
diff --git a/test/CodeGen/X86/2010-12-02-MC-Set.ll b/test/CodeGen/X86/2010-12-02-MC-Set.ll
index 3144678..cf40624 100644
--- a/test/CodeGen/X86/2010-12-02-MC-Set.ll
+++ b/test/CodeGen/X86/2010-12-02-MC-Set.ll
@@ -18,5 +18,5 @@ entry:
 
 ; CHECK: .subsections_via_symbols
 ; CHECK-NEXT: __debug_line
-; CHECK-NEXT: Ltmp
+; CHECK-NEXT: Lline_table_start0
 ; CHECK-NEXT: Ltmp{{[0-9]}} = (Ltmp
diff --git a/test/CodeGen/X86/2011-11-30-or.ll b/test/CodeGen/X86/2011-11-30-or.ll
index f66248b..8ac4632 100644
--- a/test/CodeGen/X86/2011-11-30-or.ll
+++ b/test/CodeGen/X86/2011-11-30-or.ll
@@ -8,9 +8,9 @@ target triple = "x86_64-apple-macosx10.6.6"
 ; CHECK: pblendvb        %xmm1, %xmm2
 ; CHECK: ret
 
-define void @select_func() {
+define void @select_func(<8 x i16> %in) {
 entry:
-  %c.lobit.i.i.i = ashr <8 x i16> <i16 17, i16 5, i16 1, i16 15, i16 19, i16 15, i16 4, i16 1> , <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %c.lobit.i.i.i = ashr <8 x i16> %in, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %and.i56.i.i.i = and <8 x i16> %c.lobit.i.i.i, <i16 25, i16 8, i16 65, i16 25, i16 8, i16 95, i16 15, i16 45>
   %and.i5.i.i.i = bitcast <8 x i16> %and.i56.i.i.i to <2 x i64>
   %neg.i.i.i.i = xor <8 x i16> %c.lobit.i.i.i, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
diff --git a/test/CodeGen/X86/2011-12-28-vselecti8.ll b/test/CodeGen/X86/2011-12-28-vselecti8.ll
index 1a9d46d..dbc122a 100644
--- a/test/CodeGen/X86/2011-12-28-vselecti8.ll
+++ b/test/CodeGen/X86/2011-12-28-vselecti8.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-darwin11.2.0"
 
 ; CHECK: @foo8
 ; CHECK: psll
-; CHECK-NOT: psraw
+; CHECK: psraw
 ; CHECK: pblendvb
 ; CHECK: ret
 define void @foo8(float* nocapture %RET) nounwind {
diff --git a/test/CodeGen/X86/2012-01-11-split-cv.ll b/test/CodeGen/X86/2012-01-11-split-cv.ll
index 6b90072..7e91498 100644
--- a/test/CodeGen/X86/2012-01-11-split-cv.ll
+++ b/test/CodeGen/X86/2012-01-11-split-cv.ll
@@ -2,7 +2,7 @@
 
 ;CHECK: add18i16
 define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind {
-;CHECK: vmovups
+;CHECK: vmovaps
   %b = load <18 x i16>* %bp, align 16
   %x = add <18 x i16> zeroinitializer, %b
   store <18 x i16> %x, <18 x i16>* %ret, align 16
diff --git a/test/CodeGen/X86/2013-01-09-DAGCombineBug.ll b/test/CodeGen/X86/2013-01-09-DAGCombineBug.ll
new file mode 100644
index 0000000..db7ec8a
--- /dev/null
+++ b/test/CodeGen/X86/2013-01-09-DAGCombineBug.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=x86_64-apple-macosx10.5.0 < %s
+
+; rdar://12968664
+
+define void @t() nounwind uwtable ssp {
+  br label %4
+
+; <label>:1                                       ; preds = %4, %2
+  ret void
+
+; <label>:2                                       ; preds = %6, %5, %3, %2
+  switch i32 undef, label %2 [
+    i32 1090573978, label %1
+    i32 1090573938, label %3
+    i32 1090573957, label %5
+  ]
+
+; <label>:3                                       ; preds = %4, %2
+  br i1 undef, label %2, label %4
+
+; <label>:4                                       ; preds = %6, %5, %3, %0
+  switch i32 undef, label %11 [
+    i32 1090573938, label %3
+    i32 1090573957, label %5
+    i32 1090573978, label %1
+    i32 165205179, label %6
+  ]
+
+; <label>:5                                       ; preds = %4, %2
+  br i1 undef, label %2, label %4
+
+; <label>:6                                       ; preds = %4
+  %7 = icmp eq i32 undef, 590901838
+  %8 = or i1 false, %7
+  %9 = or i1 true, %8
+  %10 = xor i1 %8, %9
+  br i1 %10, label %4, label %2
+
+; <label>:11                                      ; preds = %11, %4
+  br label %11
+}
diff --git a/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll b/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll
new file mode 100644
index 0000000..614ccda
--- /dev/null
+++ b/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mtriple=x86_64-pc-win32 | FileCheck %s
+
+; CHECK: test
+; CHECK: vpmovzxwd
+; CHECK: vpmovzxwd
+define void @test(<4 x i64> %a, <4 x i16>* %buf) {
+  %ex1 = extractelement <4 x i64> %a, i32 0
+  %ex2 = extractelement <4 x i64> %a, i32 1
+  %x1 = bitcast i64 %ex1 to <4 x i16>
+  %x2 = bitcast i64 %ex2 to <4 x i16>
+  %Sh = shufflevector <4 x i16> %x1, <4 x i16> %x2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  store <4 x i16> %Sh, <4 x i16>* %buf, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/Atomics-64.ll b/test/CodeGen/X86/Atomics-64.ll
index 8e93762..8b0a349 100644
--- a/test/CodeGen/X86/Atomics-64.ll
+++ b/test/CodeGen/X86/Atomics-64.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 > %t
-; RUN: llc < %s -march=x86 > %t
+; RUN: llc < %s -march=x86-64 > %t.x86-64
+; RUN: llc < %s -march=x86 > %t.x86
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin8"
 
diff --git a/test/CodeGen/X86/GC/ocaml-gc.ll b/test/CodeGen/X86/GC/ocaml-gc.ll
new file mode 100644
index 0000000..44241a9
--- /dev/null
+++ b/test/CodeGen/X86/GC/ocaml-gc.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+define i32 @main(i32 %x) nounwind gc "ocaml" {
+; CHECK:        .text
+; CHECK-NEXT:   .globl  caml_3C_stdin_3E___code_begin
+; CHECK-NEXT: caml_3C_stdin_3E___code_begin:
+; CHECK-NEXT:   .data
+; CHECK-NEXT:   .globl  caml_3C_stdin_3E___data_begin
+; CHECK-NEXT: caml_3C_stdin_3E___data_begin:
+
+  %puts = tail call i32 @foo(i32 %x)
+  ret i32 0
+
+; CHECK:        .globl  caml_3C_stdin_3E___code_end
+; CHECK-NEXT: caml_3C_stdin_3E___code_end:
+; CHECK-NEXT:   .data
+; CHECK-NEXT:   .globl  caml_3C_stdin_3E___data_end
+; CHECK-NEXT: caml_3C_stdin_3E___data_end:
+; CHECK-NEXT:   .quad   0
+; CHECK-NEXT:   .globl  caml_3C_stdin_3E___frametable
+; CHECK-NEXT: caml_3C_stdin_3E___frametable:
+; CHECK-NEXT:   .short  1
+; CHECK-NEXT:   .align  8
+; CHECK-NEXT:                # live roots for main
+; CHECK-NEXT:   .quad   .Ltmp0
+; CHECK-NEXT:   .short  8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .align  8
+}
+
+declare i32 @foo(i32)
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index 64825ba..fbe8879 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -40,9 +40,43 @@ define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwt
   ret void
 }
 
+; No vectors because we use noimplicitfloat
+; CHECK: merge_const_store_no_vec
+; CHECK-NOT: vmovups
+; CHECK: ret
+define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  store i32 0, i32* %2, align 4
+  %3 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  store i32 0, i32* %3, align 4
+  %4 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  store i32 0, i32* %4, align 4
+  %5 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  store i32 0, i32* %5, align 4
+  %6 = getelementptr inbounds %struct.B* %.01, i64 0, i32 4
+  store i32 0, i32* %6, align 4
+  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 5
+  store i32 0, i32* %7, align 4
+  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 6
+  store i32 0, i32* %8, align 4
+  %9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 7
+  store i32 0, i32* %9, align 4
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
 ; Move the constants using a single vector store.
 ; CHECK: merge_const_store_vec
-; CHECK: vmovups  %ymm0, (%rsi)
+; CHECK: vmovups
 ; CHECK: ret
 define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
   %1 = icmp sgt i32 %count, 0
diff --git a/test/CodeGen/X86/atom-pad-short-functions.ll b/test/CodeGen/X86/atom-pad-short-functions.ll
index 54af17d..b9a39e0 100644
--- a/test/CodeGen/X86/atom-pad-short-functions.ll
+++ b/test/CodeGen/X86/atom-pad-short-functions.ll
@@ -22,6 +22,13 @@ define i32 @test_optsize(i32 %a) nounwind optsize {
   ret i32 %a
 }
 
+define i32 @test_minsize(i32 %a) nounwind minsize {
+; CHECK: test_minsize
+; CHECK: movl
+; CHECK-NEXT: ret
+  ret i32 %a
+}
+
 define i32 @test_add(i32 %a, i32 %b) nounwind {
 ; CHECK: test_add
 ; CHECK: addl
@@ -76,3 +83,21 @@ if.end:
   ret void
 
 }
+
+define void @test_branch_to_same_bb(i32 %x, i32 %y) nounwind {
+; CHECK: @test_branch_to_same_bb
+  %cmp = icmp sgt i32 %x, 0
+  br i1 %cmp, label %while.cond, label %while.end
+
+while.cond:
+  br label %while.cond
+
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: nop
+; CHECK: ret
+while.end:
+  ret void
+}
+
diff --git a/test/CodeGen/X86/atomic-dagsched.ll b/test/CodeGen/X86/atomic-dagsched.ll
new file mode 100644
index 0000000..00891d6
--- /dev/null
+++ b/test/CodeGen/X86/atomic-dagsched.ll
@@ -0,0 +1,110 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+define void @test(i8** %a, i64* %b, i64 %c, i64 %d) nounwind {
+entry:
+  %ptrtoarg4 = load i8** %a, align 8
+  %brglist1 = getelementptr i8** %a, i64 1
+  %ptrtoarg25 = load i8** %brglist1, align 8
+  %0 = load i64* %b, align 8
+  %1 = mul i64 %0, 4
+  %scevgep = getelementptr i8* %ptrtoarg25, i64 %1
+  %2 = mul i64 %d, 4
+  br label %loop.cond
+
+loop.cond:                                        ; preds = %test.exit, %entry
+  %asr.iv6 = phi i8* [ %29, %test.exit ], [ %scevgep, %entry ]
+  %iv = phi i64 [ %0, %entry ], [ %28, %test.exit ]
+  %3 = icmp eq i64 %iv, %c
+  br i1 %3, label %return, label %loop
+
+loop:                                             ; preds = %loop.cond
+  %4 = load i64* addrspace(256)* inttoptr (i64 264 to i64* addrspace(256)*), align 8, !tbaa !0
+  %5 = load i64* %4, align 8, !tbaa !3
+  %vector.size.i = ashr i64 %5, 3
+  %num.vector.wi.i = shl i64 %vector.size.i, 3
+  %6 = icmp eq i64 %vector.size.i, 0
+  br i1 %6, label %scalarIf.i, label %dim_0_vector_pre_head.i
+
+dim_0_vector_pre_head.i:                          ; preds = %loop
+  %7 = trunc i64 %5 to i32
+  %tempvector_func.i = insertelement <8 x i32> undef, i32 %7, i32 0
+  %vectorvector_func.i = shufflevector <8 x i32> %tempvector_func.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  br label %vector_kernel_entry.i
+
+vector_kernel_entry.i:                            ; preds = %vector_kernel_entry.i, %dim_0_vector_pre_head.i
+  %asr.iv9 = phi i8* [ %scevgep10, %vector_kernel_entry.i ], [ %asr.iv6, %dim_0_vector_pre_head.i ]
+  %asr.iv = phi i64 [ %asr.iv.next, %vector_kernel_entry.i ], [ %vector.size.i, %dim_0_vector_pre_head.i ]
+  %8 = bitcast i8* %ptrtoarg4 to i32 addrspace(1)*
+  %asr.iv911 = bitcast i8* %asr.iv9 to <8 x i32> addrspace(1)*
+  %9 = load <8 x i32> addrspace(1)* %asr.iv911, align 4
+  %extract8vector_func.i = extractelement <8 x i32> %9, i32 0
+  %extract9vector_func.i = extractelement <8 x i32> %9, i32 1
+  %extract10vector_func.i = extractelement <8 x i32> %9, i32 2
+  %extract11vector_func.i = extractelement <8 x i32> %9, i32 3
+  %extract12vector_func.i = extractelement <8 x i32> %9, i32 4
+  %extract13vector_func.i = extractelement <8 x i32> %9, i32 5
+  %extract14vector_func.i = extractelement <8 x i32> %9, i32 6
+  %extract15vector_func.i = extractelement <8 x i32> %9, i32 7
+  %10 = atomicrmw min i32 addrspace(1)* %8, i32 %extract8vector_func.i seq_cst
+  %11 = atomicrmw min i32 addrspace(1)* %8, i32 %extract9vector_func.i seq_cst
+  %12 = atomicrmw min i32 addrspace(1)* %8, i32 %extract10vector_func.i seq_cst
+  %13 = atomicrmw min i32 addrspace(1)* %8, i32 %extract11vector_func.i seq_cst
+  %14 = atomicrmw min i32 addrspace(1)* %8, i32 %extract12vector_func.i seq_cst
+  %15 = atomicrmw min i32 addrspace(1)* %8, i32 %extract13vector_func.i seq_cst
+  %16 = atomicrmw min i32 addrspace(1)* %8, i32 %extract14vector_func.i seq_cst
+  %17 = atomicrmw min i32 addrspace(1)* %8, i32 %extract15vector_func.i seq_cst
+  store <8 x i32> %vectorvector_func.i, <8 x i32> addrspace(1)* %asr.iv911, align 4
+  %asr.iv.next = add i64 %asr.iv, -1
+  %scevgep10 = getelementptr i8* %asr.iv9, i64 32
+  %dim_0_vector_cmp.to.max.i = icmp eq i64 %asr.iv.next, 0
+  br i1 %dim_0_vector_cmp.to.max.i, label %scalarIf.i, label %vector_kernel_entry.i
+
+scalarIf.i:                                       ; preds = %vector_kernel_entry.i, %loop
+  %exec_wi.i = phi i64 [ 0, %loop ], [ %num.vector.wi.i, %vector_kernel_entry.i ]
+  %18 = icmp eq i64 %exec_wi.i, %5
+  br i1 %18, label %test.exit, label %dim_0_pre_head.i
+
+dim_0_pre_head.i:                                 ; preds = %scalarIf.i
+  %19 = load i64* addrspace(256)* inttoptr (i64 264 to i64* addrspace(256)*), align 8, !tbaa !0
+  %20 = load i64* %19, align 8, !tbaa !3
+  %21 = trunc i64 %20 to i32
+  %22 = mul i64 %vector.size.i, 8
+  br label %scalar_kernel_entry.i
+
+scalar_kernel_entry.i:                            ; preds = %scalar_kernel_entry.i, %dim_0_pre_head.i
+  %asr.iv12 = phi i64 [ %asr.iv.next13, %scalar_kernel_entry.i ], [ %22, %dim_0_pre_head.i ]
+  %23 = bitcast i8* %asr.iv6 to i32 addrspace(1)*
+  %24 = bitcast i8* %ptrtoarg4 to i32 addrspace(1)*
+  %scevgep16 = getelementptr i32 addrspace(1)* %23, i64 %asr.iv12
+  %25 = load i32 addrspace(1)* %scevgep16, align 4, !tbaa !4
+  %26 = atomicrmw min i32 addrspace(1)* %24, i32 %25 seq_cst
+  %scevgep15 = getelementptr i32 addrspace(1)* %23, i64 %asr.iv12
+  store i32 %21, i32 addrspace(1)* %scevgep15, align 4, !tbaa !4
+  %asr.iv.next13 = add i64 %asr.iv12, 1
+  %dim_0_cmp.to.max.i = icmp eq i64 %5, %asr.iv.next13
+  br i1 %dim_0_cmp.to.max.i, label %test.exit, label %scalar_kernel_entry.i
+
+test.exit:                     ; preds = %scalar_kernel_entry.i, %scalarIf.i
+  %27 = bitcast i8* %asr.iv6 to i1*
+  %28 = add i64 %iv, %d
+  store i64 %28, i64* %b, align 8
+  %scevgep8 = getelementptr i1* %27, i64 %2
+  %29 = bitcast i1* %scevgep8 to i8*
+  br label %loop.cond
+
+return:                                           ; preds = %loop.cond
+  store i64 %0, i64* %b, align 8
+  ret void
+}
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"long", metadata !1}
+!4 = metadata !{metadata !"int", metadata !1}
+
+; CHECK: test
+; CHECK: decq
+; CHECK-NOT: cmpxchgl
+; CHECK: jne
+; CHECK: ret
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 0fec965..0550720 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -127,3 +127,43 @@ define i32 @test_int(i32 %a, i32 %b) nounwind {
     %c = add i32 %c2, %b
 	ret i32 %c
 }
+
+; WIN64: test_float4
+; WIN64-NOT: vzeroupper
+; WIN64: call
+; WIN64-NOT: vzeroupper
+; WIN64: call
+; WIN64: ret
+
+; X64: test_float4
+; X64-NOT: vzeroupper
+; X64: call
+; X64-NOT: vzeroupper
+; X64: call
+; X64: ret
+
+; X32: test_float4
+; X32: vzeroupper
+; X32: call
+; X32: vzeroupper
+; X32: call
+; X32: ret
+
+declare <4 x float> @func_float4(<4 x float>, <4 x float>, <4 x float>)
+
+define <8 x float> @test_float4(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone {
+entry:
+  %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %call.i = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %0, <4 x float> %1, <4 x float> %2) nounwind
+  %3 = shufflevector <4 x float> %call.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %4 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %5 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %6 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %call.i2 = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %4, <4 x float> %5, <4 x float> %6) nounwind
+  %7 = shufflevector <4 x float> %call.i2, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x float> %8
+}
+
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll
index c9fc66a..77a7c4f 100644
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -53,19 +53,24 @@ define void @storev16i16(<16 x i16> %a) nounwind {
   unreachable
 }
 
-; CHECK: vmovups  %ymm
+; CHECK: storev16i16_01
+; CHECK: vextractf128
+; CHECK: vmovaps  %xmm
 define void @storev16i16_01(<16 x i16> %a) nounwind {
   store <16 x i16> %a, <16 x i16>* undef, align 4
   unreachable
 }
 
+; CHECK: storev32i8
 ; CHECK: vmovaps  %ymm
 define void @storev32i8(<32 x i8> %a) nounwind {
   store <32 x i8> %a, <32 x i8>* undef, align 32
   unreachable
 }
 
-; CHECK: vmovups  %ymm
+; CHECK: storev32i8_01
+; CHECK: vextractf128
+; CHECK: vmovups  %xmm
 define void @storev32i8_01(<32 x i8> %a) nounwind {
   store <32 x i8> %a, <32 x i8>* undef, align 4
   unreachable
@@ -76,7 +81,7 @@ define void @storev32i8_01(<32 x i8> %a) nounwind {
 ; CHECK: _double_save
 ; CHECK-NOT: vinsertf128 $1
 ; CHECK-NOT: vinsertf128 $0
-; CHECK: vmovaps %xmm
+; CHECK: vmovups %xmm
 ; CHECK: vmovaps %xmm
 define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll
index 8d7d79d..7ae0d36 100755
--- a/test/CodeGen/X86/avx-sext.ll
+++ b/test/CodeGen/X86/avx-sext.ll
@@ -142,3 +142,26 @@ define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) {
  %Y = sext <8 x i8> %X to <8 x i16>
  ret <8 x i16>%Y
 }
+
+; AVX: sext_4i1_to_4i64
+; AVX: vpslld  $31
+; AVX: vpsrad  $31
+; AVX: vpmovsxdq
+; AVX: vpmovsxdq
+; AVX: ret
+define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
+  %extmask = sext <4 x i1> %mask to <4 x i64>
+  ret <4 x i64> %extmask
+}
+
+; AVX: sext_4i8_to_4i64
+; AVX: vpslld  $24
+; AVX: vpsrad  $24
+; AVX: vpmovsxdq
+; AVX: vpmovsxdq
+; AVX: ret
+define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
+  %extmask = sext <4 x i8> %mask to <4 x i64>
+  ret <4 x i64> %extmask
+}
+
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index 681747b..b0bff45 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -112,6 +112,16 @@ define <8 x i32> @vshift08(<8 x i32> %a) nounwind {
   ret <8 x i32> %bitop
 }
 
+; PR15141
+; CHECK: _vshift13:
+; CHECK-NOT: vpsll
+; CHECK: vcvttps2dq
+; CHECK-NEXT: vpmulld
+define <4 x i32> @vshift13(<4 x i32> %in) {
+  %T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %T
+}
+
 ;;; Uses shifts for sign extension
 ; CHECK: _sext_v16i16
 ; CHECK: vpsllw
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 65685a3..73faa1f 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -98,8 +98,8 @@ define i32 @test10(<4 x i32> %a) nounwind {
 }
 
 define <4 x float> @test11(<4 x float> %a) nounwind  {
-; check: test11
-; check: vpermilps $27
+; CHECK: test11
+; CHECK: vpshufd $27
   %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %tmp1
 }
@@ -113,8 +113,8 @@ define <4 x float> @test12(<4 x float>* %a) nounwind  {
 }
 
 define <4 x i32> @test13(<4 x i32> %a) nounwind  {
-; check: test13
-; check: vpshufd $27
+; CHECK: test13
+; CHECK: vpshufd $27
   %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i32> %tmp1
 }
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 67e4b40..5c01c2c 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -3,8 +3,8 @@
 
 ; CHECK: vpunpcklbw %xmm
 ; CHECK-NEXT: vpunpckhbw %xmm
+; CHECK-NEXT: vpshufd $85
 ; CHECK-NEXT: vinsertf128 $1
-; CHECK-NEXT: vpermilps $85
 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
 entry:
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -12,8 +12,8 @@ entry:
 }
 
 ; CHECK: vpunpckhwd %xmm
+; CHECK-NEXT: vpshufd $85
 ; CHECK-NEXT: vinsertf128 $1
-; CHECK-NEXT: vpermilps $85
 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
 entry:
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 3444542..e565da7 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -5,7 +5,8 @@
 ; shifting the needed bit to the MSB, and not using shl+sra.
 
 ;CHECK: vsel_float
-;CHECK: pslld
+;CHECK: movl $-2147483648
+;CHECK-NEXT: movd
 ;CHECK-NEXT: blendvps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
@@ -14,7 +15,8 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 }
 
 ;CHECK: vsel_4xi8
-;CHECK: pslld
+;CHECK: movl $-2147483648
+;CHECK-NEXT: movd
 ;CHECK-NEXT: blendvps
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
@@ -28,7 +30,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 ; reduce the mask in this case.
 ;CHECK: vsel_8xi16
 ;CHECK: psllw
-;CHECK-NOT: psraw
+;CHECK: psraw
 ;CHECK: pblendvb
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
diff --git a/test/CodeGen/X86/cas.ll b/test/CodeGen/X86/cas.ll
new file mode 100644
index 0000000..c2dd05e
--- /dev/null
+++ b/test/CodeGen/X86/cas.ll
@@ -0,0 +1,73 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu %s -o - | FileCheck %s
+
+; C code this came from
+;bool cas(float volatile *p, float *expected, float desired) {
+;  bool success;
+;  __asm__ __volatile__("lock; cmpxchg %[desired], %[mem]; "
+;                       "mov %[expected], %[expected_out]; "
+;                       "sete %[success]"
+;                       : [success] "=a" (success),
+;                         [expected_out] "=rm" (*expected)
+;                       : [expected] "a" (*expected),
+;                         [desired] "q" (desired),
+;                         [mem] "m" (*p)
+;                       : "memory", "cc");
+;  return success;
+;}
+
+define zeroext i1 @cas(float* %p, float* %expected, float %desired) nounwind {
+entry:
+  %p.addr = alloca float*, align 8
+  %expected.addr = alloca float*, align 8
+  %desired.addr = alloca float, align 4
+  %success = alloca i8, align 1
+  store float* %p, float** %p.addr, align 8
+  store float* %expected, float** %expected.addr, align 8
+  store float %desired, float* %desired.addr, align 4
+  %0 = load float** %expected.addr, align 8
+  %1 = load float** %expected.addr, align 8
+  %2 = load float* %1, align 4
+  %3 = load float* %desired.addr, align 4
+  %4 = load float** %p.addr, align 8
+  %5 = call i8 asm sideeffect "lock; cmpxchg $3, $4; mov $2, $1; sete $0", "={ax},=*rm,{ax},q,*m,~{memory},~{cc},~{dirflag},~{fpsr},~{flags}"(float* %0, float %2, float %3, float* %4) nounwind
+  store i8 %5, i8* %success, align 1
+  %6 = load i8* %success, align 1
+  %tobool = trunc i8 %6 to i1
+  ret i1 %tobool
+}
+
+; CHECK: @cas
+; Make sure we're emitting a move from eax.
+; CHECK: #APP
+; CHECK-NEXT: lock;{{.*}}mov %eax,{{.*}}
+; CHECK-NEXT: #NO_APP
+
+define zeroext i1 @cas2(i8* %p, i8* %expected, i1 zeroext %desired) nounwind {
+entry:
+  %p.addr = alloca i8*, align 8
+  %expected.addr = alloca i8*, align 8
+  %desired.addr = alloca i8, align 1
+  %success = alloca i8, align 1
+  store i8* %p, i8** %p.addr, align 8
+  store i8* %expected, i8** %expected.addr, align 8
+  %frombool = zext i1 %desired to i8
+  store i8 %frombool, i8* %desired.addr, align 1
+  %0 = load i8** %expected.addr, align 8
+  %1 = load i8** %expected.addr, align 8
+  %2 = load i8* %1, align 1
+  %tobool = trunc i8 %2 to i1
+  %3 = load i8* %desired.addr, align 1
+  %tobool1 = trunc i8 %3 to i1
+  %4 = load i8** %p.addr, align 8
+  %5 = call i8 asm sideeffect "lock; cmpxchg $3, $4; mov $2, $1; sete $0", "={ax},=*rm,{ax},q,*m,~{memory},~{cc},~{dirflag},~{fpsr},~{flags}"(i8* %0, i1 %tobool, i1 %tobool1, i8* %4) nounwind
+  store i8 %5, i8* %success, align 1
+  %6 = load i8* %success, align 1
+  %tobool2 = trunc i8 %6 to i1
+  ret i1 %tobool2
+}
+
+; CHECK: @cas2
+; Make sure we're emitting a move from %al here.
+; CHECK: #APP
+; CHECK-NEXT: lock;{{.*}}mov %al,{{.*}}
+; CHECK-NEXT: #NO_APP
diff --git a/test/CodeGen/X86/coldcc64.ll b/test/CodeGen/X86/coldcc64.ll
new file mode 100644
index 0000000..4db56bb
--- /dev/null
+++ b/test/CodeGen/X86/coldcc64.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "x86_64-linux-gnu"
+
+define coldcc void @foo() {
+; CHECK: pushq %rbp
+; CHECK: pushq %r15
+; CHECK: pushq %r14
+; CHECK: pushq %r13
+; CHECK: pushq %r12
+; CHECK: pushq %r11
+; CHECK: pushq %r10
+; CHECK: pushq %r9
+; CHECK: pushq %r8
+; CHECK: pushq %rdi
+; CHECK: pushq %rsi
+; CHECK: pushq %rdx
+; CHECK: pushq %rcx
+; CHECK: pushq %rbx
+; CHECK: movaps %xmm15
+; CHECK: movaps %xmm0
+  call void asm sideeffect "", "~{xmm15},~{xmm0},~{rbp},~{r15},~{r14},~{r13},~{r12},~{r11},~{r10},~{r9},~{r8},~{rdi},~{rsi},~{rdx},~{rcx},~{rbx}"()
+  ret void
+}
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 276d0db..6d21962 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -431,7 +431,7 @@ return:                                           ; preds = %entry
 ; uitofp expands to an FCMOV instruction which splits the basic block.
 ; Make sure the live range of %AL isn't split.
 @.str = private unnamed_addr constant { [1 x i8], [63 x i8] } zeroinitializer, align 32
-define void @pr13188(i64* nocapture %this) uwtable ssp address_safety align 2 {
+define void @pr13188(i64* nocapture %this) uwtable ssp sanitize_address align 2 {
 entry:
   %x7 = load i64* %this, align 8
   %sub = add i64 %x7, -1
diff --git a/test/CodeGen/X86/dagcombine_unsafe_math.ll b/test/CodeGen/X86/dagcombine_unsafe_math.ll
new file mode 100644
index 0000000..a3221de
--- /dev/null
+++ b/test/CodeGen/X86/dagcombine_unsafe_math.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -enable-unsafe-fp-math -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s 
+
+
+; rdar://13126763
+; Expression "x + x*x" was mistakenly transformed into "x * 3.0f".
+
+define float @test1(float %x) {
+  %t1 = fmul fast float %x, %x
+  %t2 = fadd fast float %t1, %x
+  ret float %t2
+; CHECK: test1
+; CHECK: vaddss
+}
+
+; (x + x) + x => x * 3.0
+define float @test2(float %x) {
+  %t1 = fadd fast float %x, %x
+  %t2 = fadd fast float %t1, %x
+  ret float %t2
+; CHECK: .long  1077936128
+; CHECK: test2
+; CHECK: vmulss LCPI1_0(%rip), %xmm0, %xmm0
+}
+
+; x + (x + x) => x * 3.0
+define float @test3(float %x) {
+  %t1 = fadd fast float %x, %x
+  %t2 = fadd fast float %t1, %x
+  ret float %t2
+; CHECK: .long  1077936128
+; CHECK: test3
+; CHECK: vmulss LCPI2_0(%rip), %xmm0, %xmm0
+}
+
+; (y + x) + x != x * 3.0
+define float @test4(float %x, float %y) {
+  %t1 = fadd fast float %x, %y
+  %t2 = fadd fast float %t1, %x
+  ret float %t2
+; CHECK: test4
+; CHECK: vaddss
+}
diff --git a/test/CodeGen/X86/dbg-declare.ll b/test/CodeGen/X86/dbg-declare.ll
index b73e310..9d13de1 100644
--- a/test/CodeGen/X86/dbg-declare.ll
+++ b/test/CodeGen/X86/dbg-declare.ll
@@ -30,10 +30,8 @@ declare void @llvm.stackrestore(i8*) nounwind
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"20020104-2.c", metadata !"/Volumes/Sandbox/llvm", metadata !"clang version 3.1 (trunk 153698)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"", metadata !6, i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*)* @foo, null, null, metadata !12} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"20020104-2.c", metadata !"/Volumes/Sandbox/llvm", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/CodeGen/X86/dbg-subrange.ll b/test/CodeGen/X86/dbg-subrange.ll
index 0efb50e..c9be972 100644
--- a/test/CodeGen/X86/dbg-subrange.ll
+++ b/test/CodeGen/X86/dbg-subrange.ll
@@ -15,18 +15,15 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 12, metadata !"small.c", metadata !"/private/tmp", metadata !"clang version 3.1 (trunk 144833)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !11} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"bar", metadata !"bar", metadata !"", metadata !6, i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void ()* @bar, null, null, metadata !9} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !"small.c", metadata !"/private/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !8 = metadata !{null}
 !9 = metadata !{metadata !10}
 !10 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!11 = metadata !{metadata !12}
-!12 = metadata !{metadata !13}
+!11 = metadata !{metadata !13}
 !13 = metadata !{i32 720948, i32 0, null, metadata !"s", metadata !"s", metadata !"", metadata !6, i32 2, metadata !14, i32 0, i32 1, [4294967296 x i8]* @s} ; [ DW_TAG_variable ]
 !14 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 34359738368, i64 8, i32 0, i32 0, metadata !15, metadata !16, i32 0, i32 0} ; [ DW_TAG_array_type ]
 !15 = metadata !{i32 720932, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index 8e7c13d..9669d97 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -56,9 +56,9 @@ entry:
   %div = sdiv i16 %x, 10
   ret i16 %div
 ; CHECK: test6:
-; CHECK: imull	$26215, %eax, %eax
-; CHECK: shrl	$31, %ecx
-; CHECK: sarl	$18, %eax
+; CHECK: imull $26215, %eax, %ecx
+; CHECK: sarl $18, %ecx
+; CHECK: shrl $15, %eax
 }
 
 define i32 @test7(i32 %x) nounwind {
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index c64752c..63e6167 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -1,5 +1,5 @@
 ; RUN: llc %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=line %t | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/fast-isel-args-fail.ll b/test/CodeGen/X86/fast-isel-args-fail.ll
new file mode 100644
index 0000000..4995baa
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-args-fail.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-apple-darwin10
+; Requires: Asserts
+
+; Previously, this would cause an assert.
+define i31 @t1(i31 %a, i31 %b, i31 %c) {
+entry:
+  %add = add nsw i31 %b, %a
+  %add1 = add nsw i31 %add, %c
+  ret i31 %add1
+}
diff --git a/test/CodeGen/X86/fast-isel-args.ll b/test/CodeGen/X86/fast-isel-args.ll
new file mode 100644
index 0000000..0f36265
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-args.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -fast-isel -fast-isel-abort -fast-isel-abort-args -verify-machineinstrs -mtriple=x86_64-apple-darwin10
+
+; Just make sure these don't abort when lowering the arguments.
+define i32 @t1(i32 %a, i32 %b, i32 %c) {
+entry:
+  %add = add nsw i32 %b, %a
+  %add1 = add nsw i32 %add, %c
+  ret i32 %add1
+}
+
+define i64 @t2(i64 %a, i64 %b, i64 %c) {
+entry:
+  %add = add nsw i64 %b, %a
+  %add1 = add nsw i64 %add, %c
+  ret i64 %add1
+}
+
+define i64 @t3(i32 %a, i64 %b, i32 %c) {
+entry:
+  %conv = sext i32 %a to i64
+  %add = add nsw i64 %conv, %b
+  %conv1 = sext i32 %c to i64
+  %add2 = add nsw i64 %add, %conv1
+  ret i64 %add2
+}
diff --git a/test/CodeGen/X86/fast-isel-constant.ll b/test/CodeGen/X86/fast-isel-constant.ll
new file mode 100644
index 0000000..6f9240a
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-constant.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=x86-64 -O0 | FileCheck %s
+; Make sure fast-isel doesn't reset the materialised constant map
+; across an intrinsic call.
+
+; CHECK: movl	$100000
+; CHECK-NOT: movl	$100000
+define i1 @test1(i32 %v1, i32 %v2, i32* %X) nounwind {
+entry:
+  %a = shl i32 100000, %v1
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %v2)
+  %ext = extractvalue {i32, i1} %t, 0
+  %sum = shl i32 100000, %ext
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %normal
+
+normal:
+  store i32 %sum, i32* %X
+  br label %overflow
+
+overflow:
+  ret i1 false
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
diff --git a/test/CodeGen/X86/float-asmprint.ll b/test/CodeGen/X86/float-asmprint.ll
new file mode 100644
index 0000000..4aeae7f
--- /dev/null
+++ b/test/CodeGen/X86/float-asmprint.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=x86_64-none-linux < %s | FileCheck %s
+
+; Check that all current floating-point types are correctly emitted to assembly
+; on a little-endian target.
+
+@var128 = global fp128 0xL00000000000000008000000000000000, align 16
+@varppc128 = global ppc_fp128 0xM80000000000000000000000000000000, align 16
+@var80 = global x86_fp80 0xK80000000000000000000, align 16
+@var64 = global double -0.0, align 8
+@var32 = global float -0.0, align 4
+@var16 = global half -0.0, align 2
+
+; CHECK: var128:
+; CHECK-NEXT: .quad 0                         # fp128 -0
+; CHECK-NEXT: .quad -9223372036854775808
+; CHECK-NEXT: .size
+
+; CHECK: varppc128:
+; CHECK-NEXT: .quad 0                         # ppc_fp128 -0
+; CHECK-NEXT: .quad -9223372036854775808
+; CHECK-NEXT: .size
+
+; CHECK: var80:
+; CHECK-NEXT: .quad 0                         # x86_fp80 -0
+; CHECK-NEXT: .short 32768
+; CHECK-NEXT: .zero 6
+; CHECK-NEXT: .size
+
+; CHECK: var64:
+; CHECK-NEXT: .quad -9223372036854775808      # double -0
+; CHECK-NEXT: .size
+
+; CHECK: var32:
+; CHECK-NEXT: .long 2147483648                # float -0
+; CHECK-NEXT: .size
+
+; CHECK: var16:
+; CHECK-NEXT: .short 32768                    # half -0
+; CHECK-NEXT: .size
+
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll
index 2ae65c9..a973bef 100644
--- a/test/CodeGen/X86/fp-load-trunc.ll
+++ b/test/CodeGen/X86/fp-load-trunc.ll
@@ -49,8 +49,8 @@ define <8 x float> @test4(<8 x double>* %p) nounwind {
 ; CHECK: movlhps
 ; CHECK: ret
 ; AVX:   test4
-; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
-; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vcvtpd2psy
+; AVX:   vcvtpd2psy
 ; AVX:   vinsertf128
 ; AVX:   ret
   %x = load <8 x double>* %p
diff --git a/test/CodeGen/X86/handle-move.ll b/test/CodeGen/X86/handle-move.ll
index e9f7a96..ba96275 100644
--- a/test/CodeGen/X86/handle-move.ll
+++ b/test/CodeGen/X86/handle-move.ll
@@ -16,7 +16,7 @@
 ;       DL:     [0B,16r:0)[128r,144r:2)[144r,144d:1)  0@0B-phi 1@144r 2@128r
 ;         -->   [0B,16r:0)[128r,180r:2)[180r,180d:1)  0@0B-phi 1@180r 2@128r
 ;
-define i32 @f1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+define i32 @f1(i32 %a, i32 %b, i32 %c) nounwind uwtable readnone ssp {
 entry:
   %y = add i32 %c, 1
   %x = udiv i32 %b, %a
@@ -50,7 +50,7 @@ entry:
 ;       %vreg5:         [16r,112r:0)  0@16r
 ;            -->        [16r,120r:0)  0@16r
 ;
-define i32 @f3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+define i32 @f3(i32 %a, i32 %b) nounwind uwtable readnone ssp {
 entry:
   %y = sub i32 %a, %b
   %x = add i32 %a, %b
diff --git a/test/CodeGen/X86/hipe-prologue.ll b/test/CodeGen/X86/hipe-prologue.ll
new file mode 100644
index 0000000..ff3c5c8
--- /dev/null
+++ b/test/CodeGen/X86/hipe-prologue.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+
+; The HiPE compiler (i.e., the native code compiler of the Erlang/OTP system)
+; adds a custom assembly prologue in order to efficiently manipulate the stack
+; at runtime.
+
+; Just to prevent the alloca from being optimized away.
+declare void @dummy_use(i32*, i32)
+
+define {i32, i32} @test_basic(i32 %hp, i32 %p) {
+  ; X32-Linux:       test_basic:
+  ; X32-Linux-NOT:   calll inc_stack_0
+
+  ; X64-Linux:       test_basic:
+  ; X64-Linux-NOT:   callq inc_stack_0
+
+  %mem = alloca i32, i32 10
+  call void @dummy_use (i32* %mem, i32 10)
+  %1 = insertvalue {i32, i32} undef, i32 %hp, 0
+  %2 = insertvalue {i32, i32} %1, i32 %p, 1
+  ret {i32, i32} %1
+}
+
+define cc 11 {i32, i32} @test_basic_hipecc(i32 %hp, i32 %p) {
+  ; X32-Linux:       test_basic_hipecc:
+  ; X32-Linux:       leal -156(%esp), %ebx
+  ; X32-Linux-NEXT:  cmpl 76(%ebp), %ebx
+  ; X32-Linux-NEXT:  jb .LBB1_1
+
+  ; X32-Linux:       ret
+
+  ; X32-Linux:       .LBB1_1:
+  ; X32-Linux-NEXT:  calll inc_stack_0
+
+  ; X64-Linux:       test_basic_hipecc:
+  ; X64-Linux:       leaq -232(%rsp), %r14
+  ; X64-Linux-NEXT:  cmpq 144(%rbp), %r14
+  ; X64-Linux-NEXT:  jb .LBB1_1
+
+  ; X64-Linux:       ret
+
+  ; X64-Linux:       .LBB1_1:
+  ; X64-Linux-NEXT:  callq inc_stack_0
+
+  %mem = alloca i32, i32 10
+  call void @dummy_use (i32* %mem, i32 10)
+  %1 = insertvalue {i32, i32} undef, i32 %hp, 0
+  %2 = insertvalue {i32, i32} %1, i32 %p, 1
+  ret {i32, i32} %2
+}
+
+define cc 11 {i32,i32,i32} @test_nocall_hipecc(i32 %hp,i32 %p,i32 %x,i32 %y) {
+  ; X32-Linux:       test_nocall_hipecc:
+  ; X32-Linux-NOT:   calll inc_stack_0
+
+  ; X64-Linux:       test_nocall_hipecc:
+  ; X64-Linux-NOT:   callq inc_stack_0
+
+  %1 = add i32 %x, %y
+  %2 = mul i32 42, %1
+  %3 = sub i32 24, %2
+  %4 = insertvalue {i32, i32, i32} undef, i32 %hp, 0
+  %5 = insertvalue {i32, i32, i32} %4, i32 %p, 1
+  %6 = insertvalue {i32, i32, i32} %5, i32 %p, 2
+  ret {i32, i32, i32} %6
+}
diff --git a/test/CodeGen/X86/imul-lea-2.ll b/test/CodeGen/X86/imul-lea-2.ll
index 1cb54b3..7b79d06 100644
--- a/test/CodeGen/X86/imul-lea-2.ll
+++ b/test/CodeGen/X86/imul-lea-2.ll
@@ -1,15 +1,19 @@
-; RUN: llc < %s -march=x86-64 | grep lea | count 3
-; RUN: llc < %s -march=x86-64 | grep shl | count 1
-; RUN: llc < %s -march=x86-64 | not grep imul
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+; CHECK-NOT: imul
 
 define i64 @t1(i64 %a) nounwind readnone {
 entry:
-	%0 = mul i64 %a, 81		; <i64> [#uses=1]
-	ret i64 %0
+  %0 = mul i64 %a, 81
+; CHECK: lea
+; CHECK: lea
+  ret i64 %0
 }
 
 define i64 @t2(i64 %a) nounwind readnone {
 entry:
-	%0 = mul i64 %a, 40		; <i64> [#uses=1]
-	ret i64 %0
+  %0 = mul i64 %a, 40
+; CHECK: shl
+; CHECK: lea
+  ret i64 %0
 }
diff --git a/test/CodeGen/X86/imul-lea.ll b/test/CodeGen/X86/imul-lea.ll
index 4e8e2af..d55ece7 100644
--- a/test/CodeGen/X86/imul-lea.ll
+++ b/test/CodeGen/X86/imul-lea.ll
@@ -1,10 +1,12 @@
-; RUN: llc < %s -march=x86 | grep lea
+; RUN: llc < %s -march=x86 | FileCheck %s
 
 declare i32 @foo()
 
 define i32 @test() {
-        %tmp.0 = tail call i32 @foo( )          ; <i32> [#uses=1]
-        %tmp.1 = mul i32 %tmp.0, 9              ; <i32> [#uses=1]
-        ret i32 %tmp.1
+  %tmp.0 = tail call i32 @foo( )
+  %tmp.1 = mul i32 %tmp.0, 9
+; CHECK-NOT: mul
+; CHECK: lea
+  ret i32 %tmp.1
 }
 
diff --git a/test/CodeGen/X86/imul64-lea.ll b/test/CodeGen/X86/imul64-lea.ll
new file mode 100644
index 0000000..047c129
--- /dev/null
+++ b/test/CodeGen/X86/imul64-lea.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 | FileCheck %s
+
+; Test that 64-bit LEAs are generated for both LP64 and ILP32 in 64-bit mode.
+declare i64 @foo64()
+
+define i64 @test64() {
+  %tmp.0 = tail call i64 @foo64( )
+  %tmp.1 = mul i64 %tmp.0, 9
+; CHECK-NOT: mul
+; CHECK: leaq
+  ret i64 %tmp.1
+}
+
+; Test that 32-bit LEAs are generated for both LP64 and ILP32 in 64-bit mode.
+declare i32 @foo32()
+
+define i32 @test32() {
+  %tmp.0 = tail call i32 @foo32( )
+  %tmp.1 = mul i32 %tmp.0, 9
+; CHECK-NOT: mul
+; CHECK: leal
+  ret i32 %tmp.1
+}
+
diff --git a/test/CodeGen/X86/insertelement-copytoregs.ll b/test/CodeGen/X86/insertelement-copytoregs.ll
index 34a29ca..88ff4da 100644
--- a/test/CodeGen/X86/insertelement-copytoregs.ll
+++ b/test/CodeGen/X86/insertelement-copytoregs.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86-64 | grep -v IMPLICIT_DEF
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+; CHECK-NOT: IMPLICIT_DEF
 
 define void @foo(<2 x float>* %p) {
   %t = insertelement <2 x float> undef, float 0.0, i32 0
diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll
index 43f69b0..2112809 100644
--- a/test/CodeGen/X86/lea-2.ll
+++ b/test/CodeGen/X86/lea-2.ll
@@ -1,13 +1,15 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   grep "lea	EAX, DWORD PTR \[... + 4\*... - 5\]"
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
-; RUN:   not grep add
+; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
 
 define i32 @test1(i32 %A, i32 %B) {
-        %tmp1 = shl i32 %A, 2           ; <i32> [#uses=1]
-        %tmp3 = add i32 %B, -5          ; <i32> [#uses=1]
-        %tmp4 = add i32 %tmp3, %tmp1            ; <i32> [#uses=1]
-        ret i32 %tmp4
+  %tmp1 = shl i32 %A, 2
+  %tmp3 = add i32 %B, -5
+  %tmp4 = add i32 %tmp3, %tmp1
+; The above computation of %tmp4 should match a single lea, without using
+; actual add instructions.
+; CHECK-NOT: add
+; CHECK: lea {{[A-Z]+}}, DWORD PTR [{{[A-Z]+}} + 4*{{[A-Z]+}} - 5]
+
+  ret i32 %tmp4
 }
 
 
diff --git a/test/CodeGen/X86/lea-4.ll b/test/CodeGen/X86/lea-4.ll
index 2171204..cef4726 100644
--- a/test/CodeGen/X86/lea-4.ll
+++ b/test/CodeGen/X86/lea-4.ll
@@ -1,19 +1,21 @@
-; RUN: llc < %s -march=x86-64 | grep lea | count 2
+; RUN: llc < %s -march=x86-64 | FileCheck %s
 
 define zeroext i16 @t1(i32 %on_off) nounwind {
 entry:
-	%0 = sub i32 %on_off, 1
-	%1 = mul i32 %0, 2
-	%2 = trunc i32 %1 to i16
-	%3 = zext i16 %2 to i32
-	%4 = trunc i32 %3 to i16
-	ret i16 %4
+  %0 = sub i32 %on_off, 1
+  %1 = mul i32 %0, 2
+  %2 = trunc i32 %1 to i16
+  %3 = zext i16 %2 to i32
+  %4 = trunc i32 %3 to i16
+; CHECK: lea
+  ret i16 %4
 }
 
 define i32 @t2(i32 %on_off) nounwind {
 entry:
-	%0 = sub i32 %on_off, 1
-	%1 = mul i32 %0, 2
-        %2 = and i32 %1, 65535
-	ret i32 %2
+  %0 = sub i32 %on_off, 1
+  %1 = mul i32 %0, 2
+  %2 = and i32 %1, 65535
+; CHECK: lea
+  ret i32 %2
 }
diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll
index c9f2fc2..71ef2d3 100644
--- a/test/CodeGen/X86/legalize-shift-64.ll
+++ b/test/CodeGen/X86/legalize-shift-64.ll
@@ -54,3 +54,14 @@ define i64 @test4(i64 %xx, i32 %test) nounwind {
 ; CHECK: orl	%esi, %eax
 ; CHECK: sarl	%cl, %edx
 }
+
+; PR14668
+define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
+  %shl = shl <2 x i64> %A, %B
+  ret <2 x i64> %shl
+; CHECK: test5
+; CHECK: shl
+; CHECK: shldl
+; CHECK: shl
+; CHECK: shldl
+}
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index 949d6a4..630c0ed 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mattr=+sse2      -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -mattr=+sse2      -mtriple=i686-pc-mingw32 -mcpu=core2 | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -mattr=+sse2      -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE2-Darwin
+; RUN: llc < %s -mattr=+sse2      -mtriple=i686-pc-mingw32 -mcpu=core2 | FileCheck %s -check-prefix=SSE2-Mingw32
 ; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSE1
 ; RUN: llc < %s -mattr=-sse       -mtriple=i686-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=NOSSE
 ; RUN: llc < %s                 -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=X86-64
@@ -9,12 +9,19 @@
 
 define void @t1(i32 %argc, i8** %argv) nounwind  {
 entry:
-; SSE2: t1:
-; SSE2: movsd _.str+16, %xmm0
-; SSE2: movsd %xmm0, 16(%esp)
-; SSE2: movaps _.str, %xmm0
-; SSE2: movaps %xmm0
-; SSE2: movb $0, 24(%esp)
+; SSE2-Darwin: t1:
+; SSE2-Darwin: movsd _.str+16, %xmm0
+; SSE2-Darwin: movsd %xmm0, 16(%esp)
+; SSE2-Darwin: movaps _.str, %xmm0
+; SSE2-Darwin: movaps %xmm0
+; SSE2-Darwin: movb $0, 24(%esp)
+
+; SSE2-Mingw32: t1:
+; SSE2-Mingw32: movsd _.str+16, %xmm0
+; SSE2-Mingw32: movsd %xmm0, 16(%esp)
+; SSE2-Mingw32: movaps _.str, %xmm0
+; SSE2-Mingw32: movups %xmm0
+; SSE2-Mingw32: movb $0, 24(%esp)
 
 ; SSE1: t1:
 ; SSE1: movaps _.str, %xmm0
@@ -48,9 +55,13 @@ entry:
 
 define void @t2(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
 entry:
-; SSE2: t2:
-; SSE2: movaps (%eax), %xmm0
-; SSE2: movaps %xmm0, (%eax)
+; SSE2-Darwin: t2:
+; SSE2-Darwin: movaps (%eax), %xmm0
+; SSE2-Darwin: movaps %xmm0, (%eax)
+
+; SSE2-Mingw32: t2:
+; SSE2-Mingw32: movaps (%eax), %xmm0
+; SSE2-Mingw32: movaps %xmm0, (%eax)
 
 ; SSE1: t2:
 ; SSE1: movaps (%eax), %xmm0
@@ -79,11 +90,17 @@ entry:
 
 define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
 entry:
-; SSE2: t3:
-; SSE2: movsd (%eax), %xmm0
-; SSE2: movsd 8(%eax), %xmm1
-; SSE2: movsd %xmm1, 8(%eax)
-; SSE2: movsd %xmm0, (%eax)
+; SSE2-Darwin: t3:
+; SSE2-Darwin: movsd (%eax), %xmm0
+; SSE2-Darwin: movsd 8(%eax), %xmm1
+; SSE2-Darwin: movsd %xmm1, 8(%eax)
+; SSE2-Darwin: movsd %xmm0, (%eax)
+
+; SSE2-Mingw32: t3:
+; SSE2-Mingw32: movsd (%eax), %xmm0
+; SSE2-Mingw32: movsd 8(%eax), %xmm1
+; SSE2-Mingw32: movsd %xmm1, 8(%eax)
+; SSE2-Mingw32: movsd %xmm0, (%eax)
 
 ; SSE1: t3:
 ; SSE1: movl
@@ -122,15 +139,25 @@ entry:
 
 define void @t4() nounwind {
 entry:
-; SSE2: t4:
-; SSE2: movw $120
-; SSE2: movl $2021161080
-; SSE2: movl $2021161080
-; SSE2: movl $2021161080
-; SSE2: movl $2021161080
-; SSE2: movl $2021161080
-; SSE2: movl $2021161080
-; SSE2: movl $2021161080
+; SSE2-Darwin: t4:
+; SSE2-Darwin: movw $120
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+; SSE2-Darwin: movl $2021161080
+
+; SSE2-Mingw32: t4:
+; SSE2-Mingw32: movw $120
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
+; SSE2-Mingw32: movl $2021161080
 
 ; SSE1: t4:
 ; SSE1: movw $120
diff --git a/test/CodeGen/X86/memcpy.ll b/test/CodeGen/X86/memcpy.ll
index 39c7fba..3372a4a 100644
--- a/test/CodeGen/X86/memcpy.ll
+++ b/test/CodeGen/X86/memcpy.ll
@@ -87,8 +87,34 @@ entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([30 x i8]* @.str, i64 0, i64 0), i64 16, i32 1, i1 false)
   ret void
 
+; DARWIN: test5:
 ; DARWIN: movabsq	$7016996765293437281
 ; DARWIN: movabsq	$7016996765293437184
 }
 
 
+; PR14896
+@.str2 = private unnamed_addr constant [2 x i8] c"x\00", align 1
+
+define void @test6() nounwind uwtable {
+entry:
+; DARWIN: test6
+; DARWIN: movw $0, 8
+; DARWIN: movq $120, 0
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* null, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0), i64 10, i32 1, i1 false)
+  ret void
+}
+
+define void @PR15348(i8* %a, i8* %b) {
+; Ensure that alignment of '0' in an @llvm.memcpy intrinsic results in
+; unaligned loads and stores.
+; LINUX: PR15348
+; LINUX: movb
+; LINUX: movb
+; LINUX: movq
+; LINUX: movq
+; LINUX: movq
+; LINUX: movq
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 17, i32 0, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/X86/memset.ll b/test/CodeGen/X86/memset.ll
index b35f261..0d479f0 100644
--- a/test/CodeGen/X86/memset.ll
+++ b/test/CodeGen/X86/memset.ll
@@ -20,15 +20,18 @@ entry:
 ; X86: movl $0,
 ; X86: movl $0,
 ; X86-NOT: movl $0,
+; X86: ret
 
 ; XMM: xorps %xmm{{[0-9]+}}, [[Z:%xmm[0-9]+]]
 ; XMM: movaps [[Z]],
 ; XMM: movaps [[Z]],
 ; XMM-NOT: movaps
+; XMM: ret
 
 ; YMM: vxorps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, [[Z:%ymm[0-9]+]]
 ; YMM: vmovaps [[Z]],
 ; YMM-NOT: movaps
+; YMM: ret
 
 	call void @foo( %struct.x* %up_mvd116 ) nounwind 
 	ret void
@@ -37,3 +40,16 @@ entry:
 declare void @foo(%struct.x*)
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+define void @PR15348(i8* %a) {
+; Ensure that alignment of '0' in an @llvm.memset intrinsic results in
+; unaligned loads and stores.
+; XMM: PR15348
+; XMM: movb $0,
+; XMM: movl $0,
+; XMM: movl $0,
+; XMM: movl $0,
+; XMM: movl $0,
+  call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 17, i32 0, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index 24d28ad..5048a93 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -1,10 +1,11 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
 
 define i32 @t1() nounwind {
 entry:
   %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0A\09mov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
   ret i32 %0
 ; CHECK: t1
+; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: mov eax, ecx
@@ -18,6 +19,7 @@ entry:
   call void asm sideeffect inteldialect "mov eax, $$1", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
   ret void
 ; CHECK: t2
+; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: mov eax, 1
@@ -32,6 +34,7 @@ entry:
   call void asm sideeffect inteldialect "mov eax, DWORD PTR [$0]", "*m,~{eax},~{dirflag},~{fpsr},~{flags}"(i32* %V.addr) nounwind
   ret void
 ; CHECK: t3
+; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: mov eax, DWORD PTR {{[[esp]}}
@@ -53,6 +56,7 @@ entry:
   %0 = load i32* %b1, align 4
   ret i32 %0
 ; CHECK: t18
+; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: lea ebx, foo
@@ -61,3 +65,46 @@ entry:
 ; CHECK: .att_syntax
 ; CHECK: {{## InlineAsm End|#NO_APP}}
 }
+
+define void @t19_helper() nounwind {
+entry:
+  ret void
+}
+
+define void @t19() nounwind {
+entry:
+  call void asm sideeffect inteldialect "call $0", "r,~{dirflag},~{fpsr},~{flags}"(void ()* @t19_helper) nounwind
+  ret void
+; CHECK: t19:
+; CHECK: movl %esp, %ebp
+; CHECK: movl ${{_?}}t19_helper, %eax
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: call eax
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+@results = global [2 x i32] [i32 3, i32 2], align 4
+
+define i32* @t30() nounwind ssp {
+entry:
+  %res = alloca i32*, align 4
+  call void asm sideeffect inteldialect "lea edi, dword ptr $0", "*m,~{edi},~{dirflag},~{fpsr},~{flags}"([2 x i32]* @results) nounwind
+  call void asm sideeffect inteldialect "mov dword ptr $0, edi", "=*m,~{dirflag},~{fpsr},~{flags}"(i32** %res) nounwind
+  %0 = load i32** %res, align 4
+  ret i32* %0
+; CHECK: t30:
+; CHECK: movl %esp, %ebp
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: lea edi, dword ptr [{{_?}}results]
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov dword ptr [esi], edi
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+; CHECK: movl (%esi), %eax
+}
diff --git a/test/CodeGen/X86/no-cmov.ll b/test/CodeGen/X86/no-cmov.ll
new file mode 100644
index 0000000..62d73b0
--- /dev/null
+++ b/test/CodeGen/X86/no-cmov.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=x86 -mcpu=i486 < %s | FileCheck %s
+
+define i32 @test1(i32 %g, i32* %j) {
+  %tobool = icmp eq i32 %g, 0
+  %cmp = load i32* %j, align 4
+  %retval.0 = select i1 %tobool, i32 1, i32 %cmp
+  ret i32 %retval.0
+
+; CHECK: test1:
+; CHECK-NOT: cmov
+}
diff --git a/test/CodeGen/X86/pmovsx-inreg.ll b/test/CodeGen/X86/pmovsx-inreg.ll
new file mode 100644
index 0000000..d8c27f2
--- /dev/null
+++ b/test/CodeGen/X86/pmovsx-inreg.ll
@@ -0,0 +1,176 @@
+; RUN: llc < %s -march=x86-64 -mcpu=penryn | FileCheck -check-prefix=SSE41 %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck -check-prefix=AVX1 %s
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck -check-prefix=AVX2 %s
+
+; PR14887
+; These tests inject a store into the chain to test the inreg versions of pmovsx
+
+define void @test1(<2 x i8>* %in, <2 x i64>* %out) nounwind {
+  %wide.load35 = load <2 x i8>* %in, align 1
+  %sext = sext <2 x i8> %wide.load35 to <2 x i64>
+  store <2 x i64> zeroinitializer, <2 x i64>* undef, align 8
+  store <2 x i64> %sext, <2 x i64>* %out, align 8
+  ret void
+
+; SSE41: test1:
+; SSE41: pmovsxbq
+
+; AVX1: test1:
+; AVX1: vpmovsxbq
+
+; AVX2: test1:
+; AVX2: vpmovsxbq
+}
+
+define void @test2(<4 x i8>* %in, <4 x i64>* %out) nounwind {
+  %wide.load35 = load <4 x i8>* %in, align 1
+  %sext = sext <4 x i8> %wide.load35 to <4 x i64>
+  store <4 x i64> zeroinitializer, <4 x i64>* undef, align 8
+  store <4 x i64> %sext, <4 x i64>* %out, align 8
+  ret void
+
+; AVX2: test2:
+; AVX2: vpmovsxbq
+}
+
+define void @test3(<4 x i8>* %in, <4 x i32>* %out) nounwind {
+  %wide.load35 = load <4 x i8>* %in, align 1
+  %sext = sext <4 x i8> %wide.load35 to <4 x i32>
+  store <4 x i32> zeroinitializer, <4 x i32>* undef, align 8
+  store <4 x i32> %sext, <4 x i32>* %out, align 8
+  ret void
+
+; SSE41: test3:
+; SSE41: pmovsxbd
+
+; AVX1: test3:
+; AVX1: vpmovsxbd
+
+; AVX2: test3:
+; AVX2: vpmovsxbd
+}
+
+define void @test4(<8 x i8>* %in, <8 x i32>* %out) nounwind {
+  %wide.load35 = load <8 x i8>* %in, align 1
+  %sext = sext <8 x i8> %wide.load35 to <8 x i32>
+  store <8 x i32> zeroinitializer, <8 x i32>* undef, align 8
+  store <8 x i32> %sext, <8 x i32>* %out, align 8
+  ret void
+
+; AVX2: test4:
+; AVX2: vpmovsxbd
+}
+
+define void @test5(<8 x i8>* %in, <8 x i16>* %out) nounwind {
+  %wide.load35 = load <8 x i8>* %in, align 1
+  %sext = sext <8 x i8> %wide.load35 to <8 x i16>
+  store <8 x i16> zeroinitializer, <8 x i16>* undef, align 8
+  store <8 x i16> %sext, <8 x i16>* %out, align 8
+  ret void
+
+; SSE41: test5:
+; SSE41: pmovsxbw
+
+; AVX1: test5:
+; AVX1: vpmovsxbw
+
+; AVX2: test5:
+; AVX2: vpmovsxbw
+}
+
+define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind {
+  %wide.load35 = load <16 x i8>* %in, align 1
+  %sext = sext <16 x i8> %wide.load35 to <16 x i16>
+  store <16 x i16> zeroinitializer, <16 x i16>* undef, align 8
+  store <16 x i16> %sext, <16 x i16>* %out, align 8
+  ret void
+
+; AVX2: test6:
+; FIXME: v16i8 -> v16i16 is scalarized.
+; AVX2-NOT: pmovsx
+}
+
+define void @test7(<2 x i16>* %in, <2 x i64>* %out) nounwind {
+  %wide.load35 = load <2 x i16>* %in, align 1
+  %sext = sext <2 x i16> %wide.load35 to <2 x i64>
+  store <2 x i64> zeroinitializer, <2 x i64>* undef, align 8
+  store <2 x i64> %sext, <2 x i64>* %out, align 8
+  ret void
+
+
+; SSE41: test7:
+; SSE41: pmovsxwq
+
+; AVX1: test7:
+; AVX1: vpmovsxwq
+
+; AVX2: test7:
+; AVX2: vpmovsxwq
+}
+
+define void @test8(<4 x i16>* %in, <4 x i64>* %out) nounwind {
+  %wide.load35 = load <4 x i16>* %in, align 1
+  %sext = sext <4 x i16> %wide.load35 to <4 x i64>
+  store <4 x i64> zeroinitializer, <4 x i64>* undef, align 8
+  store <4 x i64> %sext, <4 x i64>* %out, align 8
+  ret void
+
+; AVX2: test8:
+; AVX2: vpmovsxwq
+}
+
+define void @test9(<4 x i16>* %in, <4 x i32>* %out) nounwind {
+  %wide.load35 = load <4 x i16>* %in, align 1
+  %sext = sext <4 x i16> %wide.load35 to <4 x i32>
+  store <4 x i32> zeroinitializer, <4 x i32>* undef, align 8
+  store <4 x i32> %sext, <4 x i32>* %out, align 8
+  ret void
+
+; SSE41: test9:
+; SSE41: pmovsxwd
+
+; AVX1: test9:
+; AVX1: vpmovsxwd
+
+; AVX2: test9:
+; AVX2: vpmovsxwd
+}
+
+define void @test10(<8 x i16>* %in, <8 x i32>* %out) nounwind {
+  %wide.load35 = load <8 x i16>* %in, align 1
+  %sext = sext <8 x i16> %wide.load35 to <8 x i32>
+  store <8 x i32> zeroinitializer, <8 x i32>* undef, align 8
+  store <8 x i32> %sext, <8 x i32>* %out, align 8
+  ret void
+
+; AVX2: test10:
+; AVX2: vpmovsxwd
+}
+
+define void @test11(<2 x i32>* %in, <2 x i64>* %out) nounwind {
+  %wide.load35 = load <2 x i32>* %in, align 1
+  %sext = sext <2 x i32> %wide.load35 to <2 x i64>
+  store <2 x i64> zeroinitializer, <2 x i64>* undef, align 8
+  store <2 x i64> %sext, <2 x i64>* %out, align 8
+  ret void
+
+; SSE41: test11:
+; SSE41: pmovsxdq
+
+; AVX1: test11:
+; AVX1: vpmovsxdq
+
+; AVX2: test11:
+; AVX2: vpmovsxdq
+}
+
+define void @test12(<4 x i32>* %in, <4 x i64>* %out) nounwind {
+  %wide.load35 = load <4 x i32>* %in, align 1
+  %sext = sext <4 x i32> %wide.load35 to <4 x i64>
+  store <4 x i64> zeroinitializer, <4 x i64>* undef, align 8
+  store <4 x i64> %sext, <4 x i64>* %out, align 8
+  ret void
+
+; AVX2: test12:
+; AVX2: vpmovsxdq
+}
diff --git a/test/CodeGen/X86/pr10499.ll b/test/CodeGen/X86/pr10499.ll
new file mode 100644
index 0000000..f9cc747
--- /dev/null
+++ b/test/CodeGen/X86/pr10499.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7-avx -mattr=-sse2
+
+; No check as PR10499 is a crashing bug.
+
+define void @autogen_24438_500() {
+BB:
+  %I = insertelement <8 x i32> undef, i32 -1, i32 4
+  %BC = bitcast <8 x i32> %I to <8 x float>
+  br label %CF
+
+CF:                                               ; preds = %CF, %BB
+  %ZE = fpext <8 x float> %BC to <8 x double>
+  br label %CF
+}
diff --git a/test/CodeGen/X86/pr14562.ll b/test/CodeGen/X86/pr14562.ll
new file mode 100644
index 0000000..e66f175
--- /dev/null
+++ b/test/CodeGen/X86/pr14562.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+@temp1 = global i64 -77129852189294865, align 8
+
+define void @foo() nounwind {
+  %x = load i64* @temp1, align 8
+  %s = shl i64 %x, 32
+  %t = trunc i64 %s to i32
+  %z = zext i32 %t to i64
+  store i64 %z, i64* @temp1, align 8
+; CHECK: movl $0, {{_?}}temp1+4                                                
+; CHECK: movl $0, {{_?}}temp1
+  ret void
+}
+
diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
new file mode 100644
index 0000000..c8aaf32
--- /dev/null
+++ b/test/CodeGen/X86/pr15267.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7-avx | FileCheck %s
+
+define <4 x i3> @test1(<4 x i3>* %in) nounwind {
+  %ret = load <4 x i3>* %in, align 1
+  ret <4 x i3> %ret
+}
+
+; CHECK: test1
+; CHECK: movzwl
+; CHECK: shrl $3
+; CHECK: andl $7
+; CHECK: andl $7
+; CHECK: vmovd
+; CHECK: pinsrd $1
+; CHECK: shrl $6
+; CHECK: andl $7
+; CHECK: pinsrd $2
+; CHECK: shrl $9
+; CHECK: andl $7
+; CHECK: pinsrd $3
+; CHECK: ret
+
+define <4 x i1> @test2(<4 x i1>* %in) nounwind {
+  %ret = load <4 x i1>* %in, align 1
+  ret <4 x i1> %ret
+}
+
+; CHECK: test2
+; CHECK: movzbl
+; CHECK: shrl
+; CHECK: andl $1
+; CHECK: andl $1
+; CHECK: vmovd
+; CHECK: pinsrd $1
+; CHECK: shrl $2
+; CHECK: andl $1
+; CHECK: pinsrd $2
+; CHECK: shrl $3
+; CHECK: andl $1
+; CHECK: pinsrd $3
+; CHECK: ret
+
+define <4 x i64> @test3(<4 x i1>* %in) nounwind {
+  %wide.load35 = load <4 x i1>* %in, align 1
+  %sext = sext <4 x i1> %wide.load35 to <4 x i64>
+  ret <4 x i64> %sext
+}
+
+; CHECK: test3
+; CHECK: movzbl
+; CHECK: shrl
+; CHECK: andl $1
+; CHECK: andl $1
+; CHECK: vmovd
+; CHECK: pinsrd $1
+; CHECK: shrl $2
+; CHECK: andl $1
+; CHECK: pinsrd $2
+; CHECK: shrl $3
+; CHECK: andl $1
+; CHECK: pinsrd $3
+; CHECK: pslld
+; CHECK: psrad
+; CHECK: pmovsxdq
+; CHECK: pmovsxdq
+; CHECK: ret
diff --git a/test/CodeGen/X86/pre-ra-sched.ll b/test/CodeGen/X86/pre-ra-sched.ll
new file mode 100644
index 0000000..b792ffa
--- /dev/null
+++ b/test/CodeGen/X86/pre-ra-sched.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -debug-only=pre-RA-sched \
+; RUN:     2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; rdar:13279013: pre-RA-sched should not check all interferences and
+; repush them on the ready queue after scheduling each instruction.
+;
+; CHECK: *** List Scheduling
+; CHECK: Interfering reg EFLAGS
+; CHECK: Repushing
+; CHECK: Repushing
+; CHECK: Repushing
+; CHECK-NOT: Repushing
+; CHECK: *** Final schedule
+define i32 @test(i8* %pin) #0 {
+  %g0 = getelementptr inbounds i8* %pin, i64 0
+  %l0 = load i8* %g0, align 1
+
+  %g1a = getelementptr inbounds i8* %pin, i64 1
+  %l1a = load i8* %g1a, align 1
+  %z1a = zext i8 %l1a to i32
+  %g1b = getelementptr inbounds i8* %pin, i64 2
+  %l1b = load i8* %g1b, align 1
+  %z1b = zext i8 %l1b to i32
+  %c1 = icmp ne i8 %l0, 0
+  %x1 = xor i32 %z1a, %z1b
+  %s1 = select i1 %c1, i32 %z1a, i32 %x1
+
+  %g2a = getelementptr inbounds i8* %pin, i64 3
+  %l2a = load i8* %g2a, align 1
+  %z2a = zext i8 %l2a to i32
+  %g2b = getelementptr inbounds i8* %pin, i64 4
+  %l2b = load i8* %g2b, align 1
+  %z2b = zext i8 %l2b to i32
+  %x2 = xor i32 %z2a, %z2b
+  %s2 = select i1 %c1, i32 %z2a, i32 %x2
+
+  %g3a = getelementptr inbounds i8* %pin, i64 5
+  %l3a = load i8* %g3a, align 1
+  %z3a = zext i8 %l3a to i32
+  %g3b = getelementptr inbounds i8* %pin, i64 6
+  %l3b = load i8* %g3b, align 1
+  %z3b = zext i8 %l3b to i32
+  %x3 = xor i32 %z3a, %z3b
+  %s3 = select i1 %c1, i32 %z3a, i32 %x3
+
+  %c3 = icmp ne i8 %l1a, 0
+  %c4 = icmp ne i8 %l2a, 0
+
+  %s4 = select i1 %c3, i32 %s1, i32 %s2
+  %s5 = select i1 %c4, i32 %s4, i32 %s3
+
+  ret i32 %s5
+}
+
+attributes #0 = { nounwind ssp uwtable }
diff --git a/test/CodeGen/X86/rip-rel-lea.ll b/test/CodeGen/X86/rip-rel-lea.ll
new file mode 100644
index 0000000..71dacf6
--- /dev/null
+++ b/test/CodeGen/X86/rip-rel-lea.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=PIC64
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -relocation-model=pic | FileCheck %s -check-prefix=PICX32
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=PIC32
+
+; Use %rip-relative addressing even in static mode on x86-64, because
+; it has a smaller encoding.
+
+@a = internal global double 3.4
+define double* @foo() nounwind {
+  %a = getelementptr double* @a, i64 0
+  ret double* %a
+  
+; PIC64:    leaq	a(%rip)
+; PICX32:   leal	a(%rip)
+; PIC32:    leal	a@GOTOFF(%eax)
+}
diff --git a/test/CodeGen/X86/sandybridge-loads.ll b/test/CodeGen/X86/sandybridge-loads.ll
new file mode 100644
index 0000000..5a23cf1
--- /dev/null
+++ b/test/CodeGen/X86/sandybridge-loads.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
+
+;CHECK: wideloads
+;CHECK: vmovaps
+;CHECK: vinsertf128
+;CHECK: vmovaps
+;CHECK-NOT: vinsertf128
+;CHECK: ret
+
+define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+  %v0 = load <8 x float>* %a, align 16  ; <---- unaligned!
+  %v1 = load <8 x float>* %b, align 32  ; <---- aligned!
+  %m0 = fcmp olt <8 x float> %v1, %v0
+  %v2 = load <8 x float>* %c, align 32  ; <---- aligned!
+  %m1 = fcmp olt <8 x float> %v2, %v0
+  %mand = and <8 x i1> %m1, %m0
+  %r = zext <8 x i1> %mand to <8 x i32>
+  store <8 x i32> %r, <8 x i32>* undef, align 32
+  ret void
+}
+
+; CHECK: widestores
+; loads:
+; CHECK: vmovaps
+; CHECK: vmovaps
+; stores:
+; CHECK: vmovaps
+; CHECK: vextractf128
+; CHECK: vmovaps
+;CHECK: ret
+
+define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+  %v0 = load <8 x float>* %a, align 32
+  %v1 = load <8 x float>* %b, align 32
+  store <8 x float> %v0, <8 x float>* %b, align 32 ; <--- aligned
+  store <8 x float> %v1, <8 x float>* %a, align 16 ; <--- unaligned
+  ret void
+}
+
diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll
new file mode 100644
index 0000000..f364d1f
--- /dev/null
+++ b/test/CodeGen/X86/sincos-opt.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_SINCOS
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_NOOPT
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS
+
+; Combine sin / cos into a single call.
+; rdar://13087969
+
+define float @test1(float %x) nounwind {
+entry:
+; GNU_SINCOS: test1:
+; GNU_SINCOS: callq sincosf
+; GNU_SINCOS: movss 4(%rsp), %xmm0
+; GNU_SINCOS: addss (%rsp), %xmm0
+
+; OSX_SINCOS: test1:
+; OSX_SINCOS: callq ___sincosf_stret
+; OSX_SINCOS: addss %xmm1, %xmm0
+
+; OSX_NOOPT: test1
+; OSX_NOOPT: callq _cosf
+; OSX_NOOPT: callq _sinf
+  %call = tail call float @sinf(float %x) nounwind readnone
+  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define double @test2(double %x) nounwind {
+entry:
+; GNU_SINCOS: test2:
+; GNU_SINCOS: callq sincos
+; GNU_SINCOS: movsd 16(%rsp), %xmm0
+; GNU_SINCOS: addsd 8(%rsp), %xmm0
+
+; OSX_SINCOS: test2:
+; OSX_SINCOS: callq ___sincos_stret
+; OSX_SINCOS: addsd %xmm1, %xmm0
+
+; OSX_NOOPT: test2
+; OSX_NOOPT: callq _cos
+; OSX_NOOPT: callq _sin
+  %call = tail call double @sin(double %x) nounwind readnone
+  %call1 = tail call double @cos(double %x) nounwind readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+define x86_fp80 @test3(x86_fp80 %x) nounwind {
+entry:
+; GNU_SINCOS: test3:
+; GNU_SINCOS: callq sinl
+; GNU_SINCOS: callq cosl
+; GNU_SINCOS: ret
+  %call = tail call x86_fp80 @sinl(x86_fp80 %x) nounwind
+  %call1 = tail call x86_fp80 @cosl(x86_fp80 %x) nounwind
+  %add = fadd x86_fp80 %call, %call1
+  ret x86_fp80 %add
+}
+
+declare float  @sinf(float) readonly
+declare double @sin(double) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly
+
+declare x86_fp80 @sinl(x86_fp80)
+declare x86_fp80 @cosl(x86_fp80)
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
index 67ce1be..30a0fbe 100644
--- a/test/CodeGen/X86/sse2-blend.ll
+++ b/test/CodeGen/X86/sse2-blend.ll
@@ -29,7 +29,6 @@ define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
 ; Without forcing instructions, fall back to the preferred PS domain.
 ; CHECK: vsel_i64
 ; CHECK: andnps
-; CHECK: andps
 ; CHECK: orps
 ; CHECK: ret
 
@@ -44,7 +43,6 @@ define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) {
 ; Without forcing instructions, fall back to the preferred PS domain.
 ; CHECK: vsel_double
 ; CHECK: andnps
-; CHECK: andps
 ; CHECK: orps
 ; CHECK: ret
 
diff --git a/test/CodeGen/X86/stack-align-memcpy.ll b/test/CodeGen/X86/stack-align-memcpy.ll
new file mode 100644
index 0000000..74945e5
--- /dev/null
+++ b/test/CodeGen/X86/stack-align-memcpy.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -force-align-stack -mtriple i386-apple-darwin -mcpu=i486 | FileCheck %s
+
+%struct.foo = type { [88 x i8] }
+
+; PR15249
+; We can't use rep;movsl here because it clobbers the base pointer in %esi.
+define void @test1(%struct.foo* nocapture %x, i32 %y) nounwind {
+  %dynalloc = alloca i8, i32 %y, align 1
+  call void @bar(i8* %dynalloc, %struct.foo* align 4 byval %x)
+  ret void
+
+; CHECK: test1:
+; CHECK: andl $-16, %esp
+; CHECK: movl %esp, %esi
+; CHECK-NOT: rep;movsl
+}
+
+declare void @bar(i8* nocapture, %struct.foo* align 4 byval) nounwind
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index c075114..1e9ca1d 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -1,28 +1,3141 @@
-; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | grep %gs:
-; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | grep %fs:
-; RUN: llc -code-model=kernel -mtriple=x86_64-pc-linux-gnu < %s -o - | grep %gs:
-; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep "__stack_chk_guard"
-; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | grep "__stack_chk_fail"
+; RUN: llc -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+; RUN: llc -code-model=kernel -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=LINUX-KERNEL-X64 %s
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o - | FileCheck --check-prefix=DARWIN-X64 %s
 
-@"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00"		; <[11 x i8]*> [#uses=1]
+%struct.foo = type { [16 x i8] }
+%struct.foo.0 = type { [4 x i8] }
+%struct.pair = type { i32, i32 }
+%struct.nest = type { %struct.pair, %struct.pair }
+%struct.vec = type { <4 x i32> }
+%class.A = type { [2 x i8] }
+%struct.deep = type { %union.anon }
+%union.anon = type { %struct.anon }
+%struct.anon = type { %struct.anon.0 }
+%struct.anon.0 = type { %union.anon.1 }
+%union.anon.1 = type { [2 x i8] }
+%struct.small = type { i8 }
 
-define void @test(i8* %a) nounwind ssp {
+@.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+
+; test1a: array of [16 x i8] 
+;         no ssp attribute
+; Requires no protector.
+define void @test1a(i8* %a) nounwind uwtable {
+entry:
+; LINUX-I386: test1a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test1a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test1a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test1a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a.addr = alloca i8*, align 8
+  %buf = alloca [16 x i8], align 16
+  store i8* %a, i8** %a.addr, align 8
+  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %arraydecay1 = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  ret void
+}
+
+; test1b: array of [16 x i8] 
+;         ssp attribute
+; Requires protector.
+define void @test1b(i8* %a) nounwind uwtable ssp {
+entry:
+; LINUX-I386: test1b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test1b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test1b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test1b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %buf = alloca [16 x i8], align 16
+  store i8* %a, i8** %a.addr, align 8
+  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %arraydecay1 = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  ret void
+}
+
+; test1c: array of [16 x i8] 
+;         sspstrong attribute
+; Requires protector.
+define void @test1c(i8* %a) nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test1c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test1c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test1c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test1c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %buf = alloca [16 x i8], align 16
+  store i8* %a, i8** %a.addr, align 8
+  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %arraydecay1 = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  ret void
+}
+
+; test1d: array of [16 x i8] 
+;         sspreq attribute
+; Requires protector.
+define void @test1d(i8* %a) nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test1d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test1d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test1d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test1d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %buf = alloca [16 x i8], align 16
+  store i8* %a, i8** %a.addr, align 8
+  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %arraydecay1 = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  ret void
+}
+
+; test2a: struct { [16 x i8] }
+;         no ssp attribute
+; Requires no protector.
+define void @test2a(i8* %a) nounwind uwtable {
+entry:
+; LINUX-I386: test2a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test2a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test2a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test2a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a.addr = alloca i8*, align 8
+  %b = alloca %struct.foo, align 1
+  store i8* %a, i8** %a.addr, align 8
+  %buf = getelementptr inbounds %struct.foo* %b, i32 0, i32 0
+  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %buf1 = getelementptr inbounds %struct.foo* %b, i32 0, i32 0
+  %arraydecay2 = getelementptr inbounds [16 x i8]* %buf1, i32 0, i32 0
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  ret void
+}
+
+; test2b: struct { [16 x i8] }
+;          ssp attribute
+; Requires protector.
+define void @test2b(i8* %a) nounwind uwtable ssp {
+entry:
+; LINUX-I386: test2b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test2b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test2b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test2b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %b = alloca %struct.foo, align 1
+  store i8* %a, i8** %a.addr, align 8
+  %buf = getelementptr inbounds %struct.foo* %b, i32 0, i32 0
+  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %buf1 = getelementptr inbounds %struct.foo* %b, i32 0, i32 0
+  %arraydecay2 = getelementptr inbounds [16 x i8]* %buf1, i32 0, i32 0
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  ret void
+}
+
+; test2c: struct { [16 x i8] }
+;          sspstrong attribute
+; Requires protector.
+define void @test2c(i8* %a) nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test2c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test2c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test2c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test2c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %b = alloca %struct.foo, align 1
+  store i8* %a, i8** %a.addr, align 8
+  %buf = getelementptr inbounds %struct.foo* %b, i32 0, i32 0
+  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %buf1 = getelementptr inbounds %struct.foo* %b, i32 0, i32 0
+  %arraydecay2 = getelementptr inbounds [16 x i8]* %buf1, i32 0, i32 0
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  ret void
+}
+
+; test2d: struct { [16 x i8] }
+;          sspreq attribute
+; Requires protector.
+define void @test2d(i8* %a) nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test2d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test2d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test2d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test2d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %b = alloca %struct.foo, align 1
+  store i8* %a, i8** %a.addr, align 8
+  %buf = getelementptr inbounds %struct.foo* %b, i32 0, i32 0
+  %arraydecay = getelementptr inbounds [16 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %buf1 = getelementptr inbounds %struct.foo* %b, i32 0, i32 0
+  %arraydecay2 = getelementptr inbounds [16 x i8]* %buf1, i32 0, i32 0
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  ret void
+}
+
+; test3a:  array of [4 x i8]
+;          no ssp attribute
+; Requires no protector.
+define void @test3a(i8* %a) nounwind uwtable {
+entry:
+; LINUX-I386: test3a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test3a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test3a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test3a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a.addr = alloca i8*, align 8
+  %buf = alloca [4 x i8], align 1
+  store i8* %a, i8** %a.addr, align 8
+  %arraydecay = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %arraydecay1 = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  ret void
+}
+
+; test3b:  array [4 x i8]
+;          ssp attribute
+; Requires no protector.
+define void @test3b(i8* %a) nounwind uwtable ssp {
+entry:
+; LINUX-I386: test3b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test3b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test3b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test3b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a.addr = alloca i8*, align 8
+  %buf = alloca [4 x i8], align 1
+  store i8* %a, i8** %a.addr, align 8
+  %arraydecay = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %arraydecay1 = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  ret void
+}
+
+; test3c:  array of [4 x i8]
+;          sspstrong attribute
+; Requires protector.
+define void @test3c(i8* %a) nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test3c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test3c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test3c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test3c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %buf = alloca [4 x i8], align 1
+  store i8* %a, i8** %a.addr, align 8
+  %arraydecay = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %arraydecay1 = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  ret void
+}
+
+; test3d:  array of [4 x i8]
+;          sspreq attribute
+; Requires protector.
+define void @test3d(i8* %a) nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test3d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test3d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test3d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test3d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %buf = alloca [4 x i8], align 1
+  store i8* %a, i8** %a.addr, align 8
+  %arraydecay = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %arraydecay1 = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay1)
+  ret void
+}
+
+; test4a:  struct { [4 x i8] }
+;          no ssp attribute
+; Requires no protector.
+define void @test4a(i8* %a) nounwind uwtable {
+entry:
+; LINUX-I386: test4a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test4a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test4a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test4a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a.addr = alloca i8*, align 8
+  %b = alloca %struct.foo.0, align 1
+  store i8* %a, i8** %a.addr, align 8
+  %buf = getelementptr inbounds %struct.foo.0* %b, i32 0, i32 0
+  %arraydecay = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %buf1 = getelementptr inbounds %struct.foo.0* %b, i32 0, i32 0
+  %arraydecay2 = getelementptr inbounds [4 x i8]* %buf1, i32 0, i32 0
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  ret void
+}
+
+; test4b:  struct { [4 x i8] }
+;          ssp attribute
+; Requires no protector.
+define void @test4b(i8* %a) nounwind uwtable ssp {
+entry:
+; LINUX-I386: test4b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test4b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test4b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test4b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a.addr = alloca i8*, align 8
+  %b = alloca %struct.foo.0, align 1
+  store i8* %a, i8** %a.addr, align 8
+  %buf = getelementptr inbounds %struct.foo.0* %b, i32 0, i32 0
+  %arraydecay = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %buf1 = getelementptr inbounds %struct.foo.0* %b, i32 0, i32 0
+  %arraydecay2 = getelementptr inbounds [4 x i8]* %buf1, i32 0, i32 0
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  ret void
+}
+
+; test4c:  struct { [4 x i8] }
+;          sspstrong attribute
+; Requires protector.
+define void @test4c(i8* %a) nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test4c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test4c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test4c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test4c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %b = alloca %struct.foo.0, align 1
+  store i8* %a, i8** %a.addr, align 8
+  %buf = getelementptr inbounds %struct.foo.0* %b, i32 0, i32 0
+  %arraydecay = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %buf1 = getelementptr inbounds %struct.foo.0* %b, i32 0, i32 0
+  %arraydecay2 = getelementptr inbounds [4 x i8]* %buf1, i32 0, i32 0
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  ret void
+}
+
+; test4d:  struct { [4 x i8] }
+;          sspreq attribute
+; Requires protector.
+define void @test4d(i8* %a) nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test4d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test4d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test4d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test4d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  %b = alloca %struct.foo.0, align 1
+  store i8* %a, i8** %a.addr, align 8
+  %buf = getelementptr inbounds %struct.foo.0* %b, i32 0, i32 0
+  %arraydecay = getelementptr inbounds [4 x i8]* %buf, i32 0, i32 0
+  %0 = load i8** %a.addr, align 8
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %0)
+  %buf1 = getelementptr inbounds %struct.foo.0* %b, i32 0, i32 0
+  %arraydecay2 = getelementptr inbounds [4 x i8]* %buf1, i32 0, i32 0
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay2)
+  ret void
+}
+
+; test5a:  no arrays / no nested arrays
+;          no ssp attribute
+; Requires no protector.
+define void @test5a(i8* %a) nounwind uwtable {
+entry:
+; LINUX-I386: test5a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test5a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test5a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test5a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a.addr = alloca i8*, align 8
+  store i8* %a, i8** %a.addr, align 8
+  %0 = load i8** %a.addr, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  ret void
+}
+
+; test5b:  no arrays / no nested arrays
+;          ssp attribute
+; Requires no protector.
+define void @test5b(i8* %a) nounwind uwtable ssp {
+entry:
+; LINUX-I386: test5b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test5b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test5b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test5b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a.addr = alloca i8*, align 8
+  store i8* %a, i8** %a.addr, align 8
+  %0 = load i8** %a.addr, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  ret void
+}
+
+; test5c:  no arrays / no nested arrays
+;          sspstrong attribute
+; Requires no protector.
+define void @test5c(i8* %a) nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test5c:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test5c:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test5c:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test5c:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a.addr = alloca i8*, align 8
+  store i8* %a, i8** %a.addr, align 8
+  %0 = load i8** %a.addr, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  ret void
+}
+
+; test5d:  no arrays / no nested arrays
+;          sspreq attribute
+; Requires protector.
+define void @test5d(i8* %a) nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test5d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test5d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test5d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test5d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a.addr = alloca i8*, align 8
+  store i8* %a, i8** %a.addr, align 8
+  %0 = load i8** %a.addr, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %0)
+  ret void
+}
+
+; test6a:  Address-of local taken (j = &a)
+;          no ssp attribute
+; Requires no protector.
+define void @test6a() nounwind uwtable {
+entry:
+; LINUX-I386: test6a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test6a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test6a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test6a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %retval = alloca i32, align 4
+  %a = alloca i32, align 4
+  %j = alloca i32*, align 8
+  store i32 0, i32* %retval
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  store i32* %a, i32** %j, align 8
+  ret void
+}
+
+; test6b:  Address-of local taken (j = &a)
+;          ssp attribute
+; Requires no protector.
+define void @test6b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test6b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test6b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test6b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test6b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %retval = alloca i32, align 4
+  %a = alloca i32, align 4
+  %j = alloca i32*, align 8
+  store i32 0, i32* %retval
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  store i32* %a, i32** %j, align 8
+  ret void
+}
+
+; test6c:  Address-of local taken (j = &a)
+;          sspstrong attribute
+; Requires protector.
+define void @test6c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test6c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test6c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test6c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test6c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %retval = alloca i32, align 4
+  %a = alloca i32, align 4
+  %j = alloca i32*, align 8
+  store i32 0, i32* %retval
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  store i32* %a, i32** %j, align 8
+  ret void
+}
+
+; test6d:  Address-of local taken (j = &a)
+;          sspreq attribute
+; Requires protector.
+define void @test6d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test6d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test6d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test6d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test6d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %retval = alloca i32, align 4
+  %a = alloca i32, align 4
+  %j = alloca i32*, align 8
+  store i32 0, i32* %retval
+  %0 = load i32* %a, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a, align 4
+  store i32* %a, i32** %j, align 8
+  ret void
+}
+
+; test7a:  PtrToInt Cast
+;          no ssp attribute
+; Requires no protector.
+define void @test7a() nounwind uwtable readnone {
+entry:
+; LINUX-I386: test7a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test7a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test7a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test7a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  %0 = ptrtoint i32* %a to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  ret void
+}
+
+; test7b:  PtrToInt Cast
+;          ssp attribute
+; Requires no protector.
+define void @test7b() nounwind uwtable readnone ssp {
+entry:
+; LINUX-I386: test7b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test7b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test7b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test7b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  %0 = ptrtoint i32* %a to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  ret void
+}
+
+; test7c:  PtrToInt Cast
+;          sspstrong attribute
+; Requires protector.
+define void @test7c() nounwind uwtable readnone sspstrong {
+entry:
+; LINUX-I386: test7c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test7c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test7c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test7c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  %0 = ptrtoint i32* %a to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  ret void
+}
+
+; test7d:  PtrToInt Cast
+;          sspreq attribute
+; Requires protector.
+define void @test7d() nounwind uwtable readnone sspreq {
+entry:
+; LINUX-I386: test7d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test7d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test7d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test7d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  %0 = ptrtoint i32* %a to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  ret void
+}
+
+; test8a:  Passing addr-of to function call
+;          no ssp attribute
+; Requires no protector.
+define void @test8a() nounwind uwtable {
+entry:
+; LINUX-I386: test8a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test8a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test8a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test8a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %b = alloca i32, align 4
+  call void @funcall(i32* %b) nounwind
+  ret void
+}
+
+; test8b:  Passing addr-of to function call
+;          ssp attribute
+; Requires no protector.
+define void @test8b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test8b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test8b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test8b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test8b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %b = alloca i32, align 4
+  call void @funcall(i32* %b) nounwind
+  ret void
+}
+
+; test8c:  Passing addr-of to function call
+;          sspstrong attribute
+; Requires protector.
+define void @test8c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test8c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test8c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test8c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test8c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %b = alloca i32, align 4
+  call void @funcall(i32* %b) nounwind
+  ret void
+}
+
+; test8d:  Passing addr-of to function call
+;          sspreq attribute
+; Requires protector.
+define void @test8d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test8d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test8d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test8d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test8d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %b = alloca i32, align 4
+  call void @funcall(i32* %b) nounwind
+  ret void
+}
+
+; test9a:  Addr-of in select instruction
+;          no ssp attribute
+; Requires no protector.
+define void @test9a() nounwind uwtable {
+entry:
+; LINUX-I386: test9a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test9a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test9a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test9a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %x = alloca double, align 8
+  %call = call double @testi_aux() nounwind
+  store double %call, double* %x, align 8
+  %cmp2 = fcmp ogt double %call, 0.000000e+00
+  %y.1 = select i1 %cmp2, double* %x, double* null
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  ret void
+}
+
+; test9b:  Addr-of in select instruction
+;          ssp attribute
+; Requires no protector.
+define void @test9b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test9b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test9b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test9b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test9b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %x = alloca double, align 8
+  %call = call double @testi_aux() nounwind
+  store double %call, double* %x, align 8
+  %cmp2 = fcmp ogt double %call, 0.000000e+00
+  %y.1 = select i1 %cmp2, double* %x, double* null
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  ret void
+}
+
+; test9c:  Addr-of in select instruction
+;          sspstrong attribute
+; Requires protector.
+define void @test9c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test9c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test9c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test9c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test9c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %x = alloca double, align 8
+  %call = call double @testi_aux() nounwind
+  store double %call, double* %x, align 8
+  %cmp2 = fcmp ogt double %call, 0.000000e+00
+  %y.1 = select i1 %cmp2, double* %x, double* null
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  ret void
+}
+
+; test9d:  Addr-of in select instruction
+;          sspreq attribute
+; Requires protector.
+define void @test9d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test9d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test9d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test9d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test9d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %x = alloca double, align 8
+  %call = call double @testi_aux() nounwind
+  store double %call, double* %x, align 8
+  %cmp2 = fcmp ogt double %call, 0.000000e+00
+  %y.1 = select i1 %cmp2, double* %x, double* null
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), double* %y.1)
+  ret void
+}
+
+; test10a: Addr-of in phi instruction
+;          no ssp attribute
+; Requires no protector.
+define void @test10a() nounwind uwtable {
+entry:
+; LINUX-I386: test10a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test10a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test10a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test10a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %x = alloca double, align 8
+  %call = call double @testi_aux() nounwind
+  store double %call, double* %x, align 8
+  %cmp = fcmp ogt double %call, 3.140000e+00
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call1 = call double @testi_aux() nounwind
+  store double %call1, double* %x, align 8
+  br label %if.end4
+
+if.else:                                          ; preds = %entry
+  %cmp2 = fcmp ogt double %call, 1.000000e+00
+  br i1 %cmp2, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.else
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.else, %if.then3, %if.then
+  %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  ret void
+}
+
+; test10b: Addr-of in phi instruction
+;          ssp attribute
+; Requires no protector.
+define void @test10b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test10b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test10b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test10b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test10b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %x = alloca double, align 8
+  %call = call double @testi_aux() nounwind
+  store double %call, double* %x, align 8
+  %cmp = fcmp ogt double %call, 3.140000e+00
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call1 = call double @testi_aux() nounwind
+  store double %call1, double* %x, align 8
+  br label %if.end4
+
+if.else:                                          ; preds = %entry
+  %cmp2 = fcmp ogt double %call, 1.000000e+00
+  br i1 %cmp2, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.else
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.else, %if.then3, %if.then
+  %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  ret void
+}
+
+; test10c: Addr-of in phi instruction
+;          sspstrong attribute
+; Requires protector.
+define void @test10c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test10c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test10c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test10c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test10c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %x = alloca double, align 8
+  %call = call double @testi_aux() nounwind
+  store double %call, double* %x, align 8
+  %cmp = fcmp ogt double %call, 3.140000e+00
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call1 = call double @testi_aux() nounwind
+  store double %call1, double* %x, align 8
+  br label %if.end4
+
+if.else:                                          ; preds = %entry
+  %cmp2 = fcmp ogt double %call, 1.000000e+00
+  br i1 %cmp2, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.else
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.else, %if.then3, %if.then
+  %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  ret void
+}
+
+; test10d: Addr-of in phi instruction
+;          sspreq attribute
+; Requires protector.
+define void @test10d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test10d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test10d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test10d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test10d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %x = alloca double, align 8
+  %call = call double @testi_aux() nounwind
+  store double %call, double* %x, align 8
+  %cmp = fcmp ogt double %call, 3.140000e+00
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call1 = call double @testi_aux() nounwind
+  store double %call1, double* %x, align 8
+  br label %if.end4
+
+if.else:                                          ; preds = %entry
+  %cmp2 = fcmp ogt double %call, 1.000000e+00
+  br i1 %cmp2, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.else
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.else, %if.then3, %if.then
+  %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  ret void
+}
+
+; test11a: Addr-of struct element. (GEP followed by store).
+;          no ssp attribute
+; Requires no protector.
+define void @test11a() nounwind uwtable {
+entry:
+; LINUX-I386: test11a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test11a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test11a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test11a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.pair, align 4
+  %b = alloca i32*, align 8
+  %y = getelementptr inbounds %struct.pair* %c, i32 0, i32 1
+  store i32* %y, i32** %b, align 8
+  %0 = load i32** %b, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  ret void
+}
+
+; test11b: Addr-of struct element. (GEP followed by store).
+;          ssp attribute
+; Requires no protector.
+define void @test11b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test11b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test11b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test11b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test11b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.pair, align 4
+  %b = alloca i32*, align 8
+  %y = getelementptr inbounds %struct.pair* %c, i32 0, i32 1
+  store i32* %y, i32** %b, align 8
+  %0 = load i32** %b, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  ret void
+}
+
+; test11c: Addr-of struct element. (GEP followed by store).
+;          sspstrong attribute
+; Requires protector.
+define void @test11c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test11c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test11c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test11c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test11c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.pair, align 4
+  %b = alloca i32*, align 8
+  %y = getelementptr inbounds %struct.pair* %c, i32 0, i32 1
+  store i32* %y, i32** %b, align 8
+  %0 = load i32** %b, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  ret void
+}
+
+; test11d: Addr-of struct element. (GEP followed by store).
+;          sspreq attribute
+; Requires protector.
+define void @test11d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test11d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test11d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test11d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test11d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.pair, align 4
+  %b = alloca i32*, align 8
+  %y = getelementptr inbounds %struct.pair* %c, i32 0, i32 1
+  store i32* %y, i32** %b, align 8
+  %0 = load i32** %b, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32* %0)
+  ret void
+}
+
+; test12a: Addr-of struct element, GEP followed by ptrtoint.
+;          no ssp attribute
+; Requires no protector.
+define void @test12a() nounwind uwtable {
+entry:
+; LINUX-I386: test12a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test12a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test12a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test12a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.pair, align 4
+  %b = alloca i32*, align 8
+  %y = getelementptr inbounds %struct.pair* %c, i32 0, i32 1
+  %0 = ptrtoint i32* %y to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  ret void
+}
+
+; test12b: Addr-of struct element, GEP followed by ptrtoint.
+;          ssp attribute
+; Requires no protector.
+define void @test12b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test12b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test12b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test12b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test12b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.pair, align 4
+  %b = alloca i32*, align 8
+  %y = getelementptr inbounds %struct.pair* %c, i32 0, i32 1
+  %0 = ptrtoint i32* %y to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  ret void
+}
+
+; test12c: Addr-of struct element, GEP followed by ptrtoint.
+;          sspstrong attribute
+; Requires protector.
+define void @test12c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test12c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test12c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test12c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test12c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.pair, align 4
+  %b = alloca i32*, align 8
+  %y = getelementptr inbounds %struct.pair* %c, i32 0, i32 1
+  %0 = ptrtoint i32* %y to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  ret void
+}
+
+; test12d: Addr-of struct element, GEP followed by ptrtoint.
+;          sspreq attribute
+; Requires protector.
+define void @test12d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test12d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test12d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test12d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test12d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.pair, align 4
+  %b = alloca i32*, align 8
+  %y = getelementptr inbounds %struct.pair* %c, i32 0, i32 1
+  %0 = ptrtoint i32* %y to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %0)
+  ret void
+}
+
+; test13a: Addr-of struct element, GEP followed by callinst.
+;          no ssp attribute
+; Requires no protector.
+define void @test13a() nounwind uwtable {
+entry:
+; LINUX-I386: test13a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test13a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test13a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test13a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.pair, align 4
+  %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  ret void
+}
+
+; test13b: Addr-of struct element, GEP followed by callinst.
+;          ssp attribute
+; Requires no protector.
+define void @test13b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test13b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test13b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test13b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test13b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.pair, align 4
+  %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  ret void
+}
+
+; test13c: Addr-of struct element, GEP followed by callinst.
+;          sspstrong attribute
+; Requires protector.
+define void @test13c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test13c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test13c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test13c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test13c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.pair, align 4
+  %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  ret void
+}
+
+; test13d: Addr-of struct element, GEP followed by callinst.
+;          sspreq attribute
+; Requires protector.
+define void @test13d() nounwind uwtable sspreq {
 entry:
-	%a_addr = alloca i8*		; <i8**> [#uses=2]
-	%buf = alloca [8 x i8]		; <[8 x i8]*> [#uses=2]
-	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
-	store i8* %a, i8** %a_addr
-	%buf1 = bitcast [8 x i8]* %buf to i8*		; <i8*> [#uses=1]
-	%0 = load i8** %a_addr, align 4		; <i8*> [#uses=1]
-	%1 = call i8* @strcpy(i8* %buf1, i8* %0) nounwind		; <i8*> [#uses=0]
-	%buf2 = bitcast [8 x i8]* %buf to i8*		; <i8*> [#uses=1]
-	%2 = call i32 (i8*, ...)* @printf(i8* getelementptr ([11 x i8]* @"\01LC", i32 0, i32 0), i8* %buf2) nounwind		; <i32> [#uses=0]
-	br label %return
+; LINUX-I386: test13d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test13d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
 
-return:		; preds = %entry
-	ret void
+; LINUX-KERNEL-X64: test13d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test13d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.pair, align 4
+  %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  ret void
 }
 
-declare i8* @strcpy(i8*, i8*) nounwind
+; test14a: Addr-of a local, optimized into a GEP (e.g., &a - 12)
+;          no ssp attribute
+; Requires no protector.
+define void @test14a() nounwind uwtable {
+entry:
+; LINUX-I386: test14a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test14a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test14a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test14a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  ret void
+}
+
+; test14b: Addr-of a local, optimized into a GEP (e.g., &a - 12)
+;          ssp attribute
+; Requires no protector.
+define void @test14b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test14b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test14b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test14b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test14b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  ret void
+}
+
+; test14c: Addr-of a local, optimized into a GEP (e.g., &a - 12)
+;          sspstrong attribute
+; Requires protector.
+define void @test14c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test14c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test14c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test14c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test14c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  ret void
+}
+
+; test14d: Addr-of a local, optimized into a GEP (e.g., &a - 12)
+;          sspreq  attribute
+; Requires protector.
+define void @test14d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test14d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test14d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test14d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test14d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  ret void
+}
+
+; test15a: Addr-of a local cast to a ptr of a different type
+;           (e.g., int a; ... ; float *b = &a;)
+;          no ssp attribute
+; Requires no protector.
+define void @test15a() nounwind uwtable {
+entry:
+; LINUX-I386: test15a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test15a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test15a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test15a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  %b = alloca float*, align 8
+  store i32 0, i32* %a, align 4
+  %0 = bitcast i32* %a to float*
+  store float* %0, float** %b, align 8
+  %1 = load float** %b, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), float* %1)
+  ret void
+}
+
+; test15b: Addr-of a local cast to a ptr of a different type
+;           (e.g., int a; ... ; float *b = &a;)
+;          ssp attribute
+; Requires no protector.
+define void @test15b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test15b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test15b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test15b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test15b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  %b = alloca float*, align 8
+  store i32 0, i32* %a, align 4
+  %0 = bitcast i32* %a to float*
+  store float* %0, float** %b, align 8
+  %1 = load float** %b, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), float* %1)
+  ret void
+}
+
+; test15c: Addr-of a local cast to a ptr of a different type
+;           (e.g., int a; ... ; float *b = &a;)
+;          sspstrong attribute
+; Requires protector.
+define void @test15c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test15c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test15c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test15c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test15c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  %b = alloca float*, align 8
+  store i32 0, i32* %a, align 4
+  %0 = bitcast i32* %a to float*
+  store float* %0, float** %b, align 8
+  %1 = load float** %b, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), float* %1)
+  ret void
+}
+
+; test15d: Addr-of a local cast to a ptr of a different type
+;           (e.g., int a; ... ; float *b = &a;)
+;          sspreq attribute
+; Requires protector.
+define void @test15d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test15d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test15d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test15d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test15d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  %b = alloca float*, align 8
+  store i32 0, i32* %a, align 4
+  %0 = bitcast i32* %a to float*
+  store float* %0, float** %b, align 8
+  %1 = load float** %b, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), float* %1)
+  ret void
+}
+
+; test16a: Addr-of a local cast to a ptr of a different type (optimized)
+;           (e.g., int a; ... ; float *b = &a;)
+;          no ssp attribute
+; Requires no protector.
+define void @test16a() nounwind uwtable {
+entry:
+; LINUX-I386: test16a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test16a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test16a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test16a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  store i32 0, i32* %a, align 4
+  %0 = bitcast i32* %a to float*
+  call void @funfloat(float* %0) nounwind
+  ret void
+}
+
+; test16b: Addr-of a local cast to a ptr of a different type (optimized)
+;           (e.g., int a; ... ; float *b = &a;)
+;          ssp attribute
+; Requires no protector.
+define void @test16b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test16b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test16b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test16b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test16b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  store i32 0, i32* %a, align 4
+  %0 = bitcast i32* %a to float*
+  call void @funfloat(float* %0) nounwind
+  ret void
+}
+
+; test16c: Addr-of a local cast to a ptr of a different type (optimized)
+;           (e.g., int a; ... ; float *b = &a;)
+;          sspstrong attribute
+; Requires protector.
+define void @test16c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test16c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test16c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test16c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test16c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  store i32 0, i32* %a, align 4
+  %0 = bitcast i32* %a to float*
+  call void @funfloat(float* %0) nounwind
+  ret void
+}
+
+; test16d: Addr-of a local cast to a ptr of a different type (optimized)
+;           (e.g., int a; ... ; float *b = &a;)
+;          sspreq attribute
+; Requires protector.
+define void @test16d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test16d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test16d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test16d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test16d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  store i32 0, i32* %a, align 4
+  %0 = bitcast i32* %a to float*
+  call void @funfloat(float* %0) nounwind
+  ret void
+}
+
+; test17a: Addr-of a vector nested in a struct
+;          no ssp attribute
+; Requires no protector.
+define void @test17a() nounwind uwtable {
+entry:
+; LINUX-I386: test17a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test17a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test17a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test17a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.vec, align 16
+  %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
+  %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  ret void
+}
+
+; test17b: Addr-of a vector nested in a struct
+;          ssp attribute
+; Requires no protector.
+define void @test17b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test17b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test17b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test17b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test17b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.vec, align 16
+  %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
+  %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  ret void
+}
+
+; test17c: Addr-of a vector nested in a struct
+;          sspstrong attribute
+; Requires protector.
+define void @test17c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test17c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test17c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test17c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test17c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.vec, align 16
+  %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
+  %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  ret void
+}
+
+; test17d: Addr-of a vector nested in a struct
+;          sspreq attribute
+; Requires protector.
+define void @test17d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test17d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test17d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test17d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test17d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.vec, align 16
+  %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
+  %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  ret void
+}
+
+; test18a: Addr-of a variable passed into an invoke instruction.
+;          no ssp attribute
+; Requires no protector.
+define i32 @test18a() uwtable {
+entry:
+; LINUX-I386: test18a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test18a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test18a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test18a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store i32 0, i32* %a, align 4
+  invoke void @_Z3exceptPi(i32* %a)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i32 0
+}
+
+; test18b: Addr-of a variable passed into an invoke instruction.
+;          ssp attribute
+; Requires no protector.
+define i32 @test18b() uwtable ssp {
+entry:
+; LINUX-I386: test18b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test18b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test18b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test18b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store i32 0, i32* %a, align 4
+  invoke void @_Z3exceptPi(i32* %a)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i32 0
+}
+
+; test18c: Addr-of a variable passed into an invoke instruction.
+;          sspstrong attribute
+; Requires protector.
+define i32 @test18c() uwtable sspstrong {
+entry:
+; LINUX-I386: test18c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test18c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test18c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test18c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store i32 0, i32* %a, align 4
+  invoke void @_Z3exceptPi(i32* %a)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i32 0
+}
+
+; test18d: Addr-of a variable passed into an invoke instruction.
+;          sspreq attribute
+; Requires protector.
+define i32 @test18d() uwtable sspreq {
+entry:
+; LINUX-I386: test18d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test18d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test18d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test18d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store i32 0, i32* %a, align 4
+  invoke void @_Z3exceptPi(i32* %a)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i32 0
+}
+
+; test19a: Addr-of a struct element passed into an invoke instruction.
+;           (GEP followed by an invoke)
+;          no ssp attribute
+; Requires no protector.
+define i32 @test19a() uwtable {
+entry:
+; LINUX-I386: test19a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test19a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test19a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test19a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.pair, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %a = getelementptr inbounds %struct.pair* %c, i32 0, i32 0
+  store i32 0, i32* %a, align 4
+  %a1 = getelementptr inbounds %struct.pair* %c, i32 0, i32 0
+  invoke void @_Z3exceptPi(i32* %a1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i32 0
+}
+
+; test19b: Addr-of a struct element passed into an invoke instruction.
+;           (GEP followed by an invoke)
+;          ssp attribute
+; Requires no protector.
+define i32 @test19b() uwtable ssp {
+entry:
+; LINUX-I386: test19b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test19b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test19b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test19b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.pair, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %a = getelementptr inbounds %struct.pair* %c, i32 0, i32 0
+  store i32 0, i32* %a, align 4
+  %a1 = getelementptr inbounds %struct.pair* %c, i32 0, i32 0
+  invoke void @_Z3exceptPi(i32* %a1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i32 0
+}
+
+; test19c: Addr-of a struct element passed into an invoke instruction.
+;           (GEP followed by an invoke)
+;          sspstrong attribute
+; Requires protector.
+define i32 @test19c() uwtable sspstrong {
+entry:
+; LINUX-I386: test19c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test19c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test19c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test19c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.pair, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %a = getelementptr inbounds %struct.pair* %c, i32 0, i32 0
+  store i32 0, i32* %a, align 4
+  %a1 = getelementptr inbounds %struct.pair* %c, i32 0, i32 0
+  invoke void @_Z3exceptPi(i32* %a1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i32 0
+}
+
+; test19d: Addr-of a struct element passed into an invoke instruction.
+;           (GEP followed by an invoke)
+;          sspreq attribute
+; Requires protector.
+define i32 @test19d() uwtable sspreq {
+entry:
+; LINUX-I386: test19d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test19d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test19d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test19d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %c = alloca %struct.pair, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %a = getelementptr inbounds %struct.pair* %c, i32 0, i32 0
+  store i32 0, i32* %a, align 4
+  %a1 = getelementptr inbounds %struct.pair* %c, i32 0, i32 0
+  invoke void @_Z3exceptPi(i32* %a1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i32 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  ret i32 0
+}
+
+; test20a: Addr-of a pointer
+;          no ssp attribute
+; Requires no protector.
+define void @test20a() nounwind uwtable {
+entry:
+; LINUX-I386: test20a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test20a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test20a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test20a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32*, align 8
+  %b = alloca i32**, align 8
+  %call = call i32* @getp()
+  store i32* %call, i32** %a, align 8
+  store i32** %a, i32*** %b, align 8
+  %0 = load i32*** %b, align 8
+  call void @funcall2(i32** %0)
+  ret void
+}
+
+; test20b: Addr-of a pointer
+;          ssp attribute
+; Requires no protector.
+define void @test20b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test20b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test20b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test20b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test20b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32*, align 8
+  %b = alloca i32**, align 8
+  %call = call i32* @getp()
+  store i32* %call, i32** %a, align 8
+  store i32** %a, i32*** %b, align 8
+  %0 = load i32*** %b, align 8
+  call void @funcall2(i32** %0)
+  ret void
+}
+
+; test20c: Addr-of a pointer
+;          sspstrong attribute
+; Requires protector.
+define void @test20c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test20c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test20c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test20c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test20c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32*, align 8
+  %b = alloca i32**, align 8
+  %call = call i32* @getp()
+  store i32* %call, i32** %a, align 8
+  store i32** %a, i32*** %b, align 8
+  %0 = load i32*** %b, align 8
+  call void @funcall2(i32** %0)
+  ret void
+}
+
+; test20d: Addr-of a pointer
+;          sspreq attribute
+; Requires protector.
+define void @test20d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test20d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test20d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test20d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test20d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32*, align 8
+  %b = alloca i32**, align 8
+  %call = call i32* @getp()
+  store i32* %call, i32** %a, align 8
+  store i32** %a, i32*** %b, align 8
+  %0 = load i32*** %b, align 8
+  call void @funcall2(i32** %0)
+  ret void
+}
+
+; test21a: Addr-of a casted pointer
+;          no ssp attribute
+; Requires no protector.
+define void @test21a() nounwind uwtable {
+entry:
+; LINUX-I386: test21a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test21a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test21a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test21a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32*, align 8
+  %b = alloca float**, align 8
+  %call = call i32* @getp()
+  store i32* %call, i32** %a, align 8
+  %0 = bitcast i32** %a to float**
+  store float** %0, float*** %b, align 8
+  %1 = load float*** %b, align 8
+  call void @funfloat2(float** %1)
+  ret void
+}
+
+; test21b: Addr-of a casted pointer
+;          ssp attribute
+; Requires no protector.
+define void @test21b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test21b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test21b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test21b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test21b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca i32*, align 8
+  %b = alloca float**, align 8
+  %call = call i32* @getp()
+  store i32* %call, i32** %a, align 8
+  %0 = bitcast i32** %a to float**
+  store float** %0, float*** %b, align 8
+  %1 = load float*** %b, align 8
+  call void @funfloat2(float** %1)
+  ret void
+}
+
+; test21c: Addr-of a casted pointer
+;          sspstrong attribute
+; Requires protector.
+define void @test21c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test21c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test21c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test21c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test21c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32*, align 8
+  %b = alloca float**, align 8
+  %call = call i32* @getp()
+  store i32* %call, i32** %a, align 8
+  %0 = bitcast i32** %a to float**
+  store float** %0, float*** %b, align 8
+  %1 = load float*** %b, align 8
+  call void @funfloat2(float** %1)
+  ret void
+}
+
+; test21d: Addr-of a casted pointer
+;          sspreq attribute
+; Requires protector.
+define void @test21d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test21d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test21d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test21d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test21d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca i32*, align 8
+  %b = alloca float**, align 8
+  %call = call i32* @getp()
+  store i32* %call, i32** %a, align 8
+  %0 = bitcast i32** %a to float**
+  store float** %0, float*** %b, align 8
+  %1 = load float*** %b, align 8
+  call void @funfloat2(float** %1)
+  ret void
+}
+
+; test22a: [2 x i8] in a class
+;          no ssp attribute
+; Requires no protector.
+define signext i8 @test22a() nounwind uwtable {
+entry:
+; LINUX-I386: test22a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test22a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test22a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test22a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca %class.A, align 1
+  %array = getelementptr inbounds %class.A* %a, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i8]* %array, i32 0, i64 0
+  %0 = load i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; test22b: [2 x i8] in a class
+;          ssp attribute
+; Requires no protector.
+define signext i8 @test22b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test22b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test22b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test22b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test22b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca %class.A, align 1
+  %array = getelementptr inbounds %class.A* %a, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i8]* %array, i32 0, i64 0
+  %0 = load i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; test22c: [2 x i8] in a class
+;          sspstrong attribute
+; Requires protector.
+define signext i8 @test22c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test22c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test22c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test22c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test22c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca %class.A, align 1
+  %array = getelementptr inbounds %class.A* %a, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i8]* %array, i32 0, i64 0
+  %0 = load i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; test22d: [2 x i8] in a class
+;          sspreq attribute
+; Requires protector.
+define signext i8 @test22d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test22d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test22d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test22d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test22d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca %class.A, align 1
+  %array = getelementptr inbounds %class.A* %a, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i8]* %array, i32 0, i64 0
+  %0 = load i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; test23a: [2 x i8] nested in several layers of structs and unions
+;          no ssp attribute
+; Requires no protector.
+define signext i8 @test23a() nounwind uwtable {
+entry:
+; LINUX-I386: test23a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test23a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test23a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test23a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %x = alloca %struct.deep, align 1
+  %b = getelementptr inbounds %struct.deep* %x, i32 0, i32 0
+  %c = bitcast %union.anon* %b to %struct.anon*
+  %d = getelementptr inbounds %struct.anon* %c, i32 0, i32 0
+  %e = getelementptr inbounds %struct.anon.0* %d, i32 0, i32 0
+  %array = bitcast %union.anon.1* %e to [2 x i8]*
+  %arrayidx = getelementptr inbounds [2 x i8]* %array, i32 0, i64 0
+  %0 = load i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; test23b: [2 x i8] nested in several layers of structs and unions
+;          ssp attribute
+; Requires no protector.
+define signext i8 @test23b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test23b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test23b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test23b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test23b:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %x = alloca %struct.deep, align 1
+  %b = getelementptr inbounds %struct.deep* %x, i32 0, i32 0
+  %c = bitcast %union.anon* %b to %struct.anon*
+  %d = getelementptr inbounds %struct.anon* %c, i32 0, i32 0
+  %e = getelementptr inbounds %struct.anon.0* %d, i32 0, i32 0
+  %array = bitcast %union.anon.1* %e to [2 x i8]*
+  %arrayidx = getelementptr inbounds [2 x i8]* %array, i32 0, i64 0
+  %0 = load i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; test23c: [2 x i8] nested in several layers of structs and unions
+;          sspstrong attribute
+; Requires protector.
+define signext i8 @test23c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test23c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test23c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test23c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test23c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %x = alloca %struct.deep, align 1
+  %b = getelementptr inbounds %struct.deep* %x, i32 0, i32 0
+  %c = bitcast %union.anon* %b to %struct.anon*
+  %d = getelementptr inbounds %struct.anon* %c, i32 0, i32 0
+  %e = getelementptr inbounds %struct.anon.0* %d, i32 0, i32 0
+  %array = bitcast %union.anon.1* %e to [2 x i8]*
+  %arrayidx = getelementptr inbounds [2 x i8]* %array, i32 0, i64 0
+  %0 = load i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; test23d: [2 x i8] nested in several layers of structs and unions
+;          sspreq attribute
+; Requires protector.
+define signext i8 @test23d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test23d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test23d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test23d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test23d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %x = alloca %struct.deep, align 1
+  %b = getelementptr inbounds %struct.deep* %x, i32 0, i32 0
+  %c = bitcast %union.anon* %b to %struct.anon*
+  %d = getelementptr inbounds %struct.anon* %c, i32 0, i32 0
+  %e = getelementptr inbounds %struct.anon.0* %d, i32 0, i32 0
+  %array = bitcast %union.anon.1* %e to [2 x i8]*
+  %arrayidx = getelementptr inbounds [2 x i8]* %array, i32 0, i64 0
+  %0 = load i8* %arrayidx, align 1
+  ret i8 %0
+}
+
+; test24a: Variable sized alloca
+;          no ssp attribute
+; Requires no protector.
+define void @test24a(i32 %n) nounwind uwtable {
+entry:
+; LINUX-I386: test24a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test24a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test24a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test24a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %n.addr = alloca i32, align 4
+  %a = alloca i32*, align 8
+  store i32 %n, i32* %n.addr, align 4
+  %0 = load i32* %n.addr, align 4
+  %conv = sext i32 %0 to i64
+  %1 = alloca i8, i64 %conv
+  %2 = bitcast i8* %1 to i32*
+  store i32* %2, i32** %a, align 8
+  ret void
+}
+
+; test24b: Variable sized alloca
+;          ssp attribute
+; Requires protector.
+define void @test24b(i32 %n) nounwind uwtable ssp {
+entry:
+; LINUX-I386: test24b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test24b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test24b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test24b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %n.addr = alloca i32, align 4
+  %a = alloca i32*, align 8
+  store i32 %n, i32* %n.addr, align 4
+  %0 = load i32* %n.addr, align 4
+  %conv = sext i32 %0 to i64
+  %1 = alloca i8, i64 %conv
+  %2 = bitcast i8* %1 to i32*
+  store i32* %2, i32** %a, align 8
+  ret void
+}
+
+; test24c: Variable sized alloca
+;          sspstrong attribute
+; Requires protector.
+define void @test24c(i32 %n) nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test24c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test24c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test24c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test24c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %n.addr = alloca i32, align 4
+  %a = alloca i32*, align 8
+  store i32 %n, i32* %n.addr, align 4
+  %0 = load i32* %n.addr, align 4
+  %conv = sext i32 %0 to i64
+  %1 = alloca i8, i64 %conv
+  %2 = bitcast i8* %1 to i32*
+  store i32* %2, i32** %a, align 8
+  ret void
+}
+
+; test24d: Variable sized alloca
+;          sspreq attribute
+; Requires protector.
+define void @test24d(i32 %n) nounwind uwtable sspreq  {
+entry:
+; LINUX-I386: test24d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test24d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test24d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test24d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %n.addr = alloca i32, align 4
+  %a = alloca i32*, align 8
+  store i32 %n, i32* %n.addr, align 4
+  %0 = load i32* %n.addr, align 4
+  %conv = sext i32 %0 to i64
+  %1 = alloca i8, i64 %conv
+  %2 = bitcast i8* %1 to i32*
+  store i32* %2, i32** %a, align 8
+  ret void
+}
+
+; test25a: array of [4 x i32]
+;          no ssp attribute
+; Requires no protector.
+define i32 @test25a() nounwind uwtable {
+entry:
+; LINUX-I386: test25a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test25a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test25a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test25a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %a = alloca [4 x i32], align 16
+  %arrayidx = getelementptr inbounds [4 x i32]* %a, i32 0, i64 0
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; test25b: array of [4 x i32]
+;          ssp attribute
+; Requires no protector, except for Darwin which _does_ require a protector.
+define i32 @test25b() nounwind uwtable ssp {
+entry:
+; LINUX-I386: test25b:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test25b:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test25b:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test25b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca [4 x i32], align 16
+  %arrayidx = getelementptr inbounds [4 x i32]* %a, i32 0, i64 0
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; test25c: array of [4 x i32]
+;          sspstrong attribute
+; Requires protector.
+define i32 @test25c() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test25c:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test25c:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test25c:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test25c:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca [4 x i32], align 16
+  %arrayidx = getelementptr inbounds [4 x i32]* %a, i32 0, i64 0
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; test25d: array of [4 x i32]
+;          sspreq attribute
+; Requires protector.
+define i32 @test25d() nounwind uwtable sspreq {
+entry:
+; LINUX-I386: test25d:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test25d:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test25d:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test25d:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %a = alloca [4 x i32], align 16
+  %arrayidx = getelementptr inbounds [4 x i32]* %a, i32 0, i64 0
+  %0 = load i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; test26: Nested structure, no arrays, no address-of expressions.
+;         Verify that the resulting gep-of-gep does not incorrectly trigger
+;         a stack protector.
+;         ssptrong attribute
+; Requires no protector.
+define void @test26() nounwind uwtable sspstrong {
+entry:
+; LINUX-I386: test26:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64: test26:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64: test26:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64: test26:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %c = alloca %struct.nest, align 4
+  %b = getelementptr inbounds %struct.nest* %c, i32 0, i32 1
+  %_a = getelementptr inbounds %struct.pair* %b, i32 0, i32 0
+  %0 = load i32* %_a, align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0)
+  ret void
+}
+
+; test27: Address-of a structure taken in a function with a loop where
+;         the alloca is an incoming value to a PHI node and a use of that PHI 
+;         node is also an incoming value.
+;         Verify that the address-of analysis does not get stuck in infinite
+;         recursion when chasing the alloca through the PHI nodes.
+; Requires protector.
+define i32 @test27(i32 %arg) nounwind uwtable sspstrong {
+bb:
+; LINUX-I386: test27:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64: test27:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64: test27:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64: test27:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %tmp = alloca %struct.small*, align 8
+  %tmp1 = call i32 (...)* @dummy(%struct.small** %tmp) nounwind
+  %tmp2 = load %struct.small** %tmp, align 8
+  %tmp3 = ptrtoint %struct.small* %tmp2 to i64
+  %tmp4 = trunc i64 %tmp3 to i32
+  %tmp5 = icmp sgt i32 %tmp4, 0
+  br i1 %tmp5, label %bb6, label %bb21
+
+bb6:                                              ; preds = %bb17, %bb
+  %tmp7 = phi %struct.small* [ %tmp19, %bb17 ], [ %tmp2, %bb ]
+  %tmp8 = phi i64 [ %tmp20, %bb17 ], [ 1, %bb ]
+  %tmp9 = phi i32 [ %tmp14, %bb17 ], [ %tmp1, %bb ]
+  %tmp10 = getelementptr inbounds %struct.small* %tmp7, i64 0, i32 0
+  %tmp11 = load i8* %tmp10, align 1
+  %tmp12 = icmp eq i8 %tmp11, 1
+  %tmp13 = add nsw i32 %tmp9, 8
+  %tmp14 = select i1 %tmp12, i32 %tmp13, i32 %tmp9
+  %tmp15 = trunc i64 %tmp8 to i32
+  %tmp16 = icmp eq i32 %tmp15, %tmp4
+  br i1 %tmp16, label %bb21, label %bb17
+
+bb17:                                             ; preds = %bb6
+  %tmp18 = getelementptr inbounds %struct.small** %tmp, i64 %tmp8
+  %tmp19 = load %struct.small** %tmp18, align 8
+  %tmp20 = add i64 %tmp8, 1
+  br label %bb6
+
+bb21:                                             ; preds = %bb6, %bb
+  %tmp22 = phi i32 [ %tmp1, %bb ], [ %tmp14, %bb6 ]
+  %tmp23 = call i32 (...)* @dummy(i32 %tmp22) nounwind
+  ret i32 undef
+}
 
-declare i32 @printf(i8*, ...) nounwind
+declare double @testi_aux()
+declare i8* @strcpy(i8*, i8*)
+declare i32 @printf(i8*, ...)
+declare void @funcall(i32*)
+declare void @funcall2(i32**)
+declare void @funfloat(float*)
+declare void @funfloat2(float**)
+declare void @_Z3exceptPi(i32*)
+declare i32 @__gxx_personality_v0(...)
+declare i32* @getp()
+declare i32 @dummy(...)
diff --git a/test/CodeGen/X86/stack-update-frame-opcode.ll b/test/CodeGen/X86/stack-update-frame-opcode.ll
new file mode 100644
index 0000000..9a5a242
--- /dev/null
+++ b/test/CodeGen/X86/stack-update-frame-opcode.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CORE_LP64 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=atom < %s | FileCheck -check-prefix=ATOM_LP64 %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -mcpu=corei7 < %s | FileCheck -check-prefix=CORE_ILP32 %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -mcpu=atom < %s | FileCheck -check-prefix=ATOM_ILP32 %s
+
+define i32 @bar(i32 %a) nounwind {
+entry:
+  %arr = alloca [400 x i32], align 16
+
+; There is a 2x2 variation matrix here:
+; Atoms use LEA to update the SP. Opcode bitness depends on data model.
+; Cores use sub/add to update the SP. Opcode bitness depends on data model.
+
+; CORE_LP64: subq $1608
+; CORE_ILP32: subl $1608
+; ATOM_LP64: leaq -1608
+; ATOM_ILP32: leal -1608
+
+  %arraydecay = getelementptr inbounds [400 x i32]* %arr, i64 0, i64 0
+  %call = call i32 @foo(i32 %a, i32* %arraydecay) nounwind
+  ret i32 %call
+
+; CORE_LP64: addq $1608
+; CORE_ILP32: addl $1608
+; ATOM_LP64: leaq 1608
+; ATOM_ILP32: leal 1608
+
+}
+
+declare i32 @foo(i32, i32*)
+
diff --git a/test/CodeGen/X86/subtarget-feature-change.ll b/test/CodeGen/X86/subtarget-feature-change.ll
new file mode 100644
index 0000000..cd67729
--- /dev/null
+++ b/test/CodeGen/X86/subtarget-feature-change.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+; This should not generate SSE instructions:
+;
+; CHECK: without.sse:
+; CHECK: flds
+; CHECK: fmuls
+; CHECK: fstps
+define void @without.sse(float* nocapture %a, float* nocapture %b, float* nocapture %c, i32 %n) #0 {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %arrayidx2 = getelementptr inbounds float* %c, i64 %indvars.iv
+  %1 = load float* %arrayidx2, align 4, !tbaa !0
+  %mul = fmul float %0, %1
+  %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx4, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This should generate SSE instructions:
+;
+; CHECK: with.sse
+; CHECK: movss
+; CHECK: mulss
+; CHECK: movss
+define void @with.sse(float* nocapture %a, float* nocapture %b, float* nocapture %c, i32 %n) #1 {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %arrayidx2 = getelementptr inbounds float* %c, i64 %indvars.iv
+  %1 = load float* %arrayidx2, align 4, !tbaa !0
+  %mul = fmul float %0, %1
+  %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx4, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind optsize ssp uwtable "target-cpu"="core2" "target-features"="-sse4a,-avx2,-xop,-fma4,-bmi2,-3dnow,-3dnowa,-pclmul,-sse,-avx,-sse41,-ssse3,+mmx,-rtm,-sse42,-lzcnt,-f16c,-popcnt,-bmi,-aes,-fma,-rdrand,-sse2,-sse3" }
+attributes #1 = { nounwind optsize ssp uwtable "target-cpu"="core2" "target-features"="-sse4a,-avx2,-xop,-fma4,-bmi2,-3dnow,-3dnowa,-pclmul,+sse,-avx,-sse41,+ssse3,+mmx,-rtm,-sse42,-lzcnt,-f16c,-popcnt,-bmi,-aes,-fma,-rdrand,+sse2,+sse3" }
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/tailcall-structret.ll b/test/CodeGen/X86/tailcall-structret.ll
index d8be4b2..dcfefe8 100644
--- a/test/CodeGen/X86/tailcall-structret.ll
+++ b/test/CodeGen/X86/tailcall-structret.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
+; RUN: llc < %s -mtriple=i686-unknown-linux -tailcallopt | FileCheck %s
 define fastcc { { i8*, i8* }*, i8*} @init({ { i8*, i8* }*, i8*}, i32) {
 entry:
       %2 = tail call fastcc { { i8*, i8* }*, i8* } @init({ { i8*, i8*}*, i8*} %0, i32 %1)
       ret { { i8*, i8* }*, i8*} %2
+; CHECK: jmp init
 }
diff --git a/test/CodeGen/X86/tailcallbyval.ll b/test/CodeGen/X86/tailcallbyval.ll
index 118eee6..9a0b57c 100644
--- a/test/CodeGen/X86/tailcallbyval.ll
+++ b/test/CodeGen/X86/tailcallbyval.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL
-; RUN: llc < %s -march=x86 -tailcallopt | grep "movl[[:space:]]*4(%esp), %eax" | count 1
+; RUN: llc < %s -mtriple=i686-unknown-linux -tailcallopt | FileCheck %s
 %struct.s = type {i32, i32, i32, i32, i32, i32, i32, i32,
                   i32, i32, i32, i32, i32, i32, i32, i32,
                   i32, i32, i32, i32, i32, i32, i32, i32 }
@@ -9,10 +8,14 @@ entry:
         %tmp2 = getelementptr %struct.s* %a, i32 0, i32 0
         %tmp3 = load i32* %tmp2
         ret i32 %tmp3
+; CHECK: tailcallee
+; CHECK: movl 4(%esp), %eax
 }
 
 define  fastcc i32 @tailcaller(%struct.s* byval %a) nounwind {
 entry:
         %tmp4 = tail call fastcc i32 @tailcallee(%struct.s* byval %a )
         ret i32 %tmp4
+; CHECK: tailcaller
+; CHECK: jmp tailcallee
 }
diff --git a/test/CodeGen/X86/tailcallfp.ll b/test/CodeGen/X86/tailcallfp.ll
index c0b609a..22a7930 100644
--- a/test/CodeGen/X86/tailcallfp.ll
+++ b/test/CodeGen/X86/tailcallfp.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel -tailcallopt | not grep call
+; RUN: llc < %s -march=x86 -tailcallopt | FileCheck %s
 define fastcc i32 @bar(i32 %X, i32(double, i32) *%FP) {
      %Y = tail call fastcc i32 %FP(double 0.0, i32 %X)
      ret i32 %Y
+; CHECK: jmpl
 }
diff --git a/test/CodeGen/X86/tailcallpic1.ll b/test/CodeGen/X86/tailcallpic1.ll
index 60e3be5..ff590a1 100644
--- a/test/CodeGen/X86/tailcallpic1.ll
+++ b/test/CodeGen/X86/tailcallpic1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s  -tailcallopt -mtriple=i686-pc-linux-gnu -relocation-model=pic | grep TAILCALL
+; RUN: llc < %s  -tailcallopt -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s
 
 define protected fastcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
 entry:
@@ -9,4 +9,5 @@ define fastcc i32 @tailcaller(i32 %in1, i32 %in2) {
 entry:
 	%tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )		; <i32> [#uses=1]
 	ret i32 %tmp11
+; CHECK: jmp tailcallee
 }
diff --git a/test/CodeGen/X86/tailcallpic2.ll b/test/CodeGen/X86/tailcallpic2.ll
index eaa7631..1b6bdb7 100644
--- a/test/CodeGen/X86/tailcallpic2.ll
+++ b/test/CodeGen/X86/tailcallpic2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s  -tailcallopt -mtriple=i686-pc-linux-gnu -relocation-model=pic | grep -v TAILCALL
+; RUN: llc < %s  -tailcallopt -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s
 
 define fastcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
 entry:
@@ -9,4 +9,7 @@ define fastcc i32 @tailcaller(i32 %in1, i32 %in2) {
 entry:
 	%tmp11 = tail call fastcc i32 @tailcallee( i32 %in1, i32 %in2, i32 %in1, i32 %in2 )		; <i32> [#uses=1]
 	ret i32 %tmp11
+; CHECK: movl tailcallee@GOT
+; CHECK: jmpl
 }
+
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
index abb4b39..8cbfb5d 100644
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -1,12 +1,12 @@
 ; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
 
 ;CHECK: and_masks
-;CHECK: vmovups
+;CHECK: vmovaps
 ;CHECK: vcmpltp
 ;CHECK: vcmpltp
 ;CHECK: vandps
 ;CHECK: vandps
-;CHECK: vmovups
+;CHECK: vmovaps
 ;CHECK: ret
 
 define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
@@ -17,7 +17,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
   %m1 = fcmp olt <8 x float> %v2, %v0
   %mand = and <8 x i1> %m1, %m0
   %r = zext <8 x i1> %mand to <8 x i32>
-  store <8 x i32> %r, <8 x i32>* undef, align 16
+  store <8 x i32> %r, <8 x i32>* undef, align 32
   ret void
 }
 
@@ -25,7 +25,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
 ;CHECK: vcmpltps
 ;CHECK: vxorps
 ;CHECK: vandps
-;CHECK: vmovups
+;CHECK: vmovaps
 ;CHECK: ret
 define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
   %v0 = load <8 x float>* %a, align 16
@@ -33,7 +33,7 @@ define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
   %m0 = fcmp olt <8 x float> %v1, %v0
   %mand = xor <8 x i1> %m0, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
   %r = zext <8 x i1> %mand to <8 x i32>
-  store <8 x i32> %r, <8 x i32>* undef, align 16
+  store <8 x i32> %r, <8 x i32>* undef, align 32
   ret void
 }
 
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index dc0464f..e4a8f46 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -29,8 +29,8 @@ entry:
 ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
 ; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
 ; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
-; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
 ; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
   %0 = load <8 x float>* %in
   %1 = fpext <8 x float> %0 to <8 x double>
   store <8 x double> %1, <8 x double>* %out, align 1
diff --git a/test/CodeGen/X86/vec_sdiv_to_shift.ll b/test/CodeGen/X86/vec_sdiv_to_shift.ll
new file mode 100644
index 0000000..35e052d
--- /dev/null
+++ b/test/CodeGen/X86/vec_sdiv_to_shift.ll
@@ -0,0 +1,72 @@
+; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=+avx2 | FileCheck %s
+
+
+define <8 x i16> @sdiv_vec8x16(<8 x i16> %var) {
+entry:
+; CHECK: sdiv_vec8x16
+; CHECK: psraw  $15
+; CHECK: vpsrlw  $11
+; CHECK: vpaddw
+; CHECK: vpsraw  $5
+; CHECK: ret
+  %0 = sdiv <8 x i16> %var, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %0
+}
+
+define <4 x i32> @sdiv_zero(<4 x i32> %var) {
+entry:
+; CHECK: sdiv_zero
+; CHECK-NOT sra
+; CHECK: ret
+  %0 = sdiv <4 x i32> %var, <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i32> %0
+}
+
+define <4 x i32> @sdiv_vec4x32(<4 x i32> %var) {
+entry:
+; CHECK: sdiv_vec4x32
+; CHECK: vpsrad $31
+; CHECK: vpsrld $28
+; CHECK: vpaddd
+; CHECK: vpsrad $4
+; CHECK: ret
+%0 = sdiv <4 x i32> %var, <i32 16, i32 16, i32 16, i32 16>
+ret <4 x i32> %0
+}
+
+define <4 x i32> @sdiv_negative(<4 x i32> %var) {
+entry:
+; CHECK: sdiv_negative
+; CHECK: vpsrad $31
+; CHECK: vpsrld $28
+; CHECK: vpaddd
+; CHECK: vpsrad $4
+; CHECK: vpsubd
+; CHECK: ret
+%0 = sdiv <4 x i32> %var, <i32 -16, i32 -16, i32 -16, i32 -16>
+ret <4 x i32> %0
+}
+
+define <8 x i32> @sdiv8x32(<8 x i32> %var) {
+entry:
+; CHECK: sdiv8x32
+; CHECK: vpsrad $31
+; CHECK: vpsrld $26
+; CHECK: vpaddd
+; CHECK: vpsrad $6
+; CHECK: ret
+%0 = sdiv <8 x i32> %var, <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
+ret <8 x i32> %0
+}
+
+define <16 x i16> @sdiv16x16(<16 x i16> %var) {
+entry:
+; CHECK: sdiv16x16
+; CHECK: vpsraw  $15
+; CHECK: vpsrlw  $14
+; CHECK: vpaddw
+; CHECK: vpsraw  $2
+; CHECK: ret
+  %a0 = sdiv <16 x i16> %var, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+  ret <16 x i16> %a0
+}
diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll
index f105de4..5c668b7 100644
--- a/test/CodeGen/X86/vec_splat-2.ll
+++ b/test/CodeGen/X86/vec_splat-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd | count 1
+; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s
 
 define void @test(<2 x i64>* %P, i8 %x) nounwind {
 	%tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0		; <<16 x i8>> [#uses=1]
@@ -23,4 +23,11 @@ define void @test(<2 x i64>* %P, i8 %x) nounwind {
 	%tmp73.upgrd.1 = bitcast <16 x i8> %tmp73 to <2 x i64>		; <<2 x i64>> [#uses=1]
 	store <2 x i64> %tmp73.upgrd.1, <2 x i64>* %P
 	ret void
+
+; CHECK: test:
+; CHECK-NOT: pshufd
+; CHECK: punpcklbw
+; CHECK: punpcklbw
+; CHECK: pshufd $0
+; CHECK-NOT: pshufd
 }
diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll
index feacc42..cf0ecf4 100644
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ b/test/CodeGen/X86/vec_splat-3.ll
@@ -1,55 +1,230 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t
-; RUN: grep punpcklwd %t | count 4
-; RUN: grep punpckhwd %t | count 4
-; RUN: grep "pshufd" %t | count 8
+; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s
 
 ; Splat test for v8i16
-; Should generate with pshufd with masks $0, $85, $170, $255 (each mask is used twice)
 define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef , i32 undef >
+	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>
 	ret <8 x i16> %tmp6
+
+; CHECK: shuf_8i16_0:
+; CHECK: pshuflw $0
 }
 
 define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
+	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 	ret <8 x i16> %tmp6
+
+; CHECK: shuf_8i16_1:
+; CHECK: pshuflw $5
 }
 
 define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef , i32 undef >
+	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef, i32 undef>
 	ret <8 x i16> %tmp6
+
+; CHECK: shuf_8i16_2:
+; CHECK: punpcklwd
+; CHECK-NEXT: pshufd $-86
 }
 
 define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
+	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 	ret <8 x i16> %tmp6
+
+; CHECK: shuf_8i16_3:
+; CHECK: pshuflw $15
 }
 
 define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef , i32 undef >
+	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef>
 	ret <8 x i16> %tmp6
+
+; CHECK: shuf_8i16_4:
+; CHECK: movhlps
 }
 
 define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 undef, i32 undef , i32 undef >
+	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
 	ret <8 x i16> %tmp6
+
+; CHECK: shuf_8i16_5:
+; CHECK: punpckhwd
+; CHECK-NEXT: pshufd $85
 }
 
 define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 6, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >
+	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 6, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 	ret <8 x i16> %tmp6
-}
 
+; CHECK: shuf_8i16_6:
+; CHECK: punpckhwd
+; CHECK-NEXT: pshufd $-86
+}
 
 define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef , i32 undef >
+	%tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 	ret <8 x i16> %tmp6
+
+; CHECK: shuf_8i16_7:
+; CHECK: punpckhwd
+; CHECK-NEXT: pshufd $-1
+}
+
+; Splat test for v16i8
+define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_8:
+; CHECK: punpcklbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: pshufd $0
+}
+
+define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_9:
+; CHECK: punpcklbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: pshufd $85
+}
+
+define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_10:
+; CHECK: punpcklbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: pshufd $-86
+}
+
+define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 3, i32 undef, i32 undef, i32 3, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_11:
+; CHECK: punpcklbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: pshufd $-1
+}
+
+
+define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_12:
+; CHECK: pshufd $5
+}
+
+define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_13:
+; CHECK: punpcklbw
+; CHECK-NEXT: punpckhbw
+; CHECK-NEXT: pshufd $85
+}
+
+define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 6, i32 undef, i32 undef, i32 6, i32 undef, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_14:
+; CHECK: punpcklbw
+; CHECK-NEXT: punpckhbw
+; CHECK-NEXT: pshufd $-86
+}
+
+define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef >
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_15:
+; CHECK: punpcklbw
+; CHECK-NEXT: punpckhbw
+; CHECK-NEXT: pshufd $-1
+}
+
+define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 8, i32 undef, i32 undef, i32 8, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_16:
+; CHECK: punpckhbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: pshufd $0
+}
+
+define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 9, i32 undef, i32 undef, i32 9, i32 undef, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_17:
+; CHECK: punpckhbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: pshufd $85
+}
+
+define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 10, i32 undef, i32 undef, i32 10, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_18:
+; CHECK: punpckhbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: pshufd $-86
+}
+
+define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 11, i32 undef, i32 undef, i32 11, i32 undef, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_19:
+; CHECK: punpckhbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: pshufd $-1
+}
+
+define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 12, i32 undef, i32 undef, i32 12, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_20:
+; CHECK: punpckhbw
+; CHECK-NEXT: punpckhbw
+; CHECK-NEXT: pshufd $0
+}
+
+define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 13, i32 undef, i32 undef, i32 13, i32 undef, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_21:
+; CHECK: punpckhbw
+; CHECK-NEXT: punpckhbw
+; CHECK-NEXT: pshufd $85
+}
+
+define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 14, i32 undef, i32 undef, i32 14, i32 undef, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_22:
+; CHECK: punpckhbw
+; CHECK-NEXT: punpckhbw
+; CHECK-NEXT: pshufd $-86
+}
+
+define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone {
+	%tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 15, i32 undef, i32 undef, i32 15, i32 undef, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+	ret <16 x i8> %tmp6
+
+; CHECK: shuf_16i8_23:
+; CHECK: punpckhbw
+; CHECK-NEXT: punpckhbw
+; CHECK-NEXT: pshufd $-1
 }
diff --git a/test/CodeGen/X86/vec_splat-4.ll b/test/CodeGen/X86/vec_splat-4.ll
deleted file mode 100644
index 374acfa..0000000
--- a/test/CodeGen/X86/vec_splat-4.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 -o %t
-; RUN: grep punpcklbw %t | count 16
-; RUN: grep punpckhbw %t | count 16
-; RUN: grep "pshufd" %t | count 16
-
-; Should generate with pshufd with masks $0, $85, $170, $255 (each mask is used 4 times)
-
-; Splat test for v16i8
-define <16 x i8 > @shuf_16i8_0(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 0, i32 0 , i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_1(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef  >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_2(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 2 , i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_3(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 3, i32 undef, i32 undef, i32 3, i32 undef, i32 3, i32 3 , i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3 >
-	ret <16 x i8 > %tmp6
-}
-
-
-define <16 x i8 > @shuf_16i8_4(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef , i32 undef  >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_5(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 5, i32 5 , i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_6(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 6, i32 undef, i32 undef, i32 6, i32 undef, i32 6, i32 6 , i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_7(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef , i32 undef, i32 undef, i32 undef, i32 undef , i32 undef , i32 undef, i32 undef, i32 undef , i32 undef  >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_8(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 8, i32 undef, i32 undef, i32 8, i32 undef, i32 8, i32 8 , i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_9(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 9, i32 undef, i32 undef, i32 9, i32 undef, i32 9, i32 9 , i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_10(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 10, i32 undef, i32 undef, i32 10, i32 undef, i32 10, i32 10 , i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_11(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 11, i32 undef, i32 undef, i32 11, i32 undef, i32 11, i32 11 , i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_12(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 12, i32 undef, i32 undef, i32 12, i32 undef, i32 12, i32 12 , i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_13(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 13, i32 undef, i32 undef, i32 13, i32 undef, i32 13, i32 13 , i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_14(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 14, i32 undef, i32 undef, i32 14, i32 undef, i32 14, i32 14 , i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14 >
-	ret <16 x i8 > %tmp6
-}
-
-define <16 x i8 > @shuf_16i8_15(<16 x i8 > %T0, <16 x i8 > %T1) nounwind readnone {
-entry:
-	%tmp6 = shufflevector <16 x i8 > %T0, <16 x i8 > %T1, <16 x i32> < i32 15, i32 undef, i32 undef, i32 15, i32 undef, i32 15, i32 15 , i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15 >
-	ret <16 x i8 > %tmp6
-}
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
index 24d8487..deedee8 100644
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 | grep pshufd
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse3 | grep movddup
+; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2
+; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3
 
 define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
 	%tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0		; <<4 x float>> [#uses=1]
@@ -10,6 +10,12 @@ define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind {
 	%tmp10 = fmul <4 x float> %tmp8, %tmp6		; <<4 x float>> [#uses=1]
 	store <4 x float> %tmp10, <4 x float>* %P
 	ret void
+
+; SSE2: test_v4sf:
+; SSE2: pshufd $0
+
+; SSE3: test_v4sf:
+; SSE3: pshufd $0
 }
 
 define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
@@ -19,4 +25,10 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
 	%tmp6 = fmul <2 x double> %tmp4, %tmp2		; <<2 x double>> [#uses=1]
 	store <2 x double> %tmp6, <2 x double>* %P
 	ret void
+
+; SSE2: test_v2sd:
+; SSE2: shufpd $0
+
+; SSE3: test_v2sd:
+; SSE3: movddup
 }
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index d08e2a0..ec93ce0 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -8,10 +8,8 @@ entry:
   %vecinit2.i = insertelement <4 x i32*> %vecinit.i, i32* %ptr, i32 1
   %vecinit4.i = insertelement <4 x i32*> %vecinit2.i, i32* %ptr, i32 2
   %vecinit6.i = insertelement <4 x i32*> %vecinit4.i, i32* %ptr, i32 3
-;CHECK: pslld $2
 ;CHECK: padd
   %A2 = getelementptr <4 x i32*> %vecinit6.i, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-;CHECK: pslld $2
 ;CHECK: padd
   %A3 = getelementptr <4 x i32*> %A2, <4 x i32> <i32 10, i32 14, i32 19, i32 233>
   ret <4 x i32*> %A3
@@ -21,7 +19,6 @@ entry:
 ;CHECK: AGEP1:
 define i32 @AGEP1(<4 x i32*> %param) nounwind {
 entry:
-;CHECK: pslld $2
 ;CHECK: padd
   %A2 = getelementptr <4 x i32*> %param, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   %k = extractelement <4 x i32*> %A2, i32 3
diff --git a/test/CodeGen/X86/win_ftol2.ll b/test/CodeGen/X86/win_ftol2.ll
index 596b426..7f8ae07 100644
--- a/test/CodeGen/X86/win_ftol2.ll
+++ b/test/CodeGen/X86/win_ftol2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=FTOL
+; RUN: llc < %s -mtriple=i686-pc-win32 -mcpu=generic | FileCheck %s -check-prefix=FTOL
 ; RUN: llc < %s -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=COMPILERRT
 ; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s -check-prefix=COMPILERRT
 ; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=COMPILERRT
diff --git a/test/CodeGen/X86/x86-64-ptr-arg-simple.ll b/test/CodeGen/X86/x86-64-ptr-arg-simple.ll
new file mode 100644
index 0000000..6d46663
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-ptr-arg-simple.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
+
+; %in is kept in %esi for both ABIs. But the pointer will be passed in %edi
+; for x32, not %rdi
+
+; CHECK: movl %esi, (%rdi)
+; X32ABI: movl %esi, (%edi)
+
+define void @foo(i32* nocapture %out, i32 %in) nounwind {
+entry:
+  store i32 %in, i32* %out, align 4
+  ret void
+}
+
+; CHECK: bar
+; CHECK: movl (%rsi), %eax
+
+; Similarly here, but for loading
+; X32ABI: bar
+; X32ABI: movl (%esi), %eax
+
+define void @bar(i32* nocapture %pOut, i32* nocapture %pIn) nounwind {
+entry:
+  %0 = load i32* %pIn, align 4
+  store i32 %0, i32* %pOut, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/X86/x86-64-sret-return.ll b/test/CodeGen/X86/x86-64-sret-return.ll
index 7b5f189..bc8a543 100644
--- a/test/CodeGen/X86/x86-64-sret-return.ll
+++ b/test/CodeGen/X86/x86-64-sret-return.ll
@@ -1,11 +1,16 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin8 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-apple-darwin8"
-	%struct.foo = type { [4 x i64] }
+%struct.foo = type { [4 x i64] }
 
 ; CHECK: bar:
 ; CHECK: movq %rdi, %rax
+
+; For the x32 ABI, pointers are 32-bit so 32-bit instructions will be used
+; X32ABI: bar:
+; X32ABI: movl %edi, %eax
+
 define void @bar(%struct.foo* noalias sret  %agg.result, %struct.foo* %d) nounwind  {
 entry:
 	%d_addr = alloca %struct.foo*		; <%struct.foo**> [#uses=2]
@@ -57,6 +62,11 @@ return:		; preds = %entry
 
 ; CHECK: foo:
 ; CHECK: movq %rdi, %rax
+
+; For the x32 ABI, pointers are 32-bit so 32-bit instructions will be used
+; X32ABI: foo:
+; X32ABI: movl %edi, %eax
+
 define void @foo({ i64 }* noalias nocapture sret %agg.result) nounwind {
   store { i64 } { i64 0 }, { i64 }* %agg.result
   ret void
diff --git a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
index e0371d6..6efce1a 100644
--- a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
@@ -10,10 +10,8 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 12, metadata !"fb.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 139632)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 true, i32 ()* @foo, null, null, metadata !10} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !"fb.c", metadata !"/private/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/2009-11-10-CurrentFn.ll b/test/DebugInfo/2009-11-10-CurrentFn.ll
index 01db617..19be3f2 100644
--- a/test/DebugInfo/2009-11-10-CurrentFn.ll
+++ b/test/DebugInfo/2009-11-10-CurrentFn.ll
@@ -13,10 +13,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 12, metadata !"cf.c", metadata !"/private/tmp", metadata !"clang version 3.0 (trunk 139632)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"bar", metadata !"bar", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void (i32)* @bar, null, null, metadata !9} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !"cf.c", metadata !"/private/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll b/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
index 2557c9c..c1a88e1 100644
--- a/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
+++ b/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
@@ -49,6 +49,5 @@ entry:
 !21 = metadata !{i32 9, i32 0, metadata !11, metadata !17}
 !22 = metadata !{i32 11, i32 0, metadata !11, metadata !17}
 !23 = metadata !{i32 16, i32 0, metadata !18, null}
-!24 = metadata !{metadata !25}
-!25 = metadata !{metadata !9, metadata !10}
+!24 = metadata !{metadata !9, metadata !10}
 
diff --git a/test/DebugInfo/AArch64/cfi-frame.ll b/test/DebugInfo/AArch64/cfi-frame.ll
new file mode 100644
index 0000000..7290ddf
--- /dev/null
+++ b/test/DebugInfo/AArch64/cfi-frame.ll
@@ -0,0 +1,58 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-WITH-FP
+
+@bigspace = global [8 x i64] zeroinitializer
+
+declare void @use_addr(i8*)
+
+define void @test_frame([8 x i64] %val) {
+; CHECK: test_frame:
+; CHECK: .cfi_startproc
+
+  %var = alloca i8, i32 1000000
+; CHECK: sub sp, sp, #[[SP_INIT_ADJ:[0-9]+]]
+; CHECK-NEXT: .Ltmp
+; CHECK-NEXT: .cfi_def_cfa sp, [[SP_INIT_ADJ]]
+
+; Make sure the prologue is reasonably efficient
+; CHECK-NEXT: stp x29, x30, [sp,
+; CHECK-NEXT: stp x25, x26, [sp,
+; CHECK-NEXT: stp x23, x24, [sp,
+; CHECK-NEXT: stp x21, x22, [sp,
+; CHECK-NEXT: stp x19, x20, [sp,
+; CHECK-NEXT: sub sp, sp, #160
+; CHECK-NEXT: sub sp, sp, #244, lsl #12
+; CHECK-NEXT: .Ltmp
+; CHECK-NEXT: .cfi_def_cfa sp, 1000080
+; CHECK-NEXT: .Ltmp
+; CHECK-NEXT: .cfi_offset x30, -8
+; CHECK-NEXT: .Ltmp
+; CHECK-NEXT: .cfi_offset x29, -16
+; [...]
+; CHECK: .cfi_offset x19, -80
+
+; CHECK: bl use_addr
+  call void @use_addr(i8* %var)
+
+  store [8 x i64] %val, [8 x i64]* @bigspace
+  ret void
+; CHECK: ret
+; CHECK: .cfi_endproc
+}
+
+; CHECK-WITH-FP: test_frame:
+
+; CHECK-WITH-FP: sub sp, sp, #[[SP_INIT_ADJ:[0-9]+]]
+; CHECK-WITH-FP-NEXT: .Ltmp
+; CHECK-WITH-FP-NEXT: .cfi_def_cfa sp, [[SP_INIT_ADJ]]
+
+; CHECK-WITH-FP: stp x29, x30, [sp, [[OFFSET:#[0-9]+]]]
+; CHECK-WITH-FP-NEXT: add x29, sp, [[OFFSET]]
+; CHECK-WITH-FP-NEXT: .Ltmp
+; CHECK-WITH-FP-NEXT: .cfi_def_cfa x29, 16
+
+  ; We shouldn't emit any kind of update for the second stack adjustment if the
+  ; FP is in use.
+; CHECK-WITH-FP-NOT: .cfi_def_cfa_offset
+
+; CHECK-WITH-FP: bl use_addr
diff --git a/test/DebugInfo/AArch64/dwarfdump.ll b/test/DebugInfo/AArch64/dwarfdump.ll
new file mode 100644
index 0000000..b94f775
--- /dev/null
+++ b/test/DebugInfo/AArch64/dwarfdump.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=aarch64-non-linux-gnu < %s -filetype=obj | llvm-dwarfdump - | FileCheck %s
+
+; We're mostly checking that relocations are applied correctly
+; here. Currently R_AARCH64_ABS32 is used for references to debug data
+; and R_AARCH64_ABS64 is used for program addresses.
+
+; A couple of ABS32s, both at 0 and elsewhere, interpreted correctly:
+
+; CHECK: DW_AT_producer [DW_FORM_strp] ( .debug_str[0x00000000] = "clang version 3.3 ")
+; CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000013] = "tmp.c")
+
+; A couple of ABS64s similarly:
+
+; CHECK: DW_AT_low_pc [DW_FORM_addr] (0x0000000000000000)
+; CHECK: DW_AT_high_pc [DW_FORM_addr] (0x0000000000000008)
+
+define i32 @main() nounwind {
+  ret i32 0, !dbg !8
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"tmp.c", metadata !"/home/tim/llvm/build", metadata !"clang version 3.3 ", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !1} ; [ DW_TAG_compile_unit ] [/home/timnor01/llvm/build/tmp.c] [DW_LANG_C99]
+!1 = metadata !{i32 0}
+!2 = metadata !{metadata !3}
+!3 = metadata !{i32 786478, i32 0, metadata !4, metadata !"main", metadata !"main", metadata !"", metadata !4, i32 1, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!4 = metadata !{i32 786473, metadata !"tmp.c", metadata !"/home/tim/llvm/build", null} ; [ DW_TAG_file_type ]
+!5 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{i32 2, i32 0, metadata !3, null}
diff --git a/test/DebugInfo/AArch64/eh_frame.ll b/test/DebugInfo/AArch64/eh_frame.ll
new file mode 100644
index 0000000..2539c56
--- /dev/null
+++ b/test/DebugInfo/AArch64/eh_frame.ll
@@ -0,0 +1,51 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu %s -filetype=obj -o %t
+; RUN: llvm-objdump -s %t | FileCheck %s
+@var = global i32 0
+
+declare void @bar()
+
+define i64 @check_largest_class(i32 %in)  {
+  %res = load i32* @var
+  call void @bar()
+  %ext = zext i32 %res to i64
+  ret i64 %ext
+}
+
+; The really key points we're checking here are:
+;  * Return register is x30.
+;  * Pointer format is 0x1b (GNU doesn't appear to understand others).
+
+; The rest is largely incidental, but not expected to change regularly.
+
+; Output is:
+
+; CHECK: Contents of section .eh_frame:
+; CHECK-NEXT: 0000 10000000 00000000 017a5200 017c1e01  .........zR..|..
+; CHECK-NEXT: 0010 1b0c1f00 18000000 18000000 00000000  ................
+
+
+; Won't check the rest, it's rather incidental.
+; 0020 24000000 00440c1f 10449e02 93040000  $....D...D......
+
+
+; The first CIE:
+; -------------------
+; 10000000: length of first CIE = 0x10
+; 00000000: This is a CIE
+; 01: version = 0x1
+; 7a 52 00: augmentation string "zR" -- pointer format is specified
+; 01: code alignment factor 1
+; 7c: data alignment factor -4
+; 1e: return address register 30 (== x30).
+; 01: 1 byte of augmentation
+; 1b: pointer format 1b: DW_EH_PE_pcrel | DW_EH_PE_sdata4
+; 0c 1f 00: initial instructions: "DW_CFA_def_cfa x31 ofs 0" in this case
+
+; Next the FDE:
+; -------------
+; 18000000: FDE length 0x18
+; 18000000: Uses CIE 0x18 backwards (only coincidentally same as above)
+; 00000000: PC begin for this FDE is at 00000000 (relocation is applied here)
+; 24000000: FDE applies up to PC begin+0x24
+; 00: Augmentation string length 0 for this FDE
+; Rest: call frame instructions
diff --git a/test/DebugInfo/AArch64/eh_frame_personality.ll b/test/DebugInfo/AArch64/eh_frame_personality.ll
new file mode 100644
index 0000000..d35f2a2
--- /dev/null
+++ b/test/DebugInfo/AArch64/eh_frame_personality.ll
@@ -0,0 +1,46 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu %s -filetype=obj -o %t
+; RUN: llvm-objdump -s %t | FileCheck %s
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @bar()
+
+define i64 @foo(i64 %lhs, i64 %rhs) {
+  invoke void @bar() to label %end unwind label %clean
+end:
+ ret i64 0
+
+clean:
+  %tst = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) cleanup
+  ret i64 42
+}
+
+; CHECK: Contents of section .eh_frame:
+; CHECK: 0000 1c000000 00000000 017a504c 5200017c  .........zPLR..|
+; CHECK: 0010 1e0b0000 00000000 00000000 1b0c1f00  ................
+
+; Don't really care about the rest:
+
+; 0020 1c000000 24000000 00000000 24000000  ....$.......$...
+; 0030 08000000 00000000 00440c1f 10449e02  .........D...D..
+
+; The key test here is that the personality routine is sanely encoded (under the
+; small memory model it must be an 8-byte value for full generality: code+data <
+; 4GB, but you might need both +4GB and -4GB depending on where things end
+; up. However, for completeness:
+
+; First CIE:
+; ----------
+; 1c000000: Length = 0x1c
+; 00000000: This is a CIE
+; 01: Version 1
+; 7a 50 4c 52 00: Augmentation string "zPLR" (personality routine, language-specific data, pointer format)
+; 01: Code alignment factor 1
+; 78: Data alignment factor: -8
+; 1e: Return address in x30
+; 07: Augmentation data 0xb bytes (this is key!)
+; 00: Personality encoding is DW_EH_PE_absptr
+; 00 00 00 00 00 00 00 00: First part of aug (personality routine). Relocated, obviously
+; 00: Second part of aug (language-specific data): absolute pointer format used
+; 1b: pointer format: pc-relative signed 4-byte. Just like GNU.
+; 0c 1f 00: Initial instructions ("DW_CFA_def_cfa x31 ofs 0" in this case)
diff --git a/test/DebugInfo/AArch64/lit.local.cfg b/test/DebugInfo/AArch64/lit.local.cfg
new file mode 100644
index 0000000..c5ce241
--- /dev/null
+++ b/test/DebugInfo/AArch64/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+
diff --git a/test/DebugInfo/AArch64/variable-loc.ll b/test/DebugInfo/AArch64/variable-loc.ll
new file mode 100644
index 0000000..9a7f7db
--- /dev/null
+++ b/test/DebugInfo/AArch64/variable-loc.ll
@@ -0,0 +1,97 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+
+; This is a regression test making sure the location of variables is correct in
+; debugging information, even if they're addressed via the frame pointer.
+
+; In case it needs, regenerating, the following suffices:
+; int printf(const char *, ...);
+; void populate_array(int *, int);
+; int sum_array(int *, int);
+
+; int main() {
+;     int main_arr[100], val;
+;     populate_array(main_arr, 100);
+;     val = sum_array(main_arr, 100);
+;     printf("Total is %d\n", val);
+;     return 0;
+; }
+
+  ; First make sure main_arr is where we expect it: sp + 12 == x29 - 420:
+; CHECK: main:
+; CHECK: sub sp, sp, #448
+; CHECK: stp x29, x30, [sp, #432]
+; CHECK: add x29, sp, #432
+; CHECK: add {{x[0-9]+}}, sp, #12
+
+  ; Now check the debugging information reflects this:
+; CHECK: DW_TAG_variable
+; CHECK-NEXT: .word .Linfo_string7
+
+  ; Rather hard-coded, but 145 => DW_OP_fbreg and the .ascii is LEB128 encoded -420.
+; CHECK: DW_AT_location
+; CHECK-NEXT: .byte 145
+; CHECK-NEXT: .ascii "\334|"
+
+; CHECK: .Linfo_string7:
+; CHECK-NEXT: main_arr
+
+
+target datalayout = "e-p:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnu"
+
+@.str = private unnamed_addr constant [13 x i8] c"Total is %d\0A\00", align 1
+
+declare void @populate_array(i32*, i32) nounwind
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare i32 @sum_array(i32*, i32) nounwind
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %main_arr = alloca [100 x i32], align 4
+  %val = alloca i32, align 4
+  store i32 0, i32* %retval
+  call void @llvm.dbg.declare(metadata !{[100 x i32]* %main_arr}, metadata !17), !dbg !22
+  call void @llvm.dbg.declare(metadata !{i32* %val}, metadata !23), !dbg !24
+  %arraydecay = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !25
+  call void @populate_array(i32* %arraydecay, i32 100), !dbg !25
+  %arraydecay1 = getelementptr inbounds [100 x i32]* %main_arr, i32 0, i32 0, !dbg !26
+  %call = call i32 @sum_array(i32* %arraydecay1, i32 100), !dbg !26
+  store i32 %call, i32* %val, align 4, !dbg !26
+  %0 = load i32* %val, align 4, !dbg !27
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0), i32 %0), !dbg !27
+  ret i32 0, !dbg !28
+}
+
+declare i32 @printf(i8*, ...)
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"simple.c", metadata !"/home/timnor01/a64-trunk/build", metadata !"clang version 3.2 ", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/timnor01/a64-trunk/build/simple.c] [DW_LANG_C99]
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !11, metadata !14}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"populate_array", metadata !"populate_array", metadata !"", metadata !6, i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*, i32)* @populate_array, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [populate_array]
+!6 = metadata !{i32 786473, metadata !"simple.c", metadata !"/home/timnor01/a64-trunk/build", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !10}
+!9 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!10 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{i32 786478, i32 0, metadata !6, metadata !"sum_array", metadata !"sum_array", metadata !"", metadata !6, i32 9, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*, i32)* @sum_array, null, null, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [sum_array]
+!12 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !10, metadata !9, metadata !10}
+!14 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 18, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
+!15 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !10}
+!17 = metadata !{i32 786688, metadata !18, metadata !"main_arr", metadata !6, i32 19, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [main_arr] [line 19]
+!18 = metadata !{i32 786443, metadata !14, i32 18, i32 16, metadata !6, i32 4} ; [ DW_TAG_lexical_block ] [/home/timnor01/a64-trunk/build/simple.c]
+!19 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 3200, i64 32, i32 0, i32 0, metadata !10, metadata !20, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
+!20 = metadata !{i32 786465, i64 0, i64 99}       ; [ DW_TAG_subrange_type ] [0, 99]
+!22 = metadata !{i32 19, i32 7, metadata !18, null}
+!23 = metadata !{i32 786688, metadata !18, metadata !"val", metadata !6, i32 20, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [val] [line 20]
+!24 = metadata !{i32 20, i32 7, metadata !18, null}
+!25 = metadata !{i32 22, i32 3, metadata !18, null}
+!26 = metadata !{i32 23, i32 9, metadata !18, null}
+!27 = metadata !{i32 24, i32 3, metadata !18, null}
+!28 = metadata !{i32 26, i32 3, metadata !18, null}
diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.cc b/test/DebugInfo/Inputs/dwarfdump-inl-test.cc
new file mode 100644
index 0000000..8ffbb52
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.cc
@@ -0,0 +1,15 @@
+#include "dwarfdump-inl-test.h"
+static inline int inlined_f() {
+  volatile int x = inlined_g();
+  return x;
+}
+
+int main() {
+  return inlined_f();
+}
+
+// Built with Clang 3.2
+// $ mkdir -p /tmp/dbginfo
+// $ cp dwarfdump-inl-test.* /tmp/dbginfo
+// $ cd /tmp/dbginfo
+// $ clang++ -O2 -gline-tables-only -fsanitize=address -fPIC -shared dwarfdump-inl-test.cc -o <output>
diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
index 9a1d538..6df03da 100755
--- a/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.h b/test/DebugInfo/Inputs/dwarfdump-inl-test.h
new file mode 100644
index 0000000..ecc2aaa
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.h
@@ -0,0 +1,9 @@
+inline int inlined_h() {
+  volatile int z = 0;
+  return z;
+}
+
+inline int inlined_g() {
+  volatile int y = inlined_h();
+  return y;
+}
diff --git a/test/DebugInfo/Inputs/dwarfdump-pubnames.cc b/test/DebugInfo/Inputs/dwarfdump-pubnames.cc
new file mode 100644
index 0000000..284755b
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-pubnames.cc
@@ -0,0 +1,32 @@
+// Object file built using:
+// clang -g -mllvm -generate-dwarf-pubnames -o dwarfdump-pubnames.elf-x86_64 \
+//    dwarfdump-pubnames.cc  -c
+
+struct C {
+  void member_function();
+  static int static_member_function();
+  static int static_member_variable;
+};
+
+int C::static_member_variable = 0;
+
+void C::member_function() {
+  static_member_variable = 0;
+}
+
+int C::static_member_function() {
+  return static_member_variable;
+}
+
+C global_variable;
+
+int global_function() {
+  return -1;
+}
+
+namespace ns {
+  void global_namespace_function() {
+    global_variable.member_function();
+  }
+  int global_namespace_variable = 1;
+}
diff --git a/test/DebugInfo/Inputs/dwarfdump-pubnames.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-pubnames.elf-x86-64
new file mode 100644
index 0000000..3c9c1ad
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-pubnames.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-test-32bit.elf.c b/test/DebugInfo/Inputs/dwarfdump-test-32bit.elf.c
new file mode 100644
index 0000000..708e037
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test-32bit.elf.c
@@ -0,0 +1,14 @@
+// clang -c -g -o dwarfdump-test-32bit.elf.o -m32 dwarfdump-test-32bit.elf.c
+
+extern int glob;
+
+int foo(int arg) {
+  int a = arg * 2;
+  return a + glob;
+}
+
+int bar(int arg) {
+  int a = foo(arg) * foo(arg * 2);
+  return glob - foo(a);
+}
+
diff --git a/test/DebugInfo/Inputs/dwarfdump-test-32bit.elf.o b/test/DebugInfo/Inputs/dwarfdump-test-32bit.elf.o
new file mode 100644
index 0000000..817665e
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test-32bit.elf.o
diff --git a/test/DebugInfo/Inputs/dwarfdump-test.cc b/test/DebugInfo/Inputs/dwarfdump-test.cc
new file mode 100644
index 0000000..4089998
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test.cc
@@ -0,0 +1,23 @@
+class DummyClass {
+  int a_;
+ public:
+  DummyClass(int a) : a_(a) {}
+  int add(int b) {
+    return a_ + b;
+  }
+};
+
+int f(int a, int b) {
+  DummyClass c(a);
+  return c.add(b);
+}
+
+int main() {
+  return f(2, 3);
+}
+
+// Built with Clang 3.2:
+// $ mkdir -p /tmp/dbginfo
+// $ cp dwarfdump-test.cc /tmp/dbginfo
+// $ cd /tmp/dbginfo
+// $ clang++ -g dwarfdump-test.cc -o <output>
diff --git a/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64
index fe20c8e..455dd1c 100755
--- a/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64
+++ b/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-test2-helper.cc b/test/DebugInfo/Inputs/dwarfdump-test2-helper.cc
new file mode 100644
index 0000000..7d92640
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test2-helper.cc
@@ -0,0 +1,3 @@
+extern "C" int a() {
+  return 0;
+}
diff --git a/test/DebugInfo/Inputs/dwarfdump-test2-main.cc b/test/DebugInfo/Inputs/dwarfdump-test2-main.cc
new file mode 100644
index 0000000..b327674
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test2-main.cc
@@ -0,0 +1,11 @@
+extern "C" int a();
+
+int main() {
+  return a();
+}
+
+// Built with gcc 4.6.3
+// $ mkdir -p /tmp/dbginfo
+// $ cp dwarfdump-test2-helper.cc dwarfdump-test2-main.cc /tmp/dbginfo/
+// $ cd /tmp/dbginfo
+// $ g++ -g dwarfdump-test2-helper.cc dwarfdump-test2-main.cc -o <output>
diff --git a/test/DebugInfo/Inputs/dwarfdump-test2.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-test2.elf-x86-64
index ce4af7f..6f362ad 100755
--- a/test/DebugInfo/Inputs/dwarfdump-test2.elf-x86-64
+++ b/test/DebugInfo/Inputs/dwarfdump-test2.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-test3-decl.h b/test/DebugInfo/Inputs/dwarfdump-test3-decl.h
new file mode 100644
index 0000000..4a79e95
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test3-decl.h
@@ -0,0 +1,7 @@
+#include "dwarfdump-test3-decl2.h"
+
+class C {
+  explicit C(bool a = false, bool b = false);
+};
+
+void do1() {}
diff --git a/test/DebugInfo/Inputs/dwarfdump-test3-decl2.h b/test/DebugInfo/Inputs/dwarfdump-test3-decl2.h
new file mode 100644
index 0000000..9c92d56
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test3-decl2.h
@@ -0,0 +1 @@
+void do2() { }
diff --git a/test/DebugInfo/Inputs/dwarfdump-test3.cc b/test/DebugInfo/Inputs/dwarfdump-test3.cc
new file mode 100644
index 0000000..7b4d7ea
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test3.cc
@@ -0,0 +1,12 @@
+#include "dwarfdump-test3-decl.h"
+
+C::C(bool a, bool b) {}
+
+// Built with gcc 4.6.3
+// $ mkdir -p /tmp/dbginfo/include
+// $ mkdir -p /tmp/include
+// $ cp dwarfdump-test3.cc /tmp/dbginfo
+// $ cp dwarfdump-test3-decl.h /tmp/include
+// $ cp dwarfdump-test3-decl2.h /tmp/dbginfo/include
+// $ cd /tmp/dbginfo
+// $ gcc dwarfdump-test3.cc -g -I/tmp/include -Iinclude -fPIC -shared -o <output>
diff --git a/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64
index 7c17304..7330cd8 100755
--- a/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64
+++ b/test/DebugInfo/Inputs/dwarfdump-test3.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-test4-decl.h b/test/DebugInfo/Inputs/dwarfdump-test4-decl.h
new file mode 100644
index 0000000..9abd875
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test4-decl.h
@@ -0,0 +1 @@
+inline void a(){}
diff --git a/test/DebugInfo/Inputs/dwarfdump-test4-part1.cc b/test/DebugInfo/Inputs/dwarfdump-test4-part1.cc
new file mode 100644
index 0000000..94a818c
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test4-part1.cc
@@ -0,0 +1,8 @@
+#include "dwarfdump-test4-decl.h"
+int c(){a();}
+
+// Built with gcc 4.6.3
+// $ mkdir -p /tmp/dbginfo
+// $ cp dwarfdump-test4-*.* /tmp/dbginfo
+// $ cd /tmp/dbginfo
+// $ gcc -fPIC -shared -g dwarfdump-test4-part*.cc -o <output>
diff --git a/test/DebugInfo/Inputs/dwarfdump-test4-part2.cc b/test/DebugInfo/Inputs/dwarfdump-test4-part2.cc
new file mode 100644
index 0000000..2a1936f
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test4-part2.cc
@@ -0,0 +1,2 @@
+#include "dwarfdump-test4-decl.h"
+int d(){a();}
diff --git a/test/DebugInfo/Inputs/dwarfdump-test4.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-test4.elf-x86-64
index 8848708..a1dd8b9 100755
--- a/test/DebugInfo/Inputs/dwarfdump-test4.elf-x86-64
+++ b/test/DebugInfo/Inputs/dwarfdump-test4.elf-x86-64
diff --git a/test/DebugInfo/Inputs/lit.local.cfg b/test/DebugInfo/Inputs/lit.local.cfg
new file mode 100644
index 0000000..e6f55ee
--- /dev/null
+++ b/test/DebugInfo/Inputs/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = []
diff --git a/test/DebugInfo/Inputs/test-inline.o b/test/DebugInfo/Inputs/test-inline.o
new file mode 100644
index 0000000..a650c91
--- /dev/null
+++ b/test/DebugInfo/Inputs/test-inline.o
diff --git a/test/DebugInfo/Inputs/test-parameters.o b/test/DebugInfo/Inputs/test-parameters.o
new file mode 100644
index 0000000..7f4b670
--- /dev/null
+++ b/test/DebugInfo/Inputs/test-parameters.o
diff --git a/test/DebugInfo/X86/2010-08-10-DbgConstant.ll b/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
index 78f8750..e440df7 100644
--- a/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
+++ b/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
@@ -1,5 +1,5 @@
 ; RUN: llc  -mtriple=i686-linux -O0 -filetype=obj -o %t %s
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 ; CHECK: DW_TAG_constant [4]
 ; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x0000002c] = "ro")
 
diff --git a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
index e514493..d682ab2 100644
--- a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
+++ b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-pc-linux-gnu %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; ModuleID = 'test.c'
 
@@ -19,10 +19,8 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 12, metadata !"test.c", metadata !"/work/llvm/vanilla/test/DebugInfo", metadata !"clang version 3.0 (trunk)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"f", metadata !"f", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @f, null, null, metadata !10} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !"test.c", metadata !"/work/llvm/vanilla/test/DebugInfo", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
@@ -30,8 +28,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !9 = metadata !{i32 720932, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{metadata !11}
 !11 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{metadata !14}
+!12 = metadata !{metadata !14}
 !14 = metadata !{i32 720948, i32 0, null, metadata !"GLB", metadata !"GLB", metadata !"", metadata !6, i32 1, metadata !9, i32 0, i32 1, i32* @GLB} ; [ DW_TAG_variable ]
 !15 = metadata !{i32 721152, metadata !16, metadata !"LOC", metadata !6, i32 4, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
 !16 = metadata !{i32 720907, metadata !5, i32 3, i32 9, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
index 6e20169..ba8a763 100644
--- a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
+++ b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-macosx10.7 %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; CHECK: b_ref
 ; CHECK-NOT: AT_bit_size
@@ -89,10 +89,8 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 4, metadata !"main.cpp", metadata !"/Users/echristo/tmp/bad-struct-ref", metadata !"clang version 3.1 (trunk 146596)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !3, metadata !27, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !9}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !9}
 !5 = metadata !{i32 720898, null, metadata !"bar", metadata !6, i32 9, i64 128, i64 64, i32 0, i32 0, null, metadata !7, i32 0, null, null} ; [ DW_TAG_class_type ]
 !6 = metadata !{i32 720937, metadata !"main.cpp", metadata !"/Users/echristo/tmp/bad-struct-ref", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{metadata !8, metadata !19, metadata !21}
@@ -115,8 +113,7 @@ entry:
 !24 = metadata !{i32 720911, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !5} ; [ DW_TAG_pointer_type ]
 !25 = metadata !{metadata !26}
 !26 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!27 = metadata !{metadata !28}
-!28 = metadata !{metadata !29, metadata !37, metadata !40, metadata !43, metadata !46}
+!27 = metadata !{metadata !29, metadata !37, metadata !40, metadata !43, metadata !46}
 !29 = metadata !{i32 720942, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 17, metadata !30, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !35} ; [ DW_TAG_subprogram ]
 !30 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !31, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !31 = metadata !{metadata !12, metadata !12, metadata !32}
diff --git a/test/DebugInfo/X86/DW_AT_byte_size.ll b/test/DebugInfo/X86/DW_AT_byte_size.ll
index 25b5f00..ce3cf00 100644
--- a/test/DebugInfo/X86/DW_AT_byte_size.ll
+++ b/test/DebugInfo/X86/DW_AT_byte_size.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck %s
 
 ; Checks that we don't emit a size for a pointer type.
 ; CHECK: DW_TAG_pointer_type
@@ -25,10 +25,8 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo", metadata !"clang version 3.1 (trunk 150996)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooP1A", metadata !6, i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.A*)* @_Z3fooP1A, null, null, metadata !14} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
index b1fbbf7..789fceb 100644
--- a/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; CHECK: DW_TAG_formal_parameter [
 ; CHECK: DW_TAG_class_type
@@ -48,10 +48,8 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", metadata !"clang version 3.2 (trunk 163586) (llvm/trunk 163570)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !10, metadata !20}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !10, metadata !20}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", metadata !6, i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3fooi, null, null, metadata !1, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
 !6 = metadata !{i32 786473, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
diff --git a/test/DebugInfo/X86/DW_AT_specification.ll b/test/DebugInfo/X86/DW_AT_specification.ll
index 078b740..67cfb47 100644
--- a/test/DebugInfo/X86/DW_AT_specification.ll
+++ b/test/DebugInfo/X86/DW_AT_specification.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; test that the DW_AT_specification is a back edge in the file.
 
@@ -17,10 +17,8 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 4, metadata !"<unknown>", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/toolkit/library", metadata !"clang version 3.0 ()", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, i32 0, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", metadata !6, i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void ()* @_ZN3foo3barEv, null, metadata !11, metadata !16} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !"nsNativeAppSupportBase.ii", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/toolkit/library", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
@@ -34,8 +32,7 @@ entry:
 !15 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !16 = metadata !{metadata !17}
 !17 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !20}
+!18 = metadata !{metadata !20}
 !20 = metadata !{i32 720948, i32 0, metadata !5, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 5, metadata !21, i32 1, i32 1, i32* @_ZZN3foo3barEvE1x} ; [ DW_TAG_variable ]
 !21 = metadata !{i32 720934, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_const_type ]
 !22 = metadata !{i32 720932, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
diff --git a/test/DebugInfo/X86/DW_TAG_friend.ll b/test/DebugInfo/X86/DW_TAG_friend.ll
index a0dcec3..0671a40 100644
--- a/test/DebugInfo/X86/DW_TAG_friend.ll
+++ b/test/DebugInfo/X86/DW_TAG_friend.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; Check that the friend tag is there and is followed by a DW_AT_friend that has a reference back.
 
@@ -18,10 +18,8 @@
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.1 (trunk 153413) (llvm/trunk 153428)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !17}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !17}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 10, metadata !7, i32 0, i32 1, %class.A* @a} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null} ; [ DW_TAG_class_type ]
diff --git a/test/DebugInfo/X86/aligned_stack_var.ll b/test/DebugInfo/X86/aligned_stack_var.ll
index 9e6c7ff..85027aa 100644
--- a/test/DebugInfo/X86/aligned_stack_var.ll
+++ b/test/DebugInfo/X86/aligned_stack_var.ll
@@ -1,5 +1,5 @@
 ; RUN: llc %s -mtriple=x86_64-pc-linux-gnu -O0 -filetype=obj -o %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; If stack is realigned, we shouldn't describe locations of local
 ; variables by giving offset from the frame pointer (%rbp):
@@ -27,10 +27,8 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"test.cc", metadata !"/home/samsonov/debuginfo", metadata !"clang version 3.2 (trunk 155696:155697) (llvm/trunk 155696)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"run", metadata !"run", metadata !"_Z3runv", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"test.cc", metadata !"/home/samsonov/debuginfo", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/X86/block-capture.ll b/test/DebugInfo/X86/block-capture.ll
index 4953c42..a1ff6ac 100644
--- a/test/DebugInfo/X86/block-capture.ll
+++ b/test/DebugInfo/X86/block-capture.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; Checks that we emit debug info for the block variable declare.
 ; CHECK: 0x00000030:   DW_TAG_subprogram [3]
@@ -63,10 +63,8 @@ declare i32 @__objc_personality_v0(...)
 !llvm.module.flags = !{!35, !36, !37, !38}
 
 !0 = metadata !{i32 786449, i32 0, i32 16, metadata !"foo.m", metadata !"/Users/echristo", metadata !"clang version 3.1 (trunk 151227)", i1 true, i1 false, metadata !"", i32 2, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !28, metadata !31, metadata !34}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !28, metadata !31, metadata !34}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"", metadata !6, i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !26} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"foo.m", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index 58fb055..ef83a51 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-linux %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; test that we add DW_AT_inline even when we only have concrete out of line
 ; instances.
@@ -35,10 +35,8 @@ declare void @_Z8moz_freePv(i8*)
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 4, metadata !"nsAutoRefCnt.cpp", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/netwerk/base/src", metadata !"clang version 3.1 ()", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !47} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !23, metadata !27, metadata !31}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !23, metadata !27, metadata !31}
 !5 = metadata !{i32 720942, i32 0, null, metadata !"Release", metadata !"Release", metadata !"_ZN17nsAutoRefCnt7ReleaseEv", metadata !6, i32 14, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32* null, null, metadata !12, metadata !20} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !"nsAutoRefCnt.ii", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/netwerk/base/src", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
@@ -52,18 +50,14 @@ declare void @_Z8moz_freePv(i8*)
 !15 = metadata !{i32 720942, i32 0, metadata !13, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"", metadata !6, i32 12, metadata !16, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
 !16 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !17 = metadata !{null, metadata !10}
-!18 = metadata !{metadata !19}
-!19 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!20 = metadata !{metadata !21}
-!21 = metadata !{metadata !22}
+!18 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
+!20 = metadata !{metadata !22}
 !22 = metadata !{i32 721153, metadata !5, metadata !"this", metadata !6, i32 16777230, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
 !23 = metadata !{i32 720942, i32 0, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD1Ev", metadata !6, i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32* null, null, metadata !15, metadata !24} ; [ DW_TAG_subprogram ]
-!24 = metadata !{metadata !25}
-!25 = metadata !{metadata !26}
+!24 = metadata !{metadata !26}
 !26 = metadata !{i32 721153, metadata !23, metadata !"this", metadata !6, i32 16777234, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
 !27 = metadata !{i32 720942, i32 0, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD2Ev", metadata !6, i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32* null, null, metadata !15, metadata !28} ; [ DW_TAG_subprogram ]
-!28 = metadata !{metadata !29}
-!29 = metadata !{metadata !30}
+!28 = metadata !{metadata !30}
 !30 = metadata !{i32 721153, metadata !27, metadata !"this", metadata !6, i32 16777234, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
 !31 = metadata !{i32 720942, i32 0, null, metadata !"operator=", metadata !"operator=", metadata !"_ZN12nsAutoRefCntaSEi", metadata !6, i32 4, metadata !32, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null, metadata !36, metadata !43} ; [ DW_TAG_subprogram ]
 !32 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !33, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
@@ -77,12 +71,10 @@ declare void @_Z8moz_freePv(i8*)
 !40 = metadata !{i32 720942, i32 0, metadata !37, metadata !"nsAutoRefCnt", metadata !"nsAutoRefCnt", metadata !"", metadata !6, i32 3, metadata !41, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18} ; [ DW_TAG_subprogram ]
 !41 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !42, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !42 = metadata !{null, metadata !34}
-!43 = metadata !{metadata !44}
-!44 = metadata !{metadata !45, metadata !46}
+!43 = metadata !{metadata !45, metadata !46}
 !45 = metadata !{i32 721153, metadata !31, metadata !"this", metadata !6, i32 16777220, metadata !34, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
 !46 = metadata !{i32 721153, metadata !31, metadata !"aValue", metadata !6, i32 33554436, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!47 = metadata !{metadata !48}
-!48 = metadata !{metadata !49}
+!47 = metadata !{metadata !49}
 !49 = metadata !{i32 720948, i32 0, null, metadata !"mRefCnt", metadata !"mRefCnt", metadata !"", metadata !6, i32 9, metadata !37, i32 0, i32 1, i32* null} ; [ DW_TAG_variable ]
 !50 = metadata !{i32 5, i32 5, metadata !51, metadata !52}
 !51 = metadata !{i32 720907, metadata !31, i32 4, i32 29, metadata !6, i32 2} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index a09a7ea..c6052b1 100644
--- a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -filetype=obj -o %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 ; RUN: llc -mtriple=x86_64-apple-darwin -regalloc=basic %s -filetype=obj -o %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ;CHECK: DW_TAG_inlined_subroutine [12]
 ;CHECK-NEXT: DW_AT_abstract_origin
diff --git a/test/DebugInfo/X86/debug-info-static-member.ll b/test/DebugInfo/X86/debug-info-static-member.ll
new file mode 100644
index 0000000..d7a6578
--- /dev/null
+++ b/test/DebugInfo/X86/debug-info-static-member.ll
@@ -0,0 +1,171 @@
+; RUN: llc %s -o %t -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=PRESENT
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s -check-prefix=ABSENT
+; Verify that attributes we do want are PRESENT;
+; verify that attributes we don't want are ABSENT.
+; It's a lot easier to do this in two passes than in one.
+; PR14471
+
+; LLVM IR generated using: clang -emit-llvm -S -g
+; (with the Clang part of this patch applied).
+;
+; class C
+; {
+;   static int a;
+;   const static bool const_a = true;
+; protected:
+;   static int b;
+;   const static float const_b = 3.14;
+; public:
+;   static int c;
+;   const static int const_c = 18;
+;   int d;
+; };
+; 
+; int C::a = 4;
+; int C::b = 2;
+; int C::c = 1;
+; 
+; int main()
+; {
+;         C instance_C;
+;         instance_C.d = 8;
+;         return C::c;
+; }
+
+%class.C = type { i32 }
+
+@_ZN1C1aE = global i32 4, align 4
+@_ZN1C1bE = global i32 2, align 4
+@_ZN1C1cE = global i32 1, align 4
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %instance_C = alloca %class.C, align 4
+  store i32 0, i32* %retval
+  call void @llvm.dbg.declare(metadata !{%class.C* %instance_C}, metadata !29), !dbg !30
+  %d = getelementptr inbounds %class.C* %instance_C, i32 0, i32 0, !dbg !31
+  store i32 8, i32* %d, align 4, !dbg !31
+  %0 = load i32* @_ZN1C1cE, align 4, !dbg !32
+  ret i32 %0, !dbg !32
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"debug-info-static-member.cpp", metadata !"/home/probinson/projects/upstream/static-member/test", metadata !"clang version 3.3 (trunk 171914)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !10} ; [ DW_TAG_compile_unit ] [/home/probinson/projects/upstream/static-member/test/debug-info-static-member.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 18, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 23} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 23] [main]
+!6 = metadata !{i32 786473, metadata !"/usr/local/google/home/blaikie/Development/llvm/src/tools/clang/test/CodeGenCXX/debug-info-static-member.cpp", metadata !"/home/blaikie/local/Development/llvm/build/clang/x86-64/Debug/llvm", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !12, metadata !27, metadata !28}
+!12 = metadata !{i32 786484, i32 0, metadata !13, metadata !"a", metadata !"a", metadata !"_ZN1C1aE", metadata !6, i32 14, metadata !9, i32 0, i32 1, i32* @_ZN1C1aE, metadata !15} ; [ DW_TAG_variable ] [a] [line 14] [def]
+!13 = metadata !{i32 786434, null, metadata !"C", metadata !6, i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !14, i32 0, null, null} ; [ DW_TAG_class_type ] [C] [line 1, size 32, align 32, offset 0] [from ]
+!14 = metadata !{metadata !15, metadata !16, metadata !19, metadata !20, metadata !23, metadata !24, metadata !26}
+!15 = metadata !{i32 786445, metadata !13, metadata !"a", metadata !6, i32 3, i64 0, i64 0, i64 0, i32 4097, metadata !9, null} ; [ DW_TAG_member ] [a] [line 3, size 0, align 0, offset 0] [private] [static] [from int]
+!16 = metadata !{i32 786445, metadata !13, metadata !"const_a", metadata !6, i32 4, i64 0, i64 0, i64 0, i32 4097, metadata !17, i1 true} ; [ DW_TAG_member ] [const_a] [line 4, size 0, align 0, offset 0] [private] [static] [from ]
+!17 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from bool]
+!18 = metadata !{i32 786468, null, metadata !"bool", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
+!19 = metadata !{i32 786445, metadata !13, metadata !"b", metadata !6, i32 6, i64 0, i64 0, i64 0, i32 4098, metadata !9, null} ; [ DW_TAG_member ] [b] [line 6, size 0, align 0, offset 0] [protected] [static] [from int]
+!20 = metadata !{i32 786445, metadata !13, metadata !"const_b", metadata !6, i32 7, i64 0, i64 0, i64 0, i32 4098, metadata !21, float 0x40091EB860000000} ; [ DW_TAG_member ] [const_b] [line 7, size 0, align 0, offset 0] [protected] [static] [from ]
+!21 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from float]
+!22 = metadata !{i32 786468, null, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!23 = metadata !{i32 786445, metadata !13, metadata !"c", metadata !6, i32 9, i64 0, i64 0, i64 0, i32 4096, metadata !9, null} ; [ DW_TAG_member ] [c] [line 9, size 0, align 0, offset 0] [static] [from int]
+!24 = metadata !{i32 786445, metadata !13, metadata !"const_c", metadata !6, i32 10, i64 0, i64 0, i64 0, i32 4096, metadata !25, i32 18} ; [ DW_TAG_member ] [const_c] [line 10, size 0, align 0, offset 0] [static] [from ]
+!25 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!26 = metadata !{i32 786445, metadata !13, metadata !"d", metadata !6, i32 11, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [d] [line 11, size 32, align 32, offset 0] [from int]
+!27 = metadata !{i32 786484, i32 0, metadata !13, metadata !"b", metadata !"b", metadata !"_ZN1C1bE", metadata !6, i32 15, metadata !9, i32 0, i32 1, i32* @_ZN1C1bE, metadata !19} ; [ DW_TAG_variable ] [b] [line 15] [def]
+!28 = metadata !{i32 786484, i32 0, metadata !13, metadata !"c", metadata !"c", metadata !"_ZN1C1cE", metadata !6, i32 16, metadata !9, i32 0, i32 1, i32* @_ZN1C1cE, metadata !23} ; [ DW_TAG_variable ] [c] [line 16] [def]
+!29 = metadata !{i32 786688, metadata !5, metadata !"instance_C", metadata !6, i32 20, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [instance_C] [line 20]
+!30 = metadata !{i32 20, i32 0, metadata !5, null}
+!31 = metadata !{i32 21, i32 0, metadata !5, null}
+!32 = metadata !{i32 22, i32 0, metadata !5, null}
+; PRESENT verifies that static member declarations have these attributes:
+; external, declaration, accessibility, and either DW_AT_MIPS_linkage_name
+; (for variables) or DW_AT_const_value (for constants).
+;
+; PRESENT:      .debug_info contents:
+; PRESENT:      DW_TAG_class_type
+; PRESENT-NEXT: DW_AT_name {{.*}} "C"
+; PRESENT:      0x[[DECL_A:[0-9a-f]+]]: DW_TAG_member
+; PRESENT-NEXT: DW_AT_name {{.*}} "a"
+; PRESENT:      DW_AT_external
+; PRESENT:      DW_AT_declaration
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x03)
+; PRESENT:      DW_AT_MIPS_linkage_name {{.*}} "_ZN1C1aE"
+; PRESENT:      DW_TAG_member
+; PRESENT-NEXT: DW_AT_name {{.*}} "const_a"
+; PRESENT:      DW_AT_external
+; PRESENT:      DW_AT_declaration
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x03)
+; PRESENT:      DW_AT_const_value {{.*}} (1)
+; PRESENT:      0x[[DECL_B:[0-9a-f]+]]: DW_TAG_member
+; PRESENT-NEXT: DW_AT_name {{.*}} "b"
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
+; PRESENT:      DW_AT_MIPS_linkage_name {{.*}} "_ZN1C1bE"
+; PRESENT:      DW_TAG_member
+; PRESENT-NEXT: DW_AT_name {{.*}} "const_b"
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
+; PRESENT:      DW_AT_const_value {{.*}} (0x4048f5c3)
+; PRESENT:      0x[[DECL_C:[0-9a-f]+]]: DW_TAG_member
+; PRESENT-NEXT: DW_AT_name {{.*}} "c"
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; PRESENT:      DW_AT_MIPS_linkage_name {{.*}} "_ZN1C1cE"
+; PRESENT:      DW_TAG_member
+; PRESENT-NEXT: DW_AT_name {{.*}} "const_c"
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; PRESENT:      DW_AT_const_value {{.*}} (0x00000012)
+; While we're here, a normal member has data_member_location and
+; accessibility attributes.
+; PRESENT:      DW_TAG_member
+; PRESENT-NEXT: DW_AT_name {{.*}} "d"
+; PRESENT:      DW_AT_data_member_location
+; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
+; PRESENT:      NULL
+; Definitions point back to their declarations, and have a location.
+; PRESENT:      DW_TAG_variable
+; PRESENT-NEXT: DW_AT_specification {{.*}} {0x[[DECL_A]]}
+; PRESENT-NEXT: DW_AT_location
+; PRESENT:      DW_TAG_variable
+; PRESENT-NEXT: DW_AT_specification {{.*}} {0x[[DECL_B]]}
+; PRESENT-NEXT: DW_AT_location
+; PRESENT:      DW_TAG_variable
+; PRESENT-NEXT: DW_AT_specification {{.*}} {0x[[DECL_C]]}
+; PRESENT-NEXT: DW_AT_location
+
+; ABSENT verifies that static member declarations do not have either
+; DW_AT_location or DW_AT_data_member_location; also, variables do not
+; have DW_AT_const_value and constants do not have DW_AT_MIPS_linkage_name.
+;
+; ABSENT:      .debug_info contents:
+; ABSENT:      DW_TAG_member
+; ABSENT:      DW_AT_name {{.*}} "a"
+; ABSENT-NOT:  DW_AT_const_value
+; ABSENT-NOT:  location
+; ABSENT:      DW_AT_name {{.*}} "const_a"
+; ABSENT-NOT:  DW_AT_MIPS_linkage_name
+; ABSENT-NOT:  location
+; ABSENT:      DW_AT_name {{.*}} "b"
+; ABSENT-NOT:  DW_AT_const_value
+; ABSENT-NOT:  location
+; ABSENT:      DW_AT_name {{.*}} "const_b"
+; ABSENT-NOT:  DW_AT_MIPS_linkage_name
+; ABSENT-NOT:  location
+; ABSENT:      DW_AT_name {{.*}} "c"
+; ABSENT-NOT:  DW_AT_const_value
+; ABSENT-NOT:  location
+; ABSENT:      DW_AT_name {{.*}} "const_c"
+; ABSENT-NOT:  DW_AT_MIPS_linkage_name
+; ABSENT-NOT:  location
+; While we're here, a normal member does not have a linkage name, constant
+; value, or DW_AT_location.
+; ABSENT:      DW_AT_name {{.*}} "d"
+; ABSENT-NOT:  DW_AT_MIPS_linkage_name
+; ABSENT-NOT:  DW_AT_const_value
+; ABSENT-NOT:  DW_AT_location
+; ABSENT:      NULL
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
index b6a263d..f4df0b7 100644
--- a/test/DebugInfo/X86/elf-names.ll
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 ; RUN: llvm-as < %s | llvm-dis | FileCheck --check-prefix=CHECK-DIS %s
 
 ; CHECK: 0x0000000b: DW_TAG_compile_unit
@@ -59,10 +59,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", metadata !"clang version 3.2 (trunk 167506) (llvm/trunk 167505)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !31}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !31}
 !5 = metadata !{i32 786478, i32 0, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2Ev", metadata !6, i32 12, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*)* @_ZN1DC2Ev, null, metadata !17, metadata !27, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
 !6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
diff --git a/test/DebugInfo/X86/empty-and-one-elem-array.ll b/test/DebugInfo/X86/empty-and-one-elem-array.ll
index 0744c6b..c3bdbc4 100644
--- a/test/DebugInfo/X86/empty-and-one-elem-array.ll
+++ b/test/DebugInfo/X86/empty-and-one-elem-array.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -O0 -filetype=obj -o %t < %s
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 ; <rdar://problem/12566646>
 
 %struct.foo = type { i32, [1 x i32] }
@@ -60,10 +60,8 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"test.c", metadata !"/Volumes/Sandbox/llvm", metadata !"clang version 3.3 (trunk 169136)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"func", metadata !"func", metadata !"", metadata !6, i32 11, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @func, null, null, metadata !1, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [func]
 !6 = metadata !{i32 786473, metadata !"test.c", metadata !"/Volumes/Sandbox/llvm", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
diff --git a/test/DebugInfo/X86/empty-array.ll b/test/DebugInfo/X86/empty-array.ll
index dd5c636..b4621fb 100644
--- a/test/DebugInfo/X86/empty-array.ll
+++ b/test/DebugInfo/X86/empty-array.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -O0 -filetype=obj -o %t < %s
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 ; <rdar://problem/12566646>
 
 %class.A = type { [0 x i32] }
@@ -25,10 +25,8 @@
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm", metadata !"clang version 3.3 (trunk 169136)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, %class.A* @a} ; [ DW_TAG_variable ] [a] [line 1] [def]
 !6 = metadata !{i32 786473, metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 0, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [from ]
diff --git a/test/DebugInfo/X86/ending-run.ll b/test/DebugInfo/X86/ending-run.ll
index 6935c47..3813e1f 100644
--- a/test/DebugInfo/X86/ending-run.ll
+++ b/test/DebugInfo/X86/ending-run.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=line %t | FileCheck %s
 
 ; Check that the line table starts at 7, not 4, but that the first
 ; statement isn't until line 8.
@@ -29,10 +29,8 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"ending-run.c", metadata !"/Users/echristo/tmp", metadata !"clang version 3.1 (trunk 153921) (llvm/trunk 153916)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"callee", metadata !"callee", metadata !"", metadata !6, i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 (i32)* @callee, null, null, metadata !10, i32 7} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"ending-run.c", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/X86/enum-class.ll b/test/DebugInfo/X86/enum-class.ll
index 6eb715d..d129603 100644
--- a/test/DebugInfo/X86/enum-class.ll
+++ b/test/DebugInfo/X86/enum-class.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 @a = global i32 0, align 4
 @b = global i64 0, align 8
@@ -8,8 +8,7 @@
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 157269) (llvm/trunk 157264)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !15, metadata !15, metadata !17} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{metadata !3, metadata !8, metadata !12}
+!1 = metadata !{metadata !3, metadata !8, metadata !12}
 !3 = metadata !{i32 786436, null, metadata !"A", metadata !4, i32 1, i64 32, i64 32, i32 0, i32 0, metadata !5, metadata !6, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
 !4 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !5 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
@@ -22,10 +21,8 @@
 !12 = metadata !{i32 786436, null, metadata !"C", metadata !4, i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
 !13 = metadata !{metadata !14}
 !14 = metadata !{i32 786472, metadata !"C1", i64 1} ; [ DW_TAG_enumerator ]
-!15 = metadata !{metadata !16}
-!16 = metadata !{i32 0}
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !19, metadata !20, metadata !21}
+!15 = metadata !{i32 0}
+!17 = metadata !{metadata !19, metadata !20, metadata !21}
 !19 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !4, i32 4, metadata !3, i32 0, i32 1, i32* @a} ; [ DW_TAG_variable ]
 !20 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !4, i32 5, metadata !8, i32 0, i32 1, i64* @b} ; [ DW_TAG_variable ]
 !21 = metadata !{i32 786484, i32 0, null, metadata !"c", metadata !"c", metadata !"", metadata !4, i32 6, metadata !12, i32 0, i32 1, i32* @c} ; [ DW_TAG_variable ]
diff --git a/test/DebugInfo/X86/enum-fwd-decl.ll b/test/DebugInfo/X86/enum-fwd-decl.ll
index 0902430..04ac8e4 100644
--- a/test/DebugInfo/X86/enum-fwd-decl.ll
+++ b/test/DebugInfo/X86/enum-fwd-decl.ll
@@ -1,15 +1,13 @@
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 @e = global i16 0, align 2
 
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/tmp", metadata !"clang version 3.2 (trunk 165274) (llvm/trunk 165272)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/tmp/foo.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i16* @e} ; [ DW_TAG_variable ] [e] [line 2] [def]
 !6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786436, null, metadata !"E", metadata !6, i32 1, i64 16, i64 16, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [fwd] [from ]
diff --git a/test/DebugInfo/X86/fission-cu.ll b/test/DebugInfo/X86/fission-cu.ll
index 3ada3ef..d0ae6c7 100644
--- a/test/DebugInfo/X86/fission-cu.ll
+++ b/test/DebugInfo/X86/fission-cu.ll
@@ -1,15 +1,13 @@
 ; RUN: llc -split-dwarf=Enable -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck %s
 
 @a = common global i32 0, align 4
 
 !llvm.dbg.cu = !{!0}
 
-!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"baz.c", metadata !"/usr/local/google/home/echristo/tmp", metadata !"clang version 3.3 (trunk 169021) (llvm/trunk 169020)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"baz.c", metadata !"/usr/local/google/home/echristo/tmp", metadata !"clang version 3.3 (trunk 169021) (llvm/trunk 169020)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3, metadata !"baz.dwo"} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.c] [DW_LANG_C99]
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, i32* @a} ; [ DW_TAG_variable ] [a] [line 1] [def]
 !6 = metadata !{i32 786473, metadata !"baz.c", metadata !"/usr/local/google/home/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
@@ -19,21 +17,83 @@
 ; DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
 ; DW_AT_ranges_base, DW_AT_addr_base.
 
+; CHECK: .debug_abbrev contents:
+; CHECK: Abbrev table for offset: 0x00000000
+; CHECK: [1] DW_TAG_compile_unit DW_CHILDREN_no
+; CHECK: DW_AT_GNU_dwo_name      DW_FORM_strp
+; CHECK: DW_AT_GNU_dwo_id        DW_FORM_data8
+; CHECK: DW_AT_GNU_addr_base     DW_FORM_sec_offset
+; CHECK: DW_AT_low_pc    DW_FORM_addr
+; CHECK: DW_AT_stmt_list DW_FORM_sec_offset
+; CHECK: DW_AT_comp_dir  DW_FORM_strp
+
 ; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
-; CHECK: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000000] = "baz.c")
+; CHECK: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000000] = "baz.dwo")
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x0000000000000000)
+; CHECK: DW_AT_GNU_addr_base [DW_FORM_sec_offset]                   (0x00000000)
 ; CHECK: DW_AT_low_pc [DW_FORM_addr]       (0x0000000000000000)
-; CHECK: DW_AT_stmt_list [DW_FORM_data4]   (0x00000000)
-; CHECK: DW_AT_comp_dir [DW_FORM_strp]     ( .debug_str[0x00000006] = "/usr/local/google/home/echristo/tmp")
+; CHECK: DW_AT_stmt_list [DW_FORM_sec_offset]   (0x00000000)
+; CHECK: DW_AT_comp_dir [DW_FORM_strp]     ( .debug_str[0x00000008] = "/usr/local/google/home/echristo/tmp")
+
+; CHECK: .debug_str contents:
+; CHECK: 0x00000000: "baz.dwo"
+; CHECK: 0x00000008: "/usr/local/google/home/echristo/tmp"
+
+; Check that we're using the right forms.
+; CHECK: .debug_abbrev.dwo contents:
+; CHECK: Abbrev table for offset: 0x00000000
+; CHECK: [1] DW_TAG_compile_unit DW_CHILDREN_yes
+; CHECK: DW_AT_producer  DW_FORM_GNU_str_index
+; CHECK: DW_AT_language  DW_FORM_data2
+; CHECK: DW_AT_name      DW_FORM_GNU_str_index
+; CHECK: DW_AT_low_pc    DW_FORM_GNU_addr_index
+; CHECK: DW_AT_stmt_list DW_FORM_data4
+; CHECK: DW_AT_comp_dir  DW_FORM_GNU_str_index
+; CHECK: DW_AT_GNU_dwo_id        DW_FORM_data8
+
+; CHECK: [2] DW_TAG_base_type    DW_CHILDREN_no
+; CHECK: DW_AT_name      DW_FORM_GNU_str_index
+; CHECK: DW_AT_encoding  DW_FORM_data1
+; CHECK: DW_AT_byte_size DW_FORM_data1
+
+; CHECK: [3] DW_TAG_variable     DW_CHILDREN_no
+; CHECK: DW_AT_name      DW_FORM_GNU_str_index
+; CHECK: DW_AT_type      DW_FORM_ref4
+; CHECK: DW_AT_external  DW_FORM_flag_present
+; CHECK: DW_AT_decl_file DW_FORM_data1
+; CHECK: DW_AT_decl_line DW_FORM_data1
+; CHECK: DW_AT_location  DW_FORM_block1
 
 ; Check that the rest of the compile units have information.
-; FIXME: Strings will ultimately be a different form.
 ; CHECK: .debug_info.dwo contents:
 ; CHECK: DW_TAG_compile_unit
 ; CHECK: DW_AT_producer [DW_FORM_GNU_str_index] ( indexed (00000000) string = "clang version 3.3 (trunk 169021) (llvm/trunk 169020)")
 ; CHECK: DW_AT_language [DW_FORM_data2]        (0x000c)
 ; CHECK: DW_AT_name [DW_FORM_GNU_str_index]    ( indexed (00000001) string = "baz.c")
+; CHECK: DW_AT_low_pc [DW_FORM_GNU_addr_index]     ( indexed (00000000) address = 0x0000000000000000)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x0000000000000000)
 ; CHECK: DW_TAG_base_type
 ; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000004) string = "int")
 ; CHECK: DW_TAG_variable
 ; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000003) string = "a")
+; CHECK: DW_AT_type [DW_FORM_ref4]       (cu + 0x001e => {0x0000001e})
+; CHECK: DW_AT_external [DW_FORM_flag_present]   (true)
+; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01)
+; CHECK: DW_AT_decl_line [DW_FORM_data1] (0x01)
+; CHECK: DW_AT_location [DW_FORM_block1] (<0x02> fb 01 )
+
+
+; CHECK: .debug_str.dwo contents:
+; CHECK: 0x00000000: "clang version 3.3 (trunk 169021) (llvm/trunk 169020)"
+; CHECK: 0x00000035: "baz.c"
+; CHECK: 0x0000003b: "/usr/local/google/home/echristo/tmp"
+; CHECK: 0x0000005f: "a"
+; CHECK: 0x00000061: "int"
+
+; CHECK: .debug_str_offsets.dwo contents:
+; CHECK: 0x00000000: 00000000
+; CHECK: 0x00000004: 00000035
+; CHECK: 0x00000008: 0000003b
+; CHECK: 0x0000000c: 0000005f
+; CHECK: 0x00000010: 00000061
diff --git a/test/DebugInfo/X86/line-info.ll b/test/DebugInfo/X86/line-info.ll
new file mode 100644
index 0000000..92dd072
--- /dev/null
+++ b/test/DebugInfo/X86/line-info.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -filetype=obj -O0 < %s > %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: 2      0      1   0  is_stmt
+
+; IR generated from clang -g -emit-llvm with the following source:
+; list0.h:
+; int foo (int x) {
+;     return ++x;
+; }
+; list0.c:
+; #include "list0.h"
+; int main() {
+; }
+
+define i32 @foo(i32 %x) nounwind uwtable {
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %x.addr}, metadata !14), !dbg !15
+  %0 = load i32* %x.addr, align 4, !dbg !16
+  %inc = add nsw i32 %0, 1, !dbg !16
+  store i32 %inc, i32* %x.addr, align 4, !dbg !16
+  ret i32 %inc, !dbg !16
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @main() nounwind uwtable {
+entry:
+  ret i32 0, !dbg !17
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"list0.c", metadata !"/usr/local/google/home/blaikie/dev/scratch/pr14566", metadata !"clang version 3.3 ", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/pr14566/list0.c] [DW_LANG_C99]
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !10}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!6 = metadata !{i32 786473, metadata !"./list0.h", metadata !"/usr/local/google/home/blaikie/dev/scratch/pr14566", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786478, i32 0, metadata !11, metadata !"main", metadata !"main", metadata !"", metadata !11, i32 2, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
+!11 = metadata !{i32 786473, metadata !"list0.c", metadata !"/usr/local/google/home/blaikie/dev/scratch/pr14566", null} ; [ DW_TAG_file_type ]
+!12 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{metadata !9}
+!14 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
+!15 = metadata !{i32 1, i32 0, metadata !5, null}
+!16 = metadata !{i32 2, i32 0, metadata !5, null}
+!17 = metadata !{i32 3, i32 0, metadata !18, null}
+!18 = metadata !{i32 786443, metadata !10, metadata !11} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/pr14566/list0.c]
diff --git a/test/DebugInfo/X86/linkage-name.ll b/test/DebugInfo/X86/linkage-name.ll
index b984923..a3c2a9b 100644
--- a/test/DebugInfo/X86/linkage-name.ll
+++ b/test/DebugInfo/X86/linkage-name.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-macosx -darwin-gdb-compat=Disable %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; CHECK: DW_TAG_subprogram [9] *
 ; CHECK-NOT: DW_AT_MIPS_linkage_name
@@ -27,10 +27,8 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo", metadata !"clang version 3.1 (trunk 152691) (llvm/trunk 152692)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, null, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", metadata !6, i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, metadata !16} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
@@ -44,8 +42,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !15 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !16 = metadata !{metadata !17}
 !17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!18 = metadata !{metadata !19}
-!19 = metadata !{metadata !20}
+!18 = metadata !{metadata !20}
 !20 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 9, metadata !11, i32 0, i32 1, %class.A* @a} ; [ DW_TAG_variable ]
 !21 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777221, metadata !22, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
 !22 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
diff --git a/test/DebugInfo/X86/low-pc-cu.ll b/test/DebugInfo/X86/low-pc-cu.ll
index f9d9b91..2240f36 100644
--- a/test/DebugInfo/X86/low-pc-cu.ll
+++ b/test/DebugInfo/X86/low-pc-cu.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; Check that we use DW_AT_low_pc
 
@@ -15,10 +15,8 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.1 (trunk 153454) (llvm/trunk 153471)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !12}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !12}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"q", metadata !"q", metadata !"_Z1qv", metadata !6, i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z1qv, null, null, metadata !10} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/X86/main-file-name.s b/test/DebugInfo/X86/main-file-name.s
index 6817c9e..0369c61 100644
--- a/test/DebugInfo/X86/main-file-name.s
+++ b/test/DebugInfo/X86/main-file-name.s
@@ -1,5 +1,5 @@
 // RUN: llvm-mc -triple x86_64-unknown-linux-gnu -filetype obj -main-file-name foo.S -g -o %t %s
-// RUN: llvm-dwarfdump %t | FileCheck %s
+// RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 // CHECK: DW_TAG_compile_unit [1]
 // CHECK-NOT: DW_TAG_
diff --git a/test/DebugInfo/X86/misched-dbg-value.ll b/test/DebugInfo/X86/misched-dbg-value.ll
new file mode 100644
index 0000000..6c1032e
--- /dev/null
+++ b/test/DebugInfo/X86/misched-dbg-value.ll
@@ -0,0 +1,173 @@
+; RUN: llc %s -mtriple=x86_64-apple-darwin -filetype=obj -o %t -enable-misched
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; rdar://13183203
+; Make sure when misched is enabled, we still have location information for
+; function parameters.
+; CHECK: .debug_info contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_subprogram
+; CHECK: Proc8
+; CHECK: DW_TAG_formal_parameter
+; CHECK: Array1Par
+; CHECK: DW_AT_location
+; CHECK: DW_TAG_formal_parameter
+; CHECK: Array2Par
+; CHECK: DW_AT_location
+; CHECK: DW_TAG_formal_parameter
+; CHECK: IntParI1
+; CHECK: DW_AT_location
+; CHECK: DW_TAG_formal_parameter
+; CHECK: IntParI2
+; CHECK: DW_AT_location
+
+%struct.Record = type { %struct.Record*, i32, i32, i32, [31 x i8] }
+
+@Version = global [4 x i8] c"1.1\00", align 1
+@IntGlob = common global i32 0, align 4
+@BoolGlob = common global i32 0, align 4
+@Char1Glob = common global i8 0, align 1
+@Char2Glob = common global i8 0, align 1
+@Array1Glob = common global [51 x i32] zeroinitializer, align 16
+@Array2Glob = common global [51 x [51 x i32]] zeroinitializer, align 16
+@PtrGlb = common global %struct.Record* null, align 8
+@PtrGlbNext = common global %struct.Record* null, align 8
+
+define void @Proc8(i32* nocapture %Array1Par, [51 x i32]* nocapture %Array2Par, i32 %IntParI1, i32 %IntParI2) nounwind optsize {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32* %Array1Par}, i64 0, metadata !23), !dbg !64
+  tail call void @llvm.dbg.value(metadata !{[51 x i32]* %Array2Par}, i64 0, metadata !24), !dbg !65
+  tail call void @llvm.dbg.value(metadata !{i32 %IntParI1}, i64 0, metadata !25), !dbg !66
+  tail call void @llvm.dbg.value(metadata !{i32 %IntParI2}, i64 0, metadata !26), !dbg !67
+  %add = add i32 %IntParI1, 5, !dbg !68
+  tail call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !27), !dbg !68
+  %idxprom = sext i32 %add to i64, !dbg !69
+  %arrayidx = getelementptr inbounds i32* %Array1Par, i64 %idxprom, !dbg !69
+  store i32 %IntParI2, i32* %arrayidx, align 4, !dbg !69, !tbaa !70
+  %add3 = add nsw i32 %IntParI1, 6, !dbg !73
+  %idxprom4 = sext i32 %add3 to i64, !dbg !73
+  %arrayidx5 = getelementptr inbounds i32* %Array1Par, i64 %idxprom4, !dbg !73
+  store i32 %IntParI2, i32* %arrayidx5, align 4, !dbg !73, !tbaa !70
+  %add6 = add nsw i32 %IntParI1, 35, !dbg !74
+  %idxprom7 = sext i32 %add6 to i64, !dbg !74
+  %arrayidx8 = getelementptr inbounds i32* %Array1Par, i64 %idxprom7, !dbg !74
+  store i32 %add, i32* %arrayidx8, align 4, !dbg !74, !tbaa !70
+  tail call void @llvm.dbg.value(metadata !{i32 %add}, i64 0, metadata !28), !dbg !75
+  br label %for.body, !dbg !75
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %idxprom, %entry ], [ %indvars.iv.next, %for.body ]
+  %IntIndex.046 = phi i32 [ %add, %entry ], [ %inc, %for.body ]
+  %arrayidx13 = getelementptr inbounds [51 x i32]* %Array2Par, i64 %idxprom, i64 %indvars.iv, !dbg !77
+  store i32 %add, i32* %arrayidx13, align 4, !dbg !77, !tbaa !70
+  %inc = add nsw i32 %IntIndex.046, 1, !dbg !75
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !28), !dbg !75
+  %cmp = icmp sgt i32 %inc, %add3, !dbg !75
+  %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !75
+  br i1 %cmp, label %for.end, label %for.body, !dbg !75
+
+for.end:                                          ; preds = %for.body
+  %sub = add nsw i32 %IntParI1, 4, !dbg !78
+  %idxprom14 = sext i32 %sub to i64, !dbg !78
+  %arrayidx17 = getelementptr inbounds [51 x i32]* %Array2Par, i64 %idxprom, i64 %idxprom14, !dbg !78
+  %0 = load i32* %arrayidx17, align 4, !dbg !78, !tbaa !70
+  %inc18 = add nsw i32 %0, 1, !dbg !78
+  store i32 %inc18, i32* %arrayidx17, align 4, !dbg !78, !tbaa !70
+  %1 = load i32* %arrayidx, align 4, !dbg !79, !tbaa !70
+  %add22 = add nsw i32 %IntParI1, 25, !dbg !79
+  %idxprom23 = sext i32 %add22 to i64, !dbg !79
+  %arrayidx25 = getelementptr inbounds [51 x i32]* %Array2Par, i64 %idxprom23, i64 %idxprom, !dbg !79
+  store i32 %1, i32* %arrayidx25, align 4, !dbg !79, !tbaa !70
+  store i32 5, i32* @IntGlob, align 4, !dbg !80, !tbaa !70
+  ret void, !dbg !81
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+attributes #0 = { nounwind optsize ssp uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"dry.c", metadata !"/Users/manmanren/test-Nov/rdar_13183203/test2", metadata !"clang version 3.3 (trunk 175015)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !10, metadata !11, metadata !29} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 786436, null, metadata !"", metadata !3, i32 128, i64 32, i64 32, i32 0, i32 0, null, metadata !4, i32 0, i32 0} ; [ DW_TAG_enumeration_type ] [line 128, size 32, align 32, offset 0] [from ]
+!3 = metadata !{i32 786473, metadata !"dry.c", metadata !"/Users/manmanren/test-Nov/rdar_13183203/test2", null} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8, metadata !9}
+!5 = metadata !{i32 786472, metadata !"Ident1", i64 0} ; [ DW_TAG_enumerator ] [Ident1 :: 0]
+!6 = metadata !{i32 786472, metadata !"Ident2", i64 10000} ; [ DW_TAG_enumerator ] [Ident2 :: 10000]
+!7 = metadata !{i32 786472, metadata !"Ident3", i64 10001} ; [ DW_TAG_enumerator ] [Ident3 :: 10001]
+!8 = metadata !{i32 786472, metadata !"Ident4", i64 10002} ; [ DW_TAG_enumerator ] [Ident4 :: 10002]
+!9 = metadata !{i32 786472, metadata !"Ident5", i64 10003} ; [ DW_TAG_enumerator ] [Ident5 :: 10003]
+!10 = metadata !{i32 0}
+!11 = metadata !{metadata !12}
+!12 = metadata !{i32 786478, i32 0, metadata !3, metadata !"Proc8", metadata !"Proc8", metadata !"", metadata !3, i32 180, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void (i32*, [51 x i32]*, i32, i32)* @Proc8, null, null, metadata !22, i32 185} ; [ DW_TAG_subprogram ] [line 180] [def] [scope 185] [Proc8]
+!13 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{null, metadata !15, metadata !17, metadata !21, metadata !21}
+!15 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!16 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!17 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!18 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 1632, i64 32, i32 0, i32 0, metadata !16, metadata !19, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 1632, align 32, offset 0] [from int]
+!19 = metadata !{metadata !20}
+!20 = metadata !{i32 786465, i64 0, i64 51}       ; [ DW_TAG_subrange_type ] [0, 50]
+!21 = metadata !{i32 786454, null, metadata !"OneToFifty", metadata !3, i32 132, i64 0, i64 0, i64 0, i32 0, metadata !16} ; [ DW_TAG_typedef ] [OneToFifty] [line 132, size 0, align 0, offset 0] [from int]
+!22 = metadata !{metadata !23, metadata !24, metadata !25, metadata !26, metadata !27, metadata !28}
+!23 = metadata !{i32 786689, metadata !12, metadata !"Array1Par", metadata !3, i32 16777397, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [Array1Par] [line 181]
+!24 = metadata !{i32 786689, metadata !12, metadata !"Array2Par", metadata !3, i32 33554614, metadata !17, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [Array2Par] [line 182]
+!25 = metadata !{i32 786689, metadata !12, metadata !"IntParI1", metadata !3, i32 50331831, metadata !21, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [IntParI1] [line 183]
+!26 = metadata !{i32 786689, metadata !12, metadata !"IntParI2", metadata !3, i32 67109048, metadata !21, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [IntParI2] [line 184]
+!27 = metadata !{i32 786688, metadata !12, metadata !"IntLoc", metadata !3, i32 186, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [IntLoc] [line 186]
+!28 = metadata !{i32 786688, metadata !12, metadata !"IntIndex", metadata !3, i32 187, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [IntIndex] [line 187]
+!29 = metadata !{metadata !30, metadata !35, metadata !36, metadata !38, metadata !39, metadata !40, metadata !42, metadata !46, metadata !63}
+!30 = metadata !{i32 786484, i32 0, null, metadata !"Version", metadata !"Version", metadata !"", metadata !3, i32 111, metadata !31, i32 0, i32 1, [4 x i8]* @Version, null} ; [ DW_TAG_variable ] [Version] [line 111] [def]
+!31 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 32, i64 8, i32 0, i32 0, metadata !32, metadata !33, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
+!32 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!33 = metadata !{metadata !34}
+!34 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
+!35 = metadata !{i32 786484, i32 0, null, metadata !"IntGlob", metadata !"IntGlob", metadata !"", metadata !3, i32 171, metadata !16, i32 0, i32 1, i32* @IntGlob, null} ; [ DW_TAG_variable ] [IntGlob] [line 171] [def]
+!36 = metadata !{i32 786484, i32 0, null, metadata !"BoolGlob", metadata !"BoolGlob", metadata !"", metadata !3, i32 172, metadata !37, i32 0, i32 1, i32* @BoolGlob, null} ; [ DW_TAG_variable ] [BoolGlob] [line 172] [def]
+!37 = metadata !{i32 786454, null, metadata !"boolean", metadata !3, i32 149, i64 0, i64 0, i64 0, i32 0, metadata !16} ; [ DW_TAG_typedef ] [boolean] [line 149, size 0, align 0, offset 0] [from int]
+!38 = metadata !{i32 786484, i32 0, null, metadata !"Char1Glob", metadata !"Char1Glob", metadata !"", metadata !3, i32 173, metadata !32, i32 0, i32 1, i8* @Char1Glob, null} ; [ DW_TAG_variable ] [Char1Glob] [line 173] [def]
+!39 = metadata !{i32 786484, i32 0, null, metadata !"Char2Glob", metadata !"Char2Glob", metadata !"", metadata !3, i32 174, metadata !32, i32 0, i32 1, i8* @Char2Glob, null} ; [ DW_TAG_variable ] [Char2Glob] [line 174] [def]
+!40 = metadata !{i32 786484, i32 0, null, metadata !"Array1Glob", metadata !"Array1Glob", metadata !"", metadata !3, i32 175, metadata !41, i32 0, i32 1, [51 x i32]* @Array1Glob, null} ; [ DW_TAG_variable ] [Array1Glob] [line 175] [def]
+!41 = metadata !{i32 786454, null, metadata !"Array1Dim", metadata !3, i32 135, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_typedef ] [Array1Dim] [line 135, size 0, align 0, offset 0] [from ]
+!42 = metadata !{i32 786484, i32 0, null, metadata !"Array2Glob", metadata !"Array2Glob", metadata !"", metadata !3, i32 176, metadata !43, i32 0, i32 1, [51 x [51 x i32]]* @Array2Glob, null} ; [ DW_TAG_variable ] [Array2Glob] [line 176] [def]
+!43 = metadata !{i32 786454, null, metadata !"Array2Dim", metadata !3, i32 136, i64 0, i64 0, i64 0, i32 0, metadata !44} ; [ DW_TAG_typedef ] [Array2Dim] [line 136, size 0, align 0, offset 0] [from ]
+!44 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 83232, i64 32, i32 0, i32 0, metadata !16, metadata !45, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 83232, align 32, offset 0] [from int]
+!45 = metadata !{metadata !20, metadata !20}
+!46 = metadata !{i32 786484, i32 0, null, metadata !"PtrGlb", metadata !"PtrGlb", metadata !"", metadata !3, i32 177, metadata !47, i32 0, i32 1, %struct.Record** @PtrGlb, null} ; [ DW_TAG_variable ] [PtrGlb] [line 177] [def]
+!47 = metadata !{i32 786454, null, metadata !"RecordPtr", metadata !3, i32 148, i64 0, i64 0, i64 0, i32 0, metadata !48} ; [ DW_TAG_typedef ] [RecordPtr] [line 148, size 0, align 0, offset 0] [from ]
+!48 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !49} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from RecordType]
+!49 = metadata !{i32 786454, null, metadata !"RecordType", metadata !3, i32 147, i64 0, i64 0, i64 0, i32 0, metadata !50} ; [ DW_TAG_typedef ] [RecordType] [line 147, size 0, align 0, offset 0] [from Record]
+!50 = metadata !{i32 786451, null, metadata !"Record", metadata !3, i32 138, i64 448, i64 64, i32 0, i32 0, null, metadata !51, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [Record] [line 138, size 448, align 64, offset 0] [from ]
+!51 = metadata !{metadata !52, metadata !54, metadata !56, metadata !57, metadata !58}
+!52 = metadata !{i32 786445, metadata !50, metadata !"PtrComp", metadata !3, i32 140, i64 64, i64 64, i64 0, i32 0, metadata !53} ; [ DW_TAG_member ] [PtrComp] [line 140, size 64, align 64, offset 0] [from ]
+!53 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !50} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Record]
+!54 = metadata !{i32 786445, metadata !50, metadata !"Discr", metadata !3, i32 141, i64 32, i64 32, i64 64, i32 0, metadata !55} ; [ DW_TAG_member ] [Discr] [line 141, size 32, align 32, offset 64] [from Enumeration]
+!55 = metadata !{i32 786454, null, metadata !"Enumeration", metadata !3, i32 128, i64 0, i64 0, i64 0, i32 0, metadata !2} ; [ DW_TAG_typedef ] [Enumeration] [line 128, size 0, align 0, offset 0] [from ]
+!56 = metadata !{i32 786445, metadata !50, metadata !"EnumComp", metadata !3, i32 142, i64 32, i64 32, i64 96, i32 0, metadata !55} ; [ DW_TAG_member ] [EnumComp] [line 142, size 32, align 32, offset 96] [from Enumeration]
+!57 = metadata !{i32 786445, metadata !50, metadata !"IntComp", metadata !3, i32 143, i64 32, i64 32, i64 128, i32 0, metadata !21} ; [ DW_TAG_member ] [IntComp] [line 143, size 32, align 32, offset 128] [from OneToFifty]
+!58 = metadata !{i32 786445, metadata !50, metadata !"StringComp", metadata !3, i32 144, i64 248, i64 8, i64 160, i32 0, metadata !59} ; [ DW_TAG_member ] [StringComp] [line 144, size 248, align 8, offset 160] [from String30]
+!59 = metadata !{i32 786454, null, metadata !"String30", metadata !3, i32 134, i64 0, i64 0, i64 0, i32 0, metadata !60} ; [ DW_TAG_typedef ] [String30] [line 134, size 0, align 0, offset 0] [from ]
+!60 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 248, i64 8, i32 0, i32 0, metadata !32, metadata !61, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 248, align 8, offset 0] [from char]
+!61 = metadata !{metadata !62}
+!62 = metadata !{i32 786465, i64 0, i64 31}       ; [ DW_TAG_subrange_type ] [0, 30]
+!63 = metadata !{i32 786484, i32 0, null, metadata !"PtrGlbNext", metadata !"PtrGlbNext", metadata !"", metadata !3, i32 178, metadata !47, i32 0, i32 1, %struct.Record** @PtrGlbNext, null} ; [ DW_TAG_variable ] [PtrGlbNext] [line 178] [def]
+!64 = metadata !{i32 181, i32 0, metadata !12, null}
+!65 = metadata !{i32 182, i32 0, metadata !12, null}
+!66 = metadata !{i32 183, i32 0, metadata !12, null}
+!67 = metadata !{i32 184, i32 0, metadata !12, null}
+!68 = metadata !{i32 189, i32 0, metadata !12, null}
+!69 = metadata !{i32 190, i32 0, metadata !12, null}
+!70 = metadata !{metadata !"int", metadata !71}
+!71 = metadata !{metadata !"omnipotent char", metadata !72}
+!72 = metadata !{metadata !"Simple C/C++ TBAA"}
+!73 = metadata !{i32 191, i32 0, metadata !12, null}
+!74 = metadata !{i32 192, i32 0, metadata !12, null}
+!75 = metadata !{i32 193, i32 0, metadata !76, null}
+!76 = metadata !{i32 786443, metadata !12, i32 193, i32 0, metadata !3, i32 0} ; [ DW_TAG_lexical_block ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c]
+!77 = metadata !{i32 194, i32 0, metadata !76, null}
+!78 = metadata !{i32 195, i32 0, metadata !12, null}
+!79 = metadata !{i32 196, i32 0, metadata !12, null}
+!80 = metadata !{i32 197, i32 0, metadata !12, null}
+!81 = metadata !{i32 198, i32 0, metadata !12, null}
diff --git a/test/DebugInfo/X86/multiple-at-const-val.ll b/test/DebugInfo/X86/multiple-at-const-val.ll
new file mode 100644
index 0000000..5f3e0d9
--- /dev/null
+++ b/test/DebugInfo/X86/multiple-at-const-val.ll
@@ -0,0 +1,59 @@
+; RUN: llc -O0 %s -mtriple=x86_64-apple-darwin -filetype=obj -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; rdar://13071590
+; Check we are not emitting mutliple AT_const_value for a single member.
+; CHECK: .debug_info contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_class_type
+; CHECK: DW_TAG_member
+; CHECK: badbit
+; CHECK: DW_AT_const_value [DW_FORM_data4]	(0x00000001)
+; CHECK-NOT: DW_AT_const_value
+; CHECK: NULL
+
+%"class.std::basic_ostream" = type { i32 (...)**, %"class.std::basic_os" }
+%"class.std::basic_os" = type { %"class.std::os_base", %"class.std::basic_ostream"*, i8, i8 }
+%"class.std::os_base" = type { i32 (...)**, i64, i64, i32, i32, i32 }
+
+@_ZSt4cout = external global %"class.std::basic_ostream"
+@.str = private unnamed_addr constant [6 x i8] c"c is \00", align 1
+
+define i32 @main() {
+entry:
+  %call1.i = tail call %"class.std::basic_ostream"* @test(%"class.std::basic_ostream"* @_ZSt4cout, i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0), i64 5)
+  ret i32 0
+}
+
+declare %"class.std::basic_ostream"* @test(%"class.std::basic_ostream"*, i8*, i64)
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"student2.cpp", metadata !"/privite/tmp", metadata !"clang version 3.3 (trunk 174207)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !955, metadata !956, metadata !1786} ; [ DW_TAG_compile_unit ] [/privite/tmp/student2.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !26}
+!4 = metadata !{i32 786489, null, metadata !"std", metadata !5, i32 48} ; [ DW_TAG_namespace ]
+!5 = metadata !{i32 786473, metadata !"os_base.h", metadata !"/privite/tmp", null} ; [ DW_TAG_file_type ]
+!25 = metadata !{i32 786472, metadata !"_S_os_fmtflags_end", i64 65536} ; [ DW_TAG_enumerator ]
+!26 = metadata !{i32 786436, metadata !4, metadata !"_Ios_Iostate", metadata !5, i32 146, i64 32, i64 32, i32 0, i32 0, null, metadata !27, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!27 = metadata !{metadata !28, metadata !29, metadata !30, metadata !31, metadata !32}
+!28 = metadata !{i32 786472, metadata !"_S_goodbit", i64 0} ; [ DW_TAG_enumerator ] [_S_goodbit :: 0]
+!29 = metadata !{i32 786472, metadata !"_S_badbit", i64 1} ; [ DW_TAG_enumerator ] [_S_badbit :: 1]
+!30 = metadata !{i32 786472, metadata !"_S_eofbit", i64 2} ; [ DW_TAG_enumerator ] [_S_eofbit :: 2]
+!31 = metadata !{i32 786472, metadata !"_S_failbit", i64 4} ; [ DW_TAG_enumerator ] [_S_failbit :: 4]
+!32 = metadata !{i32 786472, metadata !"_S_os_ostate_end", i64 65536} ; [ DW_TAG_enumerator ] [_S_os_ostate_end :: 65536]
+!49 = metadata !{i32 786434, metadata !4, metadata !"os_base", metadata !5, i32 200, i64 1728, i64 64, i32 0, i32 0, null, metadata !50, i32 0, metadata !49, null} ; [ DW_TAG_class_type ]
+!50 = metadata !{metadata !77}
+!54 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !55, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!55 = metadata !{metadata !56}
+!56 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!77 = metadata !{i32 786445, metadata !49, metadata !"badbit", metadata !5, i32 331, i64 0, i64 0, i64 0, i32 4096, metadata !78, i32 1} ; [ DW_TAG_member ]
+!78 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !79} ; [ DW_TAG_const_type ]
+!79 = metadata !{i32 786454, metadata !49, metadata !"ostate", metadata !5, i32 327, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_typedef ]
+!955 = metadata !{i32 0}
+!956 = metadata !{metadata !960}
+!960 = metadata !{i32 786478, i32 0, metadata !961, metadata !"main", metadata !"main", metadata !"", metadata !961, i32 73, metadata !54, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !955, i32 73} ; [ DW_TAG_subprogram ]
+!961 = metadata !{i32 786473, metadata !"student2.cpp", metadata !"/privite/tmp", null} ; [ DW_TAG_file_type ]
+!1786 = metadata !{metadata !1800}
+!1800 = metadata !{i32 786484, i32 0, metadata !5, metadata !"badbit", metadata !"badbit", metadata !"badbit", metadata !5, i32 331, metadata !78, i32 1, i32 1, i32 1, metadata !77} ; [ DW_TAG_variable ]
diff --git a/test/DebugInfo/X86/nondefault-subrange-array.ll b/test/DebugInfo/X86/nondefault-subrange-array.ll
index 6247cc3..fcc2912 100644
--- a/test/DebugInfo/X86/nondefault-subrange-array.ll
+++ b/test/DebugInfo/X86/nondefault-subrange-array.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin -O0 -filetype=obj -o %t < %s
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 %class.A = type { [42 x i32] }
 
@@ -28,10 +28,8 @@
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm", metadata !"clang version 3.3 (trunk 169136)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, %class.A* @a} ; [ DW_TAG_variable ] [a] [line 1] [def]
 !6 = metadata !{i32 786473, metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 0, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [from ]
diff --git a/test/DebugInfo/X86/objc-fwd-decl.ll b/test/DebugInfo/X86/objc-fwd-decl.ll
index 1a815f9..eead9e1 100644
--- a/test/DebugInfo/X86/objc-fwd-decl.ll
+++ b/test/DebugInfo/X86/objc-fwd-decl.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-macosx %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; CHECK: 0x00000027:   DW_TAG_structure_type
 ; CHECK: 0x0000002c:     DW_AT_declaration
@@ -13,10 +13,8 @@
 !llvm.module.flags = !{!9, !10, !11, !12}
 
 !0 = metadata !{i32 786449, i32 0, i32 16, metadata !"foo.m", metadata !"/Users/echristo", metadata !"clang version 3.1 (trunk 152054 trunk 152094)", i1 true, i1 false, metadata !"", i32 2, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 3, metadata !7, i32 0, i32 1, %0** @a} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 786473, metadata !"foo.m", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
diff --git a/test/DebugInfo/X86/op_deref.ll b/test/DebugInfo/X86/op_deref.ll
index b0b09e7..13efe21 100644
--- a/test/DebugInfo/X86/op_deref.ll
+++ b/test/DebugInfo/X86/op_deref.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; CHECK: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
 ; FIXME: The location here needs to be fixed, but llvm-dwarfdump doesn't handle
@@ -60,10 +60,8 @@ declare void @llvm.stackrestore(i8*) nounwind
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"bar.c", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 156005) (llvm/trunk 156000)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"testVLAwithSize", metadata !"testVLAwithSize", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @testVLAwithSize, null, null, metadata !1, i32 2} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"bar.c", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/X86/pointer-type-size.ll b/test/DebugInfo/X86/pointer-type-size.ll
index f11fbe4..8e203c9 100644
--- a/test/DebugInfo/X86/pointer-type-size.ll
+++ b/test/DebugInfo/X86/pointer-type-size.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-macosx10.7 %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; CHECK: ptr
 ; CHECK-NOT: AT_bit_size
@@ -11,10 +11,8 @@
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 12, metadata !"foo.c", metadata !"/Users/echristo/tmp", metadata !"clang version 3.1 (trunk 147882)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 720948, i32 0, null, metadata !"crass", metadata !"crass", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, %struct.crass* @crass} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 720937, metadata !"foo.c", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720915, null, metadata !"crass", metadata !6, i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_structure_type ]
diff --git a/test/DebugInfo/X86/pr11300.ll b/test/DebugInfo/X86/pr11300.ll
index 5a001ee..b0c8f37 100644
--- a/test/DebugInfo/X86/pr11300.ll
+++ b/test/DebugInfo/X86/pr11300.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; test that the DW_AT_specification is a back edge in the file.
 
@@ -32,10 +32,8 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 4, metadata !"/home/espindola/llvm/test.cc", metadata !"/home/espindola/tmpfs/build", metadata !"clang version 3.0 ()", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !20}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !20}
 !5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"zed", metadata !"zed", metadata !"_Z3zedP3foo", metadata !6, i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%struct.foo*)* @_Z3zedP3foo, null, null, metadata !18} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !"/home/espindola/llvm/test.cc", metadata !"/home/espindola/tmpfs/build", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/X86/pr12831.ll b/test/DebugInfo/X86/pr12831.ll
index abb946d..3970583 100644
--- a/test/DebugInfo/X86/pr12831.ll
+++ b/test/DebugInfo/X86/pr12831.ll
@@ -78,10 +78,8 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"BPLFunctionWriter.cpp", metadata !"/home/peter/crashdelta", metadata !"clang version 3.2 ", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !128} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !106, metadata !107, metadata !126, metadata !127}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !106, metadata !107, metadata !126, metadata !127}
 !5 = metadata !{i32 786478, i32 0, null, metadata !"writeExpr", metadata !"writeExpr", metadata !"_ZN17BPLFunctionWriter9writeExprEv", metadata !6, i32 19, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.BPLFunctionWriter*)* @_ZN17BPLFunctionWriter9writeExprEv, null, metadata !103, metadata !1, i32 19} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"BPLFunctionWriter2.ii", metadata !"/home/peter/crashdelta", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
@@ -205,8 +203,7 @@ entry:
 !125 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
 !126 = metadata !{i32 786478, i32 0, null, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_", metadata !6, i32 8, metadata !23, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_0EET_", metadata !47, metadata !22, metadata !1, i32 8} ; [ DW_TAG_subprogram ]
 !127 = metadata !{i32 786478, i32 0, null, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", metadata !6, i32 3, metadata !117, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.anon*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", metadata !120, metadata !116, metadata !1, i32 3} ; [ DW_TAG_subprogram ]
-!128 = metadata !{metadata !129}
-!129 = metadata !{metadata !130}
+!128 = metadata !{metadata !130}
 !130 = metadata !{i32 786484, i32 0, metadata !114, metadata !"__stored_locally", metadata !"__stored_locally", metadata !"__stored_locally", metadata !6, i32 2, metadata !131, i32 1, i32 1, i1 true} ; [ DW_TAG_variable ]
 !131 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !132} ; [ DW_TAG_const_type ]
 !132 = metadata !{i32 786468, null, metadata !"bool", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
diff --git a/test/DebugInfo/X86/pr13303.ll b/test/DebugInfo/X86/pr13303.ll
index e820cb5..a72bc9b 100644
--- a/test/DebugInfo/X86/pr13303.ll
+++ b/test/DebugInfo/X86/pr13303.ll
@@ -1,5 +1,5 @@
 ; RUN: llc %s -o %t -filetype=obj -mtriple=x86_64-unknown-linux-gnu
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=line %t | FileCheck %s
 ; PR13303
 
 ; Check that the prologue ends with is_stmt here.
@@ -15,10 +15,8 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"PR13303.c", metadata !"/home/probinson", metadata !"clang version 3.2 (trunk 160143)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
 !6 = metadata !{i32 786473, metadata !"PR13303.c", metadata !"/home/probinson", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
diff --git a/test/DebugInfo/X86/prologue-stack.ll b/test/DebugInfo/X86/prologue-stack.ll
index 929db51..0651e59 100644
--- a/test/DebugInfo/X86/prologue-stack.ll
+++ b/test/DebugInfo/X86/prologue-stack.ll
@@ -21,10 +21,8 @@ declare i32 @callme(i32)
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp", metadata !"clang version 3.2 (trunk 164980) (llvm/trunk 164979)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"isel_line_test2", metadata !"isel_line_test2", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @isel_line_test2, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [isel_line_test2]
 !6 = metadata !{i32 786473, metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
diff --git a/test/DebugInfo/X86/rvalue-ref.ll b/test/DebugInfo/X86/rvalue-ref.ll
index e73869d..136db0e 100644
--- a/test/DebugInfo/X86/rvalue-ref.ll
+++ b/test/DebugInfo/X86/rvalue-ref.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj -O0
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; CHECK: DW_TAG_rvalue_reference_type
 
@@ -23,10 +23,8 @@ declare i32 @printf(i8*, ...)
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 157054) (llvm/trunk 157060)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooOi", metadata !6, i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*)* @_Z3fooOi, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
diff --git a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
new file mode 100644
index 0000000..601d08f
--- /dev/null
+++ b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
@@ -0,0 +1,62 @@
+; RUN: llc -O0 %s -mtriple=x86_64-apple-darwin -filetype=obj -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; rdar://13067005
+; CHECK: .debug_info contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_AT_low_pc [DW_FORM_addr]       (0x0000000000000000)
+; CHECK: DW_AT_stmt_list [DW_FORM_data4]   (0x00000000)
+
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_AT_low_pc [DW_FORM_addr]       (0x0000000000000000)
+; CHECK: DW_AT_stmt_list [DW_FORM_data4]   (0x00000049)
+
+; CHECK: .debug_line contents:
+; CHECK-NEXT: Line table prologue:
+; CHECK-NEXT: total_length: 0x00000045
+; CHECK: Line table prologue:
+; CHECK: total_length: 0x00000047
+
+define i32 @test(i32 %a) nounwind uwtable ssp {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !15), !dbg !16
+  %0 = load i32* %a.addr, align 4, !dbg !17
+  %call = call i32 @fn(i32 %0), !dbg !17
+  ret i32 %call, !dbg !17
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @fn(i32 %a) nounwind uwtable ssp {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !19), !dbg !20
+  %0 = load i32* %a.addr, align 4, !dbg !21
+  ret i32 %0, !dbg !21
+}
+
+!llvm.dbg.cu = !{!0, !10}
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"simple.c", metadata !"/private/tmp", metadata !"clang version 3.3", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"test", metadata !"test", metadata !"", metadata !6, i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @test, null, null, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [test]
+!6 = metadata !{i32 786473, metadata !"simple.c", metadata !"/private/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786449, i32 0, i32 12, metadata !"simple2.c", metadata !"/private/tmp", metadata !"clang version 3.3 (trunk 172862)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !11, metadata !1} ; [ DW_TAG_compile_unit ]
+!11 = metadata !{metadata !13}
+!13 = metadata !{i32 786478, i32 0, metadata !14, metadata !"fn", metadata !"fn", metadata !"", metadata !14, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @fn, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [fn]
+!14 = metadata !{i32 786473, metadata !"simple2.c", metadata !"/private/tmp", null} ; [ DW_TAG_file_type ]
+!15 = metadata !{i32 786689, metadata !5, metadata !"a", metadata !6, i32 16777218, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 2]
+!16 = metadata !{i32 2, i32 0, metadata !5, null}
+!17 = metadata !{i32 4, i32 0, metadata !18, null}
+!18 = metadata !{i32 786443, metadata !5, i32 3, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{i32 786689, metadata !13, metadata !"a", metadata !14, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 1]
+!20 = metadata !{i32 1, i32 0, metadata !13, null}
+!21 = metadata !{i32 2, i32 0, metadata !22, null}
+!22 = metadata !{i32 786443, metadata !13, i32 1, i32 0, metadata !14, i32 0} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/X86/stringpool.ll b/test/DebugInfo/X86/stringpool.ll
index 21b0d09..1e08d54 100644
--- a/test/DebugInfo/X86/stringpool.ll
+++ b/test/DebugInfo/X86/stringpool.ll
@@ -6,10 +6,8 @@
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 720913, i32 0, i32 12, metadata !"z.c", metadata !"/home/nicholas", metadata !"clang version 3.1 (trunk 143009)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 720948, i32 0, null, metadata !"yyyy", metadata !"yyyy", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, i32* @yyyy} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 720937, metadata !"z.c", metadata !"/home/nicholas", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720932, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
diff --git a/test/DebugInfo/X86/struct-loc.ll b/test/DebugInfo/X86/struct-loc.ll
index 9a04738..485aa61 100644
--- a/test/DebugInfo/X86/struct-loc.ll
+++ b/test/DebugInfo/X86/struct-loc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; Make sure that structures have a decl file and decl line attached.
 ; CHECK: DW_TAG_structure_type [3]
@@ -14,10 +14,8 @@
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"struct_bug.c", metadata !"/Users/echristo/tmp", metadata !"clang version 3.1 (trunk 152837) (llvm/trunk 152845)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !6, i32 5, metadata !7, i32 0, i32 1, %struct.foo* @f} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 786473, metadata !"struct_bug.c", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786451, null, metadata !"foo", metadata !6, i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_structure_type ]
diff --git a/test/DebugInfo/X86/subrange-type.ll b/test/DebugInfo/X86/subrange-type.ll
index 15202fb..e9d3e77 100644
--- a/test/DebugInfo/X86/subrange-type.ll
+++ b/test/DebugInfo/X86/subrange-type.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; Make sure that the base type from the subrange type has a name.
 ; CHECK: 0x0000006b:   DW_TAG_base_type [6]
@@ -21,10 +21,8 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp", metadata !"clang version 3.3 (trunk 171472) (llvm/trunk 171487)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [main]
 !6 = metadata !{i32 786473, metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
diff --git a/test/DebugInfo/X86/vector.ll b/test/DebugInfo/X86/vector.ll
index 7b61e76..0ff99cc 100644
--- a/test/DebugInfo/X86/vector.ll
+++ b/test/DebugInfo/X86/vector.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu -O0 -filetype=obj -o %t %s
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; Generated from:
 ; clang -g -S -emit-llvm -o foo.ll foo.c
@@ -12,10 +12,8 @@
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"foo.c", metadata !"/Users/echristo", metadata !"clang version 3.3 (trunk 171825) (llvm/trunk 171822)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/Users/echristo/foo.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 3, metadata !7, i32 0, i32 1, <4 x i32>* @a} ; [ DW_TAG_variable ] [a] [line 3] [def]
 !6 = metadata !{i32 786473, metadata !"foo.c", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786454, null, metadata !"v4si", metadata !6, i32 1, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] [v4si] [line 1, size 0, align 0, offset 0] [from ]
diff --git a/test/DebugInfo/debuglineinfo.test b/test/DebugInfo/debuglineinfo.test
new file mode 100644
index 0000000..14d2f82
--- /dev/null
+++ b/test/DebugInfo/debuglineinfo.test
@@ -0,0 +1,49 @@
+RUN: llvm-rtdyld -printline %p/Inputs/test-inline.o \
+RUN:   | FileCheck %s -check-prefix TEST_INLINE
+RUN: llvm-rtdyld -printline %p/Inputs/test-parameters.o \
+RUN:   | FileCheck %s -check-prefix TEST_PARAMETERS
+
+; This test verifies that relocations are correctly applied to the
+; .debug_line section and exercises DIContext::getLineInfoForAddressRange().
+; If relocations are not applied the first two functions will be reported as
+; both starting at address zero in the; line number table.
+TEST_INLINE:      Function: _Z15test_parametersPfPA2_dR11char_structPPitm, Size = 170
+TEST_INLINE-NEXT: Line info @ 0: test-inline.cpp, line:33
+TEST_INLINE-NEXT: Line info @ 35: test-inline.cpp, line:34
+TEST_INLINE-NEXT: Line info @ 165: test-inline.cpp, line:35
+TEST_INLINE-NEXT: Function: _Z3foov, Size = 3
+TEST_INLINE-NEXT: Line info @ 0: test-inline.cpp, line:28
+TEST_INLINE-NEXT: Line info @ 2: test-inline.cpp, line:29
+TEST_INLINE-NEXT: Line info @ 3: test-inline.cpp, line:29
+TEST_INLINE-NEXT: Function: main, Size = 146
+TEST_INLINE-NEXT: Line info @ 0: test-inline.cpp, line:39
+TEST_INLINE-NEXT: Line info @ 21: test-inline.cpp, line:41
+TEST_INLINE-NEXT: Line info @ 39: test-inline.cpp, line:42
+TEST_INLINE-NEXT: Line info @ 60: test-inline.cpp, line:44
+TEST_INLINE-NEXT: Line info @ 80: test-inline.cpp, line:48
+TEST_INLINE-NEXT: Line info @ 90: test-inline.cpp, line:45
+TEST_INLINE-NEXT: Line info @ 95: test-inline.cpp, line:46
+TEST_INLINE-NEXT: Line info @ 114: test-inline.cpp, line:48 
+TEST_INLINE-NEXT: Line info @ 141: test-inline.cpp, line:49
+TEST_INLINE-NEXT: Line info @ 146: test-inline.cpp, line:49
+
+; This test checks the case where all code is in a single section.
+TEST_PARAMETERS:      Function: _Z15test_parametersPfPA2_dR11char_structPPitm, Size = 170
+TEST_PARAMETERS-NEXT: Line info @ 0: test-parameters.cpp, line:33
+TEST_PARAMETERS-NEXT: Line info @ 35: test-parameters.cpp, line:34
+TEST_PARAMETERS-NEXT: Line info @ 165: test-parameters.cpp, line:35
+TEST_PARAMETERS-NEXT: Function: _Z3foov, Size = 3
+TEST_PARAMETERS-NEXT: Line info @ 0: test-parameters.cpp, line:28
+TEST_PARAMETERS-NEXT: Line info @ 2: test-parameters.cpp, line:29
+TEST_PARAMETERS-NEXT: Function: main, Size = 146
+TEST_PARAMETERS-NEXT: Line info @ 0: test-parameters.cpp, line:39
+TEST_PARAMETERS-NEXT: Line info @ 21: test-parameters.cpp, line:41
+TEST_PARAMETERS-NEXT: Line info @ 39: test-parameters.cpp, line:42
+TEST_PARAMETERS-NEXT: Line info @ 60: test-parameters.cpp, line:44
+TEST_PARAMETERS-NEXT: Line info @ 80: test-parameters.cpp, line:48
+TEST_PARAMETERS-NEXT: Line info @ 90: test-parameters.cpp, line:45
+TEST_PARAMETERS-NEXT: Line info @ 95: test-parameters.cpp, line:46
+TEST_PARAMETERS-NEXT: Line info @ 114: test-parameters.cpp, line:48 
+TEST_PARAMETERS-NEXT: Line info @ 141: test-parameters.cpp, line:49
+TEST_PARAMETERS-NEXT: Line info @ 146: test-parameters.cpp, line:49
+
diff --git a/test/DebugInfo/dwarf-public-names.ll b/test/DebugInfo/dwarf-public-names.ll
new file mode 100644
index 0000000..4ef4197
--- /dev/null
+++ b/test/DebugInfo/dwarf-public-names.ll
@@ -0,0 +1,124 @@
+; RUN: llc -generate-dwarf-pubnames -filetype=obj -o %t.o < %s
+; RUN: llvm-dwarfdump -debug-dump=pubnames %t.o | FileCheck %s
+;
+; ModuleID = 'dwarf-public-names.cpp'
+;
+; Generated from:
+;
+; struct C {
+;   void member_function();
+;   static int static_member_function();
+;   static int static_member_variable;
+; };
+;
+; int C::static_member_variable = 0;
+;
+; void C::member_function() {
+;   static_member_variable = 0;
+; }
+;
+; int C::static_member_function() {
+;   return static_member_variable;
+; }
+;
+; C global_variable;
+;
+; int global_function() {
+;   return -1;
+; }
+;
+; namespace ns {
+;   void global_namespace_function() {
+;     global_variable.member_function();
+;   }
+;   int global_namespace_variable = 1;
+; }
+
+; Skip the output to the header of the pubnames section.
+; CHECK: debug_pubnames
+
+; Check for each name in the output.
+; CHECK: global_namespace_variable
+; CHECK: global_namespace_function
+; CHECK: static_member_function
+; CHECK: global_variable
+; CHECK: global_function
+; CHECK: member_function
+
+%struct.C = type { i8 }
+
+@_ZN1C22static_member_variableE = global i32 0, align 4
+@global_variable = global %struct.C zeroinitializer, align 1
+@_ZN2ns25global_namespace_variableE = global i32 1, align 4
+
+define void @_ZN1C15member_functionEv(%struct.C* %this) nounwind uwtable align 2 {
+entry:
+  %this.addr = alloca %struct.C*, align 8
+  store %struct.C* %this, %struct.C** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !28), !dbg !30
+  %this1 = load %struct.C** %this.addr
+  store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !31
+  ret void, !dbg !32
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @_ZN1C22static_member_functionEv() nounwind uwtable align 2 {
+entry:
+  %0 = load i32* @_ZN1C22static_member_variableE, align 4, !dbg !33
+  ret i32 %0, !dbg !33
+}
+
+define i32 @_Z15global_functionv() nounwind uwtable {
+entry:
+  ret i32 -1, !dbg !34
+}
+
+define void @_ZN2ns25global_namespace_functionEv() nounwind uwtable {
+entry:
+  call void @_ZN1C15member_functionEv(%struct.C* @global_variable), !dbg !35
+  ret void, !dbg !36
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"dwarf-public-names.cpp", metadata !"/usr2/kparzysz/s.hex/t", metadata !"clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !24} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{i32 0}
+!2 = metadata !{metadata !3, metadata !18, metadata !19, metadata !20}
+!3 = metadata !{i32 786478, i32 0, null, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", metadata !4, i32 9, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !12, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!4 = metadata !{i32 786473, metadata !"dwarf-public-names.cpp", metadata !"/usr2/kparzysz/s.hex/t", null} ; [ DW_TAG_file_type ]
+!5 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{null, metadata !7}
+!7 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
+!8 = metadata !{i32 786451, null, metadata !"C", metadata !4, i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [from ]
+!9 = metadata !{metadata !10, metadata !12, metadata !14}
+!10 = metadata !{i32 786445, metadata !8, metadata !"static_member_variable", metadata !4, i32 4, i64 0, i64 0, i64 0, i32 4096, metadata !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
+!11 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = metadata !{i32 786478, i32 0, metadata !8, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", metadata !4, i32 2, metadata !5, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !13, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!13 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!14 = metadata !{i32 786478, i32 0, metadata !8, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", metadata !4, i32 3, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !17, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!15 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !11}
+!17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!18 = metadata !{i32 786478, i32 0, null, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", metadata !4, i32 13, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!19 = metadata !{i32 786478, i32 0, metadata !4, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", metadata !4, i32 19, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !1, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!20 = metadata !{i32 786478, i32 0, metadata !21, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", metadata !4, i32 24, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!21 = metadata !{i32 786489, null, metadata !"ns", metadata !4, i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!22 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{null}
+!24 = metadata !{metadata !25, metadata !26, metadata !27}
+!25 = metadata !{i32 786484, i32 0, metadata !8, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !4, i32 7, metadata !11, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!26 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !4, i32 17, metadata !8, i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!27 = metadata !{i32 786484, i32 0, metadata !21, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !4, i32 27, metadata !11, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!28 = metadata !{i32 786689, metadata !3, metadata !"this", metadata !4, i32 16777225, metadata !29, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 9]
+!29 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
+!30 = metadata !{i32 9, i32 0, metadata !3, null}
+!31 = metadata !{i32 10, i32 0, metadata !3, null}
+!32 = metadata !{i32 11, i32 0, metadata !3, null}
+!33 = metadata !{i32 14, i32 0, metadata !18, null}
+!34 = metadata !{i32 20, i32 0, metadata !19, null}
+!35 = metadata !{i32 25, i32 0, metadata !20, null}
+!36 = metadata !{i32 26, i32 0, metadata !20, null}
diff --git a/test/DebugInfo/dwarfdump-debug-frame-simple.test b/test/DebugInfo/dwarfdump-debug-frame-simple.test
new file mode 100644
index 0000000..c2427d8
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-debug-frame-simple.test
@@ -0,0 +1,28 @@
+; RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test-32bit.elf.o -debug-dump=frames | FileCheck %s -check-prefix FRAMES
+; Note: the input file was generated from Inputs/dwarfdump-test-32bit.elf.c
+
+; FRAMES: .debug_frame
+; FRAMES-NOT: .eh_frame
+
+; FRAMES: 00000000 00000010 ffffffff CIE
+; FRAMES: Version: 1
+; FRAMES:      DW_CFA_def_cfa
+; FRAMES-NEXT: DW_CFA_offset
+; FRAMES-NEXT: DW_CFA_nop
+; FRAMES-NEXT: DW_CFA_nop
+
+; FRAMES: 00000014 00000010 00000000 FDE cie=00000000 pc=00000000...00000022
+; FRAMES:      DW_CFA_advance_loc
+; FRAMES-NEXT: DW_CFA_def_cfa_offset
+; FRAMES-NEXT: DW_CFA_nop
+
+; FRAMES: 00000028 00000014 00000000 FDE cie=00000000 pc=00000030...00000080
+; FRAMES:      DW_CFA_advance_loc
+; FRAMES-NEXT: DW_CFA_def_cfa_offset
+; FRAMES-NEXT: DW_CFA_offset
+; FRAMES-NEXT: DW_CFA_advance_loc
+; FRAMES-NEXT: DW_CFA_def_cfa_register
+
+; FRAMES-NOT: CIE
+; FRAMES-NOT: FDE
+
diff --git a/test/DebugInfo/dwarfdump-dump-flags.test b/test/DebugInfo/dwarfdump-dump-flags.test
new file mode 100644
index 0000000..92b2d50
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-dump-flags.test
@@ -0,0 +1,13 @@
+; RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64 -debug-dump=all | FileCheck %s -check-prefix DUMP_ALL
+; RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64 -debug-dump=info | FileCheck %s -check-prefix DUMP_INFO
+; RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64 -debug-dump=ranges | FileCheck %s -check-prefix DUMP_RANGES
+
+; DUMP_ALL: .debug_info
+; DUMP_ALL: .debug_ranges
+
+; DUMP_INFO: .debug_info
+; DUMP_INFO-NOT: .debug_ranges
+
+; DUMP_RANGES-NOT: .debug_info
+; DUMP_RANGES: .debug_ranges
+
diff --git a/test/DebugInfo/dwarfdump-inlining.test b/test/DebugInfo/dwarfdump-inlining.test
index d3a7e12..e926634 100644
--- a/test/DebugInfo/dwarfdump-inlining.test
+++ b/test/DebugInfo/dwarfdump-inlining.test
@@ -1,28 +1,28 @@
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x613 \
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x710 \
 RUN:   --inlining --functions | FileCheck %s -check-prefix DEEP_STACK
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x6de \
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x7d1 \
 RUN:   --inlining | FileCheck %s -check-prefix SHORTER_STACK
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x685 \
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x785 \
 RUN:   --inlining | FileCheck %s -check-prefix SHORT_STACK
-RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x640 \
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x737 \
 RUN:   --functions | FileCheck %s -check-prefix INL_FUNC_NAME
 
 DEEP_STACK:      inlined_h
-DEEP_STACK-NEXT: header.h:2:21
+DEEP_STACK-NEXT: dwarfdump-inl-test.h:2
 DEEP_STACK-NEXT: inlined_g
-DEEP_STACK-NEXT: header.h:7
+DEEP_STACK-NEXT: dwarfdump-inl-test.h:7
 DEEP_STACK-NEXT: inlined_f
-DEEP_STACK-NEXT: main.cc:3
+DEEP_STACK-NEXT: dwarfdump-inl-test.cc:3
 DEEP_STACK-NEXT: main
-DEEP_STACK-NEXT: main.cc:8
+DEEP_STACK-NEXT: dwarfdump-inl-test.cc:8
 
-SHORTER_STACK:      header.h:7:20
-SHORTER_STACK-NEXT: main.cc:3
-SHORTER_STACK-NEXT: main.cc:8
+SHORTER_STACK:      dwarfdump-inl-test.h:7
+SHORTER_STACK-NEXT: dwarfdump-inl-test.cc:3
+SHORTER_STACK-NEXT: dwarfdump-inl-test.cc:8
 
-SHORT_STACK:      main.cc:3:20
-SHORT_STACK-NEXT: main.cc:8
+SHORT_STACK:      dwarfdump-inl-test.cc:3
+SHORT_STACK-NEXT: dwarfdump-inl-test.cc:8
 
 INL_FUNC_NAME:      inlined_g
-INL_FUNC_NAME-NEXT: header.h:7:20
+INL_FUNC_NAME-NEXT: dwarfdump-inl-test.h:7
 
diff --git a/test/DebugInfo/dwarfdump-pubnames.test b/test/DebugInfo/dwarfdump-pubnames.test
new file mode 100644
index 0000000..e1b16c2
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-pubnames.test
@@ -0,0 +1,16 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-pubnames.elf-x86-64 \
+RUN:   -debug-dump=pubnames | FileCheck %s
+
+CHECK: .debug_pubnames contents:
+CHECK: Length:                161
+CHECK: Version:               2
+CHECK: Offset in .debug_info: 0
+CHECK: Size:                  321
+
+CHECK:  Offset    Name
+CHECK:      98    global_namespace_variable
+CHECK:      a7    global_namespace_function
+CHECK:      ec    static_member_function
+CHECK:      7c    global_variable
+CHECK:     103    global_function
+CHECK:      c2    member_function
diff --git a/test/DebugInfo/dwarfdump-test.test b/test/DebugInfo/dwarfdump-test.test
index 973c344..355445e 100644
--- a/test/DebugInfo/dwarfdump-test.test
+++ b/test/DebugInfo/dwarfdump-test.test
@@ -1,56 +1,56 @@
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64  \
-RUN:   --address=0x400589 --functions | FileCheck %s -check-prefix MAIN
+RUN:   --address=0x400559 --functions | FileCheck %s -check-prefix MAIN
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64  \
-RUN:   --address=0x400558 --functions | FileCheck %s -check-prefix FUNCTION
+RUN:   --address=0x400528 --functions | FileCheck %s -check-prefix FUNCTION
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test.elf-x86-64 \
-RUN:   --address=0x4005b6 --functions | FileCheck %s -check-prefix CTOR_WITH_SPEC
+RUN:   --address=0x400586 --functions | FileCheck %s -check-prefix CTOR_WITH_SPEC
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test2.elf-x86-64 \
-RUN:   --address=0x4004b8 --functions | FileCheck %s -check-prefix MANY_CU_1
+RUN:   --address=0x4004e8 --functions | FileCheck %s -check-prefix MANY_CU_1
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test2.elf-x86-64 \
-RUN:   --address=0x4004c4 --functions | FileCheck %s -check-prefix MANY_CU_2
+RUN:   --address=0x4004f4 --functions | FileCheck %s -check-prefix MANY_CU_2
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test3.elf-x86-64 \
-RUN:   --address=0x580 --functions | FileCheck %s -check-prefix ABS_ORIGIN_1
+RUN:   --address=0x640 --functions | FileCheck %s -check-prefix ABS_ORIGIN_1
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test3.elf-x86-64 \
-RUN:   --address=0x573 --functions | FileCheck %s -check-prefix INCLUDE_TEST_1
+RUN:   --address=0x633 --functions | FileCheck %s -check-prefix INCLUDE_TEST_1
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test3.elf-x86-64 \
-RUN:   --address=0x56d --functions | FileCheck %s -check-prefix INCLUDE_TEST_2
+RUN:   --address=0x62d --functions | FileCheck %s -check-prefix INCLUDE_TEST_2
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
-RUN:   --address=0x55c --functions \
+RUN:   --address=0x62c --functions \
 RUN:   | FileCheck %s -check-prefix MANY_SEQ_IN_LINE_TABLE
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
 RUN:   | FileCheck %s -check-prefix DEBUG_RANGES
 
 MAIN: main
-MAIN-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16:10
+MAIN-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16
 
 FUNCTION: _Z1fii
-FUNCTION-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:11:18
+FUNCTION-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:11
 
-CTOR_WITH_SPEC: _ZN10DummyClassC1Ei
-CTOR_WITH_SPEC-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:4:30
+CTOR_WITH_SPEC: DummyClass
+CTOR_WITH_SPEC-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:4
 
 MANY_CU_1: a
-MANY_CU_1-NEXT: /tmp/dbginfo{{[/\\]}}a.cc:2:0
+MANY_CU_1-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test2-helper.cc:2
 
 MANY_CU_2: main
-MANY_CU_2-NEXT: /tmp/dbginfo{{[/\\]}}main.cc:4:0
+MANY_CU_2-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test2-main.cc:4
 
 ABS_ORIGIN_1: C
-ABS_ORIGIN_1-NEXT: /tmp/dbginfo{{[/\\]}}def2.cc:4:0
+ABS_ORIGIN_1-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test3.cc:3
 
-INCLUDE_TEST_1: _Z3do2v
-INCLUDE_TEST_1-NEXT: /tmp/dbginfo{{[/\\]}}include{{[/\\]}}decl2.h:1:0
+INCLUDE_TEST_1: _Z3do1v
+INCLUDE_TEST_1-NEXT: /tmp/include{{[/\\]}}dwarfdump-test3-decl.h:7
 
-INCLUDE_TEST_2: _Z3do1v
-INCLUDE_TEST_2-NEXT: /tmp/include{{[/\\]}}decl.h:5:0
+INCLUDE_TEST_2: _Z3do2v
+INCLUDE_TEST_2-NEXT: /tmp/dbginfo{{[/\\]}}include{{[/\\]}}dwarfdump-test3-decl2.h:1
 
 MANY_SEQ_IN_LINE_TABLE: _Z1cv
-MANY_SEQ_IN_LINE_TABLE-NEXT: /tmp/dbginfo/sequences{{[/\\]}}c.cc:2:0
+MANY_SEQ_IN_LINE_TABLE-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test4-part1.cc:2
 
 DEBUG_RANGES:      .debug_ranges contents:
-DEBUG_RANGES-NEXT: 00000000 000000000000055c 0000000000000567
-DEBUG_RANGES-NEXT: 00000000 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000000 000000000000062c 0000000000000637
+DEBUG_RANGES-NEXT: 00000000 0000000000000637 000000000000063d
 DEBUG_RANGES-NEXT: 00000000 <End of list>
-DEBUG_RANGES-NEXT: 00000030 0000000000000570 000000000000057b
-DEBUG_RANGES-NEXT: 00000030 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000030 0000000000000640 000000000000064b
+DEBUG_RANGES-NEXT: 00000030 0000000000000637 000000000000063d
 DEBUG_RANGES-NEXT: 00000030 <End of list>
diff --git a/test/DebugInfo/inlined-vars.ll b/test/DebugInfo/inlined-vars.ll
index ed4e7da..b25f3fa 100644
--- a/test/DebugInfo/inlined-vars.ll
+++ b/test/DebugInfo/inlined-vars.ll
@@ -4,8 +4,8 @@
 
 define i32 @main() uwtable {
 entry:
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !18), !dbg !21
-  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !22), !dbg !23
+  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !18), !dbg !21
+  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !22), !dbg !23
   tail call void @smth(i32 0), !dbg !24
   tail call void @smth(i32 0), !dbg !25
   ret i32 0, !dbg !19
@@ -18,10 +18,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"inline-bug.cc", metadata !"/tmp/dbginfo/pr13202", metadata !"clang version 3.2 (trunk 159419)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !10}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !10}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 10, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !1, i32 10} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"inline-bug.cc", metadata !"/tmp/dbginfo/pr13202", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
@@ -39,19 +37,17 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ; ARGUMENT: {{.*Abbrev.*DW_TAG_formal_parameter}}
 ; ARGUMENT-NOT: {{.*Abbrev.*DW_TAG_formal_parameter}}
 
-!16 = metadata !{i32 786688, metadata !17, metadata !"local", metadata !6, i32 4, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+!16 = metadata !{i32 786688, metadata !10, metadata !"local", metadata !6, i32 4, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
 
 ; Two DW_TAG_variable: one abstract and one inlined.
 ; VARIABLE: {{.*Abbrev.*DW_TAG_variable}}
 ; VARIABLE: {{.*Abbrev.*DW_TAG_variable}}
 ; VARIABLE-NOT: {{.*Abbrev.*DW_TAG_variable}}
 
-!17 = metadata !{i32 786443, metadata !10, i32 3, i32 35, metadata !6, i32 1} ; [ DW_TAG_lexical_block ]
 !18 = metadata !{i32 786689, metadata !10, metadata !"argument", metadata !6, i32 16777219, metadata !9, i32 0, metadata !19} ; [ DW_TAG_arg_variable ]
-!19 = metadata !{i32 11, i32 10, metadata !20, null}
-!20 = metadata !{i32 786443, metadata !5, i32 10, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{i32 11, i32 10, metadata !5, null}
 !21 = metadata !{i32 3, i32 25, metadata !10, metadata !19}
-!22 = metadata !{i32 786688, metadata !17, metadata !"local", metadata !6, i32 4, metadata !9, i32 0, metadata !19} ; [ DW_TAG_auto_variable ]
-!23 = metadata !{i32 4, i32 16, metadata !17, metadata !19}
-!24 = metadata !{i32 5, i32 3, metadata !17, metadata !19}
-!25 = metadata !{i32 6, i32 3, metadata !17, metadata !19}
+!22 = metadata !{i32 786688, metadata !10, metadata !"local", metadata !6, i32 4, metadata !9, i32 0, metadata !19} ; [ DW_TAG_auto_variable ]
+!23 = metadata !{i32 4, i32 16, metadata !10, metadata !19}
+!24 = metadata !{i32 5, i32 3, metadata !10, metadata !19}
+!25 = metadata !{i32 6, i32 3, metadata !10, metadata !19}
diff --git a/test/DebugInfo/member-pointers.ll b/test/DebugInfo/member-pointers.ll
index 47874d9..1dbadf2 100644
--- a/test/DebugInfo/member-pointers.ll
+++ b/test/DebugInfo/member-pointers.ll
@@ -1,7 +1,12 @@
 ; RUN: llc -filetype=obj -O0 < %s > %t
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 ; CHECK: DW_TAG_ptr_to_member_type
+; CHECK: [[TYPE:.*]]:   DW_TAG_subroutine_type
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_type
+; CHECK-NEXT: DW_AT_artificial [DW_FORM_flag_present]
 ; CHECK: DW_TAG_ptr_to_member_type
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]       (cu + {{.*}} => {[[TYPE]]})
 ; IR generated from clang -g with the following source:
 ; struct S {
 ; };
@@ -15,16 +20,15 @@
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"simple.cpp", metadata !"/home/blaikie/Development/scratch", metadata !"clang version 3.3 ", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/home/blaikie/Development/scratch/simple.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !10}
-!5 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i64* @x} ; [ DW_TAG_variable ] [x] [line 2] [def]
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !10}
+!5 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !6, i32 4, metadata !7, i32 0, i32 1, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
 !6 = metadata !{i32 786473, metadata !"simple.cpp", metadata !"/home/blaikie/Development/scratch", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
 !8 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786451, null, metadata !"S", metadata !6, i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 8, align 8, offset 0] [from ]
-!10 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !6, i32 3, metadata !11, i32 0, i32 1, { i64, i64 }* @y} ; [ DW_TAG_variable ] [y] [line 3] [def]
+!9 = metadata !{i32 786451, null, metadata !"S", metadata !6, i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !1, i32 0, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 8, align 8, offset 0] [from ]
+!10 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !6, i32 5, metadata !11, i32 0, i32 1, { i64, i64 }* @y, null} ; [ DW_TAG_variable ] [y] [line 5] [def]
 !11 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !12, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!13 = metadata !{null, metadata !8}
+!13 = metadata !{null, metadata !14, metadata !8}
+!14 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from S]
diff --git a/test/DebugInfo/two-cus-from-same-file.ll b/test/DebugInfo/two-cus-from-same-file.ll
index d01aeea..fe50920 100644
--- a/test/DebugInfo/two-cus-from-same-file.ll
+++ b/test/DebugInfo/two-cus-from-same-file.ll
@@ -4,7 +4,7 @@
 ;
 
 ; RUN: llc %s -o %t -filetype=obj -O0
-; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; ModuleID = 'test.bc'
 
@@ -33,17 +33,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0, !9}
 
 !0 = metadata !{i32 786449, i32 0, i32 12, metadata !"foo.c", metadata !"/tmp", metadata !"clang version 3.2 (trunk 156513)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"", metadata !6, i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @foo, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !"foo.c", metadata !"/tmp", null} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !8 = metadata !{null}
 !9 = metadata !{i32 786449, i32 0, i32 12, metadata !"foo.c", metadata !"/tmp", metadata !"clang version 3.2 (trunk 156513)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !10, metadata !1} ; [ DW_TAG_compile_unit ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{metadata !12}
+!10 = metadata !{metadata !12}
 !12 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 11, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !19, i32 11} ; [ DW_TAG_subprogram ]
 !13 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !14 = metadata !{metadata !15, metadata !15, metadata !16}
diff --git a/test/ExecutionEngine/2002-12-16-ArgTest.ll b/test/ExecutionEngine/2002-12-16-ArgTest.ll
index 4c03519..e04bf03 100644
--- a/test/ExecutionEngine/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/2002-12-16-ArgTest.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
+; XFAIL: armv7
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
index 28cc54a..babd8f6 100644
--- a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
index 9f89598..bbb81b8 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @foo(i32 %X, i32 %Y, double %A) {
 	%cond212 = fcmp une double %A, 1.000000e+00		; <i1> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
index 997b2a9..7574267 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() {
 	call i32 @mylog( i32 4 )		; <i32>:1 [#uses=0]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
index ba35b5b..261939a 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() {
 ; <label>:0
diff --git a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
index f3c88ad..f76f998 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ; We were accidentally inverting the signedness of right shifts.  Whoops.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
index f925e79..2b83bb9 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() {
 	%X = fadd double 0.000000e+00, 1.000000e+00		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
index 5b426f6..d1ca2be 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @bar(i8* %X) {
         ; pointer should be 4 byte aligned!
diff --git a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
index c0a7393..20ef0ff 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
@@ -1,6 +1,6 @@
 ; This testcase should return with an exit code of 1.
 ;
-; RUN: not %lli -mtriple=%mcjit_triple -use-mcjit %s
+; RUN: not %lli_mcjit %s
 
 @test = global i64 0		; <i64*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
index d3e6204..c7bcc54 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s test
+; RUN: %lli_mcjit %s test
 
 declare i32 @puts(i8*)
 
diff --git a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
index 55a1697..0512575 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 target datalayout = "e-p:32:32"
 
diff --git a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
index 79c6e7f..c292a81 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
index ffd6df6..c0a83f5 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
index 90839e9..55ce689 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ; This testcase failed to work because two variable sized allocas confused the
 ; local register allocator.
diff --git a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
index 29ef2c5..2e99996 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ;
 ; Regression Test: EnvironmentTest.ll
diff --git a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
index 2adb608..659901b 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ; This testcase exposes a bug in the local register allocator where it runs out
 ; of registers (due to too many overlapping live ranges), but then attempts to
diff --git a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
index 91bde46..68e31a7 100644
--- a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
+++ b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 @A = global i32 0		; <i32*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
index a7462d9..69f4ec8 100644
--- a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
+++ b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
@@ -1,6 +1,6 @@
 ; PR672
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s
-; XFAIL: mcjit-ia32
+; RUN: %lli_mcjit %s
+; XFAIL: mcjit-ia32, armv5
 
 define i32 @main() {
 	%f = bitcast i32 (i32, i32*, i32)* @check_tail to i32*		; <i32*> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
index 2406596..43188f2 100644
--- a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
+++ b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter %s
+; RUN: %lli_mcjit -force-interpreter %s
 ; PR1836
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
index d429d51..0912897 100644
--- a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
+++ b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s | grep 1
+; RUN: %lli_mcjit -force-interpreter=true %s | grep 1
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i686-pc-linux-gnu"
diff --git a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
index a6d18e7..7ed0e38 100644
--- a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
+++ b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s > /dev/null
+; RUN: %lli_mcjit -force-interpreter=true %s > /dev/null
 
 define i32 @main() {
        %a = add i32 0, undef
diff --git a/test/ExecutionEngine/MCJIT/fpbitcast.ll b/test/ExecutionEngine/MCJIT/fpbitcast.ll
index bb4957e..fb5ab6f 100644
--- a/test/ExecutionEngine/MCJIT/fpbitcast.ll
+++ b/test/ExecutionEngine/MCJIT/fpbitcast.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s | grep 40091eb8
+; RUN: %lli_mcjit -force-interpreter=true %s | grep 40091eb8
 ;
 define i32 @test(double %x) {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/hello.ll b/test/ExecutionEngine/MCJIT/hello.ll
index ceb9c12..b744707 100644
--- a/test/ExecutionEngine/MCJIT/hello.ll
+++ b/test/ExecutionEngine/MCJIT/hello.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 @.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/hello2.ll b/test/ExecutionEngine/MCJIT/hello2.ll
index 756fcad..cd033d5 100644
--- a/test/ExecutionEngine/MCJIT/hello2.ll
+++ b/test/ExecutionEngine/MCJIT/hello2.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 @X = global i32 7		; <i32*> [#uses=0]
 @msg = internal global [13 x i8] c"Hello World\0A\00"		; <[13 x i8]*> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/pr13727.ll b/test/ExecutionEngine/MCJIT/pr13727.ll
index c33bf32..1c719c5 100644
--- a/test/ExecutionEngine/MCJIT/pr13727.ll
+++ b/test/ExecutionEngine/MCJIT/pr13727.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 -disable-lazy-compilation=false %s
+; RUN: %lli_mcjit -O0 -disable-lazy-compilation=false %s
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/MCJIT/simplesttest.ll b/test/ExecutionEngine/MCJIT/simplesttest.ll
index 02ad006..318baf4 100644
--- a/test/ExecutionEngine/MCJIT/simplesttest.ll
+++ b/test/ExecutionEngine/MCJIT/simplesttest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/simpletest-remote.ll b/test/ExecutionEngine/MCJIT/simpletest-remote.ll
index 272204c..9ceaf54 100644
--- a/test/ExecutionEngine/MCJIT/simpletest-remote.ll
+++ b/test/ExecutionEngine/MCJIT/simpletest-remote.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -remote-mcjit %s > /dev/null
+; RUN: %lli_mcjit -remote-mcjit %s > /dev/null
 ; XFAIL: arm, mips
 
 define i32 @bar() {
diff --git a/test/ExecutionEngine/MCJIT/simpletest.ll b/test/ExecutionEngine/MCJIT/simpletest.ll
index 958b783..5b0f2dd 100644
--- a/test/ExecutionEngine/MCJIT/simpletest.ll
+++ b/test/ExecutionEngine/MCJIT/simpletest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @bar() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/stubs-remote.ll b/test/ExecutionEngine/MCJIT/stubs-remote.ll
index 4c7684f..15cb5d0 100644
--- a/test/ExecutionEngine/MCJIT/stubs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/stubs-remote.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -remote-mcjit -disable-lazy-compilation=false %s
+; RUN: %lli_mcjit -remote-mcjit -disable-lazy-compilation=false %s
 ; XFAIL: arm, mips
 
 define i32 @main() nounwind {
diff --git a/test/ExecutionEngine/MCJIT/stubs.ll b/test/ExecutionEngine/MCJIT/stubs.ll
index 9e5d5b2..f4aac33 100644
--- a/test/ExecutionEngine/MCJIT/stubs.ll
+++ b/test/ExecutionEngine/MCJIT/stubs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -disable-lazy-compilation=false %s
+; RUN: %lli_mcjit -disable-lazy-compilation=false %s
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-arith.ll b/test/ExecutionEngine/MCJIT/test-arith.ll
index b73227f..e1cc23b 100644
--- a/test/ExecutionEngine/MCJIT/test-arith.ll
+++ b/test/ExecutionEngine/MCJIT/test-arith.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() {
 	%A = add i8 0, 12		; <i8> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-branch.ll b/test/ExecutionEngine/MCJIT/test-branch.ll
index 8f3c727..cdf1035 100644
--- a/test/ExecutionEngine/MCJIT/test-branch.ll
+++ b/test/ExecutionEngine/MCJIT/test-branch.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ; test unconditional branch
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
index 20150b2..8a36cf2 100644
--- a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @_Z14func_exit_codev() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-call.ll b/test/ExecutionEngine/MCJIT/test-call.ll
index 51d19fe..ed593e3 100644
--- a/test/ExecutionEngine/MCJIT/test-call.ll
+++ b/test/ExecutionEngine/MCJIT/test-call.ll
@@ -1,4 +1,5 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
+; XFAIL: armv5
 
 declare void @exit(i32)
 
diff --git a/test/ExecutionEngine/MCJIT/test-cast.ll b/test/ExecutionEngine/MCJIT/test-cast.ll
index dcc97f4..335ec50 100644
--- a/test/ExecutionEngine/MCJIT/test-cast.ll
+++ b/test/ExecutionEngine/MCJIT/test-cast.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @foo() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
index d666a2a..989a473 100644
--- a/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+; RUN: %lli_mcjit -O0 %s
 
 ; This test checks that common symbols have been allocated addresses honouring
 ; the alignment requirement.
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols-remote.ll b/test/ExecutionEngine/MCJIT/test-common-symbols-remote.ll
index 285ce5c..3b8ee9d 100644
--- a/test/ExecutionEngine/MCJIT/test-common-symbols-remote.ll
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols-remote.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -remote-mcjit -O0 -disable-lazy-compilation=false %s
+; RUN: %lli_mcjit -remote-mcjit -O0 -disable-lazy-compilation=false %s
 ; XFAIL: arm, mips
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols.ll b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
index 8c81902..13ee06a 100644
--- a/test/ExecutionEngine/MCJIT/test-common-symbols.ll
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 -disable-lazy-compilation=false %s
+; RUN: %lli_mcjit -O0 -disable-lazy-compilation=false %s
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/MCJIT/test-constantexpr.ll b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
index 56c1290..8f15cbd 100644
--- a/test/ExecutionEngine/MCJIT/test-constantexpr.ll
+++ b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ; This tests to make sure that we can evaluate weird constant expressions
 
diff --git a/test/ExecutionEngine/MCJIT/test-data-align-remote.ll b/test/ExecutionEngine/MCJIT/test-data-align-remote.ll
index a1591d0..9daf168 100644
--- a/test/ExecutionEngine/MCJIT/test-data-align-remote.ll
+++ b/test/ExecutionEngine/MCJIT/test-data-align-remote.ll
@@ -1,5 +1,5 @@
-; RUN:  %lli -mtriple=%mcjit_triple -use-mcjit -remote-mcjit -O0 %s
-; XFAIL: arm, mips
+; RUN:  %lli_mcjit -remote-mcjit -O0 %s
+; XFAIL: armv7, mips
 
 ; Check that a variable is always aligned as specified.
 
diff --git a/test/ExecutionEngine/MCJIT/test-data-align.ll b/test/ExecutionEngine/MCJIT/test-data-align.ll
index 0493cba..2472d95 100644
--- a/test/ExecutionEngine/MCJIT/test-data-align.ll
+++ b/test/ExecutionEngine/MCJIT/test-data-align.ll
@@ -1,4 +1,4 @@
-; RUN:  %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+; RUN:  %lli_mcjit -O0 %s
 
 ; Check that a variable is always aligned as specified.
 
diff --git a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs-remote.ll
index 69c73b9..847d225 100644
--- a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs-remote.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -remote-mcjit %s > /dev/null
+; RUN: %lli_mcjit -remote-mcjit %s > /dev/null
 ; XFAIL: arm, mips
 
 define double @test(double* %DP, double %Arg) {
diff --git a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
index 7af1d8b..f094f3d 100644
--- a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-fp.ll b/test/ExecutionEngine/MCJIT/test-fp.ll
index f7e6fb9..b10e9d6 100644
--- a/test/ExecutionEngine/MCJIT/test-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-global-ctors.ll b/test/ExecutionEngine/MCJIT/test-global-ctors.ll
index fbe9118..4510d9b 100644
--- a/test/ExecutionEngine/MCJIT/test-global-ctors.ll
+++ b/test/ExecutionEngine/MCJIT/test-global-ctors.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 @var = global i32 1, align 4
 @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @ctor_func }]
 @llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @dtor_func }]
diff --git a/test/ExecutionEngine/MCJIT/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/MCJIT/test-global-init-nonzero-remote.ll
index 8b7c83e..b8d94b5 100644
--- a/test/ExecutionEngine/MCJIT/test-global-init-nonzero-remote.ll
+++ b/test/ExecutionEngine/MCJIT/test-global-init-nonzero-remote.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -remote-mcjit %s > /dev/null
+; RUN: %lli_mcjit -remote-mcjit %s > /dev/null
 ; XFAIL: arm, mips
 
 @count = global i32 1, align 4
diff --git a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
index ec6cbad..b9f74b8 100644
--- a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
+++ b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/test-global.ll b/test/ExecutionEngine/MCJIT/test-global.ll
index e7972f9..6a8c042 100644
--- a/test/ExecutionEngine/MCJIT/test-global.ll
+++ b/test/ExecutionEngine/MCJIT/test-global.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 @count = global i32 0, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/test-loadstore.ll b/test/ExecutionEngine/MCJIT/test-loadstore.ll
index f450d0a..9038194 100644
--- a/test/ExecutionEngine/MCJIT/test-loadstore.ll
+++ b/test/ExecutionEngine/MCJIT/test-loadstore.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define void @test(i8* %P, i16* %P.upgrd.1, i32* %P.upgrd.2, i64* %P.upgrd.3) {
 	%V = load i8* %P		; <i8> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-local.ll b/test/ExecutionEngine/MCJIT/test-local.ll
index d4e9f44..d7c1734 100644
--- a/test/ExecutionEngine/MCJIT/test-local.ll
+++ b/test/ExecutionEngine/MCJIT/test-local.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-logical.ll b/test/ExecutionEngine/MCJIT/test-logical.ll
index 32f45ef..a03833e 100644
--- a/test/ExecutionEngine/MCJIT/test-logical.ll
+++ b/test/ExecutionEngine/MCJIT/test-logical.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() {
 	%A = and i8 4, 8		; <i8> [#uses=2]
diff --git a/test/ExecutionEngine/MCJIT/test-loop.ll b/test/ExecutionEngine/MCJIT/test-loop.ll
index ebc6896..5ed8c40 100644
--- a/test/ExecutionEngine/MCJIT/test-loop.ll
+++ b/test/ExecutionEngine/MCJIT/test-loop.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() {
 ; <label>:0
diff --git a/test/ExecutionEngine/MCJIT/test-phi.ll b/test/ExecutionEngine/MCJIT/test-phi.ll
index 1408533..4245cca 100644
--- a/test/ExecutionEngine/MCJIT/test-phi.ll
+++ b/test/ExecutionEngine/MCJIT/test-phi.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ; test phi node
 @Y = global i32 6		; <i32*> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-ptr-reloc-remote.ll b/test/ExecutionEngine/MCJIT/test-ptr-reloc-remote.ll
index 773e4a1..f2c2cd6 100644
--- a/test/ExecutionEngine/MCJIT/test-ptr-reloc-remote.ll
+++ b/test/ExecutionEngine/MCJIT/test-ptr-reloc-remote.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -remote-mcjit -O0 %s
+; RUN: %lli_mcjit -remote-mcjit -O0 %s
 ; XFAIL: arm, mips
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
diff --git a/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
index 93b6a6d..871d8bf 100644
--- a/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
+++ b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+; RUN: %lli_mcjit -O0 %s
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/ExecutionEngine/MCJIT/test-ret.ll b/test/ExecutionEngine/MCJIT/test-ret.ll
index af28292..6bfc480 100644
--- a/test/ExecutionEngine/MCJIT/test-ret.ll
+++ b/test/ExecutionEngine/MCJIT/test-ret.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 ; test return instructions
 define void @test1() {
diff --git a/test/ExecutionEngine/MCJIT/test-return.ll b/test/ExecutionEngine/MCJIT/test-return.ll
index 67f7107..4db1c3f 100644
--- a/test/ExecutionEngine/MCJIT/test-return.ll
+++ b/test/ExecutionEngine/MCJIT/test-return.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
index a8f4bd8..b4367d0 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-int.ll b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
index ed52b50..8c7d815 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-int.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() {
 	%int1 = add i32 0, 0		; <i32> [#uses=6]
diff --git a/test/ExecutionEngine/MCJIT/test-shift.ll b/test/ExecutionEngine/MCJIT/test-shift.ll
index 5a5c10d..8d9a94e 100644
--- a/test/ExecutionEngine/MCJIT/test-shift.ll
+++ b/test/ExecutionEngine/MCJIT/test-shift.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
+; RUN: %lli_mcjit %s > /dev/null
 
 define i32 @main() {
 	%shamt = add i8 0, 1		; <i8> [#uses=8]
diff --git a/test/ExecutionEngine/lit.local.cfg b/test/ExecutionEngine/lit.local.cfg
index f034326..dd6a5bb 100644
--- a/test/ExecutionEngine/lit.local.cfg
+++ b/test/ExecutionEngine/lit.local.cfg
@@ -7,6 +7,6 @@ def getRoot(config):
 
 root = getRoot(config)
 
-if root.host_arch in ['PowerPC']:
+if root.host_arch in ['PowerPC', 'AArch64']:
     config.unsupported = True
 
diff --git a/test/ExecutionEngine/test-call.ll b/test/ExecutionEngine/test-call.ll
index 3fd39fe..563d486 100644
--- a/test/ExecutionEngine/test-call.ll
+++ b/test/ExecutionEngine/test-call.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
+; XFAIL: armv7
 
 declare void @exit(i32)
 
diff --git a/test/ExecutionEngine/test-fp-no-external-funcs.ll b/test/ExecutionEngine/test-fp-no-external-funcs.ll
index 139b2ef..92cc0d6 100644
--- a/test/ExecutionEngine/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/test-fp-no-external-funcs.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli  %s > /dev/null
-; XFAIL: arm
+; XFAIL: armv7
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/test-fp.ll b/test/ExecutionEngine/test-fp.ll
index c906450..68a8182 100644
--- a/test/ExecutionEngine/test-fp.ll
+++ b/test/ExecutionEngine/test-fp.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli %s > /dev/null
-; XFAIL: arm
+; XFAIL: armv7
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/Feature/attributes.ll b/test/Feature/attributes.ll
new file mode 100644
index 0000000..7707d82
--- /dev/null
+++ b/test/Feature/attributes.ll
@@ -0,0 +1,15 @@
+; RUN: llvm-as < %s | llvm-dis > %t1.ll
+; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
+; RUN: diff %t1.ll %t2.ll
+
+@.str = private unnamed_addr constant [14 x i8] c"hello world!\0A\00", align 1
+
+define void @foo() #0 {
+entry:
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([14 x i8]* @.str, i32 0, i32 0))
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+attributes #0 = { nounwind ssp uwtable }
diff --git a/test/Feature/intrinsics.ll b/test/Feature/intrinsics.ll
index 9e7dc6d..28be053 100644
--- a/test/Feature/intrinsics.ll
+++ b/test/Feature/intrinsics.ll
@@ -61,10 +61,14 @@ define void @libm() {
 ; FIXME: test ALL the intrinsics in this file.
 
 ; rdar://11542750
-; CHECK: declare void @llvm.trap() noreturn nounwind
+; CHECK: declare void @llvm.trap() #2
 declare void @llvm.trap()
 
 define void @trap() {
   call void @llvm.trap()
   ret void
 }
+
+; CHECK: attributes #0 = { nounwind readnone }
+; CHECK: attributes #1 = { nounwind readonly }
+; CHECK: attributes #2 = { noreturn nounwind }
diff --git a/test/Feature/minsize_attr.ll b/test/Feature/minsize_attr.ll
index 51b133c..1f915b3 100644
--- a/test/Feature/minsize_attr.ll
+++ b/test/Feature/minsize_attr.ll
@@ -1,7 +1,8 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck %s
 
 define void @test1() minsize {
-; CHECK: define void @test1() minsize
+; CHECK: define void @test1() #0
         ret void
 }
 
+; CHECK: attributes #0 = { minsize }
diff --git a/test/FileCheck/dos-style-eol.txt b/test/FileCheck/dos-style-eol.txt
new file mode 100644
index 0000000..4252aad
--- /dev/null
+++ b/test/FileCheck/dos-style-eol.txt
@@ -0,0 +1,11 @@
+// Test for using FileCheck on DOS style end-of-line
+// This test was deliberately committed with DOS style end of line.
+// Don't change line endings!
+// RUN: FileCheck -input-file %s %s
+// RUN: FileCheck  --strict-whitespace -input-file %s %s
+
+LINE 1
+; CHECK: {{^}}LINE 1{{$}}
+
+LINE 2
+; CHECK: {{^}}LINE 2{{$}}
+\ No newline at end of file
diff --git a/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll b/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll
index 35c5c4a..38168fc 100644
--- a/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll
@@ -36,14 +36,14 @@ target triple = "i386-unknown-linux-gnu"
 @ff_mlp_firorder_7 = external global i8
 @ff_mlp_firorder_8 = external global i8
 
-define void @ff_mlp_init_x86(%struct.DSPContext* nocapture %c, %struct.AVCodecContext* nocapture %avctx) nounwind address_safety {
+define void @ff_mlp_init_x86(%struct.DSPContext* nocapture %c, %struct.AVCodecContext* nocapture %avctx) nounwind sanitize_address {
 entry:
   %mlp_filter_channel = getelementptr inbounds %struct.DSPContext* %c, i32 0, i32 131
   store void (i32*, i32*, i32, i32, i32, i32, i32, i32*)* @mlp_filter_channel_x86, void (i32*, i32*, i32, i32, i32, i32, i32, i32*)** %mlp_filter_channel, align 4, !tbaa !0
   ret void
 }
 
-define internal void @mlp_filter_channel_x86(i32* %state, i32* %coeff, i32 %firorder, i32 %iirorder, i32 %filter_shift, i32 %mask, i32 %blocksize, i32* %sample_buffer) nounwind address_safety {
+define internal void @mlp_filter_channel_x86(i32* %state, i32* %coeff, i32 %firorder, i32 %iirorder, i32 %filter_shift, i32 %mask, i32 %blocksize, i32* %sample_buffer) nounwind sanitize_address {
 entry:
   %filter_shift.addr = alloca i32, align 4
   %mask.addr = alloca i32, align 4
diff --git a/test/Instrumentation/AddressSanitizer/adaptive_global_redzones.ll b/test/Instrumentation/AddressSanitizer/adaptive_global_redzones.ll
new file mode 100644
index 0000000..6a60d1c
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/adaptive_global_redzones.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Here we check that the global redzone sizes grow with the object size.
+
+@G10 = global [10 x i8] zeroinitializer, align 1
+; CHECK: @G10 = global { [10 x i8], [54 x i8] }
+
+@G31 = global [31 x i8] zeroinitializer, align 1
+@G32 = global [32 x i8] zeroinitializer, align 1
+@G33 = global [33 x i8] zeroinitializer, align 1
+; CHECK: @G31 = global { [31 x i8], [33 x i8] }
+; CHECK: @G32 = global { [32 x i8], [32 x i8] }
+; CHECK: @G33 = global { [33 x i8], [63 x i8] }
+
+@G63 = global [63 x i8] zeroinitializer, align 1
+@G64 = global [64 x i8] zeroinitializer, align 1
+@G65 = global [65 x i8] zeroinitializer, align 1
+; CHECK: @G63 = global { [63 x i8], [33 x i8] }
+; CHECK: @G64 = global { [64 x i8], [32 x i8] }
+; CHECK: @G65 = global { [65 x i8], [63 x i8] }
+
+@G127 = global [127 x i8] zeroinitializer, align 1
+@G128 = global [128 x i8] zeroinitializer, align 1
+@G129 = global [129 x i8] zeroinitializer, align 1
+; CHECK: @G127 = global { [127 x i8], [33 x i8] }
+; CHECK: @G128 = global { [128 x i8], [32 x i8] }
+; CHECK: @G129 = global { [129 x i8], [63 x i8] }
+
+@G255 = global [255 x i8] zeroinitializer, align 1
+@G256 = global [256 x i8] zeroinitializer, align 1
+@G257 = global [257 x i8] zeroinitializer, align 1
+; CHECK: @G255 = global { [255 x i8], [33 x i8] }
+; CHECK: @G256 = global { [256 x i8], [64 x i8] }
+; CHECK: @G257 = global { [257 x i8], [95 x i8] }
+
+@G511 = global [511 x i8] zeroinitializer, align 1
+@G512 = global [512 x i8] zeroinitializer, align 1
+@G513 = global [513 x i8] zeroinitializer, align 1
+; CHECK: @G511 = global { [511 x i8], [97 x i8] }
+; CHECK: @G512 = global { [512 x i8], [128 x i8] }
+; CHECK: @G513 = global { [513 x i8], [159 x i8] }
+
+@G1023 = global [1023 x i8] zeroinitializer, align 1
+@G1024 = global [1024 x i8] zeroinitializer, align 1
+@G1025 = global [1025 x i8] zeroinitializer, align 1
+; CHECK: @G1023 = global { [1023 x i8], [225 x i8] }
+; CHECK: @G1024 = global { [1024 x i8], [256 x i8] }
+; CHECK: @G1025 = global { [1025 x i8], [287 x i8] }
+
+@G1000000 = global [1000000 x i8] zeroinitializer, align 1
+@G10000000 = global [10000000 x i8] zeroinitializer, align 1
+@G100000000 = global [100000000 x i8] zeroinitializer, align 1
+; CHECK: @G1000000 = global { [1000000 x i8], [249984 x i8] }
+; CHECK: @G10000000 = global { [10000000 x i8], [262144 x i8] }
+; CHECK: @G100000000 = global { [100000000 x i8], [262144 x i8] }
diff --git a/test/Instrumentation/AddressSanitizer/asan-vs-gvn.ll b/test/Instrumentation/AddressSanitizer/asan-vs-gvn.ll
index c0fe15e..da8f541 100644
--- a/test/Instrumentation/AddressSanitizer/asan-vs-gvn.ll
+++ b/test/Instrumentation/AddressSanitizer/asan-vs-gvn.ll
@@ -11,9 +11,9 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 @f = global %struct_of_7_bytes_4_aligned zeroinitializer, align 4
 
-; Accessing bytes 4 and 6, not ok to widen to i32 if address_safety is set.
+; Accessing bytes 4 and 6, not ok to widen to i32 if sanitize_address is set.
 
-define i32 @test_widening_bad(i8* %P) nounwind ssp noredzone address_safety {
+define i32 @test_widening_bad(i8* %P) nounwind ssp noredzone sanitize_address {
 entry:
   %tmp = load i8* getelementptr inbounds (%struct_of_7_bytes_4_aligned* @f, i64 0, i32 1), align 4
   %conv = zext i8 %tmp to i32
@@ -36,7 +36,7 @@ define void @end_test_widening_bad() {
 
 ;; Accessing bytes 4 and 5. Ok to widen to i16.
 
-define i32 @test_widening_ok(i8* %P) nounwind ssp noredzone address_safety {
+define i32 @test_widening_ok(i8* %P) nounwind ssp noredzone sanitize_address {
 entry:
   %tmp = load i8* getelementptr inbounds (%struct_of_7_bytes_4_aligned* @f, i64 0, i32 1), align 4
   %conv = zext i8 %tmp to i32
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index 655f69c..c477b19 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -5,12 +5,12 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @test_load(i32* %a) address_safety {
+define i32 @test_load(i32* %a) sanitize_address {
 ; CHECK: @test_load
 ; CHECK-NOT: load
 ; CHECK:   %[[LOAD_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
 ; CHECK:   lshr i64 %[[LOAD_ADDR]], 3
-; CHECK:   or i64
+; CHECK:   {{or|add}}
 ; CHECK:   %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr
 ; CHECK:   %[[LOAD_SHADOW:[^ ]*]] = load i8* %[[LOAD_SHADOW_PTR]]
 ; CHECK:   icmp ne i8
@@ -38,12 +38,12 @@ entry:
   ret i32 %tmp1
 }
 
-define void @test_store(i32* %a) address_safety {
+define void @test_store(i32* %a) sanitize_address {
 ; CHECK: @test_store
 ; CHECK-NOT: store
 ; CHECK:   %[[STORE_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
 ; CHECK:   lshr i64 %[[STORE_ADDR]], 3
-; CHECK:   or i64
+; CHECK:   {{or|add}}
 ; CHECK:   %[[STORE_SHADOW_PTR:[^ ]*]] = inttoptr
 ; CHECK:   %[[STORE_SHADOW:[^ ]*]] = load i8* %[[STORE_SHADOW_PTR]]
 ; CHECK:   icmp ne i8
@@ -73,7 +73,7 @@ entry:
 ; Check that asan leaves just one alloca.
 
 declare void @alloca_test_use([10 x i8]*)
-define void @alloca_test() address_safety {
+define void @alloca_test() sanitize_address {
 entry:
   %x = alloca [10 x i8], align 1
   %y = alloca [10 x i8], align 1
@@ -89,3 +89,42 @@ entry:
 ; CHECK-NOT: = alloca
 ; CHECK: ret void
 
+define void @LongDoubleTest(x86_fp80* nocapture %a) nounwind uwtable sanitize_address {
+entry:
+    store x86_fp80 0xK3FFF8000000000000000, x86_fp80* %a, align 16
+    ret void
+}
+
+; CHECK: LongDoubleTest
+; CHECK: __asan_report_store_n
+; CHECK: __asan_report_store_n
+; CHECK: ret void
+
+
+define void @i40test(i40* %a, i40* %b) nounwind uwtable sanitize_address {
+  entry:
+  %t = load i40* %a
+  store i40 %t, i40* %b, align 8
+  ret void
+}
+
+; CHECK: i40test
+; CHECK: __asan_report_load_n{{.*}}, i64 5)
+; CHECK: __asan_report_load_n{{.*}}, i64 5)
+; CHECK: __asan_report_store_n{{.*}}, i64 5)
+; CHECK: __asan_report_store_n{{.*}}, i64 5)
+; CHECK: ret void
+
+define void @i80test(i80* %a, i80* %b) nounwind uwtable sanitize_address {
+  entry:
+  %t = load i80* %a
+  store i80 %t, i80* %b, align 8
+  ret void
+}
+
+; CHECK: i80test
+; CHECK: __asan_report_load_n{{.*}}, i64 10)
+; CHECK: __asan_report_load_n{{.*}}, i64 10)
+; CHECK: __asan_report_store_n{{.*}}, i64 10)
+; CHECK: __asan_report_store_n{{.*}}, i64 10)
+; CHECK: ret void
diff --git a/test/Instrumentation/AddressSanitizer/debug_info.ll b/test/Instrumentation/AddressSanitizer/debug_info.ll
index f686ac1..7822fd0 100644
--- a/test/Instrumentation/AddressSanitizer/debug_info.ll
+++ b/test/Instrumentation/AddressSanitizer/debug_info.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @_Z3zzzi(i32 %p) nounwind uwtable address_safety {
+define i32 @_Z3zzzi(i32 %p) nounwind uwtable sanitize_address {
 entry:
   %p.addr = alloca i32, align 4
   %r = alloca i32, align 4
diff --git a/test/Instrumentation/AddressSanitizer/different_scale_and_offset.ll b/test/Instrumentation/AddressSanitizer/different_scale_and_offset.ll
new file mode 100644
index 0000000..b037176
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/different_scale_and_offset.ll
@@ -0,0 +1,41 @@
+; Test non-default shadow mapping scale and offset.
+;
+; RUN: opt < %s -asan -asan-mapping-scale=2 -asan-mapping-offset-log=0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test that ASan tells scale and offset to runtime.
+; CHECK: @__asan_mapping_offset = linkonce_odr constant i64 0
+; CHECK: @__asan_mapping_scale = linkonce_odr constant i64 2
+
+define i32 @test_load(i32* %a) sanitize_address {
+; CHECK: @test_load
+; CHECK-NOT: load
+; CHECK:   %[[LOAD_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK:   lshr i64 %[[LOAD_ADDR]], 2
+
+; No need in shift for zero offset.
+; CHECK-NOT:  or i64
+
+; CHECK:   %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK:   %[[LOAD_SHADOW:[^ ]*]] = load i8* %[[LOAD_SHADOW_PTR]]
+; CHECK:   icmp ne i8
+; CHECK:   br i1 %{{.*}}, label %{{.*}}, label %{{.*}}
+
+; No need in slow path for i32 and mapping scale equal to 2.
+; CHECK-NOT:   and i64 %[[LOAD_ADDR]]
+;
+; The crash block reports the error.
+; CHECK:   call void @__asan_report_load4(i64 %[[LOAD_ADDR]])
+; CHECK:   unreachable
+;
+; The actual load.
+; CHECK:   %tmp1 = load i32* %a
+; CHECK:   ret i32 %tmp1
+
+entry:
+  %tmp1 = load i32* %a
+  ret i32 %tmp1
+}
+
diff --git a/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll b/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
index 28d4ac0..2efd6b1 100644
--- a/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
+++ b/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
@@ -5,7 +5,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define void @_Z3barv() uwtable address_safety {
+define void @_Z3barv() uwtable sanitize_address {
 entry:
   %a = alloca i32, align 4
   call void @_Z3fooPi(i32* %a)
diff --git a/test/Instrumentation/AddressSanitizer/instrument-no-return.ll b/test/Instrumentation/AddressSanitizer/instrument-no-return.ll
index e8f62b5..2d835a3 100644
--- a/test/Instrumentation/AddressSanitizer/instrument-no-return.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument-no-return.ll
@@ -1,13 +1,13 @@
 ; RUN: opt < %s -asan -S | FileCheck %s
 ; AddressSanitizer must insert __asan_handle_no_return
-; before every noreturn call.
+; before every noreturn call or invoke.
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
 
 declare void @MyNoReturnFunc(i32) noreturn
 
-define i32 @Call1(i8* nocapture %arg) uwtable address_safety {
+define i32 @Call1(i8* nocapture %arg) uwtable sanitize_address {
 entry:
   call void @MyNoReturnFunc(i32 1) noreturn  ; The call insn has noreturn attr.
 ; CHECK:        @Call1
@@ -17,7 +17,7 @@ entry:
   unreachable
 }
 
-define i32 @Call2(i8* nocapture %arg) uwtable address_safety {
+define i32 @Call2(i8* nocapture %arg) uwtable sanitize_address {
 entry:
   call void @MyNoReturnFunc(i32 1)  ; No noreturn attribure on the call.
 ; CHECK:        @Call2
@@ -26,3 +26,24 @@ entry:
 ; CHECK-NEXT: unreachable
   unreachable
 }
+
+declare i32 @__gxx_personality_v0(...)
+
+define i64 @Invoke1(i8** %esc) nounwind uwtable ssp sanitize_address {
+entry:
+  invoke void @MyNoReturnFunc(i32 1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret i64 0
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          filter [0 x i8*] zeroinitializer
+  ret i64 1
+}
+; CHECK: @Invoke1
+; CHECK:        call void @__asan_handle_no_return
+; CHECK-NEXT:   invoke void @MyNoReturnFunc
+; CHECK: ret i64 0
+; CHECK: ret i64 1
diff --git a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
index 042c06b..584db37 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
@@ -23,7 +23,7 @@ entry:
   ret void
 }
 
-define internal void @_GLOBAL__I_a() address_safety section ".text.startup" {
+define internal void @_GLOBAL__I_a() sanitize_address section ".text.startup" {
 entry:
   call void @__cxx_global_var_init()
   ret void
@@ -40,7 +40,7 @@ entry:
 ; CHECK: ret
 
 ; Check that xxx is instrumented.
-define void @touch_xxx() address_safety {
+define void @touch_xxx() sanitize_address {
   store i32 0, i32 *@xxx, align 4
   ret void
 ; CHECK: define void @touch_xxx
@@ -49,7 +49,7 @@ define void @touch_xxx() address_safety {
 }
 
 ; Check that XXX is instrumented.
-define void @touch_XXX() address_safety {
+define void @touch_XXX() sanitize_address {
   store i32 0, i32 *@XXX, align 4
   ret void
 ; CHECK: define void @touch_XXX
@@ -59,7 +59,7 @@ define void @touch_XXX() address_safety {
 
 
 ; Check that yyy is NOT instrumented (as it does not have dynamic initializer).
-define void @touch_yyy() address_safety {
+define void @touch_yyy() sanitize_address {
   store i32 0, i32 *@yyy, align 4
   ret void
 ; CHECK: define void @touch_yyy
@@ -68,7 +68,7 @@ define void @touch_yyy() address_safety {
 }
 
 ; Check that YYY is NOT instrumented (as it does not have dynamic initializer).
-define void @touch_YYY() address_safety {
+define void @touch_YYY() sanitize_address {
   store i32 0, i32 *@YYY, align 4
   ret void
 ; CHECK: define void @touch_YYY
diff --git a/test/Instrumentation/AddressSanitizer/instrument_load_then_store.ll b/test/Instrumentation/AddressSanitizer/instrument_load_then_store.ll
index 633bf9a..23cf6d2 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_load_then_store.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_load_then_store.ll
@@ -4,7 +4,7 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
-define void @IncrementMe(i32* %a) address_safety {
+define void @IncrementMe(i32* %a) sanitize_address {
 entry:
   %tmp1 = load i32* %a, align 4
   %tmp2 = add i32 %tmp1,  1
diff --git a/test/Instrumentation/AddressSanitizer/lifetime.ll b/test/Instrumentation/AddressSanitizer/lifetime.ll
index 982ad08..3348728 100644
--- a/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
 
-define void @lifetime_no_size() address_safety {
+define void @lifetime_no_size() sanitize_address {
 entry:
   %i = alloca i32, align 4
   %i.ptr = bitcast i32* %i to i8*
@@ -23,7 +23,7 @@ entry:
 }
 
 ; Generic case of lifetime analysis.
-define void @lifetime() address_safety {
+define void @lifetime() sanitize_address {
   ; CHECK: @lifetime
 
   ; Regular variable lifetime intrinsics.
@@ -61,7 +61,7 @@ define void @lifetime() address_safety {
 }
 
 ; Check that arguments of lifetime may come from phi nodes.
-define void @phi_args(i1 %x) address_safety {
+define void @phi_args(i1 %x) sanitize_address {
   ; CHECK: @phi_args
 
 entry:
diff --git a/test/Instrumentation/AddressSanitizer/test64.ll b/test/Instrumentation/AddressSanitizer/test64.ll
index d544d77..6aa5c28 100644
--- a/test/Instrumentation/AddressSanitizer/test64.ll
+++ b/test/Instrumentation/AddressSanitizer/test64.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -asan -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
-define i32 @read_4_bytes(i32* %a) address_safety {
+define i32 @read_4_bytes(i32* %a) sanitize_address {
 entry:
   %tmp1 = load i32* %a, align 4
   ret i32 %tmp1
@@ -9,11 +9,11 @@ entry:
 ; CHECK: @read_4_bytes
 ; CHECK-NOT: ret
 ; CHECK: lshr {{.*}} 3
-; Check for ASAN's Offset for 64-bit (2^44)
-; CHECK-NEXT: 17592186044416
+; Check for ASAN's Offset for 64-bit (2^44 or 7fff8000)
+; CHECK-NEXT: {{17592186044416|2147450880}}
 ; CHECK: ret
 
-define void @example_atomicrmw(i64* %ptr) nounwind uwtable address_safety {
+define void @example_atomicrmw(i64* %ptr) nounwind uwtable sanitize_address {
 entry:
   %0 = atomicrmw add i64* %ptr, i64 1 seq_cst
   ret void
@@ -24,7 +24,7 @@ entry:
 ; CHECK: atomicrmw
 ; CHECK: ret
 
-define void @example_cmpxchg(i64* %ptr, i64 %compare_to, i64 %new_value) nounwind uwtable address_safety {
+define void @example_cmpxchg(i64* %ptr, i64 %compare_to, i64 %new_value) nounwind uwtable sanitize_address {
 entry:
   %0 = cmpxchg i64* %ptr, i64 %compare_to, i64 %new_value seq_cst
   ret void
diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index 20957fb..60f946f 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -362,6 +362,41 @@ define zeroext i1 @ICmpSLE(i32 %x) nounwind uwtable readnone {
 ; CHECK: ret i1
 
 
+; Check that we propagate shadow for x<0, x>=0, etc (i.e. sign bit tests)
+; of the vector arguments.
+
+define <2 x i1> @ICmpSLT_vector(<2 x i32*> %x) nounwind uwtable readnone {
+  %1 = icmp slt <2 x i32*> %x, zeroinitializer
+  ret <2 x i1> %1
+}
+
+; CHECK: @ICmpSLT_vector
+; CHECK: icmp slt <2 x i64>
+; CHECK-NOT: call void @__msan_warning
+; CHECK: icmp slt <2 x i32*>
+; CHECK-NOT: call void @__msan_warning
+; CHECK: ret <2 x i1>
+
+
+; Check that we propagate shadow for unsigned relational comparisons with
+; constants
+
+define zeroext i1 @ICmpUGTConst(i32 %x) nounwind uwtable readnone {
+entry:
+  %cmp = icmp ugt i32 %x, 7
+  ret i1 %cmp
+}
+
+; CHECK: @ICmpUGTConst
+; CHECK: icmp ugt i32
+; CHECK-NOT: call void @__msan_warning
+; CHECK: icmp ugt i32
+; CHECK-NOT: call void @__msan_warning
+; CHECK: icmp ugt i32
+; CHECK-NOT: call void @__msan_warning
+; CHECK: ret i1
+
+
 ; Check that loads of shadow have the same aligment as the original loads.
 ; Check that loads of origin have the aligment of max(4, original alignment).
 
@@ -534,3 +569,30 @@ define <8 x i8*> @VectorOfPointers(<8 x i8*>* %p) nounwind uwtable {
 ; CHECK: load <8 x i8*>*
 ; CHECK: store <8 x i64> {{.*}} @__msan_retval_tls
 ; CHECK: ret <8 x i8*>
+
+; Test handling of va_copy.
+
+declare void @llvm.va_copy(i8*, i8*) nounwind
+
+define void @VACopy(i8* %p1, i8* %p2) nounwind uwtable {
+  call void @llvm.va_copy(i8* %p1, i8* %p2) nounwind
+  ret void
+}
+
+; CHECK: @VACopy
+; CHECK: call void @llvm.memset.p0i8.i64({{.*}}, i8 0, i64 24, i32 8, i1 false)
+; CHECK: ret void
+
+
+; Test handling of volatile stores.
+; Check that MemorySanitizer does not add a check of the value being stored.
+
+define void @VolatileStore(i32* nocapture %p, i32 %x) nounwind uwtable {
+entry:
+  store volatile i32 %x, i32* %p, align 4
+  ret void
+}
+
+; CHECK: @VolatileStore
+; CHECK-NOT: @__msan_warning
+; CHECK: ret void
diff --git a/test/Instrumentation/ThreadSanitizer/tsan-vs-gvn.ll b/test/Instrumentation/ThreadSanitizer/tsan-vs-gvn.ll
new file mode 100644
index 0000000..a83a274
--- /dev/null
+++ b/test/Instrumentation/ThreadSanitizer/tsan-vs-gvn.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -basicaa -gvn -tsan -S | FileCheck %s
+; TSAN conflicts with load widening. Make sure the load widening is off with -tsan.
+
+; 32-bit little endian target.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+%struct_of_8_bytes_4_aligned = type { i32, i8, i8, i8, i8}
+
+@f = global %struct_of_8_bytes_4_aligned zeroinitializer, align 4
+
+; Accessing bytes 4 and 6, not ok to widen to i32 if sanitize_thread is set.
+
+define i32 @test_widening_bad(i8* %P) nounwind ssp noredzone sanitize_thread {
+entry:
+  %tmp = load i8* getelementptr inbounds (%struct_of_8_bytes_4_aligned* @f, i64 0, i32 1), align 4
+  %conv = zext i8 %tmp to i32
+  %tmp1 = load i8* getelementptr inbounds (%struct_of_8_bytes_4_aligned* @f, i64 0, i32 3), align 1
+  %conv2 = zext i8 %tmp1 to i32
+  %add = add nsw i32 %conv, %conv2
+  ret i32 %add
+; CHECK: @test_widening_bad
+; CHECK: call void @__tsan_read1
+; CHECK: call void @__tsan_read1
+; CHECK-NOT: call void @__tsan_read4
+; CHECK: ret i32
+}
diff --git a/test/JitListener/test-inline.ll b/test/JitListener/test-inline.ll
index 8bfaeaf..ca5d8d6 100644
--- a/test/JitListener/test-inline.ll
+++ b/test/JitListener/test-inline.ll
@@ -1,13 +1,29 @@
 ; RUN: llvm-jitlistener %s | FileCheck %s
 
-; CHECK: Method load [1]: _Z15test_parametersPfPA2_dR11char_structPPitm, Size = 165
-; CHECK: Method load [2]: _Z3food, Size = 39
+; CHECK: Method load [1]: _Z15test_parametersPfPA2_dR11char_structPPitm, Size = 170
+; CHECK:   Line info @ 0: test-inline.cpp, line 33
+; CHECK:   Line info @ 35: test-inline.cpp, line 34
+; CHECK:   Line info @ 165: test-inline.cpp, line 35
+; CHECK: Method load [2]: _Z3foov, Size = 3
+; CHECK:   Line info @ 0: test-inline.cpp, line 28
+; CHECK:   Line info @ 2: test-inline.cpp, line 29
+; CHECK:   Line info @ 3: test-inline.cpp, line 29
 ; CHECK: Method load [3]: main, Size = 146
+; CHECK:   Line info @ 0: test-inline.cpp, line 39
+; CHECK:   Line info @ 21: test-inline.cpp, line 41
+; CHECK:   Line info @ 39: test-inline.cpp, line 42
+; CHECK:   Line info @ 60: test-inline.cpp, line 44
+; CHECK:   Line info @ 80: test-inline.cpp, line 48
+; CHECK:   Line info @ 90: test-inline.cpp, line 45
+; CHECK:   Line info @ 95: test-inline.cpp, line 46
+; CHECK:   Line info @ 114: test-inline.cpp, line 48
+; CHECK:   Line info @ 141: test-inline.cpp, line 49
+; CHECK:   Line info @ 146: test-inline.cpp, line 49
 ; CHECK: Method unload [1]
 ; CHECK: Method unload [2]
 ; CHECK: Method unload [3]
 
-; ModuleID = 'test-inline.bc'
+; ModuleID = 'test-inline.cpp'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -25,64 +41,54 @@ entry:
   %us.addr = alloca i16, align 2
   %l.addr = alloca i64, align 8
   %result = alloca double, align 8
-  %result2 = alloca i32, align 4
   store float* %pf, float** %pf.addr, align 8
-  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !32), !dbg !35
+  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !46), !dbg !47
   store [2 x double]* %ppd, [2 x double]** %ppd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !36), !dbg !39
+  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !48), !dbg !47
   store %struct.char_struct* %s, %struct.char_struct** %s.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !40), !dbg !42
+  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !49), !dbg !47
   store i32** %ppn, i32*** %ppn.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !43), !dbg !46
+  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !50), !dbg !47
   store i16 %us, i16* %us.addr, align 2
-  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !47), !dbg !49
+  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !51), !dbg !47
   store i64 %l, i64* %l.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !50), !dbg !53
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !54), !dbg !56
-  %0 = load float** %pf.addr, align 8, !dbg !57
-  %arrayidx = getelementptr inbounds float* %0, i64 0, !dbg !57
-  %1 = load float* %arrayidx, !dbg !57
-  %conv = fpext float %1 to double, !dbg !57
-  %2 = load [2 x double]** %ppd.addr, align 8, !dbg !57
-  %arrayidx1 = getelementptr inbounds [2 x double]* %2, i64 1, !dbg !57
-  %arrayidx2 = getelementptr inbounds [2 x double]* %arrayidx1, i32 0, i64 1, !dbg !57
-  %3 = load double* %arrayidx2, !dbg !57
-  %mul = fmul double %conv, %3, !dbg !57
-  %4 = load %struct.char_struct** %s.addr, !dbg !57
-  %c = getelementptr inbounds %struct.char_struct* %4, i32 0, i32 0, !dbg !57
-  %5 = load i8* %c, align 1, !dbg !57
-  %conv3 = sext i8 %5 to i32, !dbg !57
-  %conv4 = sitofp i32 %conv3 to double, !dbg !57
-  %mul5 = fmul double %mul, %conv4, !dbg !57
-  %6 = load i16* %us.addr, align 2, !dbg !57
-  %conv6 = zext i16 %6 to i32, !dbg !57
-  %conv7 = sitofp i32 %conv6 to double, !dbg !57
-  %mul8 = fmul double %mul5, %conv7, !dbg !57
-  %7 = load i64* %l.addr, align 8, !dbg !57
-  %conv9 = uitofp i64 %7 to double, !dbg !57
-  %mul10 = fmul double %mul8, %conv9, !dbg !57
-  store double %mul10, double* %result, align 8, !dbg !57
-  call void @llvm.dbg.declare(metadata !{i32* %result2}, metadata !58), !dbg !59
-  %8 = load double* %result, align 8, !dbg !60
-  %call = call i32 @_Z3food(double %8), !dbg !60
-  store i32 %call, i32* %result2, align 4, !dbg !60
-  %9 = load i32* %result2, align 4, !dbg !61
-  %conv11 = sitofp i32 %9 to double, !dbg !61
-  ret double %conv11, !dbg !61
+  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !52), !dbg !47
+  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !53), !dbg !55
+  %0 = load float** %pf.addr, align 8, !dbg !55
+  %arrayidx = getelementptr inbounds float* %0, i64 0, !dbg !55
+  %1 = load float* %arrayidx, align 4, !dbg !55
+  %conv = fpext float %1 to double, !dbg !55
+  %2 = load [2 x double]** %ppd.addr, align 8, !dbg !55
+  %arrayidx1 = getelementptr inbounds [2 x double]* %2, i64 1, !dbg !55
+  %arrayidx2 = getelementptr inbounds [2 x double]* %arrayidx1, i32 0, i64 1, !dbg !55
+  %3 = load double* %arrayidx2, align 8, !dbg !55
+  %mul = fmul double %conv, %3, !dbg !55
+  %4 = load %struct.char_struct** %s.addr, align 8, !dbg !55
+  %c = getelementptr inbounds %struct.char_struct* %4, i32 0, i32 0, !dbg !55
+  %5 = load i8* %c, align 1, !dbg !55
+  %conv3 = sext i8 %5 to i32, !dbg !55
+  %conv4 = sitofp i32 %conv3 to double, !dbg !55
+  %mul5 = fmul double %mul, %conv4, !dbg !55
+  %6 = load i16* %us.addr, align 2, !dbg !55
+  %conv6 = zext i16 %6 to i32, !dbg !55
+  %conv7 = sitofp i32 %conv6 to double, !dbg !55
+  %mul8 = fmul double %mul5, %conv7, !dbg !55
+  %7 = load i64* %l.addr, align 8, !dbg !55
+  %conv9 = uitofp i64 %7 to double, !dbg !55
+  %mul10 = fmul double %mul8, %conv9, !dbg !55
+  %call = call i32 @_Z3foov(), !dbg !55
+  %conv11 = sitofp i32 %call to double, !dbg !55
+  %add = fadd double %mul10, %conv11, !dbg !55
+  store double %add, double* %result, align 8, !dbg !55
+  %8 = load double* %result, align 8, !dbg !56
+  ret double %8, !dbg !56
 }
 
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
-define linkonce_odr i32 @_Z3food(double %input) nounwind uwtable inlinehint {
+define linkonce_odr i32 @_Z3foov() nounwind uwtable inlinehint {
 entry:
-  %input.addr = alloca double, align 8
-  store double %input, double* %input.addr, align 8
-  call void @llvm.dbg.declare(metadata !{double* %input.addr}, metadata !62), !dbg !63
-  %0 = load double* %input.addr, align 8, !dbg !64
-  %div = fdiv double %0, 3.000000e+00, !dbg !64
-  %add = fadd double %div, 1.000000e+00, !dbg !64
-  %conv = fptosi double %add to i32, !dbg !64
-  ret i32 %conv, !dbg !64
+  ret i32 0, !dbg !57
 }
 
 define i32 @main(i32 %argc, i8** %argv) uwtable {
@@ -96,124 +102,111 @@ entry:
   %result = alloca double, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !66), !dbg !67
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !59), !dbg !60
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !68), !dbg !71
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !72), !dbg !74
-  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !75), !dbg !76
-  store float 0.000000e+00, float* %f, align 4, !dbg !77
-  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !78), !dbg !81
-  %0 = bitcast [2 x [2 x double]]* %d to i8*, !dbg !82
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([2 x [2 x double]]* @_ZZ4mainE1d to i8*), i64 32, i32 16, i1 false), !dbg !82
-  %c = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 0, !dbg !83
-  store i8 97, i8* %c, align 1, !dbg !83
-  %c2 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !84
-  %arrayidx = getelementptr inbounds [2 x i8]* %c2, i32 0, i64 0, !dbg !84
-  store i8 48, i8* %arrayidx, align 1, !dbg !84
-  %c21 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !85
-  %arrayidx2 = getelementptr inbounds [2 x i8]* %c21, i32 0, i64 1, !dbg !85
-  store i8 49, i8* %arrayidx2, align 1, !dbg !85
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !86), !dbg !87
-  %arraydecay = getelementptr inbounds [2 x [2 x double]]* %d, i32 0, i32 0, !dbg !88
-  %call = call double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %f, [2 x double]* %arraydecay, %struct.char_struct* %s, i32** null, i16 zeroext 10, i64 42), !dbg !88
-  store double %call, double* %result, align 8, !dbg !88
-  %1 = load double* %result, align 8, !dbg !89
-  %cmp = fcmp oeq double %1, 0.000000e+00, !dbg !89
-  %cond = select i1 %cmp, i32 0, i32 -1, !dbg !89
-  ret i32 %cond, !dbg !89
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !61), !dbg !60
+  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !62), !dbg !64
+  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !65), !dbg !66
+  store float 0.000000e+00, float* %f, align 4, !dbg !66
+  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !67), !dbg !70
+  %0 = bitcast [2 x [2 x double]]* %d to i8*, !dbg !70
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([2 x [2 x double]]* @_ZZ4mainE1d to i8*), i64 32, i32 16, i1 false), !dbg !70
+  %c = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 0, !dbg !71
+  store i8 97, i8* %c, align 1, !dbg !71
+  %c2 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !72
+  %arrayidx = getelementptr inbounds [2 x i8]* %c2, i32 0, i64 0, !dbg !72
+  store i8 48, i8* %arrayidx, align 1, !dbg !72
+  %c21 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !73
+  %arrayidx2 = getelementptr inbounds [2 x i8]* %c21, i32 0, i64 1, !dbg !73
+  store i8 49, i8* %arrayidx2, align 1, !dbg !73
+  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !74), !dbg !75
+  %arraydecay = getelementptr inbounds [2 x [2 x double]]* %d, i32 0, i32 0, !dbg !75
+  %call = call double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %f, [2 x double]* %arraydecay, %struct.char_struct* %s, i32** null, i16 zeroext 10, i64 42), !dbg !75
+  store double %call, double* %result, align 8, !dbg !75
+  %1 = load double* %result, align 8, !dbg !76
+  %cmp = fcmp oeq double %1, 0.000000e+00, !dbg !76
+  %cond = select i1 %cmp, i32 0, i32 -1, !dbg !76
+  ret i32 %cond, !dbg !76
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 !llvm.dbg.cu = !{!0}
 
-!0 = metadata !{i32 720913, i32 0, i32 4, metadata !"test-inline.cpp", metadata !"/home/athirumurthi/dev/opencl-mc/build/RH64/Debug/backend/llvm", metadata !"clang version 3.0 (branches/release_30 36797)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !17} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"test-inline.cpp", metadata !"/home/akaylor/dev", metadata !"clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !43} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-inline.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !12, metadata !16}
-!5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"test_parameters", metadata !"test_parameters", metadata !"_Z15test_parametersPfPA2_dR11char_structPPitm", metadata !6, i32 33, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !10} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 720937, metadata !"test-inline.cpp", metadata !"/home/athirumurthi/dev/opencl-mc/build/RH64/Debug/backend/llvm", null} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{i32 720932, null, metadata !"double", null, i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 720942, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 40, metadata !13, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !10} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !14, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5, metadata !35, metadata !40}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"test_parameters", metadata !"test_parameters", metadata !"_Z15test_parametersPfPA2_dR11char_structPPitm", metadata !6, i32 32, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1, i32 33} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
+!6 = metadata !{i32 786473, metadata !"test-inline.cpp", metadata !"/home/akaylor/dev", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !10, metadata !12, metadata !16, metadata !29, metadata !32, metadata !33}
+!9 = metadata !{i32 786468, null, metadata !"double", null, i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!10 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
+!11 = metadata !{i32 786468, null, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!12 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!13 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !9, metadata !14, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
 !14 = metadata !{metadata !15}
-!15 = metadata !{i32 720932, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!16 = metadata !{i32 720942, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3food", metadata !6, i32 28, metadata !13, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (double)* @_Z3food, null, null, metadata !10} ; [ DW_TAG_subprogram ]
-!17 = metadata !{metadata !18}
-!18 = metadata !{metadata !19}
-!19 = metadata !{i32 720948, i32 0, null, metadata !"compound_char", metadata !"compound_char", metadata !"", metadata !6, i32 25, metadata !20, i32 0, i32 1, %struct.char_struct* @compound_char} ; [ DW_TAG_variable ]
-!20 = metadata !{i32 720898, null, metadata !"char_struct", metadata !6, i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !21, i32 0, null, null} ; [ DW_TAG_class_type ]
-!21 = metadata !{metadata !22, metadata !24, metadata !28}
-!22 = metadata !{i32 720909, metadata !20, metadata !"c", metadata !6, i32 23, i64 8, i64 8, i64 0, i32 0, metadata !23} ; [ DW_TAG_member ]
-!23 = metadata !{i32 720932, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!24 = metadata !{i32 720909, metadata !20, metadata !"c2", metadata !6, i32 24, i64 16, i64 8, i64 8, i32 0, metadata !25} ; [ DW_TAG_member ]
-!25 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !23, metadata !26, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!26 = metadata !{metadata !27}
-!27 = metadata !{i32 720929, i64 0, i64 2}        ; [ DW_TAG_subrange_type ]
-!28 = metadata !{i32 720942, i32 0, metadata !20, metadata !"char_struct", metadata !"char_struct", metadata !"", metadata !6, i32 22, metadata !29, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !10} ; [ DW_TAG_subprogram ]
-!29 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !30, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!30 = metadata !{null, metadata !31}
-!31 = metadata !{i32 720911, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !20} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{i32 721153, metadata !5, metadata !"pf", metadata !6, i32 16777248, metadata !33, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!33 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !34} ; [ DW_TAG_pointer_type ]
-!34 = metadata !{i32 720932, null, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!35 = metadata !{i32 32, i32 31, metadata !5, null}
-!36 = metadata !{i32 721153, metadata !5, metadata !"ppd", metadata !6, i32 33554464, metadata !37, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!37 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !38} ; [ DW_TAG_pointer_type ]
-!38 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !9, metadata !26, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!39 = metadata !{i32 32, i32 42, metadata !5, null}
-!40 = metadata !{i32 721153, metadata !5, metadata !"s", metadata !6, i32 50331680, metadata !41, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!41 = metadata !{i32 720912, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_reference_type ]
-!42 = metadata !{i32 32, i32 72, metadata !5, null}
-!43 = metadata !{i32 721153, metadata !5, metadata !"ppn", metadata !6, i32 67108896, metadata !44, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!44 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !45} ; [ DW_TAG_pointer_type ]
-!45 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ]
-!46 = metadata !{i32 32, i32 81, metadata !5, null}
-!47 = metadata !{i32 721153, metadata !5, metadata !"us", metadata !6, i32 83886112, metadata !48, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!48 = metadata !{i32 720932, null, metadata !"unsigned short", null, i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!49 = metadata !{i32 32, i32 105, metadata !5, null}
-!50 = metadata !{i32 721153, metadata !5, metadata !"l", metadata !6, i32 100663328, metadata !51, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!51 = metadata !{i32 720934, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !52} ; [ DW_TAG_const_type ]
-!52 = metadata !{i32 720932, null, metadata !"long unsigned int", null, i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!53 = metadata !{i32 32, i32 135, metadata !5, null}
-!54 = metadata !{i32 721152, metadata !55, metadata !"result", metadata !6, i32 34, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!55 = metadata !{i32 720907, metadata !5, i32 33, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
-!56 = metadata !{i32 34, i32 10, metadata !55, null}
-!57 = metadata !{i32 34, i32 51, metadata !55, null}
-!58 = metadata !{i32 721152, metadata !55, metadata !"result2", metadata !6, i32 35, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!59 = metadata !{i32 35, i32 7, metadata !55, null}
-!60 = metadata !{i32 35, i32 17, metadata !55, null}
-!61 = metadata !{i32 36, i32 3, metadata !55, null}
-!62 = metadata !{i32 721153, metadata !16, metadata !"input", metadata !6, i32 16777243, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!63 = metadata !{i32 27, i32 23, metadata !16, null}
-!64 = metadata !{i32 29, i32 3, metadata !65, null}
-!65 = metadata !{i32 720907, metadata !16, i32 28, i32 1, metadata !6, i32 2} ; [ DW_TAG_lexical_block ]
-!66 = metadata !{i32 721153, metadata !12, metadata !"argc", metadata !6, i32 16777255, metadata !15, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!67 = metadata !{i32 39, i32 14, metadata !12, null}
-!68 = metadata !{i32 721153, metadata !12, metadata !"argv", metadata !6, i32 33554471, metadata !69, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!69 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !70} ; [ DW_TAG_pointer_type ]
-!70 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!71 = metadata !{i32 39, i32 26, metadata !12, null}
-!72 = metadata !{i32 721152, metadata !73, metadata !"s", metadata !6, i32 41, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!73 = metadata !{i32 720907, metadata !12, i32 40, i32 1, metadata !6, i32 1} ; [ DW_TAG_lexical_block ]
-!74 = metadata !{i32 41, i32 22, metadata !73, null}
-!75 = metadata !{i32 721152, metadata !73, metadata !"f", metadata !6, i32 42, metadata !34, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!76 = metadata !{i32 42, i32 9, metadata !73, null}
-!77 = metadata !{i32 42, i32 16, metadata !73, null}
-!78 = metadata !{i32 721152, metadata !73, metadata !"d", metadata !6, i32 43, metadata !79, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!79 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !9, metadata !80, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!80 = metadata !{metadata !27, metadata !27}
-!81 = metadata !{i32 43, i32 10, metadata !73, null}
-!82 = metadata !{i32 43, i32 38, metadata !73, null}
-!83 = metadata !{i32 45, i32 3, metadata !73, null}
-!84 = metadata !{i32 46, i32 3, metadata !73, null}
-!85 = metadata !{i32 47, i32 3, metadata !73, null}
-!86 = metadata !{i32 721152, metadata !73, metadata !"result", metadata !6, i32 49, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!87 = metadata !{i32 49, i32 10, metadata !73, null}
-!88 = metadata !{i32 49, i32 19, metadata !73, null}
-!89 = metadata !{i32 50, i32 3, metadata !73, null}
+!15 = metadata !{i32 786465, i64 0, i64 2}        ; [ DW_TAG_subrange_type ] [0, 1]
+!16 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
+!17 = metadata !{i32 786451, null, metadata !"char_struct", metadata !6, i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !18, i32 0, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [from ]
+!18 = metadata !{metadata !19, metadata !21, metadata !23}
+!19 = metadata !{i32 786445, metadata !17, metadata !"c", metadata !6, i32 23, i64 8, i64 8, i64 0, i32 0, metadata !20} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
+!20 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!21 = metadata !{i32 786445, metadata !17, metadata !"c2", metadata !6, i32 24, i64 16, i64 8, i64 8, i32 0, metadata !22} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
+!22 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !20, metadata !14, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
+!23 = metadata !{i32 786478, i32 0, metadata !17, metadata !"char_struct", metadata !"char_struct", metadata !"", metadata !6, i32 22, metadata !24, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !27, i32 22} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
+!24 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!25 = metadata !{null, metadata !26}
+!26 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
+!27 = metadata !{metadata !28}
+!28 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!29 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !30} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!30 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !31} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!31 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!32 = metadata !{i32 786468, null, metadata !"unsigned short", null, i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
+!33 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !34} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
+!34 = metadata !{i32 786468, null, metadata !"long unsigned int", null, i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!35 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 38, metadata !36, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !1, i32 39} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
+!36 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!37 = metadata !{metadata !31, metadata !31, metadata !38}
+!38 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!39 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!40 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !6, i32 27, metadata !41, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 28} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
+!41 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !42, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!42 = metadata !{metadata !31}
+!43 = metadata !{metadata !44}
+!44 = metadata !{metadata !45}
+!45 = metadata !{i32 786484, i32 0, null, metadata !"compound_char", metadata !"compound_char", metadata !"", metadata !6, i32 25, metadata !17, i32 0, i32 1, %struct.char_struct* @compound_char} ; [ DW_TAG_variable ] [compound_char] [line 25] [def]
+!46 = metadata !{i32 786689, metadata !5, metadata !"pf", metadata !6, i32 16777248, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [pf] [line 32]
+!47 = metadata !{i32 32, i32 0, metadata !5, null}
+!48 = metadata !{i32 786689, metadata !5, metadata !"ppd", metadata !6, i32 33554464, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ppd] [line 32]
+!49 = metadata !{i32 786689, metadata !5, metadata !"s", metadata !6, i32 50331680, metadata !16, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [s] [line 32]
+!50 = metadata !{i32 786689, metadata !5, metadata !"ppn", metadata !6, i32 67108896, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ppn] [line 32]
+!51 = metadata !{i32 786689, metadata !5, metadata !"us", metadata !6, i32 83886112, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [us] [line 32]
+!52 = metadata !{i32 786689, metadata !5, metadata !"l", metadata !6, i32 100663328, metadata !33, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [l] [line 32]
+!53 = metadata !{i32 786688, metadata !54, metadata !"result", metadata !6, i32 34, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 34]
+!54 = metadata !{i32 786443, metadata !5, i32 33, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
+!55 = metadata !{i32 34, i32 0, metadata !54, null}
+!56 = metadata !{i32 35, i32 0, metadata !54, null}
+!57 = metadata !{i32 29, i32 0, metadata !58, null}
+!58 = metadata !{i32 786443, metadata !40, i32 28, i32 0, metadata !6, i32 2} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
+!59 = metadata !{i32 786689, metadata !35, metadata !"argc", metadata !6, i32 16777254, metadata !31, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 38]
+!60 = metadata !{i32 38, i32 0, metadata !35, null}
+!61 = metadata !{i32 786689, metadata !35, metadata !"argv", metadata !6, i32 33554470, metadata !38, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 38]
+!62 = metadata !{i32 786688, metadata !63, metadata !"s", metadata !6, i32 40, metadata !17, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [s] [line 40]
+!63 = metadata !{i32 786443, metadata !35, i32 39, i32 0, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-inline.cpp]
+!64 = metadata !{i32 40, i32 0, metadata !63, null}
+!65 = metadata !{i32 786688, metadata !63, metadata !"f", metadata !6, i32 41, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [f] [line 41]
+!66 = metadata !{i32 41, i32 0, metadata !63, null}
+!67 = metadata !{i32 786688, metadata !63, metadata !"d", metadata !6, i32 42, metadata !68, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 42]
+!68 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !9, metadata !69, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
+!69 = metadata !{metadata !15, metadata !15}
+!70 = metadata !{i32 42, i32 0, metadata !63, null}
+!71 = metadata !{i32 44, i32 0, metadata !63, null}
+!72 = metadata !{i32 45, i32 0, metadata !63, null}
+!73 = metadata !{i32 46, i32 0, metadata !63, null}
+!74 = metadata !{i32 786688, metadata !63, metadata !"result", metadata !6, i32 48, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 48]
+!75 = metadata !{i32 48, i32 0, metadata !63, null}
+!76 = metadata !{i32 49, i32 0, metadata !63, null}
diff --git a/test/JitListener/test-parameters.ll b/test/JitListener/test-parameters.ll
index 0c437a8..1e2a2b3 100644
--- a/test/JitListener/test-parameters.ll
+++ b/test/JitListener/test-parameters.ll
@@ -1,13 +1,28 @@
 ; RUN: llvm-jitlistener %s | FileCheck %s
 
 ; CHECK: Method load [1]: _Z15test_parametersPfPA2_dR11char_structPPitm, Size = 170
+; CHECK:   Line info @ 0: test-parameters.cpp, line 33
+; CHECK:   Line info @ 35: test-parameters.cpp, line 34
+; CHECK:   Line info @ 165: test-parameters.cpp, line 35
 ; CHECK: Method load [2]: _Z3foov, Size = 3
+; CHECK:   Line info @ 0: test-parameters.cpp, line 28
+; CHECK:   Line info @ 2: test-parameters.cpp, line 29
 ; CHECK: Method load [3]: main, Size = 146
+; CHECK:   Line info @ 0: test-parameters.cpp, line 39
+; CHECK:   Line info @ 21: test-parameters.cpp, line 41
+; CHECK:   Line info @ 39: test-parameters.cpp, line 42
+; CHECK:   Line info @ 60: test-parameters.cpp, line 44
+; CHECK:   Line info @ 80: test-parameters.cpp, line 48
+; CHECK:   Line info @ 90: test-parameters.cpp, line 45
+; CHECK:   Line info @ 95: test-parameters.cpp, line 46
+; CHECK:   Line info @ 114: test-parameters.cpp, line 48
+; CHECK:   Line info @ 141: test-parameters.cpp, line 49
+; CHECK:   Line info @ 146: test-parameters.cpp, line 49
 ; CHECK: Method unload [1]
 ; CHECK: Method unload [2]
 ; CHECK: Method unload [3]
 
-; ModuleID = 'test-parameters.bc'
+; ModuleID = 'test-parameters.cpp'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -18,7 +33,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define i32 @_Z3foov() nounwind uwtable {
 entry:
-  ret i32 0, !dbg !32
+  ret i32 0, !dbg !46
 }
 
 define double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %pf, [2 x double]* %ppd, %struct.char_struct* %s, i32** %ppn, i16 zeroext %us, i64 %l) nounwind uwtable {
@@ -31,46 +46,46 @@ entry:
   %l.addr = alloca i64, align 8
   %result = alloca double, align 8
   store float* %pf, float** %pf.addr, align 8
-  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !34), !dbg !37
+  call void @llvm.dbg.declare(metadata !{float** %pf.addr}, metadata !48), !dbg !49
   store [2 x double]* %ppd, [2 x double]** %ppd.addr, align 8
-  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !38), !dbg !41
+  call void @llvm.dbg.declare(metadata !{[2 x double]** %ppd.addr}, metadata !50), !dbg !49
   store %struct.char_struct* %s, %struct.char_struct** %s.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !42), !dbg !44
+  call void @llvm.dbg.declare(metadata !{%struct.char_struct** %s.addr}, metadata !51), !dbg !49
   store i32** %ppn, i32*** %ppn.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !45), !dbg !48
+  call void @llvm.dbg.declare(metadata !{i32*** %ppn.addr}, metadata !52), !dbg !49
   store i16 %us, i16* %us.addr, align 2
-  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !49), !dbg !51
+  call void @llvm.dbg.declare(metadata !{i16* %us.addr}, metadata !53), !dbg !49
   store i64 %l, i64* %l.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !52), !dbg !55
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !56), !dbg !58
-  %0 = load float** %pf.addr, align 8, !dbg !59
-  %arrayidx = getelementptr inbounds float* %0, i64 0, !dbg !59
-  %1 = load float* %arrayidx, !dbg !59
-  %conv = fpext float %1 to double, !dbg !59
-  %2 = load [2 x double]** %ppd.addr, align 8, !dbg !59
-  %arrayidx1 = getelementptr inbounds [2 x double]* %2, i64 1, !dbg !59
-  %arrayidx2 = getelementptr inbounds [2 x double]* %arrayidx1, i32 0, i64 1, !dbg !59
-  %3 = load double* %arrayidx2, !dbg !59
-  %mul = fmul double %conv, %3, !dbg !59
-  %4 = load %struct.char_struct** %s.addr, !dbg !59
-  %c = getelementptr inbounds %struct.char_struct* %4, i32 0, i32 0, !dbg !59
-  %5 = load i8* %c, align 1, !dbg !59
-  %conv3 = sext i8 %5 to i32, !dbg !59
-  %conv4 = sitofp i32 %conv3 to double, !dbg !59
-  %mul5 = fmul double %mul, %conv4, !dbg !59
-  %6 = load i16* %us.addr, align 2, !dbg !59
-  %conv6 = zext i16 %6 to i32, !dbg !59
-  %conv7 = sitofp i32 %conv6 to double, !dbg !59
-  %mul8 = fmul double %mul5, %conv7, !dbg !59
-  %7 = load i64* %l.addr, align 8, !dbg !59
-  %conv9 = uitofp i64 %7 to double, !dbg !59
-  %mul10 = fmul double %mul8, %conv9, !dbg !59
-  %call = call i32 @_Z3foov(), !dbg !60
-  %conv11 = sitofp i32 %call to double, !dbg !60
-  %add = fadd double %mul10, %conv11, !dbg !60
-  store double %add, double* %result, align 8, !dbg !60
-  %8 = load double* %result, align 8, !dbg !61
-  ret double %8, !dbg !61
+  call void @llvm.dbg.declare(metadata !{i64* %l.addr}, metadata !54), !dbg !49
+  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !55), !dbg !57
+  %0 = load float** %pf.addr, align 8, !dbg !57
+  %arrayidx = getelementptr inbounds float* %0, i64 0, !dbg !57
+  %1 = load float* %arrayidx, align 4, !dbg !57
+  %conv = fpext float %1 to double, !dbg !57
+  %2 = load [2 x double]** %ppd.addr, align 8, !dbg !57
+  %arrayidx1 = getelementptr inbounds [2 x double]* %2, i64 1, !dbg !57
+  %arrayidx2 = getelementptr inbounds [2 x double]* %arrayidx1, i32 0, i64 1, !dbg !57
+  %3 = load double* %arrayidx2, align 8, !dbg !57
+  %mul = fmul double %conv, %3, !dbg !57
+  %4 = load %struct.char_struct** %s.addr, align 8, !dbg !57
+  %c = getelementptr inbounds %struct.char_struct* %4, i32 0, i32 0, !dbg !57
+  %5 = load i8* %c, align 1, !dbg !57
+  %conv3 = sext i8 %5 to i32, !dbg !57
+  %conv4 = sitofp i32 %conv3 to double, !dbg !57
+  %mul5 = fmul double %mul, %conv4, !dbg !57
+  %6 = load i16* %us.addr, align 2, !dbg !57
+  %conv6 = zext i16 %6 to i32, !dbg !57
+  %conv7 = sitofp i32 %conv6 to double, !dbg !57
+  %mul8 = fmul double %mul5, %conv7, !dbg !57
+  %7 = load i64* %l.addr, align 8, !dbg !57
+  %conv9 = uitofp i64 %7 to double, !dbg !57
+  %mul10 = fmul double %mul8, %conv9, !dbg !57
+  %call = call i32 @_Z3foov(), !dbg !57
+  %conv11 = sitofp i32 %call to double, !dbg !57
+  %add = fadd double %mul10, %conv11, !dbg !57
+  store double %add, double* %result, align 8, !dbg !57
+  %8 = load double* %result, align 8, !dbg !58
+  ret double %8, !dbg !58
 }
 
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
@@ -86,120 +101,111 @@ entry:
   %result = alloca double, align 8
   store i32 0, i32* %retval
   store i32 %argc, i32* %argc.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !62), !dbg !63
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !59), !dbg !60
   store i8** %argv, i8*** %argv.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !64), !dbg !67
-  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !68), !dbg !70
-  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !71), !dbg !72
-  store float 0.000000e+00, float* %f, align 4, !dbg !73
-  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !74), !dbg !77
-  %0 = bitcast [2 x [2 x double]]* %d to i8*, !dbg !78
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([2 x [2 x double]]* @_ZZ4mainE1d to i8*), i64 32, i32 16, i1 false), !dbg !78
-  %c = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 0, !dbg !79
-  store i8 97, i8* %c, align 1, !dbg !79
-  %c2 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !80
-  %arrayidx = getelementptr inbounds [2 x i8]* %c2, i32 0, i64 0, !dbg !80
-  store i8 48, i8* %arrayidx, align 1, !dbg !80
-  %c21 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !81
-  %arrayidx2 = getelementptr inbounds [2 x i8]* %c21, i32 0, i64 1, !dbg !81
-  store i8 49, i8* %arrayidx2, align 1, !dbg !81
-  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !82), !dbg !83
-  %arraydecay = getelementptr inbounds [2 x [2 x double]]* %d, i32 0, i32 0, !dbg !84
-  %call = call double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %f, [2 x double]* %arraydecay, %struct.char_struct* %s, i32** null, i16 zeroext 10, i64 42), !dbg !84
-  store double %call, double* %result, align 8, !dbg !84
-  %1 = load double* %result, align 8, !dbg !85
-  %cmp = fcmp oeq double %1, 0.000000e+00, !dbg !85
-  %cond = select i1 %cmp, i32 0, i32 -1, !dbg !85
-  ret i32 %cond, !dbg !85
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !61), !dbg !60
+  call void @llvm.dbg.declare(metadata !{%struct.char_struct* %s}, metadata !62), !dbg !64
+  call void @llvm.dbg.declare(metadata !{float* %f}, metadata !65), !dbg !66
+  store float 0.000000e+00, float* %f, align 4, !dbg !66
+  call void @llvm.dbg.declare(metadata !{[2 x [2 x double]]* %d}, metadata !67), !dbg !70
+  %0 = bitcast [2 x [2 x double]]* %d to i8*, !dbg !70
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([2 x [2 x double]]* @_ZZ4mainE1d to i8*), i64 32, i32 16, i1 false), !dbg !70
+  %c = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 0, !dbg !71
+  store i8 97, i8* %c, align 1, !dbg !71
+  %c2 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !72
+  %arrayidx = getelementptr inbounds [2 x i8]* %c2, i32 0, i64 0, !dbg !72
+  store i8 48, i8* %arrayidx, align 1, !dbg !72
+  %c21 = getelementptr inbounds %struct.char_struct* %s, i32 0, i32 1, !dbg !73
+  %arrayidx2 = getelementptr inbounds [2 x i8]* %c21, i32 0, i64 1, !dbg !73
+  store i8 49, i8* %arrayidx2, align 1, !dbg !73
+  call void @llvm.dbg.declare(metadata !{double* %result}, metadata !74), !dbg !75
+  %arraydecay = getelementptr inbounds [2 x [2 x double]]* %d, i32 0, i32 0, !dbg !75
+  %call = call double @_Z15test_parametersPfPA2_dR11char_structPPitm(float* %f, [2 x double]* %arraydecay, %struct.char_struct* %s, i32** null, i16 zeroext 10, i64 42), !dbg !75
+  store double %call, double* %result, align 8, !dbg !75
+  %1 = load double* %result, align 8, !dbg !76
+  %cmp = fcmp oeq double %1, 0.000000e+00, !dbg !76
+  %cond = select i1 %cmp, i32 0, i32 -1, !dbg !76
+  ret i32 %cond, !dbg !76
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 !llvm.dbg.cu = !{!0}
 
-!0 = metadata !{i32 720913, i32 0, i32 4, metadata !"test-parameters.cpp", metadata !"/home/athirumurthi/dev/opencl-mc/build/RH64/Debug/backend/llvm", metadata !"clang version 3.0 (branches/release_30 36797)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !17} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"test-parameters.cpp", metadata !"/home/akaylor/dev", metadata !"clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !43} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-parameters.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !12, metadata !16}
-!5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !6, i32 28, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !10} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 720937, metadata !"test-parameters.cpp", metadata !"/home/athirumurthi/dev/opencl-mc/build/RH64/Debug/backend/llvm", null} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{metadata !5, metadata !10, metadata !38}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !6, i32 27, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 28} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
+!6 = metadata !{i32 786473, metadata !"test-parameters.cpp", metadata !"/home/akaylor/dev", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
-!9 = metadata !{i32 720932, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 720942, i32 0, metadata !6, metadata !"test_parameters", metadata !"test_parameters", metadata !"_Z15test_parametersPfPA2_dR11char_structPPitm", metadata !6, i32 33, metadata !13, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !10} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !14, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!14 = metadata !{metadata !15}
-!15 = metadata !{i32 720932, null, metadata !"double", null, i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!16 = metadata !{i32 720942, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 39, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !10} ; [ DW_TAG_subprogram ]
-!17 = metadata !{metadata !18}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786478, i32 0, metadata !6, metadata !"test_parameters", metadata !"test_parameters", metadata !"_Z15test_parametersPfPA2_dR11char_structPPitm", metadata !6, i32 32, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1, i32 33} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
+!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !13, metadata !14, metadata !16, metadata !20, metadata !33, metadata !35, metadata !36}
+!13 = metadata !{i32 786468, null, metadata !"double", null, i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!14 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
+!15 = metadata !{i32 786468, null, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!16 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!17 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !13, metadata !18, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
 !18 = metadata !{metadata !19}
-!19 = metadata !{i32 720948, i32 0, null, metadata !"compound_char", metadata !"compound_char", metadata !"", metadata !6, i32 25, metadata !20, i32 0, i32 1, %struct.char_struct* @compound_char} ; [ DW_TAG_variable ]
-!20 = metadata !{i32 720898, null, metadata !"char_struct", metadata !6, i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !21, i32 0, null, null} ; [ DW_TAG_class_type ]
-!21 = metadata !{metadata !22, metadata !24, metadata !28}
-!22 = metadata !{i32 720909, metadata !20, metadata !"c", metadata !6, i32 23, i64 8, i64 8, i64 0, i32 0, metadata !23} ; [ DW_TAG_member ]
-!23 = metadata !{i32 720932, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!24 = metadata !{i32 720909, metadata !20, metadata !"c2", metadata !6, i32 24, i64 16, i64 8, i64 8, i32 0, metadata !25} ; [ DW_TAG_member ]
-!25 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !23, metadata !26, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!26 = metadata !{metadata !27}
-!27 = metadata !{i32 720929, i64 0, i64 2}        ; [ DW_TAG_subrange_type ]
-!28 = metadata !{i32 720942, i32 0, metadata !20, metadata !"char_struct", metadata !"char_struct", metadata !"", metadata !6, i32 22, metadata !29, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !10} ; [ DW_TAG_subprogram ]
-!29 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !30, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!30 = metadata !{null, metadata !31}
-!31 = metadata !{i32 720911, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !20} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{i32 29, i32 3, metadata !33, null}
-!33 = metadata !{i32 720907, metadata !5, i32 28, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
-!34 = metadata !{i32 721153, metadata !12, metadata !"pf", metadata !6, i32 16777248, metadata !35, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!35 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !36} ; [ DW_TAG_pointer_type ]
-!36 = metadata !{i32 720932, null, metadata !"float", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!37 = metadata !{i32 32, i32 31, metadata !12, null}
-!38 = metadata !{i32 721153, metadata !12, metadata !"ppd", metadata !6, i32 33554464, metadata !39, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!39 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !40} ; [ DW_TAG_pointer_type ]
-!40 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !15, metadata !26, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!41 = metadata !{i32 32, i32 42, metadata !12, null}
-!42 = metadata !{i32 721153, metadata !12, metadata !"s", metadata !6, i32 50331680, metadata !43, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!43 = metadata !{i32 720912, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_reference_type ]
-!44 = metadata !{i32 32, i32 72, metadata !12, null}
-!45 = metadata !{i32 721153, metadata !12, metadata !"ppn", metadata !6, i32 67108896, metadata !46, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!46 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !47} ; [ DW_TAG_pointer_type ]
-!47 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
-!48 = metadata !{i32 32, i32 81, metadata !12, null}
-!49 = metadata !{i32 721153, metadata !12, metadata !"us", metadata !6, i32 83886112, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!50 = metadata !{i32 720932, null, metadata !"unsigned short", null, i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!51 = metadata !{i32 32, i32 105, metadata !12, null}
-!52 = metadata !{i32 721153, metadata !12, metadata !"l", metadata !6, i32 100663328, metadata !53, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!53 = metadata !{i32 720934, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !54} ; [ DW_TAG_const_type ]
-!54 = metadata !{i32 720932, null, metadata !"long unsigned int", null, i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!55 = metadata !{i32 32, i32 135, metadata !12, null}
-!56 = metadata !{i32 721152, metadata !57, metadata !"result", metadata !6, i32 34, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!57 = metadata !{i32 720907, metadata !12, i32 33, i32 1, metadata !6, i32 1} ; [ DW_TAG_lexical_block ]
-!58 = metadata !{i32 34, i32 10, metadata !57, null}
-!59 = metadata !{i32 34, i32 59, metadata !57, null}
-!60 = metadata !{i32 34, i32 54, metadata !57, null}
-!61 = metadata !{i32 35, i32 3, metadata !57, null}
-!62 = metadata !{i32 721153, metadata !16, metadata !"argc", metadata !6, i32 16777254, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!63 = metadata !{i32 38, i32 14, metadata !16, null}
-!64 = metadata !{i32 721153, metadata !16, metadata !"argv", metadata !6, i32 33554470, metadata !65, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!65 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !66} ; [ DW_TAG_pointer_type ]
-!66 = metadata !{i32 720911, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
-!67 = metadata !{i32 38, i32 26, metadata !16, null}
-!68 = metadata !{i32 721152, metadata !69, metadata !"s", metadata !6, i32 40, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!69 = metadata !{i32 720907, metadata !16, i32 39, i32 1, metadata !6, i32 2} ; [ DW_TAG_lexical_block ]
-!70 = metadata !{i32 40, i32 22, metadata !69, null}
-!71 = metadata !{i32 721152, metadata !69, metadata !"f", metadata !6, i32 41, metadata !36, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!72 = metadata !{i32 41, i32 9, metadata !69, null}
-!73 = metadata !{i32 41, i32 16, metadata !69, null}
-!74 = metadata !{i32 721152, metadata !69, metadata !"d", metadata !6, i32 42, metadata !75, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!75 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !15, metadata !76, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!76 = metadata !{metadata !27, metadata !27}
-!77 = metadata !{i32 42, i32 10, metadata !69, null}
-!78 = metadata !{i32 42, i32 38, metadata !69, null}
-!79 = metadata !{i32 44, i32 3, metadata !69, null}
-!80 = metadata !{i32 45, i32 3, metadata !69, null}
-!81 = metadata !{i32 46, i32 3, metadata !69, null}
-!82 = metadata !{i32 721152, metadata !69, metadata !"result", metadata !6, i32 48, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!83 = metadata !{i32 48, i32 10, metadata !69, null}
-!84 = metadata !{i32 48, i32 19, metadata !69, null}
-!85 = metadata !{i32 49, i32 3, metadata !69, null}
+!19 = metadata !{i32 786465, i64 0, i64 2}        ; [ DW_TAG_subrange_type ] [0, 1]
+!20 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !21} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
+!21 = metadata !{i32 786451, null, metadata !"char_struct", metadata !6, i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !22, i32 0, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [from ]
+!22 = metadata !{metadata !23, metadata !25, metadata !27}
+!23 = metadata !{i32 786445, metadata !21, metadata !"c", metadata !6, i32 23, i64 8, i64 8, i64 0, i32 0, metadata !24} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
+!24 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!25 = metadata !{i32 786445, metadata !21, metadata !"c2", metadata !6, i32 24, i64 16, i64 8, i64 8, i32 0, metadata !26} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
+!26 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !24, metadata !18, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
+!27 = metadata !{i32 786478, i32 0, metadata !21, metadata !"char_struct", metadata !"char_struct", metadata !"", metadata !6, i32 22, metadata !28, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !31, i32 22} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
+!28 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{null, metadata !30}
+!30 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !21} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
+!31 = metadata !{metadata !32}
+!32 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!33 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !34} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!34 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!35 = metadata !{i32 786468, null, metadata !"unsigned short", null, i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
+!36 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !37} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
+!37 = metadata !{i32 786468, null, metadata !"long unsigned int", null, i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!38 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 38, metadata !39, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !1, i32 39} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
+!39 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !40, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!40 = metadata !{metadata !9, metadata !9, metadata !41}
+!41 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !42} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!42 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!43 = metadata !{metadata !44}
+!44 = metadata !{metadata !45}
+!45 = metadata !{i32 786484, i32 0, null, metadata !"compound_char", metadata !"compound_char", metadata !"", metadata !6, i32 25, metadata !21, i32 0, i32 1, %struct.char_struct* @compound_char} ; [ DW_TAG_variable ] [compound_char] [line 25] [def]
+!46 = metadata !{i32 29, i32 0, metadata !47, null}
+!47 = metadata !{i32 786443, metadata !5, i32 28, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
+!48 = metadata !{i32 786689, metadata !10, metadata !"pf", metadata !6, i32 16777248, metadata !14, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [pf] [line 32]
+!49 = metadata !{i32 32, i32 0, metadata !10, null}
+!50 = metadata !{i32 786689, metadata !10, metadata !"ppd", metadata !6, i32 33554464, metadata !16, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ppd] [line 32]
+!51 = metadata !{i32 786689, metadata !10, metadata !"s", metadata !6, i32 50331680, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [s] [line 32]
+!52 = metadata !{i32 786689, metadata !10, metadata !"ppn", metadata !6, i32 67108896, metadata !33, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ppn] [line 32]
+!53 = metadata !{i32 786689, metadata !10, metadata !"us", metadata !6, i32 83886112, metadata !35, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [us] [line 32]
+!54 = metadata !{i32 786689, metadata !10, metadata !"l", metadata !6, i32 100663328, metadata !36, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [l] [line 32]
+!55 = metadata !{i32 786688, metadata !56, metadata !"result", metadata !6, i32 34, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 34]
+!56 = metadata !{i32 786443, metadata !10, i32 33, i32 0, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
+!57 = metadata !{i32 34, i32 0, metadata !56, null}
+!58 = metadata !{i32 35, i32 0, metadata !56, null}
+!59 = metadata !{i32 786689, metadata !38, metadata !"argc", metadata !6, i32 16777254, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 38]
+!60 = metadata !{i32 38, i32 0, metadata !38, null}
+!61 = metadata !{i32 786689, metadata !38, metadata !"argv", metadata !6, i32 33554470, metadata !41, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 38]
+!62 = metadata !{i32 786688, metadata !63, metadata !"s", metadata !6, i32 40, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [s] [line 40]
+!63 = metadata !{i32 786443, metadata !38, i32 39, i32 0, metadata !6, i32 2} ; [ DW_TAG_lexical_block ] [/home/akaylor/dev/test-parameters.cpp]
+!64 = metadata !{i32 40, i32 0, metadata !63, null}
+!65 = metadata !{i32 786688, metadata !63, metadata !"f", metadata !6, i32 41, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [f] [line 41]
+!66 = metadata !{i32 41, i32 0, metadata !63, null}
+!67 = metadata !{i32 786688, metadata !63, metadata !"d", metadata !6, i32 42, metadata !68, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 42]
+!68 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !13, metadata !69, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
+!69 = metadata !{metadata !19, metadata !19}
+!70 = metadata !{i32 42, i32 0, metadata !63, null}
+!71 = metadata !{i32 44, i32 0, metadata !63, null}
+!72 = metadata !{i32 45, i32 0, metadata !63, null}
+!73 = metadata !{i32 46, i32 0, metadata !63, null}
+!74 = metadata !{i32 786688, metadata !63, metadata !"result", metadata !6, i32 48, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 48]
+!75 = metadata !{i32 48, i32 0, metadata !63, null}
+!76 = metadata !{i32 49, i32 0, metadata !63, null}
diff --git a/test/Linker/DbgDeclare.ll b/test/Linker/DbgDeclare.ll
new file mode 100644
index 0000000..7f64f95
--- /dev/null
+++ b/test/Linker/DbgDeclare.ll
@@ -0,0 +1,58 @@
+; RUN: llvm-link %s %p/DbgDeclare2.ll -o %t.bc
+; RUN: llvm-dis < %t.bc | FileCheck %s
+; Test if metadata in dbg.declare is mapped properly or not.
+
+; rdar://13089880
+; CHECK: define i32 @main(i32 %argc, i8** %argv)
+; CHECK: call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !{{[0-9]+}})
+; CHECK: call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !{{[0-9]+}})
+; CHECK: define void @test(i32 %argc, i8** %argv)
+; CHECK: call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !{{[0-9]+}})
+; CHECK: call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !{{[0-9]+}})
+; CHECK: call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !{{[0-9]+}})
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define i32 @main(i32 %argc, i8** %argv) uwtable ssp {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  store i32 %argc, i32* %argc.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !14), !dbg !15
+  store i8** %argv, i8*** %argv.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !16), !dbg !15
+  %0 = load i32* %argc.addr, align 4, !dbg !17
+  %1 = load i8*** %argv.addr, align 8, !dbg !17
+  call void @test(i32 %0, i8** %1), !dbg !17
+  ret i32 0, !dbg !19
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare void @test(i32, i8**)
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"main.cpp", metadata !"/private/tmp", metadata !"clang version 3.3 (trunk 173515)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"main.cpp", metadata !"/private/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{metadata !9, metadata !9, metadata !10}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ]
+!13 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!14 = metadata !{i32 786689, metadata !5, metadata !"argc", metadata !6, i32 16777219, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{i32 3, i32 0, metadata !5, null}
+!16 = metadata !{i32 786689, metadata !5, metadata !"argv", metadata !6, i32 33554435, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{i32 5, i32 0, metadata !18, null}
+!18 = metadata !{i32 786443, metadata !5, i32 4, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{i32 6, i32 0, metadata !18, null}
diff --git a/test/Linker/DbgDeclare2.ll b/test/Linker/DbgDeclare2.ll
new file mode 100644
index 0000000..e2e56b2
--- /dev/null
+++ b/test/Linker/DbgDeclare2.ll
@@ -0,0 +1,76 @@
+; This file is used by 2011-08-04-DebugLoc.ll, so it doesn't actually do anything itself
+;
+; RUN: true
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define void @test(i32 %argc, i8** %argv) uwtable ssp {
+entry:
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %i = alloca i32, align 4
+  store i32 %argc, i32* %argc.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %argc.addr}, metadata !14), !dbg !15
+  store i8** %argv, i8*** %argv.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i8*** %argv.addr}, metadata !16), !dbg !15
+  call void @llvm.dbg.declare(metadata !{i32* %i}, metadata !17), !dbg !20
+  store i32 0, i32* %i, align 4, !dbg !20
+  br label %for.cond, !dbg !20
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4, !dbg !20
+  %1 = load i32* %argc.addr, align 4, !dbg !20
+  %cmp = icmp slt i32 %0, %1, !dbg !20
+  br i1 %cmp, label %for.body, label %for.end, !dbg !20
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32* %i, align 4, !dbg !21
+  %idxprom = sext i32 %2 to i64, !dbg !21
+  %3 = load i8*** %argv.addr, align 8, !dbg !21
+  %arrayidx = getelementptr inbounds i8** %3, i64 %idxprom, !dbg !21
+  %4 = load i8** %arrayidx, align 8, !dbg !21
+  %call = call i32 @puts(i8* %4), !dbg !21
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4, !dbg !20
+  %inc = add nsw i32 %5, 1, !dbg !20
+  store i32 %inc, i32* %i, align 4, !dbg !20
+  br label %for.cond, !dbg !20
+
+for.end:                                          ; preds = %for.cond
+  ret void, !dbg !24
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare i32 @puts(i8*)
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"main.cpp", metadata !"/private/tmp", metadata !"clang version 3.3 (trunk 173515)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"print_args", metadata !"print_args", metadata !"test", metadata !6, i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32, i8**)* @test, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"test.cpp", metadata !"/private/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{null, metadata !9, metadata !10}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ]
+!13 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!14 = metadata !{i32 786689, metadata !5, metadata !"argc", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{i32 4, i32 0, metadata !5, null}
+!16 = metadata !{i32 786689, metadata !5, metadata !"argv", metadata !6, i32 33554436, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!17 = metadata !{i32 786688, metadata !18, metadata !"i", metadata !6, i32 6, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+!18 = metadata !{i32 786443, metadata !19, i32 6, i32 0, metadata !6, i32 1} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{i32 786443, metadata !5, i32 5, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!20 = metadata !{i32 6, i32 0, metadata !18, null}
+!21 = metadata !{i32 8, i32 0, metadata !22, null}
+!22 = metadata !{i32 786443, metadata !18, i32 7, i32 0, metadata !6, i32 2} ; [ DW_TAG_lexical_block ]
+!23 = metadata !{i32 9, i32 0, metadata !22, null}
+!24 = metadata !{i32 10, i32 0, metadata !19, null}
diff --git a/test/Linker/module-flags-1-a.ll b/test/Linker/module-flags-1-a.ll
index 973aa80..32f189c 100644
--- a/test/Linker/module-flags-1-a.ll
+++ b/test/Linker/module-flags-1-a.ll
@@ -3,10 +3,10 @@
 ; Test basic functionality of module flags.
 
 ; CHECK: !0 = metadata !{i32 1, metadata !"foo", i32 37}
-; CHECK: !1 = metadata !{i32 1, metadata !"qux", i32 42}
+; CHECK: !1 = metadata !{i32 2, metadata !"bar", i32 42}
 ; CHECK: !2 = metadata !{i32 1, metadata !"mux", metadata !3}
 ; CHECK: !3 = metadata !{metadata !"hello world", i32 927}
-; CHECK: !4 = metadata !{i32 2, metadata !"bar", i32 42}
+; CHECK: !4 = metadata !{i32 1, metadata !"qux", i32 42}
 ; CHECK: !llvm.module.flags = !{!0, !1, !2, !4}
 
 !0 = metadata !{ i32 1, metadata !"foo", i32 37 }
diff --git a/test/Linker/module-flags-3-a.ll b/test/Linker/module-flags-3-a.ll
index 4233a0a..e7a720e 100644
--- a/test/Linker/module-flags-3-a.ll
+++ b/test/Linker/module-flags-3-a.ll
@@ -3,10 +3,10 @@
 ; Test 'require' behavior.
 
 ; CHECK: !0 = metadata !{i32 1, metadata !"foo", i32 37}
-; CHECK: !1 = metadata !{i32 3, metadata !"foo", metadata !2}
-; CHECK: !2 = metadata !{metadata !"bar", i32 42}
-; CHECK: !3 = metadata !{i32 1, metadata !"bar", i32 42}
-; CHECK: !llvm.module.flags = !{!0, !1, !3}
+; CHECK: !1 = metadata !{i32 1, metadata !"bar", i32 42}
+; CHECK: !2 = metadata !{i32 3, metadata !"foo", metadata !3}
+; CHECK: !3 = metadata !{metadata !"bar", i32 42}
+; CHECK: !llvm.module.flags = !{!0, !1, !2}
 
 !0 = metadata !{ i32 1, metadata !"foo", i32 37 }
 !1 = metadata !{ i32 1, metadata !"bar", i32 42 }
diff --git a/test/Linker/module-flags-7-a.ll b/test/Linker/module-flags-7-a.ll
new file mode 100644
index 0000000..976c8fe
--- /dev/null
+++ b/test/Linker/module-flags-7-a.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-link %s %p/module-flags-7-b.ll -S -o - 2>&1 | FileCheck %s
+
+; Test module flags error messages.
+
+; CHECK: linking module flags 'foo': IDs have conflicting behaviors
+
+!0 = metadata !{ i32 1, metadata !"foo", i32 37 }
+
+!llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-7-b.ll b/test/Linker/module-flags-7-b.ll
new file mode 100644
index 0000000..2bc7250
--- /dev/null
+++ b/test/Linker/module-flags-7-b.ll
@@ -0,0 +1,6 @@
+; This file is used with module-flags-7-a.ll
+; RUN: true
+
+!0 = metadata !{ i32 2, metadata !"foo", i32 37 }
+
+!llvm.module.flags = !{ !0 }
diff --git a/test/Linker/module-flags-8-a.ll b/test/Linker/module-flags-8-a.ll
new file mode 100644
index 0000000..146cae7
--- /dev/null
+++ b/test/Linker/module-flags-8-a.ll
@@ -0,0 +1,14 @@
+; RUN: llvm-link %s %p/module-flags-8-b.ll -S -o - | sort | FileCheck %s
+
+; Test append-type module flags.
+
+; CHECK: !0 = metadata !{i32 5, metadata !"flag-0", metadata !1}
+; CHECK: !1 = metadata !{i32 0, i32 0, i32 1}
+; CHECK: !2 = metadata !{i32 6, metadata !"flag-1", metadata !3}
+; CHECK: !3 = metadata !{i32 0, i32 1, i32 2}
+; CHECK: !llvm.module.flags = !{!0, !2}
+
+!0 = metadata !{ i32 5, metadata !"flag-0", metadata !{ i32 0 } }
+!1 = metadata !{ i32 6, metadata !"flag-1", metadata !{ i32 0, i32 1 } }
+
+!llvm.module.flags = !{ !0, !1 }
diff --git a/test/Linker/module-flags-8-b.ll b/test/Linker/module-flags-8-b.ll
new file mode 100644
index 0000000..08f9bc4
--- /dev/null
+++ b/test/Linker/module-flags-8-b.ll
@@ -0,0 +1,7 @@
+; This file is used with module-flags-6-a.ll
+; RUN: true
+
+!0 = metadata !{ i32 5, metadata !"flag-0", metadata !{ i32 0, i32 1 } }
+!1 = metadata !{ i32 6, metadata !"flag-1", metadata !{ i32 1, i32 2 } }
+
+!llvm.module.flags = !{ !0, !1 }
diff --git a/test/Linker/testlink1.ll b/test/Linker/testlink1.ll
index a874637..6ba6fd5 100644
--- a/test/Linker/testlink1.ll
+++ b/test/Linker/testlink1.ll
@@ -13,6 +13,10 @@
 ; The uses of intlist in the other file should be remapped.
 ; CHECK-NOT: {{%intlist.[0-9]}}
 
+; CHECK: %VecSize = type { <5 x i32> }
+; CHECK: %VecSize.{{[0-9]}} = type { <10 x i32> }
+%VecSize = type { <5 x i32> }
+
 %Struct1 = type opaque
 @S1GV = external global %Struct1*
 
@@ -93,3 +97,5 @@ define internal void @Testintern() {
 define void @testIntern() {
   ret void
 }
+
+declare void @VecSizeCrash(%VecSize)
diff --git a/test/Linker/testlink2.ll b/test/Linker/testlink2.ll
index 1798e31..ff8e529 100644
--- a/test/Linker/testlink2.ll
+++ b/test/Linker/testlink2.ll
@@ -8,6 +8,8 @@
 %Ty1 = type { %Ty2* }
 %Ty2 = type opaque
 
+%VecSize = type { <10 x i32> }
+
 @GVTy1 = global %Ty1* null
 @GVTy2 = external global %Ty2*
 
@@ -53,3 +55,4 @@ define internal void @testIntern() {
   ret void
 }
 
+declare void @VecSizeCrash1(%VecSize)
diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s
new file mode 100644
index 0000000..1e9024c
--- /dev/null
+++ b/test/MC/AArch64/basic-a64-diagnostics.s
@@ -0,0 +1,3713 @@
+// RUN: not llvm-mc -triple=aarch64 < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+
+//------------------------------------------------------------------------------
+// Add/sub (extended register)
+//------------------------------------------------------------------------------
+
+        // Mismatched final register and extend
+        add x2, x3, x5, sxtb
+        add x2, x4, w2, uxtx
+        add w5, w7, x9, sxtx
+// CHECK-ERROR: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR:         add x2, x3, x5, sxtb
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR:         add x2, x4, w2, uxtx
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR:         add w5, w7, x9, sxtx
+// CHECK-ERROR:                     ^
+
+        // Out of range extends
+        add x9, x10, w11, uxtb #-1
+        add x3, x5, w7, uxtb #5
+        sub x9, x15, x2, uxth #5
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR:         add x9, x10, w11, uxtb #-1
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR:         add x3, x5, w7, uxtb #5
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR:         sub x9, x15, x2, uxth #5
+// CHECK-ERROR:                          ^
+
+        // Wrong registers on normal variants
+        add xzr, x3, x5, uxtx
+        sub x3, xzr, w9, sxth #1
+        add x1, x2, sp, uxtx
+// CHECK-ERROR: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR:         add xzr, x3, x5, uxtx
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sub x3, xzr, w9, sxth #1
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR:         add x1, x2, sp, uxtx
+// CHECK-ERROR:                     ^
+
+        // Wrong registers on flag-setting variants
+        adds sp, x3, w2, uxtb
+        adds x3, xzr, x9, uxtx
+        subs x2, x1, sp, uxtx
+        adds x2, x1, sp, uxtb #2
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         adds sp, x3, w2, uxtb
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR:         adds x3, xzr, x9, uxtx
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR:         subs x2, x1, sp, uxtx
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR:         adds x2, x1, sp, uxtb #2
+// CHECK-ERROR:                      ^
+
+        // Amount not optional if lsl valid and used
+        add sp, x5, x7, lsl
+// CHECK-ERROR: error: expected #imm after shift specifier
+// CHECK-ERROR:         add sp, x5, x7, lsl
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Add/sub (immediate)
+//------------------------------------------------------------------------------
+
+// Out of range immediates: < 0 or more than 12 bits
+        add w4, w5, #-1
+        add w5, w6, #0x1000
+        add w4, w5, #-1, lsl #12
+        add w5, w6, #0x1000, lsl #12
+// CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-NEXT:         add w4, w5, #-1
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-NEXT:         add w5, w6, #0x1000
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-NEXT:         add w4, w5, #-1, lsl #12
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-NEXT:         add w5, w6, #0x1000, lsl #12
+// CHECK-ERROR-NEXT:                     ^
+
+// Only lsl #0 and lsl #12 are allowed
+        add w2, w3, #0x1, lsl #1
+        add w5, w17, #0xfff, lsl #13
+        add w17, w20, #0x1000, lsl #12
+        sub xsp, x34, #0x100, lsl #-1
+// CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-NEXT:         add w2, w3, #0x1, lsl #1
+// CHECK-ERROR-NEXT:                                ^
+// CHECK-ERROR-NEXT: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-NEXT:         add w5, w17, #0xfff, lsl #13
+// CHECK-ERROR-NEXT:                                   ^
+// CHECK-ERROR-NEXT: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-NEXT:         add w17, w20, #0x1000, lsl #12
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: only 'lsl #+N' valid after immediate
+// CHECK-ERROR-NEXT:         sub xsp, x34, #0x100, lsl #-1
+// CHECK-ERROR-NEXT:                                    ^
+
+// Incorrect registers (w31 doesn't exist at all, and 31 decodes to sp for these).
+        add w31, w20, #1234
+        add wzr, w20, #0x123
+        add w20, wzr, #0x321
+        add wzr, wzr, #0xfff
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         add w31, w20, #1234
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         add wzr, w20, #0x123
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         add w20, wzr, #0x321
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         add wzr, wzr, #0xfff
+// CHECK-ERROR-NEXT:             ^
+
+// Mixed register classes
+        add xsp, w2, #123
+        sub w2, x30, #32
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         add xsp, w2, #123
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sub w2, x30, #32
+// CHECK-ERROR-NEXT:                 ^
+
+// Out of range immediate
+        adds w0, w5, #0x10000
+// CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-NEXT:         adds w0, w5, #0x10000
+// CHECK-ERROR-NEXT:                      ^
+
+// Wn|WSP should be in second place
+        adds w4, wzr, #0x123
+// ...but wzr is the 31 destination
+        subs wsp, w5, #123
+        subs x5, xzr, #0x456, lsl #12
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adds w4, wzr, #0x123
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         subs wsp, w5, #123
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         subs x5, xzr, #0x456, lsl #12
+// CHECK-ERROR-NEXT:                  ^
+
+        // MOV alias should not accept any fiddling
+        mov x2, xsp, #123
+        mov wsp, w27, #0xfff, lsl #12
+// CHECK-ERROR: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         mov x2, xsp, #123
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         mov wsp, w27, #0xfff, lsl #12
+// CHECK-ERROR-NEXT:                       ^
+
+        // A relocation should be provided for symbols
+        add x3, x9, #variable
+// CHECK-ERROR: error: expected compatible register, symbol or integer in range [0, 4095]
+// CHECK-ERROR-NEXT:         add x3, x9, #variable
+// CHECK-ERROR-NEXT:                      ^
+
+
+//------------------------------------------------------------------------------
+// Add-subtract (shifted register)
+//------------------------------------------------------------------------------
+
+        add wsp, w1, w2, lsr #3
+        add x4, sp, x9, asr #5
+        add x9, x10, x5, ror #3
+// CHECK-ERROR: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         add wsp, w1, w2, lsr #3
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         add x4, sp, x9, asr #5
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         add x9, x10, x5, ror #3
+// CHECK-ERROR-NEXT:                          ^
+
+        add w1, w2, w3, lsl #-1
+        add w1, w2, w3, lsl #32
+        add w1, w2, w3, lsr #-1
+        add w1, w2, w3, lsr #32
+        add w1, w2, w3, asr #-1
+        add w1, w2, w3, asr #32
+        add x1, x2, x3, lsl #-1
+        add x1, x2, x3, lsl #64
+        add x1, x2, x3, lsr #-1
+        add x1, x2, x3, lsr #64
+        add x1, x2, x3, asr #-1
+        add x1, x2, x3, asr #64
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         add w1, w2, w3, lsl #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         add w1, w2, w3, lsl #32
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         add w1, w2, w3, lsr #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         add w1, w2, w3, lsr #32
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         add w1, w2, w3, asr #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         add w1, w2, w3, asr #32
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         add x1, x2, x3, lsl #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         add x1, x2, x3, lsl #64
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         add x1, x2, x3, lsr #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         add x1, x2, x3, lsr #64
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         add x1, x2, x3, asr #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         add x1, x2, x3, asr #64
+// CHECK-ERROR-NEXT:                         ^
+
+        adds w1, w2, w3, lsl #-1
+        adds w1, w2, w3, lsl #32
+        adds w1, w2, w3, lsr #-1
+        adds w1, w2, w3, lsr #32
+        adds w1, w2, w3, asr #-1
+        adds w1, w2, w3, asr #32
+        adds x1, x2, x3, lsl #-1
+        adds x1, x2, x3, lsl #64
+        adds x1, x2, x3, lsr #-1
+        adds x1, x2, x3, lsr #64
+        adds x1, x2, x3, asr #-1
+        adds x1, x2, x3, asr #64
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         adds w1, w2, w3, lsl #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         adds w1, w2, w3, lsl #32
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         adds w1, w2, w3, lsr #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         adds w1, w2, w3, lsr #32
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         adds w1, w2, w3, asr #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         adds w1, w2, w3, asr #32
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         adds x1, x2, x3, lsl #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         adds x1, x2, x3, lsl #64
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         adds x1, x2, x3, lsr #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         adds x1, x2, x3, lsr #64
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         adds x1, x2, x3, asr #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         adds x1, x2, x3, asr #64
+// CHECK-ERROR-NEXT:                          ^
+
+        sub w1, w2, w3, lsl #-1
+        sub w1, w2, w3, lsl #32
+        sub w1, w2, w3, lsr #-1
+        sub w1, w2, w3, lsr #32
+        sub w1, w2, w3, asr #-1
+        sub w1, w2, w3, asr #32
+        sub x1, x2, x3, lsl #-1
+        sub x1, x2, x3, lsl #64
+        sub x1, x2, x3, lsr #-1
+        sub x1, x2, x3, lsr #64
+        sub x1, x2, x3, asr #-1
+        sub x1, x2, x3, asr #64
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         sub w1, w2, w3, lsl #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         sub w1, w2, w3, lsl #32
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         sub w1, w2, w3, lsr #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         sub w1, w2, w3, lsr #32
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         sub w1, w2, w3, asr #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         sub w1, w2, w3, asr #32
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         sub x1, x2, x3, lsl #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         sub x1, x2, x3, lsl #64
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         sub x1, x2, x3, lsr #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         sub x1, x2, x3, lsr #64
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         sub x1, x2, x3, asr #-1
+// CHECK-ERROR-NEXT:                              ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         sub x1, x2, x3, asr #64
+// CHECK-ERROR-NEXT:                         ^
+
+        subs w1, w2, w3, lsl #-1
+        subs w1, w2, w3, lsl #32
+        subs w1, w2, w3, lsr #-1
+        subs w1, w2, w3, lsr #32
+        subs w1, w2, w3, asr #-1
+        subs w1, w2, w3, asr #32
+        subs x1, x2, x3, lsl #-1
+        subs x1, x2, x3, lsl #64
+        subs x1, x2, x3, lsr #-1
+        subs x1, x2, x3, lsr #64
+        subs x1, x2, x3, asr #-1
+        subs x1, x2, x3, asr #64
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         subs w1, w2, w3, lsl #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         subs w1, w2, w3, lsl #32
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         subs w1, w2, w3, lsr #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         subs w1, w2, w3, lsr #32
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         subs w1, w2, w3, asr #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         subs w1, w2, w3, asr #32
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         subs x1, x2, x3, lsl #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         subs x1, x2, x3, lsl #64
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         subs x1, x2, x3, lsr #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         subs x1, x2, x3, lsr #64
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         subs x1, x2, x3, asr #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         subs x1, x2, x3, asr #64
+// CHECK-ERROR-NEXT:                          ^
+
+        cmn w9, w10, lsl #-1
+        cmn w9, w10, lsl #32
+        cmn w11, w12, lsr #-1
+        cmn w11, w12, lsr #32
+        cmn w19, wzr, asr #-1
+        cmn wzr, wzr, asr #32
+        cmn x9, x10, lsl #-1
+        cmn x9, x10, lsl #64
+        cmn x11, x12, lsr #-1
+        cmn x11, x12, lsr #64
+        cmn x19, xzr, asr #-1
+        cmn xzr, xzr, asr #64
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmn w9, w10, lsl #-1
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         cmn w9, w10, lsl #32
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmn w11, w12, lsr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         cmn w11, w12, lsr #32
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmn w19, wzr, asr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+// CHECK-ERROR-NEXT:         cmn wzr, wzr, asr #32
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmn x9, x10, lsl #-1
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         cmn x9, x10, lsl #64
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmn x11, x12, lsr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         cmn x11, x12, lsr #64
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmn x19, xzr, asr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR-NEXT:         cmn xzr, xzr, asr #64
+// CHECK-ERROR-NEXT:                       ^
+
+        cmp w9, w10, lsl #-1
+        cmp w9, w10, lsl #32
+        cmp w11, w12, lsr #-1
+        cmp w11, w12, lsr #32
+        cmp w19, wzr, asr #-1
+        cmp wzr, wzr, asr #32
+        cmp x9, x10, lsl #-1
+        cmp x9, x10, lsl #64
+        cmp x11, x12, lsr #-1
+        cmp x11, x12, lsr #64
+        cmp x19, xzr, asr #-1
+        cmp xzr, xzr, asr #64
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmp w9, w10, lsl #-1
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         cmp w9, w10, lsl #32
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmp w11, w12, lsr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         cmp w11, w12, lsr #32
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmp w19, wzr, asr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+// CHECK-ERROR-NEXT:         cmp wzr, wzr, asr #32
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmp x9, x10, lsl #-1
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         cmp x9, x10, lsl #64
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmp x11, x12, lsr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]
+// CHECK-ERROR-NEXT:         cmp x11, x12, lsr #64
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         cmp x19, xzr, asr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR-NEXT:         cmp xzr, xzr, asr #64
+// CHECK-ERROR-NEXT:                       ^
+
+        neg w9, w10, lsl #-1
+        neg w9, w10, lsl #32
+        neg w11, w12, lsr #-1
+        neg w11, w12, lsr #32
+        neg w19, wzr, asr #-1
+        neg wzr, wzr, asr #32
+        neg x9, x10, lsl #-1
+        neg x9, x10, lsl #64
+        neg x11, x12, lsr #-1
+        neg x11, x12, lsr #64
+        neg x19, xzr, asr #-1
+        neg xzr, xzr, asr #64
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         neg w9, w10, lsl #-1
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+// CHECK-ERROR-NEXT:         neg w9, w10, lsl #32
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         neg w11, w12, lsr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+// CHECK-ERROR-NEXT:         neg w11, w12, lsr #32
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         neg w19, wzr, asr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+// CHECK-ERROR-NEXT:         neg wzr, wzr, asr #32
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         neg x9, x10, lsl #-1
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR-NEXT:         neg x9, x10, lsl #64
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         neg x11, x12, lsr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR-NEXT:         neg x11, x12, lsr #64
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         neg x19, xzr, asr #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR-NEXT:         neg xzr, xzr, asr #64
+// CHECK-ERROR-NEXT:                       ^
+
+        negs w9, w10, lsl #-1
+        negs w9, w10, lsl #32
+        negs w11, w12, lsr #-1
+        negs w11, w12, lsr #32
+        negs w19, wzr, asr #-1
+        negs wzr, wzr, asr #32
+        negs x9, x10, lsl #-1
+        negs x9, x10, lsl #64
+        negs x11, x12, lsr #-1
+        negs x11, x12, lsr #64
+        negs x19, xzr, asr #-1
+        negs xzr, xzr, asr #64
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         negs w9, w10, lsl #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+// CHECK-ERROR-NEXT:         negs w9, w10, lsl #32
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         negs w11, w12, lsr #-1
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+// CHECK-ERROR-NEXT:         negs w11, w12, lsr #32
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         negs w19, wzr, asr #-1
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+// CHECK-ERROR-NEXT:         negs wzr, wzr, asr #32
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         negs x9, x10, lsl #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR-NEXT:         negs x9, x10, lsl #64
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         negs x11, x12, lsr #-1
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR-NEXT:         negs x11, x12, lsr #64
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         negs x19, xzr, asr #-1
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR-NEXT:         negs xzr, xzr, asr #64
+// CHECK-ERROR-NEXT:                        ^
+
+//------------------------------------------------------------------------------
+// Add-subtract (shifted register)
+//------------------------------------------------------------------------------
+
+        adc wsp, w3, w5
+        adc w1, wsp, w2
+        adc w0, w10, wsp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        adc wsp, w3, w5
+// CHECK-ERROR-NEXT:            ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adc w1, wsp, w2
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adc w0, w10, wsp
+// CHECK-ERROR-NEXT:                      ^
+
+        adc sp, x3, x5
+        adc x1, sp, x2
+        adc x0, x10, sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adc sp, x3, x5
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adc x1, sp, x2
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adc x0, x10, sp
+// CHECK-ERROR-NEXT:                      ^
+
+        adcs wsp, w3, w5
+        adcs w1, wsp, w2
+        adcs w0, w10, wsp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adcs wsp, w3, w5
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adcs w1, wsp, w2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adcs w0, w10, wsp
+// CHECK-ERROR-NEXT:                       ^
+
+        adcs sp, x3, x5
+        adcs x1, sp, x2
+        adcs x0, x10, sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adcs sp, x3, x5
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adcs x1, sp, x2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adcs x0, x10, sp
+// CHECK-ERROR-NEXT:                       ^
+
+        sbc wsp, w3, w5
+        sbc w1, wsp, w2
+        sbc w0, w10, wsp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbc wsp, w3, w5
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbc w1, wsp, w2
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbc w0, w10, wsp
+// CHECK-ERROR-NEXT:                      ^
+
+        sbc sp, x3, x5
+        sbc x1, sp, x2
+        sbc x0, x10, sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbc sp, x3, x5
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbc x1, sp, x2
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbc x0, x10, sp
+// CHECK-ERROR-NEXT:                      ^
+
+        sbcs wsp, w3, w5
+        sbcs w1, wsp, w2
+        sbcs w0, w10, wsp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbcs wsp, w3, w5
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbcs w1, wsp, w2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbcs w0, w10, wsp
+// CHECK-ERROR-NEXT:                       ^
+
+        sbcs sp, x3, x5
+        sbcs x1, sp, x2
+        sbcs x0, x10, sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbcs sp, x3, x5
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbcs x1, sp, x2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbcs x0, x10, sp
+// CHECK-ERROR-NEXT:                       ^
+
+        ngc wsp, w3
+        ngc w9, wsp
+        ngc sp, x9
+        ngc x2, sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ngc wsp, w3
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ngc w9, wsp
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ngc sp, x9
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ngc x2, sp
+// CHECK-ERROR-NEXT:                 ^
+
+        ngcs wsp, w3
+        ngcs w9, wsp
+        ngcs sp, x9
+        ngcs x2, sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ngcs wsp, w3
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ngcs w9, wsp
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ngcs sp, x9
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ngcs x2, sp
+// CHECK-ERROR-NEXT:                  ^
+
+//------------------------------------------------------------------------------
+// Bitfield
+//------------------------------------------------------------------------------
+
+        sbfm x3, w13, #0, #0
+        sbfm w12, x9, #0, #0
+        sbfm sp, x3, #3, #5
+        sbfm w3, wsp, #1, #9
+        sbfm x9, x5, #-1, #0
+        sbfm x9, x5, #0, #-1
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfm x3, w13, #0, #0
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfm w12, x9, #0, #0
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfm sp, x3, #3, #5
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfm w3, wsp, #1, #9
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         sbfm x9, x5, #-1, #0
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         sbfm x9, x5, #0, #-1
+// CHECK-ERROR-NEXT:                          ^
+
+        sbfm w3, w5, #32, #1
+        sbfm w7, w11, #19, #32
+        sbfm x29, x30, #64, #0
+        sbfm x10, x20, #63, #64
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         sbfm w3, w5, #32, #1
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         sbfm w7, w11, #19, #32
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         sbfm x29, x30, #64, #0
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         sbfm x10, x20, #63, #64
+// CHECK-ERROR-NEXT:                             ^
+
+        ubfm w3, w5, #32, #1
+        ubfm w7, w11, #19, #32
+        ubfm x29, x30, #64, #0
+        ubfm x10, x20, #63, #64
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         ubfm w3, w5, #32, #1
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         ubfm w7, w11, #19, #32
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         ubfm x29, x30, #64, #0
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         ubfm x10, x20, #63, #64
+// CHECK-ERROR-NEXT:                             ^
+
+        bfm w3, w5, #32, #1
+        bfm w7, w11, #19, #32
+        bfm x29, x30, #64, #0
+        bfm x10, x20, #63, #64
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         bfm w3, w5, #32, #1
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         bfm w7, w11, #19, #32
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         bfm x29, x30, #64, #0
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         bfm x10, x20, #63, #64
+// CHECK-ERROR-NEXT:                             ^
+
+        sxtb x3, x2
+        sxth xzr, xzr
+        sxtw x3, x5
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sxtb x3, x2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sxth xzr, xzr
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sxtw x3, x5
+// CHECK-ERROR-NEXT:                  ^
+
+        uxtb x3, x12
+        uxth x5, x9
+        uxtw x3, x5
+        uxtb x2, sp
+        uxtb sp, xzr
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         uxtb x3, x12
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         uxth x5, x9
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid instruction
+// CHECK-ERROR-NEXT:         uxtw x3, x5
+// CHECK-ERROR-NEXT:         ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         uxtb x2, sp
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         uxtb sp, xzr
+// CHECK-ERROR-NEXT:              ^
+
+        asr x3, w2, #1
+        asr sp, x2, #1
+        asr x25, x26, #-1
+        asr x25, x26, #64
+        asr w9, w8, #32
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         asr x3, w2, #1
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         asr sp, x2, #1
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         asr x25, x26, #-1
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         asr x25, x26, #64
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         asr w9, w8, #32
+// CHECK-ERROR-NEXT:                     ^
+
+        sbfiz w1, w2, #0, #0
+        sbfiz wsp, w9, #0, #1
+        sbfiz w9, w10, #32, #1
+        sbfiz w11, w12, #32, #0
+        sbfiz w9, w10, #10, #23
+        sbfiz x3, x5, #12, #53
+        sbfiz sp, x3, #5, #6
+        sbfiz w3, wsp, #7, #8
+// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-NEXT:         sbfiz w1, w2, #0, #0
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfiz wsp, w9, #0, #1
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         sbfiz w9, w10, #32, #1
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         sbfiz w11, w12, #32, #0
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: requested insert overflows register
+// CHECK-ERROR-NEXT:         sbfiz w9, w10, #10, #23
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: requested insert overflows register
+// CHECK-ERROR-NEXT:         sbfiz x3, x5, #12, #53
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfiz sp, x3, #5, #6
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfiz w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:                   ^
+
+        sbfx w1, w2, #0, #0
+        sbfx wsp, w9, #0, #1
+        sbfx w9, w10, #32, #1
+        sbfx w11, w12, #32, #0
+        sbfx w9, w10, #10, #23
+        sbfx x3, x5, #12, #53
+        sbfx sp, x3, #5, #6
+        sbfx w3, wsp, #7, #8
+// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-NEXT:         sbfx w1, w2, #0, #0
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfx wsp, w9, #0, #1
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         sbfx w9, w10, #32, #1
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         sbfx w11, w12, #32, #0
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: requested extract overflows register
+// CHECK-ERROR-NEXT:         sbfx w9, w10, #10, #23
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: requested extract overflows register
+// CHECK-ERROR-NEXT:         sbfx x3, x5, #12, #53
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfx sp, x3, #5, #6
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sbfx w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:                  ^
+
+        bfi w1, w2, #0, #0
+        bfi wsp, w9, #0, #1
+        bfi w9, w10, #32, #1
+        bfi w11, w12, #32, #0
+        bfi w9, w10, #10, #23
+        bfi x3, x5, #12, #53
+        bfi sp, x3, #5, #6
+        bfi w3, wsp, #7, #8
+// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-NEXT:         bfi w1, w2, #0, #0
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         bfi wsp, w9, #0, #1
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         bfi w9, w10, #32, #1
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         bfi w11, w12, #32, #0
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: requested insert overflows register
+// CHECK-ERROR-NEXT:         bfi w9, w10, #10, #23
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: requested insert overflows register
+// CHECK-ERROR-NEXT:         bfi x3, x5, #12, #53
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         bfi sp, x3, #5, #6
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         bfi w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:                 ^
+
+        bfxil w1, w2, #0, #0
+        bfxil wsp, w9, #0, #1
+        bfxil w9, w10, #32, #1
+        bfxil w11, w12, #32, #0
+        bfxil w9, w10, #10, #23
+        bfxil x3, x5, #12, #53
+        bfxil sp, x3, #5, #6
+        bfxil w3, wsp, #7, #8
+// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-NEXT:         bfxil w1, w2, #0, #0
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         bfxil wsp, w9, #0, #1
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         bfxil w9, w10, #32, #1
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         bfxil w11, w12, #32, #0
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: requested extract overflows register
+// CHECK-ERROR-NEXT:         bfxil w9, w10, #10, #23
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: requested extract overflows register
+// CHECK-ERROR-NEXT:         bfxil x3, x5, #12, #53
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         bfxil sp, x3, #5, #6
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         bfxil w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:                   ^
+
+        ubfiz w1, w2, #0, #0
+        ubfiz wsp, w9, #0, #1
+        ubfiz w9, w10, #32, #1
+        ubfiz w11, w12, #32, #0
+        ubfiz w9, w10, #10, #23
+        ubfiz x3, x5, #12, #53
+        ubfiz sp, x3, #5, #6
+        ubfiz w3, wsp, #7, #8
+// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-NEXT:         ubfiz w1, w2, #0, #0
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ubfiz wsp, w9, #0, #1
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         ubfiz w9, w10, #32, #1
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         ubfiz w11, w12, #32, #0
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: requested insert overflows register
+// CHECK-ERROR-NEXT:         ubfiz w9, w10, #10, #23
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: requested insert overflows register
+// CHECK-ERROR-NEXT:         ubfiz x3, x5, #12, #53
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ubfiz sp, x3, #5, #6
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ubfiz w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:                   ^
+
+        ubfx w1, w2, #0, #0
+        ubfx wsp, w9, #0, #1
+        ubfx w9, w10, #32, #1
+        ubfx w11, w12, #32, #0
+        ubfx w9, w10, #10, #23
+        ubfx x3, x5, #12, #53
+        ubfx sp, x3, #5, #6
+        ubfx w3, wsp, #7, #8
+// CHECK-ERROR: error: expected integer in range [<lsb>, 31]
+// CHECK-ERROR-NEXT:         ubfx w1, w2, #0, #0
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ubfx wsp, w9, #0, #1
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         ubfx w9, w10, #32, #1
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         ubfx w11, w12, #32, #0
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: requested extract overflows register
+// CHECK-ERROR-NEXT:         ubfx w9, w10, #10, #23
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: requested extract overflows register
+// CHECK-ERROR-NEXT:         ubfx x3, x5, #12, #53
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ubfx sp, x3, #5, #6
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ubfx w3, wsp, #7, #8
+// CHECK-ERROR-NEXT:                  ^
+
+//------------------------------------------------------------------------------
+// Compare & branch (immediate)
+//------------------------------------------------------------------------------
+
+        cbnz wsp, lbl
+        cbz  sp, lbl
+        cbz  x3, x5
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:           cbnz wsp, lbl
+// CHECK-ERROR-NEXT:                ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:           cbz sp, lbl
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:           cbz x3, x5
+// CHECK-ERROR-NEXT:                   ^
+
+        cbz w20, #1048576
+        cbnz xzr, #-1048580
+        cbz x29, #1
+// CHECK-ERROR: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:           cbz w20, #1048576
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:           cbnz xzr, #-1048580
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:           cbz x29, #1
+// CHECK-ERROR-NEXT:                    ^
+
+//------------------------------------------------------------------------------
+// Conditional branch (immediate)
+//------------------------------------------------------------------------------
+
+        b.zf lbl
+// CHECK-ERROR: error: invalid condition code
+// CHECK-ERROR-NEXT:           b.zf lbl
+// CHECK-ERROR-NEXT:             ^
+
+        b.eq #1048576
+        b.ge #-1048580
+        b.cc #1
+// CHECK-ERROR: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:           b.eq #1048576
+// CHECK-ERROR-NEXT:                ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:           b.ge #-1048580
+// CHECK-ERROR-NEXT:                ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:           b.cc #1
+// CHECK-ERROR-NEXT:                ^
+
+//------------------------------------------------------------------------------
+// Conditional compare (immediate)
+//------------------------------------------------------------------------------
+
+        ccmp wsp, #4, #2, ne
+        ccmp w25, #-1, #15, hs
+        ccmp w3, #32, #0, ge
+        ccmp w19, #5, #-1, lt
+        ccmp w20, #7, #16, hs
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ccmp wsp, #4, #2, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmp w25, #-1, #15, hs
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmp w3, #32, #0, ge
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmp w19, #5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmp w20, #7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+        ccmp sp, #4, #2, ne
+        ccmp x25, #-1, #15, hs
+        ccmp x3, #32, #0, ge
+        ccmp x19, #5, #-1, lt
+        ccmp x20, #7, #16, hs
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ccmp sp, #4, #2, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmp x25, #-1, #15, hs
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmp x3, #32, #0, ge
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmp x19, #5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmp x20, #7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+        ccmn wsp, #4, #2, ne
+        ccmn w25, #-1, #15, hs
+        ccmn w3, #32, #0, ge
+        ccmn w19, #5, #-1, lt
+        ccmn w20, #7, #16, hs
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ccmn wsp, #4, #2, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmn w25, #-1, #15, hs
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmn w3, #32, #0, ge
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmn w19, #5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmn w20, #7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+        ccmn sp, #4, #2, ne
+        ccmn x25, #-1, #15, hs
+        ccmn x3, #32, #0, ge
+        ccmn x19, #5, #-1, lt
+        ccmn x20, #7, #16, hs
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ccmn sp, #4, #2, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmn x25, #-1, #15, hs
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmn x3, #32, #0, ge
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmn x19, #5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmn x20, #7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+//------------------------------------------------------------------------------
+// Conditional compare (register)
+//------------------------------------------------------------------------------
+
+        ccmp wsp, w4, #2, ne
+        ccmp w3, wsp, #0, ge
+        ccmp w19, w5, #-1, lt
+        ccmp w20, w7, #16, hs
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ccmp wsp, w4, #2, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmp w3, wsp, #0, ge
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmp w19, w5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmp w20, w7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+        ccmp sp, x4, #2, ne
+        ccmp x25, sp, #15, hs
+        ccmp x19, x5, #-1, lt
+        ccmp x20, x7, #16, hs
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ccmp sp, x4, #2, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmp x25, sp, #15, hs
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmp x19, x5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmp x20, x7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+        ccmn wsp, w4, #2, ne
+        ccmn w25, wsp, #15, hs
+        ccmn w19, w5, #-1, lt
+        ccmn w20, w7, #16, hs
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ccmn wsp, w4, #2, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmn w25, wsp, #15, hs
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmn w19, w5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmn w20, w7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+        ccmn sp, x4, #2, ne
+        ccmn x25, sp, #15, hs
+        ccmn x19, x5, #-1, lt
+        ccmn x20, x7, #16, hs
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ccmn sp, x4, #2, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        ccmn x25, sp, #15, hs
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmn x19, x5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        ccmn x20, x7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+//------------------------------------------------------------------------------
+// Conditional select
+//------------------------------------------------------------------------------
+
+        csel w4, wsp, w9, eq
+        csel wsp, w2, w3, ne
+        csel w10, w11, wsp, ge
+        csel w1, w2, w3, #3
+        csel x4, sp, x9, eq
+        csel sp, x2, x3, ne
+        csel x10, x11, sp, ge
+        csel x1, x2, x3, #3
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csel w4, wsp, w9, eq
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csel wsp, w2, w3, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csel w10, w11, wsp, ge
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected AArch64 condition code
+// CHECK-ERROR-NEXT:        csel w1, w2, w3, #3
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csel x4, sp, x9, eq
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csel sp, x2, x3, ne
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csel x10, x11, sp, ge
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected AArch64 condition code
+// CHECK-ERROR-NEXT:        csel x1, x2, x3, #3
+// CHECK-ERROR-NEXT:                         ^
+
+        csinc w20, w21, wsp, mi
+        csinc sp, x30, x29, eq
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csinc w20, w21, wsp, mi
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csinc sp, x30, x29, eq
+// CHECK-ERROR-NEXT:              ^
+
+        csinv w20, wsp, wsp, mi
+        csinv sp, x30, x29, le
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csinv w20, wsp, wsp, mi
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csinv sp, x30, x29, le
+// CHECK-ERROR-NEXT:              ^
+
+        csneg w20, w21, wsp, mi
+        csneg x0, sp, x29, le
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csneg w20, w21, wsp, mi
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csneg x0, sp, x29, le
+// CHECK-ERROR-NEXT:                  ^
+
+        cset wsp, lt
+        csetm sp, ge
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        cset wsp, lt
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        csetm sp, ge
+// CHECK-ERROR-NEXT:              ^
+
+        cinc w3, wsp, ne
+        cinc sp, x9, eq
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        cinc w3, wsp, ne
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        cinc sp, x9, eq
+// CHECK-ERROR-NEXT:             ^
+
+        cinv w3, wsp, ne
+        cinv sp, x9, eq
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        cinv w3, wsp, ne
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        cinv sp, x9, eq
+// CHECK-ERROR-NEXT:             ^
+
+        cneg w3, wsp, ne
+        cneg sp, x9, eq
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        cneg w3, wsp, ne
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        cneg sp, x9, eq
+// CHECK-ERROR-NEXT:             ^
+
+//------------------------------------------------------------------------------
+// Data Processing (1 source)
+//------------------------------------------------------------------------------
+        rbit x23, w2
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR-NEXT:     rbit x23, w2
+
+        cls sp, x2
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR-NEXT:     cls sp, x2
+
+        clz wsp, w3
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR-NEXT:     clz wsp, w3
+
+//------------------------------------------------------------------------------
+// Data Processing (2 sources)
+//------------------------------------------------------------------------------
+        udiv x23, w2, x18
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR-NEXT:     udiv x23, w2, x18
+
+        lsl sp, x2, x4
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR-NEXT:     lsl sp, x2, x4
+
+        asr wsp, w3, w9
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR-NEXT:     asr wsp, w3, w9
+
+//------------------------------------------------------------------------------
+// Data Processing (3 sources)
+//------------------------------------------------------------------------------
+
+        madd sp, x3, x9, x10
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR-NEXT:     madd sp, x3, x9, x10
+
+//------------------------------------------------------------------------------
+// Exception generation
+//------------------------------------------------------------------------------
+        svc #-1
+        hlt #65536
+        dcps4 #43
+        dcps4
+// CHECK-ERROR: error: expected integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         svc #-1
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         hlt #65536
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid instruction
+// CHECK-ERROR-NEXT:         dcps4 #43
+// CHECK-ERROR-NEXT:         ^
+// CHECK-ERROR-NEXT: error: invalid instruction
+// CHECK-ERROR-NEXT:         dcps4
+// CHECK-ERROR-NEXT:         ^
+
+//------------------------------------------------------------------------------
+// Extract (immediate)
+//------------------------------------------------------------------------------
+
+        extr w2, w20, w30, #-1
+        extr w9, w19, w20, #32
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         extr w2, w20, w30, #-1
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         extr w9, w19, w20, #32
+// CHECK-ERROR-NEXT:                            ^
+
+        extr x10, x15, x20, #-1
+        extr x20, x25, x30, #64
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         extr x10, x15, x20, #-1
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         extr x20, x25, x30, #64
+// CHECK-ERROR-NEXT:                             ^
+
+        ror w9, w10, #32
+        ror x10, x11, #64
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:         ror w9, w10, #32
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:         ror x10, x11, #64
+// CHECK-ERROR-NEXT:                       ^
+
+//------------------------------------------------------------------------------
+// Floating-point compare
+//------------------------------------------------------------------------------
+
+        fcmp s3, d2
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR-NEXT:         fcmp s3, d2
+// CHECK-ERROR-NEXT:                  ^
+
+        fcmp s9, #-0.0
+        fcmp d3, #-0.0
+        fcmp s1, #1.0
+        fcmpe s30, #-0.0
+// CHECK-ERROR: error: expected floating-point constant #0.0
+// CHECK-ERROR-NEXT:         fcmp s9, #-0.0
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected floating-point constant #0.0
+// CHECK-ERROR-NEXT:         fcmp d3, #-0.0
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected floating-point constant #0.0
+// CHECK-ERROR-NEXT:         fcmp s1, #1.0
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected floating-point constant #0.0
+// CHECK-ERROR-NEXT:         fcmpe s30, #-0.0
+// CHECK-ERROR-NEXT:                    ^
+
+//------------------------------------------------------------------------------
+// Floating-point conditional compare
+//------------------------------------------------------------------------------
+
+        fccmp s19, s5, #-1, lt
+        fccmp s20, s7, #16, hs
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        fccmp s19, s5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        fccmp s20, s7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+        fccmp d19, d5, #-1, lt
+        fccmp d20, d7, #16, hs
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        fccmp d19, d5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        fccmp d20, d7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+        fccmpe s19, s5, #-1, lt
+        fccmpe s20, s7, #16, hs
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        fccmpe s19, s5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        fccmpe s20, s7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+        fccmpe d19, d5, #-1, lt
+        fccmpe d20, d7, #16, hs
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        fccmpe d19, d5, #-1, lt
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:        fccmpe d20, d7, #16, hs
+// CHECK-ERROR-NEXT:                      ^
+
+//------------------------------------------------------------------------------
+// Floating-point conditional compare
+//------------------------------------------------------------------------------
+
+        fcsel q3, q20, q9, pl
+        fcsel h9, h10, h11, mi
+        fcsel b9, b10, b11, mi
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         fcsel q3, q20, q9, pl
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         fcsel h9, h10, h11, mi
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         fcsel b9, b10, b11, mi
+// CHECK-ERROR-NEXT:               ^
+
+//------------------------------------------------------------------------------
+// Floating-point data-processing (1 source)
+//------------------------------------------------------------------------------
+
+        fmov d0, s3
+        fcvt d0, d1
+// CHECK-ERROR: error: expected compatible register or floating-point constant
+// CHECK-ERROR-NEXT:           fmov d0, s3
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:           fcvt d0, d1
+// CHECK-ERROR-NEXT:                    ^
+
+
+//------------------------------------------------------------------------------
+// Floating-point data-processing (2 sources)
+//------------------------------------------------------------------------------
+
+        fadd s0, d3, d7
+        fmaxnm d3, s19, d12
+        fnmul d1, d9, s18
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:           fadd s0, d3, d7
+// CHECK-ERROR-NEXT: ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:           fmaxnm d3, s19, d12
+// CHECK-ERROR-NEXT: ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:           fnmul d1, d9, s18
+// CHECK-ERROR-NEXT: ^
+
+//------------------------------------------------------------------------------
+// Floating-point data-processing (3 sources)
+//------------------------------------------------------------------------------
+
+        fmadd b3, b4, b5, b6
+        fmsub h1, h2, h3, h4
+        fnmadd q3, q5, q6, q7
+        fnmsub s2, s4, d5, h9
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         fmadd b3, b4, b5, b6
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         fmsub h1, h2, h3, h4
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         fnmadd q3, q5, q6, q7
+// CHECK-ERROR-NEXT:                ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         fnmsub s2, s4, d5, h9
+// CHECK-ERROR-NEXT:                ^
+
+//------------------------------------------------------------------------------
+// Floating-point conditional compare
+//------------------------------------------------------------------------------
+
+        fcvtzs w13, s31, #0
+        fcvtzs w19, s20, #33
+        fcvtzs wsp, s19, #14
+// CHECK-ERROR-NEXT: error: expected integer in range [1, 32]
+// CHECK-ERROR-NEXT:        fcvtzs w13, s31, #0
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [1, 32]
+// CHECK-ERROR-NEXT:        fcvtzs w19, s20, #33
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        fcvtzs wsp, s19, #14
+// CHECK-ERROR-NEXT:               ^
+
+        fcvtzs x13, s31, #0
+        fcvtzs x19, s20, #65
+        fcvtzs sp, s19, #14
+// CHECK-ERROR-NEXT: error: expected integer in range [1, 64]
+// CHECK-ERROR-NEXT:        fcvtzs x13, s31, #0
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [1, 64]
+// CHECK-ERROR-NEXT:        fcvtzs x19, s20, #65
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        fcvtzs sp, s19, #14
+// CHECK-ERROR-NEXT:               ^
+
+        fcvtzu w13, s31, #0
+        fcvtzu w19, s20, #33
+        fcvtzu wsp, s19, #14
+// CHECK-ERROR-NEXT: error: expected integer in range [1, 32]
+// CHECK-ERROR-NEXT:        fcvtzu w13, s31, #0
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [1, 32]
+// CHECK-ERROR-NEXT:        fcvtzu w19, s20, #33
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        fcvtzu wsp, s19, #14
+// CHECK-ERROR-NEXT:               ^
+
+        fcvtzu x13, s31, #0
+        fcvtzu x19, s20, #65
+        fcvtzu sp, s19, #14
+// CHECK-ERROR-NEXT: error: expected integer in range [1, 64]
+// CHECK-ERROR-NEXT:        fcvtzu x13, s31, #0
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [1, 64]
+// CHECK-ERROR-NEXT:        fcvtzu x19, s20, #65
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        fcvtzu sp, s19, #14
+// CHECK-ERROR-NEXT:               ^
+
+        scvtf w13, s31, #0
+        scvtf w19, s20, #33
+        scvtf wsp, s19, #14
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        scvtf w13, s31, #0
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        scvtf w19, s20, #33
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        scvtf wsp, s19, #14
+// CHECK-ERROR-NEXT:              ^
+
+        scvtf x13, s31, #0
+        scvtf x19, s20, #65
+        scvtf sp, s19, #14
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        scvtf x13, s31, #0
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        scvtf x19, s20, #65
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        scvtf sp, s19, #14
+// CHECK-ERROR-NEXT:              ^
+
+        ucvtf w13, s31, #0
+        ucvtf w19, s20, #33
+        ucvtf wsp, s19, #14
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ucvtf w13, s31, #0
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ucvtf w19, s20, #33
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ucvtf wsp, s19, #14
+// CHECK-ERROR-NEXT:              ^
+
+        ucvtf x13, s31, #0
+        ucvtf x19, s20, #65
+        ucvtf sp, s19, #14
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ucvtf x13, s31, #0
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ucvtf x19, s20, #65
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ucvtf sp, s19, #14
+// CHECK-ERROR-NEXT:              ^
+
+//------------------------------------------------------------------------------
+// Floating-point immediate
+//------------------------------------------------------------------------------
+        ;; Exponent too large
+        fmov d3, #0.0625
+        fmov s2, #32.0
+// CHECK-ERROR: error: expected compatible register or floating-point constant
+// CHECK-ERROR-NEXT:           fmov d3, #0.0625
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: expected compatible register or floating-point constant
+// CHECK-ERROR-NEXT:           fmov s2, #32.0
+// CHECK-ERROR-NEXT:                    ^
+
+        ;; Fraction too precise
+        fmov s9, #1.03125
+        fmov s28, #1.96875
+// CHECK-ERROR: error: expected compatible register or floating-point constant
+// CHECK-ERROR-NEXT:           fmov s9, #1.03125
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: expected compatible register or floating-point constant
+// CHECK-ERROR-NEXT:           fmov s28, #1.96875
+// CHECK-ERROR-NEXT:                     ^
+
+        ;; No particular reason, but a striking omission
+        fmov d0, #0.0
+// CHECK-ERROR: error: expected compatible register or floating-point constant
+// CHECK-ERROR-NEXT:           fmov d0, #0.0
+// CHECK-ERROR-NEXT:                    ^
+
+//------------------------------------------------------------------------------
+// Floating-point <-> integer conversion
+//------------------------------------------------------------------------------
+
+        fmov x3, v0.d[0]
+        fmov v29.1d[1], x2
+        fmov x7, v0.d[2]
+        fcvtns sp, s5
+        scvtf s6, wsp
+// CHECK-ERROR: error: expected lane specifier '[1]'
+// CHECK-ERROR-NEXT:         fmov x3, v0.d[0]
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: lane number incompatible with layout
+// CHECK-ERROR-NEXT: fmov v29.1d[1], x2
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: lane number incompatible with layout
+// CHECK-ERROR-NEXT: fmov x7, v0.d[2]
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         fcvtns sp, s5
+// CHECK-ERROR-NEXT:                ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         scvtf s6, wsp
+// CHECK-ERROR-NEXT:                   ^
+
+//------------------------------------------------------------------------------
+// Load-register (literal)
+//------------------------------------------------------------------------------
+
+        ldr sp, some_label
+        ldrsw w3, somewhere
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldr sp, some_label
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldrsw w3, somewhere
+// CHECK-ERROR-NEXT:               ^
+
+        ldrsw x2, #1048576
+        ldr q0, #-1048580
+        ldr x0, #2
+// CHECK-ERROR: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         ldrsw x2, #1048576
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         ldr q0, #-1048580
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         ldr x0, #2
+// CHECK-ERROR-NEXT:                 ^
+
+//------------------------------------------------------------------------------
+// Load/store exclusive
+//------------------------------------------------------------------------------
+
+       stxrb w2, x3, [x4, #20]
+       stlxrh w10, w11, [w2]
+// CHECK-ERROR: error: expected '#0'
+// CHECK-ERROR-NEXT:         stxrb w2, x3, [x4, #20]
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stlxrh w10, w11, [w2]
+// CHECK-ERROR-NEXT:                           ^
+
+       stlxr  x20, w21, [sp]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stlxr  x20, w21, [sp]
+// CHECK-ERROR-NEXT:                ^
+
+       ldxr   sp, [sp]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldxr   sp, [sp]
+// CHECK-ERROR-NEXT:                ^
+
+       stxp x1, x2, x3, [x4]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stxp x1, x2,  x3, [x4]
+// CHECK-ERROR-NEXT:              ^
+
+       stlxp w5, x1, w4, [x5]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stlxp w5, x1, w4, [x5]
+// CHECK-ERROR-NEXT:                       ^
+
+       stlxp w17, w6, x7, [x22]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stlxp w17, w6, x7, [x22]
+// CHECK-ERROR-NEXT:                        ^
+
+//------------------------------------------------------------------------------
+// Load/store (unscaled immediate)
+//------------------------------------------------------------------------------
+
+        ldurb w2, [sp, #256]
+        sturh w17, [x1, #256]
+        ldursw x20, [x1, #256]
+        ldur x12, [sp, #256]
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:        ldurb w2, [sp, #256]
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         sturh w17, [x1, #256]
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldursw x20, [x1, #256]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldur x12, [sp, #256]
+// CHECK-ERROR-NEXT:                   ^
+
+        stur h2, [x2, #-257]
+        stur b2, [x2, #-257]
+        ldursb x9, [sp, #-257]
+        ldur w2, [x30, #-257]
+        stur q9, [x20, #-257]
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         stur h2, [x2, #-257]
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         stur b2, [x2, #-257]
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldursb x9, [sp, #-257]
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldur w2, [x30, #-257]
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         stur q9, [x20, #-257]
+// CHECK-ERROR-NEXT:                  ^
+
+        prfum pstl3strm, [xzr]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         prfum pstl3strm, [xzr]
+// CHECK-ERROR-NEXT:                           ^
+
+//------------------------------------------------------------------------------
+// Load-store register (immediate post-indexed)
+//------------------------------------------------------------------------------
+        ldr x3, [x4, #25], #0
+        ldr x4, [x9, #0], #4
+// CHECK-ERROR: error: expected symbolic reference or integer in range [0, 32760]
+// CHECK-ERROR-NEXT:         ldr x3, [x4, #25], #0
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldr x4, [x9, #0], #4
+// CHECK-ERROR-NEXT:                           ^
+
+        strb w1, [x19], #256
+        strb w9, [sp], #-257
+        strh w1, [x19], #256
+        strh w9, [sp], #-257
+        str w1, [x19], #256
+        str w9, [sp], #-257
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         strb w1, [x19], #256
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         strb w9, [sp], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         strh w1, [x19], #256
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         strh w9, [sp], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str w1, [x19], #256
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str w9, [sp], #-257
+// CHECK-ERROR-NEXT:                       ^
+
+        ldrb w1, [x19], #256
+        ldrb w9, [sp], #-257
+        ldrh w1, [x19], #256
+        ldrh w9, [sp], #-257
+        ldr w1, [x19], #256
+        ldr w9, [sp], #-257
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrb w1, [x19], #256
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrb w9, [sp], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrh w1, [x19], #256
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrh w9, [sp], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr w1, [x19], #256
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr w9, [sp], #-257
+// CHECK-ERROR-NEXT:                       ^
+
+        ldrsb x2, [x3], #256
+        ldrsb x22, [x13], #-257
+        ldrsh x2, [x3], #256
+        ldrsh x22, [x13], #-257
+        ldrsw x2, [x3], #256
+        ldrsw x22, [x13], #-257
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsb x2, [x3], #256
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsb x22, [x13], #-257
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsh x2, [x3], #256
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsh x22, [x13], #-257
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsw x2, [x3], #256
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsw x22, [x13], #-257
+// CHECK-ERROR-NEXT:                           ^
+
+        ldrsb w2, [x3], #256
+        ldrsb w22, [x13], #-257
+        ldrsh w2, [x3], #256
+        ldrsh w22, [x13], #-257
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsb w2, [x3], #256
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsb w22, [x13], #-257
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsh w2, [x3], #256
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsh w22, [x13], #-257
+// CHECK-ERROR-NEXT:                           ^
+
+        str b3, [x3], #256
+        str b3, [x13], #-257
+        str h3, [x3], #256
+        str h3, [x13], #-257
+        str s3, [x3], #256
+        str s3, [x13], #-257
+        str d3, [x3], #256
+        str d3, [x13], #-257
+        str q3, [x3], #256
+        str q3, [x13], #-257
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str b3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str b3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str h3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str h3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str s3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str s3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str d3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str d3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str q3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str q3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+
+        ldr b3, [x3], #256
+        ldr b3, [x13], #-257
+        ldr h3, [x3], #256
+        ldr h3, [x13], #-257
+        ldr s3, [x3], #256
+        ldr s3, [x13], #-257
+        ldr d3, [x3], #256
+        ldr d3, [x13], #-257
+        ldr q3, [x3], #256
+        ldr q3, [x13], #-257
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr b3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr b3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr h3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr h3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr s3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr s3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr d3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr d3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr q3, [x3], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr q3, [x13], #-257
+// CHECK-ERROR-NEXT:                        ^
+
+//------------------------------------------------------------------------------
+// Load-store register (immediate pre-indexed)
+//------------------------------------------------------------------------------
+
+        ldr x3, [x4]!
+// CHECK-ERROR: error:
+// CHECK-ERROR-NEXT:         ldr x3, [x4]!
+// CHECK-ERROR-NEXT:                     ^
+
+        strb w1, [x19, #256]!
+        strb w9, [sp, #-257]!
+        strh w1, [x19, #256]!
+        strh w9, [sp, #-257]!
+        str w1, [x19, #256]!
+        str w9, [sp, #-257]!
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         strb w1, [x19, #256]!
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         strb w9, [sp, #-257]!
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         strh w1, [x19, #256]!
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         strh w9, [sp, #-257]!
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         str w1, [x19, #256]!
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str w9, [sp, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+
+        ldrb w1, [x19, #256]!
+        ldrb w9, [sp, #-257]!
+        ldrh w1, [x19, #256]!
+        ldrh w9, [sp, #-257]!
+        ldr w1, [x19, #256]!
+        ldr w9, [sp, #-257]!
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldrb w1, [x19, #256]!
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrb w9, [sp, #-257]!
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldrh w1, [x19, #256]!
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrh w9, [sp, #-257]!
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldr w1, [x19, #256]!
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr w9, [sp, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+
+        ldrsb x2, [x3, #256]!
+        ldrsb x22, [x13, #-257]!
+        ldrsh x2, [x3, #256]!
+        ldrsh x22, [x13, #-257]!
+        ldrsw x2, [x3, #256]!
+        ldrsw x22, [x13, #-257]!
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldrsb x2, [x3, #256]!
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsb x22, [x13, #-257]!
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldrsh x2, [x3, #256]!
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsh x22, [x13, #-257]!
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldrsw x2, [x3, #256]!
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsw x22, [x13, #-257]!
+// CHECK-ERROR-NEXT:                    ^
+
+        ldrsb w2, [x3, #256]!
+        ldrsb w22, [x13, #-257]!
+        ldrsh w2, [x3, #256]!
+        ldrsh w22, [x13, #-257]!
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldrsb w2, [x3, #256]!
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsb w22, [x13, #-257]!
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldrsh w2, [x3, #256]!
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrsh w22, [x13, #-257]!
+// CHECK-ERROR-NEXT:                    ^
+
+        str b3, [x3, #256]!
+        str b3, [x13, #-257]!
+        str h3, [x3, #256]!
+        str h3, [x13, #-257]!
+        str s3, [x3, #256]!
+        str s3, [x13, #-257]!
+        str d3, [x3, #256]!
+        str d3, [x13, #-257]!
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         str b3, [x3, #256]!
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str b3, [x13, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         str h3, [x3, #256]!
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str h3, [x13, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         str s3, [x3, #256]!
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str s3, [x13, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         str d3, [x3, #256]!
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str d3, [x13, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+
+        ldr b3, [x3, #256]!
+        ldr b3, [x13, #-257]!
+        ldr h3, [x3, #256]!
+        ldr h3, [x13, #-257]!
+        ldr s3, [x3, #256]!
+        ldr s3, [x13, #-257]!
+        ldr d3, [x3, #256]!
+        ldr d3, [x13, #-257]!
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldr b3, [x3, #256]!
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr b3, [x13, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldr h3, [x3, #256]!
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr h3, [x13, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldr s3, [x3, #256]!
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr s3, [x13, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldr d3, [x3, #256]!
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr d3, [x13, #-257]!
+// CHECK-ERROR-NEXT:                 ^
+
+//------------------------------------------------------------------------------
+// Load/store (unprivileged)
+//------------------------------------------------------------------------------
+
+        ldtrb w2, [sp, #256]
+        sttrh w17, [x1, #256]
+        ldtrsw x20, [x1, #256]
+        ldtr x12, [sp, #256]
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:        ldtrb w2, [sp, #256]
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         sttrh w17, [x1, #256]
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldtrsw x20, [x1, #256]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldtr x12, [sp, #256]
+// CHECK-ERROR-NEXT:                   ^
+
+        sttr h2, [x2, #-257]
+        sttr b2, [x2, #-257]
+        ldtrsb x9, [sp, #-257]
+        ldtr w2, [x30, #-257]
+        sttr q9, [x20, #-257]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sttr h2, [x2, #-257]
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sttr b2, [x2, #-257]
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldtrsb x9, [sp, #-257]
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldtr w2, [x30, #-257]
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         sttr q9, [x20, #-257]
+// CHECK-ERROR-NEXT:                  ^
+
+
+//------------------------------------------------------------------------------
+// Load/store (unsigned immediate)
+//------------------------------------------------------------------------------
+
+//// Out of range immediates
+        ldr q0, [x11, #65536]
+        ldr x0, [sp, #32768]
+        ldr w0, [x4, #16384]
+        ldrh w2, [x21, #8192]
+        ldrb w3, [x12, #4096]
+// CHECK-ERROR: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr q0, [x11, #65536]
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr x0, [sp, #32768]
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldr w0, [x4, #16384]
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrh w2, [x21, #8192]
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         ldrb w3, [x12, #4096]
+// CHECK-ERROR-NEXT:                  ^
+
+//// Misaligned addresses
+        ldr w0, [x0, #2]
+        ldrsh w2, [x0, #123]
+        str q0, [x0, #8]
+// CHECK-ERROR: error: too few operands for instruction
+// CHECK-ERROR-NEXT:         ldr w0, [x0, #2]
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: too few operands for instruction
+// CHECK-ERROR-NEXT:         ldrsh w2, [x0, #123]
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: too few operands for instruction
+// CHECK-ERROR-NEXT:         str q0, [x0, #8]
+// CHECK-ERROR-NEXT:                 ^
+
+//// 32-bit addresses
+        ldr w0, [w20]
+        ldrsh x3, [wsp]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldr w0, [w20]
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldrsh x3, [wsp]
+// CHECK-ERROR-NEXT:                    ^
+
+//// Store things
+        strb w0, [wsp]
+        strh w31, [x23, #1]
+        str x5, [x22, #12]
+        str w7, [x12, #16384]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT: strb w0, [wsp]
+// CHECK-ERROR-NEXT:           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         strh w31, [x23, #1]
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: too few operands for instruction
+// CHECK-ERROR-NEXT:         str x5, [x22, #12]
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [-256, 255]
+// CHECK-ERROR-NEXT:         str w7, [x12, #16384]
+// CHECK-ERROR-NEXT:                 ^
+
+//// Bad PRFMs
+        prfm #-1, [sp]
+        prfm #32, [sp, #8]
+        prfm pldl1strm, [w3, #8]
+        prfm wibble, [sp]
+// CHECK-ERROR: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT:        prfm #-1, [sp]
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT:        prfm #32, [sp, #8]
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        prfm pldl1strm, [w3, #8]
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: operand specifier not recognised
+// CHECK-ERROR-NEXT:        prfm wibble, [sp]
+// CHECK-ERROR-NEXT:             ^
+
+//------------------------------------------------------------------------------
+// Load/store register (register offset)
+//------------------------------------------------------------------------------
+
+        ldr w3, [xzr, x3]
+        ldr w4, [x0, x4, lsl]
+        ldr w9, [x5, x5, uxtw]
+        ldr w10, [x6, x9, sxtw #2]
+        ldr w11, [x7, w2, lsl #2]
+        ldr w12, [x8, w1, sxtx]
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:        ldr w3, [xzr, x3]
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected #imm after shift specifier
+// CHECK-ERROR-NEXT:         ldr w4, [x0, x4, lsl]
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #2
+// CHECK-ERROR-NEXT:         ldr w9, [x5, x5, uxtw]
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #2
+// CHECK-ERROR-NEXT:         ldr w10, [x6, x9, sxtw #2]
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
+// CHECK-ERROR-NEXT:         ldr w11, [x7, w2, lsl #2]
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
+// CHECK-ERROR-NEXT:         ldr w12, [x8, w1, sxtx]
+// CHECK-ERROR-NEXT:                           ^
+
+        ldrsb w9, [x4, x2, lsl #-1]
+        strb w9, [x4, x2, lsl #1]
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         ldrsb w9, [x4, x2, lsl #-1]
+// CHECK-ERROR-NEXT:                                 ^
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0
+// CHECK-ERROR-NEXT:         strb w9, [x4, x2, lsl #1]
+// CHECK-ERROR-NEXT:                  ^
+
+        ldrsh w9, [x4, x2, lsl #-1]
+        ldr h13, [x4, w2, uxtw #2]
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         ldrsh w9, [x4, x2, lsl #-1]
+// CHECK-ERROR-NEXT:                                 ^
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #1
+// CHECK-ERROR-NEXT:         ldr h13, [x4, w2, uxtw #2]
+// CHECK-ERROR-NEXT:                           ^
+
+        str w9, [x5, w9, sxtw #-1]
+        str s3, [sp, w9, uxtw #1]
+        ldrsw x9, [x15, x4, sxtx #3]
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         str w9, [x5, w9, sxtw #-1]
+// CHECK-ERROR-NEXT:                                ^
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #2
+// CHECK-ERROR-NEXT:         str s3, [sp, w9, uxtw #1]
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #2
+// CHECK-ERROR-NEXT:         ldrsw x9, [x15, x4, sxtx #3]
+// CHECK-ERROR-NEXT:                             ^
+
+        str xzr, [x5, x9, sxtx #-1]
+        prfm pldl3keep, [sp, x20, lsl #2]
+        ldr d3, [x20, wzr, uxtw #4]
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         str xzr, [x5, x9, sxtx #-1]
+// CHECK-ERROR-NEXT:                                 ^
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtx' with optional shift of #0 or #3
+// CHECK-ERROR-NEXT:         prfm pldl3keep, [sp, x20, lsl #2]
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected 'uxtw' or 'sxtw' with optional shift of #0 or #3
+// CHECK-ERROR-NEXT:         ldr d3, [x20, wzr, uxtw #4]
+// CHECK-ERROR-NEXT:                 ^
+
+        ldr q5, [sp, x2, lsl #-1]
+        ldr q10, [x20, w4, uxtw #2]
+        str q21, [x20, w4, uxtw #5]
+// CHECK-ERROR-NEXT: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         ldr q5, [sp, x2, lsl #-1]
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtw' with optional shift of #0 or #4
+// CHECK-ERROR-NEXT:         ldr q10, [x20, w4, uxtw #2]
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected 'lsl' or 'sxtw' with optional shift of #0 or #4
+// CHECK-ERROR-NEXT:         str q21, [x20, w4, uxtw #5]
+// CHECK-ERROR-NEXT:                  ^
+
+//------------------------------------------------------------------------------
+// Load/store register pair (offset)
+//------------------------------------------------------------------------------
+        ldp w3, w2, [x4, #1]
+        stp w1, w2, [x3, #253]
+        stp w9, w10, [x5, #256]
+        ldp w11, w12, [x9, #-260]
+        stp wsp, w9, [sp]
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldp w3, w2, [x4, #1]
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp w1, w2, [x3, #253]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp w9, w10, [x5, #256]
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldp w11, w12, [x9, #-260]
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stp wsp, w9, [sp]
+// CHECK-ERROR-NEXT:             ^
+
+        ldpsw x9, x2, [sp, #2]
+        ldpsw x1, x2, [x10, #256]
+        ldpsw x3, x4, [x11, #-260]
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldpsw x9, x2, [sp, #2]
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldpsw x1, x2, [x10, #256]
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldpsw x3, x4, [x11, #-260]
+// CHECK-ERROR-NEXT:                       ^
+
+        ldp x2, x5, [sp, #4]
+        ldp x5, x6, [x9, #512]
+        stp x7, x8, [x10, #-520]
+// CHECK-ERROR: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldp x2, x5, [sp, #4]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldp x5, x6, [x9, #512]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         stp x7, x8, [x10, #-520]
+// CHECK-ERROR-NEXT:                     ^
+
+        ldp sp, x3, [x10]
+        stp x3, sp, [x9]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldp sp, x3, [x10]
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stp x3, sp, [x9]
+// CHECK-ERROR-NEXT:                 ^
+
+        stp s3, s5, [sp, #-2]
+        ldp s6, s26, [x4, #-260]
+        stp s13, s19, [x5, #256]
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp s3, s5, [sp, #-2]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldp s6, s26, [x4, #-260]
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp s13, s19, [x5, #256]
+// CHECK-ERROR-NEXT:                       ^
+
+        ldp d3, d4, [xzr]
+        ldp d5, d6, [x0, #512]
+        stp d7, d8, [x0, #-520]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldp d3, d4, [xzr]
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldp d5, d6, [x0, #512]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         stp d7, d8, [x0, #-520]
+// CHECK-ERROR-NEXT:                     ^
+
+        ldp d3, q2, [sp]
+        ldp q3, q5, [sp, #8]
+        stp q20, q25, [x5, #1024]
+        ldp q30, q15, [x23, #-1040]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldp d3, q2, [sp]
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         ldp q3, q5, [sp, #8]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         stp q20, q25, [x5, #1024]
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         ldp q30, q15, [x23, #-1040]
+// CHECK-ERROR-NEXT:                       ^
+
+//------------------------------------------------------------------------------
+// Load/store register pair (post-indexed)
+//------------------------------------------------------------------------------
+
+        ldp w3, w2, [x4], #1
+        stp w1, w2, [x3], #253
+        stp w9, w10, [x5], #256
+        ldp w11, w12, [x9], #-260
+        stp wsp, w9, [sp], #0
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldp w3, w2, [x4], #1
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp w1, w2, [x3], #253
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp w9, w10, [x5], #256
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldp w11, w12, [x9], #-260
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stp wsp, w9, [sp], #0
+// CHECK-ERROR-NEXT:             ^
+
+        ldpsw x9, x2, [sp], #2
+        ldpsw x1, x2, [x10], #256
+        ldpsw x3, x4, [x11], #-260
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldpsw x9, x2, [sp], #2
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldpsw x1, x2, [x10], #256
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldpsw x3, x4, [x11], #-260
+// CHECK-ERROR-NEXT:                       ^
+
+        ldp x2, x5, [sp], #4
+        ldp x5, x6, [x9], #512
+        stp x7, x8, [x10], #-520
+// CHECK-ERROR: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldp x2, x5, [sp], #4
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldp x5, x6, [x9], #512
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         stp x7, x8, [x10], #-520
+// CHECK-ERROR-NEXT:                            ^
+
+        ldp sp, x3, [x10], #0
+        stp x3, sp, [x9], #0
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldp sp, x3, [x10], #0
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stp x3, sp, [x9], #0
+// CHECK-ERROR-NEXT:                 ^
+
+        stp s3, s5, [sp], #-2
+        ldp s6, s26, [x4], #-260
+        stp s13, s19, [x5], #256
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp s3, s5, [sp], #-2
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldp s6, s26, [x4], #-260
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp s13, s19, [x5], #256
+// CHECK-ERROR-NEXT:                       ^
+
+        ldp d3, d4, [xzr], #0
+        ldp d5, d6, [x0], #512
+        stp d7, d8, [x0], #-520
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldp d3, d4, [xzr], #0
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldp d5, d6, [x0], #512
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         stp d7, d8, [x0], #-520
+// CHECK-ERROR-NEXT:                     ^
+
+        ldp d3, q2, [sp], #0
+        ldp q3, q5, [sp], #8
+        stp q20, q25, [x5], #1024
+        ldp q30, q15, [x23], #-1040
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldp d3, q2, [sp], #0
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         ldp q3, q5, [sp], #8
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         stp q20, q25, [x5], #1024
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         ldp q30, q15, [x23], #-1040
+// CHECK-ERROR-NEXT:                       ^
+
+//------------------------------------------------------------------------------
+// Load/store register pair (pre-indexed)
+//------------------------------------------------------------------------------
+
+        ldp w3, w2, [x4, #1]!
+        stp w1, w2, [x3, #253]!
+        stp w9, w10, [x5, #256]!
+        ldp w11, w12, [x9, #-260]!
+        stp wsp, w9, [sp, #0]!
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldp w3, w2, [x4, #1]!
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp w1, w2, [x3, #253]!
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp w9, w10, [x5, #256]!
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldp w11, w12, [x9, #-260]!
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stp wsp, w9, [sp, #0]!
+// CHECK-ERROR-NEXT:             ^
+
+        ldpsw x9, x2, [sp, #2]!
+        ldpsw x1, x2, [x10, #256]!
+        ldpsw x3, x4, [x11, #-260]!
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldpsw x9, x2, [sp, #2]!
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldpsw x1, x2, [x10, #256]!
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldpsw x3, x4, [x11, #-260]!
+// CHECK-ERROR-NEXT:                       ^
+
+        ldp x2, x5, [sp, #4]!
+        ldp x5, x6, [x9, #512]!
+        stp x7, x8, [x10, #-520]!
+// CHECK-ERROR: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldp x2, x5, [sp, #4]!
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldp x5, x6, [x9, #512]!
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         stp x7, x8, [x10, #-520]!
+// CHECK-ERROR-NEXT:                     ^
+
+        ldp sp, x3, [x10, #0]!
+        stp x3, sp, [x9, #0]!
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldp sp, x3, [x10, #0]!
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stp x3, sp, [x9, #0]!
+// CHECK-ERROR-NEXT:                 ^
+
+        stp s3, s5, [sp, #-2]!
+        ldp s6, s26, [x4, #-260]!
+        stp s13, s19, [x5, #256]!
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp s3, s5, [sp, #-2]!
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldp s6, s26, [x4, #-260]!
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stp s13, s19, [x5, #256]!
+// CHECK-ERROR-NEXT:                       ^
+
+        ldp d3, d4, [xzr, #0]!
+        ldp d5, d6, [x0, #512]!
+        stp d7, d8, [x0, #-520]!
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldp d3, d4, [xzr, #0]!
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldp d5, d6, [x0, #512]!
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         stp d7, d8, [x0, #-520]!
+// CHECK-ERROR-NEXT:                     ^
+
+        ldp d3, q2, [sp, #0]!
+        ldp q3, q5, [sp, #8]!
+        stp q20, q25, [x5, #1024]!
+        ldp q30, q15, [x23, #-1040]!
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldp d3, q2, [sp, #0]!
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         ldp q3, q5, [sp, #8]!
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         stp q20, q25, [x5, #1024]!
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         ldp q30, q15, [x23, #-1040]!
+// CHECK-ERROR-NEXT:                       ^
+
+//------------------------------------------------------------------------------
+// Load/store register pair (offset)
+//------------------------------------------------------------------------------
+        ldnp w3, w2, [x4, #1]
+        stnp w1, w2, [x3, #253]
+        stnp w9, w10, [x5, #256]
+        ldnp w11, w12, [x9, #-260]
+        stnp wsp, w9, [sp]
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldnp w3, w2, [x4, #1]
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stnp w1, w2, [x3, #253]
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stnp w9, w10, [x5, #256]
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldnp w11, w12, [x9, #-260]
+// CHECK-ERROR-NEXT:                             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stnp wsp, w9, [sp]
+// CHECK-ERROR-NEXT:              ^
+
+        ldnp x2, x5, [sp, #4]
+        ldnp x5, x6, [x9, #512]
+        stnp x7, x8, [x10, #-520]
+// CHECK-ERROR: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldnp x2, x5, [sp, #4]
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldnp x5, x6, [x9, #512]
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         stnp x7, x8, [x10, #-520]
+// CHECK-ERROR-NEXT:                            ^
+
+        ldnp sp, x3, [x10]
+        stnp x3, sp, [x9]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldnp sp, x3, [x10]
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         stnp x3, sp, [x9]
+// CHECK-ERROR-NEXT:                 ^
+
+        stnp s3, s5, [sp, #-2]
+        ldnp s6, s26, [x4, #-260]
+        stnp s13, s19, [x5, #256]
+// CHECK-ERROR: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stnp s3, s5, [sp, #-2]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         ldnp s6, s26, [x4, #-260]
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 4 in range [-256, 252]
+// CHECK-ERROR-NEXT:         stnp s13, s19, [x5, #256]
+// CHECK-ERROR-NEXT:                       ^
+
+        ldnp d3, d4, [xzr]
+        ldnp d5, d6, [x0, #512]
+        stnp d7, d8, [x0, #-520]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldnp d3, d4, [xzr]
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         ldnp d5, d6, [x0, #512]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 8 in range [-512, 508]
+// CHECK-ERROR-NEXT:         stnp d7, d8, [x0, #-520]
+// CHECK-ERROR-NEXT:                     ^
+
+        ldnp d3, q2, [sp]
+        ldnp q3, q5, [sp, #8]
+        stnp q20, q25, [x5, #1024]
+        ldnp q30, q15, [x23, #-1040]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ldnp d3, q2, [sp]
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         ldnp q3, q5, [sp, #8]
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         stnp q20, q25, [x5, #1024]
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: expected integer multiple of 16 in range [-1024, 1016]
+// CHECK-ERROR-NEXT:         ldnp q30, q15, [x23, #-1040]
+// CHECK-ERROR-NEXT:                       ^
+
+//------------------------------------------------------------------------------
+// Logical (shifted register)
+//------------------------------------------------------------------------------
+        orr w0, w1, #0xffffffff
+        and x3, x5, #0xffffffffffffffff
+// CHECK-ERROR: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         orr w0, w1, #0xffffffff
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         and x3, x5, #0xffffffffffffffff
+// CHECK-ERROR-NEXT:                     ^
+
+        ands w3, w9, #0x0
+        eor x2, x0, #0x0
+// CHECK-ERROR: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         ands w3, w9, #0x0
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         eor x2, x0, #0x0
+// CHECK-ERROR-NEXT:                     ^
+
+        eor w3, w5, #0x83
+        eor x9, x20, #0x1234
+// CHECK-ERROR: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         eor w3, w5, #0x83
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         eor x9, x20, #0x1234
+// CHECK-ERROR-NEXT:                      ^
+
+        and wzr, w4, 0xffff0000
+        eor xzr, x9, #0xffff0000ffff0000
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         and wzr, w4, 0xffff0000
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         eor xzr, x9, #0xffff0000ffff0000
+// CHECK-ERROR-NEXT:                      ^
+
+        orr w3, wsp, #0xf0f0f0f0
+        ands x3, sp, #0xaaaaaaaaaaaaaaaa
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         orr w3, wsp, #0xf0f0f0f0
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ands x3, sp, #0xaaaaaaaaaaaaaaaa
+// CHECK-ERROR-NEXT:                  ^
+
+        tst sp, #0xe0e0e0e0e0e0e0e0
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         tst sp, #0xe0e0e0e0e0e0e0e0
+// CHECK-ERROR-NEXT:             ^
+
+        // movi has been removed from the specification. Make sure it's really gone.
+        movi wzr, #0x44444444
+        movi w3, #0xffff
+        movi x9, #0x0000ffff00000000
+// CHECK-ERROR: error: invalid instruction
+// CHECK-ERROR-NEXT:         movi wzr, #0x44444444
+// CHECK-ERROR-NEXT:         ^
+// CHECK-ERROR: error: invalid instruction
+// CHECK-ERROR-NEXT:         movi w3, #0xffff
+// CHECK-ERROR-NEXT:         ^
+// CHECK-ERROR: error: invalid instruction
+// CHECK-ERROR-NEXT:         movi x9, #0x0000ffff00000000
+// CHECK-ERROR-NEXT:         ^
+
+//------------------------------------------------------------------------------
+// Logical (shifted register)
+//------------------------------------------------------------------------------
+
+        //// Out of range shifts
+        and w2, w24, w6, lsl #-1
+        and w4, w6, w12, lsl #32
+        and x4, x6, x12, lsl #64
+        and x2, x5, x11, asr
+// CHECK-ERROR: error: expected integer shift amount
+// CHECK-ERROR-NEXT:         and w2, w24, w6, lsl #-1
+// CHECK-ERROR-NEXT:                               ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]
+// CHECK-ERROR-NEXT:         and w4, w6, w12, lsl #32
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]
+// CHECK-ERROR-NEXT:         and x4, x6, x12, lsl #64
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: expected #imm after shift specifier
+// CHECK-ERROR-NEXT:         and x2, x5, x11, asr
+// CHECK-ERROR-NEXT:                             ^
+
+        //// sp not allowed
+        orn wsp, w3, w5
+        bics x20, sp, x9, lsr #0
+        orn x2, x6, sp, lsl #3
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         orn wsp, w3, w5
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         bics x20, sp, x9, lsr #0
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         orn x2, x6, sp, lsl #3
+// CHECK-ERROR-NEXT:                     ^
+
+        //// Mismatched registers
+        and x3, w2, w1
+        ands w1, x12, w2
+        and x4, x5, w6, lsl #12
+        orr w2, w5, x7, asr #0
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         and x3, w2, w1
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         ands w1, x12, w2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         and x4, x5, w6, lsl #12
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: expected compatible register or logical immediate
+// CHECK-ERROR-NEXT:         orr w2, w5, x7, asr #0
+// CHECK-ERROR-NEXT:                     ^
+
+        //// Shifts should not be allowed on mov
+        mov w3, w7, lsl #13
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         mov w3, w7, lsl #13
+// CHECK-ERROR-NEXT:                     ^
+
+//------------------------------------------------------------------------------
+// Move wide (immediate)
+//------------------------------------------------------------------------------
+
+        movz w3, #65536, lsl #0
+        movz w4, #65536
+        movn w1, #2, lsl #1
+        movk w3, #0, lsl #-1
+        movn w2, #-1, lsl #0
+        movz x3, #-1
+        movk w3, #1, lsl #32
+        movn x2, #12, lsl #64
+// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz w3, #65536, lsl #0
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz w4, #65536
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn w1, #2, lsl #1
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: only 'lsl #+N' valid after immediate
+// CHECK-ERROR-NEXT:         movk w3, #0, lsl #-1
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn w2, #-1, lsl #0
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz x3, #-1
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk w3, #1, lsl #32
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn x2, #12, lsl #64
+// CHECK-ERROR-NEXT:                  ^
+
+        movz x12, #:abs_g0:sym, lsl #16
+        movz x12, #:abs_g0:sym, lsl #0
+        movn x2, #:abs_g0:sym
+        movk w3, #:abs_g0:sym
+        movz x3, #:abs_g0_nc:sym
+        movn x4, #:abs_g0_nc:sym
+// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz x12, #:abs_g0:sym, lsl #16
+// CHECK-ERROR-NEXT:                                 ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz x12, #:abs_g0:sym, lsl #0
+// CHECK-ERROR-NEXT:                                 ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn x2, #:abs_g0:sym
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk w3, #:abs_g0:sym
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz x3, #:abs_g0_nc:sym
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn x4, #:abs_g0_nc:sym
+// CHECK-ERROR-NEXT:                  ^
+
+        movn x2, #:abs_g1:sym
+        movk w3, #:abs_g1:sym
+        movz x3, #:abs_g1_nc:sym
+        movn x4, #:abs_g1_nc:sym
+// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn x2, #:abs_g1:sym
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk w3, #:abs_g1:sym
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz x3, #:abs_g1_nc:sym
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn x4, #:abs_g1_nc:sym
+// CHECK-ERROR-NEXT:                  ^
+
+        movz w12, #:abs_g2:sym
+        movn x12, #:abs_g2:sym
+        movk x13, #:abs_g2:sym
+        movk w3, #:abs_g2_nc:sym
+        movz x13, #:abs_g2_nc:sym
+        movn x24, #:abs_g2_nc:sym
+// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz w12, #:abs_g2:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn x12, #:abs_g2:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk x13, #:abs_g2:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk w3, #:abs_g2_nc:sym
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz x13, #:abs_g2_nc:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn x24, #:abs_g2_nc:sym
+// CHECK-ERROR-NEXT:                   ^
+
+        movn x19, #:abs_g3:sym
+        movz w20, #:abs_g3:sym
+        movk w21, #:abs_g3:sym
+// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn x19, #:abs_g3:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz w20, #:abs_g3:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk w21, #:abs_g3:sym
+// CHECK-ERROR-NEXT:                   ^
+
+        movk x19, #:abs_g0_s:sym
+        movk w23, #:abs_g0_s:sym
+// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk x19, #:abs_g0_s:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk w23, #:abs_g0_s:sym
+// CHECK-ERROR-NEXT:                   ^
+
+        movk x19, #:abs_g1_s:sym
+        movk w23, #:abs_g1_s:sym
+// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk x19, #:abs_g1_s:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk w23, #:abs_g1_s:sym
+// CHECK-ERROR-NEXT:                   ^
+
+        movz w2, #:abs_g2_s:sym
+        movn w29, #:abs_g2_s:sym
+        movk x19, #:abs_g2_s:sym
+        movk w23, #:abs_g2_s:sym
+// CHECK-ERROR: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movz w2, #:abs_g2_s:sym
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movn w29, #:abs_g2_s:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk x19, #:abs_g2_s:sym
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected relocated symbol or integer in range [0, 65535]
+// CHECK-ERROR-NEXT:         movk w23, #:abs_g2_s:sym
+// CHECK-ERROR-NEXT:                   ^
+
+//------------------------------------------------------------------------------
+// PC-relative addressing
+//------------------------------------------------------------------------------
+
+        adr sp, loc             // expects xzr
+        adrp x3, #20            // Immediate unaligned
+        adrp w2, loc            // 64-bit register needed
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adr sp, loc
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         adrp x3, #20
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         adrp w2, loc
+// CHECK-ERROR-NEXT:              ^
+
+        adr x9, #1048576
+        adr x2, #-1048577
+        adrp x9, #4294967296
+        adrp x20, #-4294971392
+// CHECK-ERROR: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         adr x9, #1048576
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         adr x2, #-1048577
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         adrp x9, #4294967296
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         adrp x20, #-4294971392
+// CHECK-ERROR-NEXT:                   ^
+
+//------------------------------------------------------------------------------
+// System
+//------------------------------------------------------------------------------
+
+        hint #-1
+        hint #128
+// CHECK-ERROR: error: expected integer in range [0, 127]
+// CHECK-ERROR-NEXT:         hint #-1
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 127]
+// CHECK-ERROR-NEXT:         hint #128
+// CHECK-ERROR-NEXT:              ^
+
+        clrex #-1
+        clrex #16
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:         clrex #-1
+// CHECK-ERROR-NEXT:               ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:         clrex #16
+// CHECK-ERROR-NEXT:               ^
+
+        dsb #-1
+        dsb #16
+        dmb #-1
+        dmb #16
+// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT:         dsb #-1
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT:         dsb #16
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT:         dmb #-1
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT:         dmb #16
+// CHECK-ERROR-NEXT:             ^
+
+        isb #-1
+        isb #16
+// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT:         isb #-1
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: Invalid immediate for instruction
+// CHECK-ERROR-NEXT:         isb #16
+// CHECK-ERROR-NEXT:             ^
+
+        msr daifset, x4
+        msr spsel, #-1
+        msr spsel #-1
+        msr daifclr, #16
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:         msr daifset, x4
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:         msr spsel, #-1
+// CHECK-ERROR-NEXT:                    ^
+// CHECK-ERROR-NEXT: error: expected comma before next operand
+// CHECK-ERROR-NEXT:         msr spsel #-1
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 15]
+// CHECK-ERROR-NEXT:         msr daifclr, #16
+// CHECK-ERROR-NEXT:                      ^
+
+        sys #8, c1, c2, #7, x9
+        sys #3, c16, c2, #3, x10
+        sys #2, c11, c16, #5
+        sys #4, c9, c8, #8, xzr
+        sysl x11, #8, c1, c2, #7
+        sysl x13, #3, c16, c2, #3
+        sysl x9, #2, c11, c16, #5
+        sysl x4, #4, c9, c8, #8
+// CHECK-ERROR-NEXT: error:  expected integer in range [0, 7]
+// CHECK-ERROR-NEXT:         sys #8, c1, c2, #7, x9
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: Expected cN operand where 0 <= N <= 15
+// CHECK-ERROR-NEXT:         sys #3, c16, c2, #3, x10
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: Expected cN operand where 0 <= N <= 15
+// CHECK-ERROR-NEXT:         sys #2, c11, c16, #5
+// CHECK-ERROR-NEXT:                      ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 7]
+// CHECK-ERROR-NEXT:         sys #4, c9, c8, #8, xzr
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 7]
+// CHECK-ERROR-NEXT:         sysl x11, #8, c1, c2, #7
+// CHECK-ERROR-NEXT:                   ^
+// CHECK-ERROR-NEXT: error: Expected cN operand where 0 <= N <= 15
+// CHECK-ERROR-NEXT:         sysl x13, #3, c16, c2, #3
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: Expected cN operand where 0 <= N <= 15
+// CHECK-ERROR-NEXT:         sysl x9, #2, c11, c16, #5
+// CHECK-ERROR-NEXT:                           ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 7]
+// CHECK-ERROR-NEXT:         sysl x4, #4, c9, c8, #8
+// CHECK-ERROR-NEXT:                              ^
+
+        ic ialluis, x2
+        ic allu, x7
+        ic ivau
+// CHECK-ERROR-NEXT: error: specified IC op does not use a register
+// CHECK-ERROR-NEXT:         ic ialluis, x2
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: operand specifier not recognised
+// CHECK-ERROR-NEXT:         ic allu, x7
+// CHECK-ERROR-NEXT:            ^
+// CHECK-ERROR-NEXT: error: specified IC op requires a register
+// CHECK-ERROR-NEXT:         ic ivau
+// CHECK-ERROR-NEXT:            ^
+
+        tlbi IPAS2E1IS
+        tlbi IPAS2LE1IS
+        tlbi VMALLE1IS, x12
+        tlbi ALLE2IS, x11
+        tlbi ALLE3IS, x20
+        tlbi VAE1IS
+        tlbi VAE2IS
+        tlbi VAE3IS
+        tlbi ASIDE1IS
+        tlbi VAAE1IS
+        tlbi ALLE1IS, x0
+        tlbi VALE1IS
+        tlbi VALE2IS
+        tlbi VALE3IS
+        tlbi VMALLS12E1IS, xzr
+        tlbi VAALE1IS
+        tlbi IPAS2E1
+        tlbi IPAS2LE1
+        tlbi VMALLE1, x9
+        tlbi ALLE2, x10
+        tlbi ALLE3, x11
+        tlbi VAE1
+        tlbi VAE2
+        tlbi VAE3
+        tlbi ASIDE1
+        tlbi VAAE1
+        tlbi ALLE1, x25
+        tlbi VALE1
+        tlbi VALE2
+        tlbi VALE3
+        tlbi VMALLS12E1, x15
+        tlbi VAALE1
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi IPAS2E1IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi IPAS2LE1IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi VMALLE1IS, x12
+// CHECK-ERROR-NEXT:                         ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi ALLE2IS, x11
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi ALLE3IS, x20
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAE1IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAE2IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAE3IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi ASIDE1IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAAE1IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi ALLE1IS, x0
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VALE1IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VALE2IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VALE3IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi VMALLS12E1IS, xzr
+// CHECK-ERROR-NEXT:                            ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAALE1IS
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi IPAS2E1
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi IPAS2LE1
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi VMALLE1, x9
+// CHECK-ERROR-NEXT:                       ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi ALLE2, x10
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi ALLE3, x11
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAE1
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAE2
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAE3
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi ASIDE1
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAAE1
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi ALLE1, x25
+// CHECK-ERROR-NEXT:                     ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VALE1
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VALE2
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VALE3
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: specified TLBI op does not use a register
+// CHECK-ERROR-NEXT:         tlbi VMALLS12E1, x15
+// CHECK-ERROR-NEXT:                          ^
+// CHECK-ERROR-NEXT: error: specified TLBI op requires a register
+// CHECK-ERROR-NEXT:         tlbi VAALE1
+// CHECK-ERROR-NEXT:              ^
+
+// For the MSR/MRS instructions, first make sure read-only and
+// write-only registers actually are.
+        msr MDCCSR_EL0, x12
+        msr DBGDTRRX_EL0, x12
+        msr MDRAR_EL1, x12
+        msr OSLSR_EL1, x12
+        msr DBGAUTHSTATUS_EL1, x12
+        msr MIDR_EL1, x12
+        msr CCSIDR_EL1, x12
+        msr CLIDR_EL1, x12
+        msr CTR_EL0, x12
+        msr MPIDR_EL1, x12
+        msr REVIDR_EL1, x12
+        msr AIDR_EL1, x12
+        msr DCZID_EL0, x12
+        msr ID_PFR0_EL1, x12
+        msr ID_PFR1_EL1, x12
+        msr ID_DFR0_EL1, x12
+        msr ID_AFR0_EL1, x12
+        msr ID_MMFR0_EL1, x12
+        msr ID_MMFR1_EL1, x12
+        msr ID_MMFR2_EL1, x12
+        msr ID_MMFR3_EL1, x12
+        msr ID_ISAR0_EL1, x12
+        msr ID_ISAR1_EL1, x12
+        msr ID_ISAR2_EL1, x12
+        msr ID_ISAR3_EL1, x12
+        msr ID_ISAR4_EL1, x12
+        msr ID_ISAR5_EL1, x12
+        msr MVFR0_EL1, x12
+        msr MVFR1_EL1, x12
+        msr MVFR2_EL1, x12
+        msr ID_AA64PFR0_EL1, x12
+        msr ID_AA64PFR1_EL1, x12
+        msr ID_AA64DFR0_EL1, x12
+        msr ID_AA64DFR1_EL1, x12
+        msr ID_AA64AFR0_EL1, x12
+        msr ID_AA64AFR1_EL1, x12
+        msr ID_AA64ISAR0_EL1, x12
+        msr ID_AA64ISAR1_EL1, x12
+        msr ID_AA64MMFR0_EL1, x12
+        msr ID_AA64MMFR1_EL1, x12
+        msr PMCEID0_EL0, x12
+        msr PMCEID1_EL0, x12
+        msr RVBAR_EL1, x12
+        msr RVBAR_EL2, x12
+        msr RVBAR_EL3, x12
+        msr ISR_EL1, x12
+        msr CNTPCT_EL0, x12
+        msr CNTVCT_EL0, x12
+        msr PMEVCNTR31_EL0, x12
+        msr PMEVTYPER31_EL0, x12
+// CHECK-ERROR: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr MDCCSR_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr DBGDTRRX_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr MDRAR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr OSLSR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr DBGAUTHSTATUS_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr MIDR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr CCSIDR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr CLIDR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr CTR_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr MPIDR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr REVIDR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr AIDR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr DCZID_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_PFR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_PFR1_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_DFR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AFR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_MMFR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_MMFR1_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_MMFR2_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_MMFR3_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_ISAR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_ISAR1_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_ISAR2_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_ISAR3_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_ISAR4_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_ISAR5_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr MVFR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr MVFR1_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr MVFR2_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64PFR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64PFR1_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64DFR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64DFR1_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64AFR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64AFR1_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64ISAR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64ISAR1_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64MMFR0_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ID_AA64MMFR1_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr PMCEID0_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr PMCEID1_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr RVBAR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr RVBAR_EL2, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr RVBAR_EL3, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr ISR_EL1, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr CNTPCT_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr CNTVCT_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr PMEVCNTR31_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected writable system register or pstate
+// CHECK-ERROR-NEXT:         msr PMEVTYPER31_EL0, x12
+// CHECK-ERROR-NEXT:             ^
+
+        mrs x9, DBGDTRTX_EL0
+        mrs x9, OSLAR_EL1
+        mrs x9, PMSWINC_EL0
+        mrs x9, PMEVCNTR31_EL0
+        mrs x9, PMEVTYPER31_EL0
+// CHECK-ERROR: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x9, DBGDTRTX_EL0
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x9, OSLAR_EL1
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x9, PMSWINC_EL0
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x9, PMEVCNTR31_EL0
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x9, PMEVTYPER31_EL0
+// CHECK-ERROR-NEXT:                 ^
+
+// Now check some invalid generic names
+        mrs xzr, s2_5_c11_c13_2
+        mrs x12, s3_8_c11_c13_2
+        mrs x13, s3_3_c12_c13_2
+        mrs x19, s3_2_c15_c16_2
+        mrs x30, s3_2_c15_c1_8
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs xzr, s2_5_c11_c13_2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x12, s3_8_c11_c13_2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x13, s3_3_c12_c13_2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x19, s3_2_c15_c16_2
+// CHECK-ERROR-NEXT:                  ^
+// CHECK-ERROR-NEXT: error: expected readable system register
+// CHECK-ERROR-NEXT:         mrs x30, s3_2_c15_c1_8
+// CHECK-ERROR-NEXT:                  ^
+
+//------------------------------------------------------------------------------
+// Test and branch (immediate)
+//------------------------------------------------------------------------------
+
+        tbz w3, #-1, addr
+        tbz w3, #32, nowhere
+        tbz x9, #-1, there
+        tbz x20, #64, dont
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:     tbz w3, #-1, addr
+// CHECK-ERROR-NEXT:             ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        tbz w3, #32, nowhere
+// CHECK-ERROR-NEXT:                ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:        tbz x9, #-1, there
+// CHECK-ERROR-NEXT:                ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:        tbz x20, #64, dont
+// CHECK-ERROR-NEXT:                 ^
+
+        tbnz w3, #-1, addr
+        tbnz w3, #32, nowhere
+        tbnz x9, #-1, there
+        tbnz x20, #64, dont
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        tbnz w3, #-1, addr
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 31]
+// CHECK-ERROR-NEXT:        tbnz w3, #32, nowhere
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:        tbnz x9, #-1, there
+// CHECK-ERROR-NEXT:                 ^
+// CHECK-ERROR-NEXT: error: expected integer in range [0, 63]
+// CHECK-ERROR-NEXT:        tbnz x20, #64, dont
+
+//------------------------------------------------------------------------------
+// Unconditional branch (immediate)
+//------------------------------------------------------------------------------
+
+        b #134217728
+        b #-134217732
+        b #1
+// CHECK-ERROR: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         b #134217728
+// CHECK-ERROR-NEXT:           ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         b #-134217732
+// CHECK-ERROR-NEXT:           ^
+// CHECK-ERROR-NEXT: error: expected label or encodable integer pc offset
+// CHECK-ERROR-NEXT:         b #1
+// CHECK-ERROR-NEXT:           ^
+
+//------------------------------------------------------------------------------
+// Unconditional branch (register)
+//------------------------------------------------------------------------------
+
+        br w2
+        br sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         br w2
+// CHECK-ERROR-NEXT:            ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         br sp
+// CHECK-ERROR-NEXT:            ^
+
+        //// These ones shouldn't allow any registers
+        eret x2
+        drps x2
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         eret x2
+// CHECK-ERROR-NEXT:              ^
+// CHECK-ERROR-NEXT: error: invalid operand for instruction
+// CHECK-ERROR-NEXT:         drps x2
+// CHECK-ERROR-NEXT:              ^
+
diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s
new file mode 100644
index 0000000..ad3064e
--- /dev/null
+++ b/test/MC/AArch64/basic-a64-instructions.s
@@ -0,0 +1,4819 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding < %s | FileCheck %s
+  .globl _func
+
+// Check that the assembler can handle the documented syntax from the ARM ARM.
+// For complex constructs like shifter operands, check more thoroughly for them
+// once then spot check that following instructions accept the form generally.
+// This gives us good coverage while keeping the overall size of the test
+// more reasonable.
+
+
+_func:
+// CHECK: _func
+
+//------------------------------------------------------------------------------
+// Add/sub (extended register)
+//------------------------------------------------------------------------------
+        // Basic extends 64-bit ops
+        add x2, x4, w5, uxtb
+        add x20, sp, w19, uxth
+        add x12, x1, w20, uxtw
+        add x20, x3, x13, uxtx
+        add x17, x25, w20, sxtb
+        add x18, x13, w19, sxth
+        add sp, x2, w3, sxtw
+        add x3, x5, x9, sxtx
+// CHECK: add      x2, x4, w5, uxtb           // encoding: [0x82,0x00,0x25,0x8b]
+// CHECK: add      x20, sp, w19, uxth         // encoding: [0xf4,0x23,0x33,0x8b]
+// CHECK: add      x12, x1, w20, uxtw         // encoding: [0x2c,0x40,0x34,0x8b]
+// CHECK: add      x20, x3, x13, uxtx         // encoding: [0x74,0x60,0x2d,0x8b]
+// CHECK: add      x17, x25, w20, sxtb        // encoding: [0x31,0x83,0x34,0x8b]
+// CHECK: add      x18, x13, w19, sxth        // encoding: [0xb2,0xa1,0x33,0x8b]
+// CHECK: add      sp, x2, w3, sxtw           // encoding: [0x5f,0xc0,0x23,0x8b]
+// CHECK: add      x3, x5, x9, sxtx           // encoding: [0xa3,0xe0,0x29,0x8b]
+
+        // Basic extends, 32-bit ops
+        add w2, w5, w7, uxtb
+        add w21, w15, w17, uxth
+        add w30, w29, wzr, uxtw
+        add w19, w17, w1, uxtx  // Goodness knows what this means
+        add w2, w5, w1, sxtb
+        add w26, w17, w19, sxth
+        add w0, w2, w3, sxtw
+        add w2, w3, w5, sxtx
+// CHECK: add      w2, w5, w7, uxtb           // encoding: [0xa2,0x00,0x27,0x0b]
+// CHECK: add      w21, w15, w17, uxth        // encoding: [0xf5,0x21,0x31,0x0b]
+// CHECK: add      w30, w29, wzr, uxtw        // encoding: [0xbe,0x43,0x3f,0x0b]
+// CHECK: add      w19, w17, w1, uxtx         // encoding: [0x33,0x62,0x21,0x0b]
+// CHECK: add      w2, w5, w1, sxtb           // encoding: [0xa2,0x80,0x21,0x0b]
+// CHECK: add      w26, w17, w19, sxth        // encoding: [0x3a,0xa2,0x33,0x0b]
+// CHECK: add      w0, w2, w3, sxtw           // encoding: [0x40,0xc0,0x23,0x0b]
+// CHECK: add      w2, w3, w5, sxtx           // encoding: [0x62,0xe0,0x25,0x0b]
+
+        // Nonzero shift amounts
+        add x2, x3, w5, sxtb #0
+        add x7, x11, w13, uxth #4
+        add w17, w19, w23, uxtw #2
+        add w29, w23, w17, uxtx #1
+// CHECK: add      x2, x3, w5, sxtb           // encoding: [0x62,0x80,0x25,0x8b]
+// CHECK: add      x7, x11, w13, uxth #4      // encoding: [0x67,0x31,0x2d,0x8b]
+// CHECK: add      w17, w19, w23, uxtw #2     // encoding: [0x71,0x4a,0x37,0x0b]
+// CHECK: add      w29, w23, w17, uxtx #1     // encoding: [0xfd,0x66,0x31,0x0b]
+
+        // Sub
+        sub x2, x4, w5, uxtb #2
+        sub x20, sp, w19, uxth #4
+        sub x12, x1, w20, uxtw
+        sub x20, x3, x13, uxtx #0
+        sub x17, x25, w20, sxtb
+        sub x18, x13, w19, sxth
+        sub sp, x2, w3, sxtw
+        sub x3, x5, x9, sxtx
+// CHECK: sub      x2, x4, w5, uxtb #2        // encoding: [0x82,0x08,0x25,0xcb]
+// CHECK: sub      x20, sp, w19, uxth #4      // encoding: [0xf4,0x33,0x33,0xcb]
+// CHECK: sub      x12, x1, w20, uxtw         // encoding: [0x2c,0x40,0x34,0xcb]
+// CHECK: sub      x20, x3, x13, uxtx         // encoding: [0x74,0x60,0x2d,0xcb]
+// CHECK: sub      x17, x25, w20, sxtb        // encoding: [0x31,0x83,0x34,0xcb]
+// CHECK: sub      x18, x13, w19, sxth        // encoding: [0xb2,0xa1,0x33,0xcb]
+// CHECK: sub      sp, x2, w3, sxtw           // encoding: [0x5f,0xc0,0x23,0xcb]
+// CHECK: sub      x3, x5, x9, sxtx           // encoding: [0xa3,0xe0,0x29,0xcb]
+
+        sub w2, w5, w7, uxtb
+        sub w21, w15, w17, uxth
+        sub w30, w29, wzr, uxtw
+        sub w19, w17, w1, uxtx  // Goodness knows what this means
+        sub w2, w5, w1, sxtb
+        sub w26, wsp, w19, sxth
+        sub wsp, w2, w3, sxtw
+        sub w2, w3, w5, sxtx
+// CHECK: sub      w2, w5, w7, uxtb           // encoding: [0xa2,0x00,0x27,0x4b]
+// CHECK: sub      w21, w15, w17, uxth        // encoding: [0xf5,0x21,0x31,0x4b]
+// CHECK: sub      w30, w29, wzr, uxtw        // encoding: [0xbe,0x43,0x3f,0x4b]
+// CHECK: sub      w19, w17, w1, uxtx         // encoding: [0x33,0x62,0x21,0x4b]
+// CHECK: sub      w2, w5, w1, sxtb           // encoding: [0xa2,0x80,0x21,0x4b]
+// CHECK: sub      w26, wsp, w19, sxth        // encoding: [0xfa,0xa3,0x33,0x4b]
+// CHECK: sub      wsp, w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x4b]
+// CHECK: sub      w2, w3, w5, sxtx           // encoding: [0x62,0xe0,0x25,0x4b]
+
+        // Adds
+        adds x2, x4, w5, uxtb #2
+        adds x20, sp, w19, uxth #4
+        adds x12, x1, w20, uxtw
+        adds x20, x3, x13, uxtx #0
+        adds xzr, x25, w20, sxtb #3
+        adds x18, sp, w19, sxth
+        adds xzr, x2, w3, sxtw
+        adds x3, x5, x9, sxtx #2
+// CHECK: adds     x2, x4, w5, uxtb #2        // encoding: [0x82,0x08,0x25,0xab]
+// CHECK: adds     x20, sp, w19, uxth #4      // encoding: [0xf4,0x33,0x33,0xab]
+// CHECK: adds     x12, x1, w20, uxtw         // encoding: [0x2c,0x40,0x34,0xab]
+// CHECK: adds     x20, x3, x13, uxtx         // encoding: [0x74,0x60,0x2d,0xab]
+// CHECK: adds     xzr, x25, w20, sxtb #3     // encoding: [0x3f,0x8f,0x34,0xab]
+// CHECK: adds     x18, sp, w19, sxth         // encoding: [0xf2,0xa3,0x33,0xab]
+// CHECK: adds     xzr, x2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0xab]
+// CHECK: adds     x3, x5, x9, sxtx #2        // encoding: [0xa3,0xe8,0x29,0xab]
+
+        adds w2, w5, w7, uxtb
+        adds w21, w15, w17, uxth
+        adds w30, w29, wzr, uxtw
+        adds w19, w17, w1, uxtx  // Goodness knows what this means
+        adds w2, w5, w1, sxtb #1
+        adds w26, wsp, w19, sxth
+        adds wzr, w2, w3, sxtw
+        adds w2, w3, w5, sxtx
+// CHECK: adds     w2, w5, w7, uxtb           // encoding: [0xa2,0x00,0x27,0x2b]
+// CHECK: adds     w21, w15, w17, uxth        // encoding: [0xf5,0x21,0x31,0x2b]
+// CHECK: adds     w30, w29, wzr, uxtw        // encoding: [0xbe,0x43,0x3f,0x2b]
+// CHECK: adds     w19, w17, w1, uxtx         // encoding: [0x33,0x62,0x21,0x2b]
+// CHECK: adds     w2, w5, w1, sxtb #1        // encoding: [0xa2,0x84,0x21,0x2b]
+// CHECK: adds     w26, wsp, w19, sxth        // encoding: [0xfa,0xa3,0x33,0x2b]
+// CHECK: adds     wzr, w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x2b]
+// CHECK: adds     w2, w3, w5, sxtx           // encoding: [0x62,0xe0,0x25,0x2b]
+
+        // subs
+        subs x2, x4, w5, uxtb #2
+        subs x20, sp, w19, uxth #4
+        subs x12, x1, w20, uxtw
+        subs x20, x3, x13, uxtx #0
+        subs xzr, x25, w20, sxtb #3
+        subs x18, sp, w19, sxth
+        subs xzr, x2, w3, sxtw
+        subs x3, x5, x9, sxtx #2
+// CHECK: subs     x2, x4, w5, uxtb #2        // encoding: [0x82,0x08,0x25,0xeb]
+// CHECK: subs     x20, sp, w19, uxth #4      // encoding: [0xf4,0x33,0x33,0xeb]
+// CHECK: subs     x12, x1, w20, uxtw         // encoding: [0x2c,0x40,0x34,0xeb]
+// CHECK: subs     x20, x3, x13, uxtx         // encoding: [0x74,0x60,0x2d,0xeb]
+// CHECK: subs     xzr, x25, w20, sxtb #3     // encoding: [0x3f,0x8f,0x34,0xeb]
+// CHECK: subs     x18, sp, w19, sxth         // encoding: [0xf2,0xa3,0x33,0xeb]
+// CHECK: subs     xzr, x2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0xeb]
+// CHECK: subs     x3, x5, x9, sxtx #2        // encoding: [0xa3,0xe8,0x29,0xeb]
+
+        subs w2, w5, w7, uxtb
+        subs w21, w15, w17, uxth
+        subs w30, w29, wzr, uxtw
+        subs w19, w17, w1, uxtx  // Goodness knows what this means
+        subs w2, w5, w1, sxtb #1
+        subs w26, wsp, w19, sxth
+        subs wzr, w2, w3, sxtw
+        subs w2, w3, w5, sxtx
+// CHECK: subs     w2, w5, w7, uxtb           // encoding: [0xa2,0x00,0x27,0x6b]
+// CHECK: subs     w21, w15, w17, uxth        // encoding: [0xf5,0x21,0x31,0x6b]
+// CHECK: subs     w30, w29, wzr, uxtw        // encoding: [0xbe,0x43,0x3f,0x6b]
+// CHECK: subs     w19, w17, w1, uxtx         // encoding: [0x33,0x62,0x21,0x6b]
+// CHECK: subs     w2, w5, w1, sxtb #1        // encoding: [0xa2,0x84,0x21,0x6b]
+// CHECK: subs     w26, wsp, w19, sxth        // encoding: [0xfa,0xa3,0x33,0x6b]
+// CHECK: subs     wzr, w2, w3, sxtw          // encoding: [0x5f,0xc0,0x23,0x6b]
+// CHECK: subs     w2, w3, w5, sxtx           // encoding: [0x62,0xe0,0x25,0x6b]
+
+        // cmp
+        cmp x4, w5, uxtb #2
+        cmp sp, w19, uxth #4
+        cmp x1, w20, uxtw
+        cmp x3, x13, uxtx #0
+        cmp x25, w20, sxtb #3
+        cmp sp, w19, sxth
+        cmp x2, w3, sxtw
+        cmp x5, x9, sxtx #2
+// CHECK: cmp      x4, w5, uxtb #2            // encoding: [0x9f,0x08,0x25,0xeb]
+// CHECK: cmp      sp, w19, uxth #4           // encoding: [0xff,0x33,0x33,0xeb]
+// CHECK: cmp      x1, w20, uxtw              // encoding: [0x3f,0x40,0x34,0xeb]
+// CHECK: cmp      x3, x13, uxtx              // encoding: [0x7f,0x60,0x2d,0xeb]
+// CHECK: cmp      x25, w20, sxtb #3          // encoding: [0x3f,0x8f,0x34,0xeb]
+// CHECK: cmp      sp, w19, sxth              // encoding: [0xff,0xa3,0x33,0xeb]
+// CHECK: cmp      x2, w3, sxtw               // encoding: [0x5f,0xc0,0x23,0xeb]
+// CHECK: cmp      x5, x9, sxtx #2            // encoding: [0xbf,0xe8,0x29,0xeb]
+
+        cmp w5, w7, uxtb
+        cmp w15, w17, uxth
+        cmp w29, wzr, uxtw
+        cmp w17, w1, uxtx  // Goodness knows what this means
+        cmp w5, w1, sxtb #1
+        cmp wsp, w19, sxth
+        cmp w2, w3, sxtw
+        cmp w3, w5, sxtx
+// CHECK: cmp      w5, w7, uxtb               // encoding: [0xbf,0x00,0x27,0x6b]
+// CHECK: cmp      w15, w17, uxth             // encoding: [0xff,0x21,0x31,0x6b]
+// CHECK: cmp      w29, wzr, uxtw             // encoding: [0xbf,0x43,0x3f,0x6b]
+// CHECK: cmp      w17, w1, uxtx              // encoding: [0x3f,0x62,0x21,0x6b]
+// CHECK: cmp      w5, w1, sxtb #1            // encoding: [0xbf,0x84,0x21,0x6b]
+// CHECK: cmp      wsp, w19, sxth             // encoding: [0xff,0xa3,0x33,0x6b]
+// CHECK: cmp      w2, w3, sxtw               // encoding: [0x5f,0xc0,0x23,0x6b]
+// CHECK: cmp      w3, w5, sxtx               // encoding: [0x7f,0xe0,0x25,0x6b]
+
+
+        // cmn
+        cmn x4, w5, uxtb #2
+        cmn sp, w19, uxth #4
+        cmn x1, w20, uxtw
+        cmn x3, x13, uxtx #0
+        cmn x25, w20, sxtb #3
+        cmn sp, w19, sxth
+        cmn x2, w3, sxtw
+        cmn x5, x9, sxtx #2
+// CHECK: cmn      x4, w5, uxtb #2            // encoding: [0x9f,0x08,0x25,0xab]
+// CHECK: cmn      sp, w19, uxth #4           // encoding: [0xff,0x33,0x33,0xab]
+// CHECK: cmn      x1, w20, uxtw              // encoding: [0x3f,0x40,0x34,0xab]
+// CHECK: cmn      x3, x13, uxtx              // encoding: [0x7f,0x60,0x2d,0xab]
+// CHECK: cmn      x25, w20, sxtb #3          // encoding: [0x3f,0x8f,0x34,0xab]
+// CHECK: cmn      sp, w19, sxth              // encoding: [0xff,0xa3,0x33,0xab]
+// CHECK: cmn      x2, w3, sxtw               // encoding: [0x5f,0xc0,0x23,0xab]
+// CHECK: cmn      x5, x9, sxtx #2            // encoding: [0xbf,0xe8,0x29,0xab]
+
+        cmn w5, w7, uxtb
+        cmn w15, w17, uxth
+        cmn w29, wzr, uxtw
+        cmn w17, w1, uxtx  // Goodness knows what this means
+        cmn w5, w1, sxtb #1
+        cmn wsp, w19, sxth
+        cmn w2, w3, sxtw
+        cmn w3, w5, sxtx
+// CHECK: cmn      w5, w7, uxtb               // encoding: [0xbf,0x00,0x27,0x2b]
+// CHECK: cmn      w15, w17, uxth             // encoding: [0xff,0x21,0x31,0x2b]
+// CHECK: cmn      w29, wzr, uxtw             // encoding: [0xbf,0x43,0x3f,0x2b]
+// CHECK: cmn      w17, w1, uxtx              // encoding: [0x3f,0x62,0x21,0x2b]
+// CHECK: cmn      w5, w1, sxtb #1            // encoding: [0xbf,0x84,0x21,0x2b]
+// CHECK: cmn      wsp, w19, sxth             // encoding: [0xff,0xa3,0x33,0x2b]
+// CHECK: cmn      w2, w3, sxtw               // encoding: [0x5f,0xc0,0x23,0x2b]
+// CHECK: cmn      w3, w5, sxtx               // encoding: [0x7f,0xe0,0x25,0x2b]
+
+        // operands for cmp
+        cmp x20, w29, uxtb #3
+        cmp x12, x13, uxtx #4
+        cmp wsp, w1, uxtb
+        cmn wsp, wzr, sxtw
+// CHECK: cmp      x20, w29, uxtb #3          // encoding: [0x9f,0x0e,0x3d,0xeb]
+// CHECK: cmp      x12, x13, uxtx #4          // encoding: [0x9f,0x71,0x2d,0xeb]
+// CHECK: cmp      wsp, w1, uxtb              // encoding: [0xff,0x03,0x21,0x6b]
+// CHECK: cmn      wsp, wzr, sxtw             // encoding: [0xff,0xc3,0x3f,0x2b]
+
+        // LSL variant if sp involved
+        sub sp, x3, x7, lsl #4
+        add w2, wsp, w3, lsl #1
+        cmp wsp, w9, lsl #0
+        adds wzr, wsp, w3, lsl #4
+        subs x3, sp, x9, lsl #2
+// CHECK: sub      sp, x3, x7, lsl #4         // encoding: [0x7f,0x70,0x27,0xcb]
+// CHECK: add      w2, wsp, w3, lsl #1        // encoding: [0xe2,0x47,0x23,0x0b]
+// CHECK: cmp      wsp, w9                    // encoding: [0xff,0x43,0x29,0x6b]
+// CHECK: adds     wzr, wsp, w3, lsl #4       // encoding: [0xff,0x53,0x23,0x2b]
+// CHECK: subs     x3, sp, x9, lsl #2         // encoding: [0xe3,0x6b,0x29,0xeb]
+
+//------------------------------------------------------------------------------
+// Add/sub (immediate)
+//------------------------------------------------------------------------------
+
+// Check basic immediate values: an unsigned 12-bit immediate, optionally
+// shifted left by 12 bits.
+        add w4, w5, #0x0
+        add w2, w3, #4095
+        add w30, w29, #1, lsl #12
+        add w13, w5, #4095, lsl #12
+        add x5, x7, #1638
+// CHECK: add      w4, w5, #0                 // encoding: [0xa4,0x00,0x00,0x11]
+// CHECK: add      w2, w3, #4095              // encoding: [0x62,0xfc,0x3f,0x11]
+// CHECK: add      w30, w29, #1, lsl #12      // encoding: [0xbe,0x07,0x40,0x11]
+// CHECK: add      w13, w5, #4095, lsl #12    // encoding: [0xad,0xfc,0x7f,0x11]
+// CHECK: add      x5, x7, #1638              // encoding: [0xe5,0x98,0x19,0x91]
+
+// All registers involved in the non-S variants have 31 encoding sp rather than zr
+        add w20, wsp, #801, lsl #0
+        add wsp, wsp, #1104
+        add wsp, w30, #4084
+// CHECK: add      w20, wsp, #801             // encoding: [0xf4,0x87,0x0c,0x11]
+// CHECK: add      wsp, wsp, #1104            // encoding: [0xff,0x43,0x11,0x11]
+// CHECK: add      wsp, w30, #4084            // encoding: [0xdf,0xd3,0x3f,0x11]
+
+// A few checks on the sanity of 64-bit versions
+        add x0, x24, #291
+        add x3, x24, #4095, lsl #12
+        add x8, sp, #1074
+        add sp, x29, #3816
+// CHECK: add      x0, x24, #291              // encoding: [0x00,0x8f,0x04,0x91]
+// CHECK: add      x3, x24, #4095, lsl #12    // encoding: [0x03,0xff,0x7f,0x91]
+// CHECK: add      x8, sp, #1074              // encoding: [0xe8,0xcb,0x10,0x91]
+// CHECK: add      sp, x29, #3816             // encoding: [0xbf,0xa3,0x3b,0x91]
+
+// And on sub
+        sub w0, wsp, #4077
+        sub w4, w20, #546, lsl #12
+        sub sp, sp, #288
+        sub wsp, w19, #16
+// CHECK: sub      w0, wsp, #4077             // encoding: [0xe0,0xb7,0x3f,0x51]
+// CHECK: sub      w4, w20, #546, lsl #12     // encoding: [0x84,0x8a,0x48,0x51]
+// CHECK: sub      sp, sp, #288               // encoding: [0xff,0x83,0x04,0xd1]
+// CHECK: sub      wsp, w19, #16              // encoding: [0x7f,0x42,0x00,0x51]
+
+// ADDS/SUBS accept zr in the Rd position but sp in the Rn position
+        adds w13, w23, #291, lsl #12
+        adds wzr, w2, #4095                  // FIXME: canonically should be cmn
+        adds w20, wsp, #0x0
+        adds xzr, x3, #0x1, lsl #12          // FIXME: canonically should be cmn
+// CHECK: adds     w13, w23, #291, lsl #12    // encoding: [0xed,0x8e,0x44,0x31]
+// CHECK: adds     wzr, w2, #4095             // encoding: [0x5f,0xfc,0x3f,0x31]
+// CHECK: adds     w20, wsp, #0               // encoding: [0xf4,0x03,0x00,0x31]
+// CHECK: adds     xzr, x3, #1, lsl #12       // encoding: [0x7f,0x04,0x40,0xb1]
+
+// Checks for subs
+        subs xzr, sp, #20, lsl #12           // FIXME: canonically should be cmp
+        subs xzr, x30, #4095, lsl #0         // FIXME: canonically should be cmp
+        subs x4, sp, #3822
+// CHECK: subs     xzr, sp, #20, lsl #12      // encoding: [0xff,0x53,0x40,0xf1]
+// CHECK: subs     xzr, x30, #4095            // encoding: [0xdf,0xff,0x3f,0xf1]
+// CHECK: subs     x4, sp, #3822              // encoding: [0xe4,0xbb,0x3b,0xf1]
+
+// cmn is an alias for adds zr, ...
+        cmn w3, #291, lsl #12
+        cmn wsp, #1365, lsl #0
+        cmn sp, #1092, lsl #12
+// CHECK: cmn      w3, #291, lsl #12          // encoding: [0x7f,0x8c,0x44,0x31]
+// CHECK: cmn      wsp, #1365                 // encoding: [0xff,0x57,0x15,0x31]
+// CHECK: cmn      sp, #1092, lsl #12         // encoding: [0xff,0x13,0x51,0xb1]
+
+// cmp is an alias for subs zr, ... (FIXME: should always disassemble as such too).
+        cmp x4, #300, lsl #12
+        cmp wsp, #500
+        cmp sp, #200, lsl #0
+// CHECK: cmp      x4, #300, lsl #12          // encoding: [0x9f,0xb0,0x44,0xf1]
+// CHECK: cmp      wsp, #500                  // encoding: [0xff,0xd3,0x07,0x71]
+// CHECK: cmp      sp, #200                   // encoding: [0xff,0x23,0x03,0xf1]
+
+// A "MOV" involving sp is encoded in this manner: add Reg, Reg, #0
+        mov sp, x30
+        mov wsp, w20
+        mov x11, sp
+        mov w24, wsp
+// CHECK: mov      sp, x30                    // encoding: [0xdf,0x03,0x00,0x91]
+// CHECK: mov      wsp, w20                   // encoding: [0x9f,0x02,0x00,0x11]
+// CHECK: mov      x11, sp                    // encoding: [0xeb,0x03,0x00,0x91]
+// CHECK: mov      w24, wsp                   // encoding: [0xf8,0x03,0x00,0x11]
+
+// A relocation check (default to lo12, which is the only sane relocation anyway really)
+        add x0, x4, #:lo12:var
+// CHECK: add     x0, x4, #:lo12:var         // encoding: [0x80'A',A,A,0x91'A']
+// CHECK:                                    //   fixup A - offset: 0, value: :lo12:var, kind: fixup_a64_add_lo12
+
+//------------------------------------------------------------------------------
+// Add-sub (shifted register)
+//------------------------------------------------------------------------------
+
+// As usual, we don't print the canonical forms of many instructions.
+
+        add w3, w5, w7
+        add wzr, w3, w5
+        add w20, wzr, w4
+        add w4, w6, wzr
+// CHECK: add      w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x0b]
+// CHECK: add      wzr, w3, w5                // encoding: [0x7f,0x00,0x05,0x0b]
+// CHECK: add      w20, wzr, w4               // encoding: [0xf4,0x03,0x04,0x0b]
+// CHECK: add      w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x0b]
+
+        add w11, w13, w15, lsl #0
+        add w9, w3, wzr, lsl #10
+        add w17, w29, w20, lsl #31
+// CHECK: add      w11, w13, w15              // encoding: [0xab,0x01,0x0f,0x0b]
+// CHECK: add      w9, w3, wzr, lsl #10       // encoding: [0x69,0x28,0x1f,0x0b]
+// CHECK: add      w17, w29, w20, lsl #31     // encoding: [0xb1,0x7f,0x14,0x0b]
+
+        add w21, w22, w23, lsr #0
+        add w24, w25, w26, lsr #18
+        add w27, w28, w29, lsr #31
+// CHECK: add      w21, w22, w23, lsr #0      // encoding: [0xd5,0x02,0x57,0x0b]
+// CHECK: add      w24, w25, w26, lsr #18     // encoding: [0x38,0x4b,0x5a,0x0b]
+// CHECK: add      w27, w28, w29, lsr #31     // encoding: [0x9b,0x7f,0x5d,0x0b]
+
+        add w2, w3, w4, asr #0
+        add w5, w6, w7, asr #21
+        add w8, w9, w10, asr #31
+// CHECK: add      w2, w3, w4, asr #0         // encoding: [0x62,0x00,0x84,0x0b]
+// CHECK: add      w5, w6, w7, asr #21        // encoding: [0xc5,0x54,0x87,0x0b]
+// CHECK: add      w8, w9, w10, asr #31       // encoding: [0x28,0x7d,0x8a,0x0b]
+
+        add x3, x5, x7
+        add xzr, x3, x5
+        add x20, xzr, x4
+        add x4, x6, xzr
+// CHECK: add      x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0x8b]
+// CHECK: add      xzr, x3, x5                // encoding: [0x7f,0x00,0x05,0x8b]
+// CHECK: add      x20, xzr, x4               // encoding: [0xf4,0x03,0x04,0x8b]
+// CHECK: add      x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0x8b]
+
+        add x11, x13, x15, lsl #0
+        add x9, x3, xzr, lsl #10
+        add x17, x29, x20, lsl #63
+// CHECK: add      x11, x13, x15              // encoding: [0xab,0x01,0x0f,0x8b]
+// CHECK: add      x9, x3, xzr, lsl #10       // encoding: [0x69,0x28,0x1f,0x8b]
+// CHECK: add      x17, x29, x20, lsl #63     // encoding: [0xb1,0xff,0x14,0x8b]
+
+        add x21, x22, x23, lsr #0
+        add x24, x25, x26, lsr #18
+        add x27, x28, x29, lsr #63
+// CHECK: add      x21, x22, x23, lsr #0      // encoding: [0xd5,0x02,0x57,0x8b]
+// CHECK: add      x24, x25, x26, lsr #18     // encoding: [0x38,0x4b,0x5a,0x8b]
+// CHECK: add      x27, x28, x29, lsr #63     // encoding: [0x9b,0xff,0x5d,0x8b]
+
+        add x2, x3, x4, asr #0
+        add x5, x6, x7, asr #21
+        add x8, x9, x10, asr #63
+// CHECK: add      x2, x3, x4, asr #0         // encoding: [0x62,0x00,0x84,0x8b]
+// CHECK: add      x5, x6, x7, asr #21        // encoding: [0xc5,0x54,0x87,0x8b]
+// CHECK: add      x8, x9, x10, asr #63       // encoding: [0x28,0xfd,0x8a,0x8b]
+
+        adds w3, w5, w7
+        adds wzr, w3, w5
+        adds w20, wzr, w4
+        adds w4, w6, wzr
+// CHECK: adds     w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x2b]
+// CHECK: adds     wzr, w3, w5                // encoding: [0x7f,0x00,0x05,0x2b]
+// CHECK: adds     w20, wzr, w4               // encoding: [0xf4,0x03,0x04,0x2b]
+// CHECK: adds     w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x2b]
+
+        adds w11, w13, w15, lsl #0
+        adds w9, w3, wzr, lsl #10
+        adds w17, w29, w20, lsl #31
+// CHECK: adds     w11, w13, w15              // encoding: [0xab,0x01,0x0f,0x2b]
+// CHECK: adds     w9, w3, wzr, lsl #10       // encoding: [0x69,0x28,0x1f,0x2b]
+// CHECK: adds     w17, w29, w20, lsl #31     // encoding: [0xb1,0x7f,0x14,0x2b]
+
+        adds w21, w22, w23, lsr #0
+        adds w24, w25, w26, lsr #18
+        adds w27, w28, w29, lsr #31
+// CHECK: adds     w21, w22, w23, lsr #0      // encoding: [0xd5,0x02,0x57,0x2b]
+// CHECK: adds     w24, w25, w26, lsr #18     // encoding: [0x38,0x4b,0x5a,0x2b]
+// CHECK: adds     w27, w28, w29, lsr #31     // encoding: [0x9b,0x7f,0x5d,0x2b]
+
+        adds w2, w3, w4, asr #0
+        adds w5, w6, w7, asr #21
+        adds w8, w9, w10, asr #31
+// CHECK: adds     w2, w3, w4, asr #0         // encoding: [0x62,0x00,0x84,0x2b]
+// CHECK: adds     w5, w6, w7, asr #21        // encoding: [0xc5,0x54,0x87,0x2b]
+// CHECK: adds     w8, w9, w10, asr #31       // encoding: [0x28,0x7d,0x8a,0x2b]
+
+        adds x3, x5, x7
+        adds xzr, x3, x5
+        adds x20, xzr, x4
+        adds x4, x6, xzr
+// CHECK: adds     x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xab]
+// CHECK: adds     xzr, x3, x5                // encoding: [0x7f,0x00,0x05,0xab]
+// CHECK: adds     x20, xzr, x4               // encoding: [0xf4,0x03,0x04,0xab]
+// CHECK: adds     x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xab]
+
+        adds x11, x13, x15, lsl #0
+        adds x9, x3, xzr, lsl #10
+        adds x17, x29, x20, lsl #63
+// CHECK: adds     x11, x13, x15              // encoding: [0xab,0x01,0x0f,0xab]
+// CHECK: adds     x9, x3, xzr, lsl #10       // encoding: [0x69,0x28,0x1f,0xab]
+// CHECK: adds     x17, x29, x20, lsl #63     // encoding: [0xb1,0xff,0x14,0xab]
+
+        adds x21, x22, x23, lsr #0
+        adds x24, x25, x26, lsr #18
+        adds x27, x28, x29, lsr #63
+// CHECK: adds     x21, x22, x23, lsr #0      // encoding: [0xd5,0x02,0x57,0xab]
+// CHECK: adds     x24, x25, x26, lsr #18     // encoding: [0x38,0x4b,0x5a,0xab]
+// CHECK: adds     x27, x28, x29, lsr #63     // encoding: [0x9b,0xff,0x5d,0xab]
+
+        adds x2, x3, x4, asr #0
+        adds x5, x6, x7, asr #21
+        adds x8, x9, x10, asr #63
+// CHECK: adds     x2, x3, x4, asr #0         // encoding: [0x62,0x00,0x84,0xab]
+// CHECK: adds     x5, x6, x7, asr #21        // encoding: [0xc5,0x54,0x87,0xab]
+// CHECK: adds     x8, x9, x10, asr #63       // encoding: [0x28,0xfd,0x8a,0xab]
+
+        sub w3, w5, w7
+        sub wzr, w3, w5
+        sub w20, wzr, w4
+        sub w4, w6, wzr
+// CHECK: sub      w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x4b]
+// CHECK: sub      wzr, w3, w5                // encoding: [0x7f,0x00,0x05,0x4b]
+// CHECK: sub      w20, wzr, w4               // encoding: [0xf4,0x03,0x04,0x4b]
+// CHECK: sub      w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x4b]
+
+        sub w11, w13, w15, lsl #0
+        sub w9, w3, wzr, lsl #10
+        sub w17, w29, w20, lsl #31
+// CHECK: sub      w11, w13, w15              // encoding: [0xab,0x01,0x0f,0x4b]
+// CHECK: sub      w9, w3, wzr, lsl #10       // encoding: [0x69,0x28,0x1f,0x4b]
+// CHECK: sub      w17, w29, w20, lsl #31     // encoding: [0xb1,0x7f,0x14,0x4b]
+
+        sub w21, w22, w23, lsr #0
+        sub w24, w25, w26, lsr #18
+        sub w27, w28, w29, lsr #31
+// CHECK: sub      w21, w22, w23, lsr #0      // encoding: [0xd5,0x02,0x57,0x4b]
+// CHECK: sub      w24, w25, w26, lsr #18     // encoding: [0x38,0x4b,0x5a,0x4b]
+// CHECK: sub      w27, w28, w29, lsr #31     // encoding: [0x9b,0x7f,0x5d,0x4b]
+
+        sub w2, w3, w4, asr #0
+        sub w5, w6, w7, asr #21
+        sub w8, w9, w10, asr #31
+// CHECK: sub      w2, w3, w4, asr #0         // encoding: [0x62,0x00,0x84,0x4b]
+// CHECK: sub      w5, w6, w7, asr #21        // encoding: [0xc5,0x54,0x87,0x4b]
+// CHECK: sub      w8, w9, w10, asr #31       // encoding: [0x28,0x7d,0x8a,0x4b]
+
+        sub x3, x5, x7
+        sub xzr, x3, x5
+        sub x20, xzr, x4
+        sub x4, x6, xzr
+// CHECK: sub      x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xcb]
+// CHECK: sub      xzr, x3, x5                // encoding: [0x7f,0x00,0x05,0xcb]
+// CHECK: sub      x20, xzr, x4               // encoding: [0xf4,0x03,0x04,0xcb]
+// CHECK: sub      x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xcb]
+
+        sub x11, x13, x15, lsl #0
+        sub x9, x3, xzr, lsl #10
+        sub x17, x29, x20, lsl #63
+// CHECK: sub      x11, x13, x15              // encoding: [0xab,0x01,0x0f,0xcb]
+// CHECK: sub      x9, x3, xzr, lsl #10       // encoding: [0x69,0x28,0x1f,0xcb]
+// CHECK: sub      x17, x29, x20, lsl #63     // encoding: [0xb1,0xff,0x14,0xcb]
+
+        sub x21, x22, x23, lsr #0
+        sub x24, x25, x26, lsr #18
+        sub x27, x28, x29, lsr #63
+// CHECK: sub      x21, x22, x23, lsr #0      // encoding: [0xd5,0x02,0x57,0xcb]
+// CHECK: sub      x24, x25, x26, lsr #18     // encoding: [0x38,0x4b,0x5a,0xcb]
+// CHECK: sub      x27, x28, x29, lsr #63     // encoding: [0x9b,0xff,0x5d,0xcb]
+
+        sub x2, x3, x4, asr #0
+        sub x5, x6, x7, asr #21
+        sub x8, x9, x10, asr #63
+// CHECK: sub      x2, x3, x4, asr #0         // encoding: [0x62,0x00,0x84,0xcb]
+// CHECK: sub      x5, x6, x7, asr #21        // encoding: [0xc5,0x54,0x87,0xcb]
+// CHECK: sub      x8, x9, x10, asr #63       // encoding: [0x28,0xfd,0x8a,0xcb]
+
+        subs w3, w5, w7
+        subs wzr, w3, w5
+        subs w20, wzr, w4
+        subs w4, w6, wzr
+// CHECK: subs     w3, w5, w7                 // encoding: [0xa3,0x00,0x07,0x6b]
+// CHECK: subs     wzr, w3, w5                // encoding: [0x7f,0x00,0x05,0x6b]
+// CHECK: subs     w20, wzr, w4               // encoding: [0xf4,0x03,0x04,0x6b]
+// CHECK: subs     w4, w6, wzr                // encoding: [0xc4,0x00,0x1f,0x6b]
+
+        subs w11, w13, w15, lsl #0
+        subs w9, w3, wzr, lsl #10
+        subs w17, w29, w20, lsl #31
+// CHECK: subs     w11, w13, w15              // encoding: [0xab,0x01,0x0f,0x6b]
+// CHECK: subs     w9, w3, wzr, lsl #10       // encoding: [0x69,0x28,0x1f,0x6b]
+// CHECK: subs     w17, w29, w20, lsl #31     // encoding: [0xb1,0x7f,0x14,0x6b]
+
+        subs w21, w22, w23, lsr #0
+        subs w24, w25, w26, lsr #18
+        subs w27, w28, w29, lsr #31
+// CHECK: subs     w21, w22, w23, lsr #0      // encoding: [0xd5,0x02,0x57,0x6b]
+// CHECK: subs     w24, w25, w26, lsr #18     // encoding: [0x38,0x4b,0x5a,0x6b]
+// CHECK: subs     w27, w28, w29, lsr #31     // encoding: [0x9b,0x7f,0x5d,0x6b]
+
+        subs w2, w3, w4, asr #0
+        subs w5, w6, w7, asr #21
+        subs w8, w9, w10, asr #31
+// CHECK: subs     w2, w3, w4, asr #0         // encoding: [0x62,0x00,0x84,0x6b]
+// CHECK: subs     w5, w6, w7, asr #21        // encoding: [0xc5,0x54,0x87,0x6b]
+// CHECK: subs     w8, w9, w10, asr #31       // encoding: [0x28,0x7d,0x8a,0x6b]
+
+        subs x3, x5, x7
+        subs xzr, x3, x5
+        subs x20, xzr, x4
+        subs x4, x6, xzr
+// CHECK: subs     x3, x5, x7                 // encoding: [0xa3,0x00,0x07,0xeb]
+// CHECK: subs     xzr, x3, x5                // encoding: [0x7f,0x00,0x05,0xeb]
+// CHECK: subs     x20, xzr, x4               // encoding: [0xf4,0x03,0x04,0xeb]
+// CHECK: subs     x4, x6, xzr                // encoding: [0xc4,0x00,0x1f,0xeb]
+
+        subs x11, x13, x15, lsl #0
+        subs x9, x3, xzr, lsl #10
+        subs x17, x29, x20, lsl #63
+// CHECK: subs     x11, x13, x15              // encoding: [0xab,0x01,0x0f,0xeb]
+// CHECK: subs     x9, x3, xzr, lsl #10       // encoding: [0x69,0x28,0x1f,0xeb]
+// CHECK: subs     x17, x29, x20, lsl #63     // encoding: [0xb1,0xff,0x14,0xeb]
+
+        subs x21, x22, x23, lsr #0
+        subs x24, x25, x26, lsr #18
+        subs x27, x28, x29, lsr #63
+// CHECK: subs     x21, x22, x23, lsr #0      // encoding: [0xd5,0x02,0x57,0xeb]
+// CHECK: subs     x24, x25, x26, lsr #18     // encoding: [0x38,0x4b,0x5a,0xeb]
+// CHECK: subs     x27, x28, x29, lsr #63     // encoding: [0x9b,0xff,0x5d,0xeb]
+
+        subs x2, x3, x4, asr #0
+        subs x5, x6, x7, asr #21
+        subs x8, x9, x10, asr #63
+// CHECK: subs     x2, x3, x4, asr #0         // encoding: [0x62,0x00,0x84,0xeb]
+// CHECK: subs     x5, x6, x7, asr #21        // encoding: [0xc5,0x54,0x87,0xeb]
+// CHECK: subs     x8, x9, x10, asr #63       // encoding: [0x28,0xfd,0x8a,0xeb]
+
+        cmn w0, w3
+        cmn wzr, w4
+        cmn w5, wzr
+// CHECK: cmn      w0, w3                     // encoding: [0x1f,0x00,0x03,0x2b]
+// CHECK: cmn      wzr, w4                    // encoding: [0xff,0x03,0x04,0x2b]
+// CHECK: cmn      w5, wzr                    // encoding: [0xbf,0x00,0x1f,0x2b]
+
+        cmn w6, w7, lsl #0
+        cmn w8, w9, lsl #15
+        cmn w10, w11, lsl #31
+// CHECK: cmn      w6, w7                     // encoding: [0xdf,0x00,0x07,0x2b]
+// CHECK: cmn      w8, w9, lsl #15            // encoding: [0x1f,0x3d,0x09,0x2b]
+// CHECK: cmn      w10, w11, lsl #31          // encoding: [0x5f,0x7d,0x0b,0x2b]
+
+        cmn w12, w13, lsr #0
+        cmn w14, w15, lsr #21
+        cmn w16, w17, lsr #31
+// CHECK: cmn      w12, w13, lsr #0           // encoding: [0x9f,0x01,0x4d,0x2b]
+// CHECK: cmn      w14, w15, lsr #21          // encoding: [0xdf,0x55,0x4f,0x2b]
+// CHECK: cmn      w16, w17, lsr #31          // encoding: [0x1f,0x7e,0x51,0x2b]
+
+        cmn w18, w19, asr #0
+        cmn w20, w21, asr #22
+        cmn w22, w23, asr #31
+// CHECK: cmn      w18, w19, asr #0           // encoding: [0x5f,0x02,0x93,0x2b]
+// CHECK: cmn      w20, w21, asr #22          // encoding: [0x9f,0x5a,0x95,0x2b]
+// CHECK: cmn      w22, w23, asr #31          // encoding: [0xdf,0x7e,0x97,0x2b]
+
+        cmn x0, x3
+        cmn xzr, x4
+        cmn x5, xzr
+// CHECK: cmn      x0, x3                     // encoding: [0x1f,0x00,0x03,0xab]
+// CHECK: cmn      xzr, x4                    // encoding: [0xff,0x03,0x04,0xab]
+// CHECK: cmn      x5, xzr                    // encoding: [0xbf,0x00,0x1f,0xab]
+
+        cmn x6, x7, lsl #0
+        cmn x8, x9, lsl #15
+        cmn x10, x11, lsl #63
+// CHECK: cmn      x6, x7                     // encoding: [0xdf,0x00,0x07,0xab]
+// CHECK: cmn      x8, x9, lsl #15            // encoding: [0x1f,0x3d,0x09,0xab]
+// CHECK: cmn      x10, x11, lsl #63          // encoding: [0x5f,0xfd,0x0b,0xab]
+
+        cmn x12, x13, lsr #0
+        cmn x14, x15, lsr #41
+        cmn x16, x17, lsr #63
+// CHECK: cmn      x12, x13, lsr #0           // encoding: [0x9f,0x01,0x4d,0xab]
+// CHECK: cmn      x14, x15, lsr #41          // encoding: [0xdf,0xa5,0x4f,0xab]
+// CHECK: cmn      x16, x17, lsr #63          // encoding: [0x1f,0xfe,0x51,0xab]
+
+        cmn x18, x19, asr #0
+        cmn x20, x21, asr #55
+        cmn x22, x23, asr #63
+// CHECK: cmn      x18, x19, asr #0           // encoding: [0x5f,0x02,0x93,0xab]
+// CHECK: cmn      x20, x21, asr #55          // encoding: [0x9f,0xde,0x95,0xab]
+// CHECK: cmn      x22, x23, asr #63          // encoding: [0xdf,0xfe,0x97,0xab]
+
+        cmp w0, w3
+        cmp wzr, w4
+        cmp w5, wzr
+// CHECK: cmp      w0, w3                     // encoding: [0x1f,0x00,0x03,0x6b]
+// CHECK: cmp      wzr, w4                    // encoding: [0xff,0x03,0x04,0x6b]
+// CHECK: cmp      w5, wzr                    // encoding: [0xbf,0x00,0x1f,0x6b]
+
+        cmp w6, w7, lsl #0
+        cmp w8, w9, lsl #15
+        cmp w10, w11, lsl #31
+// CHECK: cmp      w6, w7                     // encoding: [0xdf,0x00,0x07,0x6b]
+// CHECK: cmp      w8, w9, lsl #15            // encoding: [0x1f,0x3d,0x09,0x6b]
+// CHECK: cmp      w10, w11, lsl #31          // encoding: [0x5f,0x7d,0x0b,0x6b]
+
+        cmp w12, w13, lsr #0
+        cmp w14, w15, lsr #21
+        cmp w16, w17, lsr #31
+// CHECK: cmp      w12, w13, lsr #0           // encoding: [0x9f,0x01,0x4d,0x6b]
+// CHECK: cmp      w14, w15, lsr #21          // encoding: [0xdf,0x55,0x4f,0x6b]
+// CHECK: cmp      w16, w17, lsr #31          // encoding: [0x1f,0x7e,0x51,0x6b]
+
+        cmp w18, w19, asr #0
+        cmp w20, w21, asr #22
+        cmp w22, w23, asr #31
+// CHECK: cmp      w18, w19, asr #0           // encoding: [0x5f,0x02,0x93,0x6b]
+// CHECK: cmp      w20, w21, asr #22          // encoding: [0x9f,0x5a,0x95,0x6b]
+// CHECK: cmp      w22, w23, asr #31          // encoding: [0xdf,0x7e,0x97,0x6b]
+
+        cmp x0, x3
+        cmp xzr, x4
+        cmp x5, xzr
+// CHECK: cmp      x0, x3                     // encoding: [0x1f,0x00,0x03,0xeb]
+// CHECK: cmp      xzr, x4                    // encoding: [0xff,0x03,0x04,0xeb]
+// CHECK: cmp      x5, xzr                    // encoding: [0xbf,0x00,0x1f,0xeb]
+
+        cmp x6, x7, lsl #0
+        cmp x8, x9, lsl #15
+        cmp x10, x11, lsl #63
+// CHECK: cmp      x6, x7                     // encoding: [0xdf,0x00,0x07,0xeb]
+// CHECK: cmp      x8, x9, lsl #15            // encoding: [0x1f,0x3d,0x09,0xeb]
+// CHECK: cmp      x10, x11, lsl #63          // encoding: [0x5f,0xfd,0x0b,0xeb]
+
+        cmp x12, x13, lsr #0
+        cmp x14, x15, lsr #41
+        cmp x16, x17, lsr #63
+// CHECK: cmp      x12, x13, lsr #0           // encoding: [0x9f,0x01,0x4d,0xeb]
+// CHECK: cmp      x14, x15, lsr #41          // encoding: [0xdf,0xa5,0x4f,0xeb]
+// CHECK: cmp      x16, x17, lsr #63          // encoding: [0x1f,0xfe,0x51,0xeb]
+
+        cmp x18, x19, asr #0
+        cmp x20, x21, asr #55
+        cmp x22, x23, asr #63
+// CHECK: cmp      x18, x19, asr #0           // encoding: [0x5f,0x02,0x93,0xeb]
+// CHECK: cmp      x20, x21, asr #55          // encoding: [0x9f,0xde,0x95,0xeb]
+// CHECK: cmp      x22, x23, asr #63          // encoding: [0xdf,0xfe,0x97,0xeb]
+
+        neg w29, w30
+        neg w30, wzr
+        neg wzr, w0
+// CHECK: sub      w29, wzr, w30              // encoding: [0xfd,0x03,0x1e,0x4b]
+// CHECK: sub      w30, wzr, wzr              // encoding: [0xfe,0x03,0x1f,0x4b]
+// CHECK: sub      wzr, wzr, w0                    // encoding: [0xff,0x03,0x00,0x4b]
+
+        neg w28, w27, lsl #0
+        neg w26, w25, lsl #29
+        neg w24, w23, lsl #31
+// CHECK: sub      w28, wzr, w27              // encoding: [0xfc,0x03,0x1b,0x4b]
+// CHECK: sub      w26, wzr, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x4b]
+// CHECK: sub      w24, wzr, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x4b]
+
+        neg w22, w21, lsr #0
+        neg w20, w19, lsr #1
+        neg w18, w17, lsr #31
+// CHECK: sub      w22, wzr, w21, lsr #0      // encoding: [0xf6,0x03,0x55,0x4b]
+// CHECK: sub      w20, wzr, w19, lsr #1      // encoding: [0xf4,0x07,0x53,0x4b]
+// CHECK: sub      w18, wzr, w17, lsr #31     // encoding: [0xf2,0x7f,0x51,0x4b]
+
+        neg w16, w15, asr #0
+        neg w14, w13, asr #12
+        neg w12, w11, asr #31
+// CHECK: sub      w16, wzr, w15, asr #0      // encoding: [0xf0,0x03,0x8f,0x4b]
+// CHECK: sub      w14, wzr, w13, asr #12     // encoding: [0xee,0x33,0x8d,0x4b]
+// CHECK: sub      w12, wzr, w11, asr #31     // encoding: [0xec,0x7f,0x8b,0x4b]
+
+        neg x29, x30
+        neg x30, xzr
+        neg xzr, x0
+// CHECK: sub      x29, xzr, x30              // encoding: [0xfd,0x03,0x1e,0xcb]
+// CHECK: sub      x30, xzr, xzr              // encoding: [0xfe,0x03,0x1f,0xcb]
+// CHECK: sub      xzr, xzr, x0               // encoding: [0xff,0x03,0x00,0xcb]
+
+        neg x28, x27, lsl #0
+        neg x26, x25, lsl #29
+        neg x24, x23, lsl #31
+// CHECK: sub      x28, xzr, x27              // encoding: [0xfc,0x03,0x1b,0xcb]
+// CHECK: sub      x26, xzr, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xcb]
+// CHECK: sub      x24, xzr, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xcb]
+
+        neg x22, x21, lsr #0
+        neg x20, x19, lsr #1
+        neg x18, x17, lsr #31
+// CHECK: sub      x22, xzr, x21, lsr #0      // encoding: [0xf6,0x03,0x55,0xcb]
+// CHECK: sub      x20, xzr, x19, lsr #1      // encoding: [0xf4,0x07,0x53,0xcb]
+// CHECK: sub      x18, xzr, x17, lsr #31     // encoding: [0xf2,0x7f,0x51,0xcb]
+
+        neg x16, x15, asr #0
+        neg x14, x13, asr #12
+        neg x12, x11, asr #31
+// CHECK: sub      x16, xzr, x15, asr #0      // encoding: [0xf0,0x03,0x8f,0xcb]
+// CHECK: sub      x14, xzr, x13, asr #12     // encoding: [0xee,0x33,0x8d,0xcb]
+// CHECK: sub      x12, xzr, x11, asr #31     // encoding: [0xec,0x7f,0x8b,0xcb]
+
+        negs w29, w30
+        negs w30, wzr
+        negs wzr, w0
+// CHECK: subs     w29, wzr, w30              // encoding: [0xfd,0x03,0x1e,0x6b]
+// CHECK: subs     w30, wzr, wzr              // encoding: [0xfe,0x03,0x1f,0x6b]
+// CHECK: subs     wzr, wzr, w0               // encoding: [0xff,0x03,0x00,0x6b]
+
+        negs w28, w27, lsl #0
+        negs w26, w25, lsl #29
+        negs w24, w23, lsl #31
+// CHECK: subs     w28, wzr, w27              // encoding: [0xfc,0x03,0x1b,0x6b]
+// CHECK: subs     w26, wzr, w25, lsl #29     // encoding: [0xfa,0x77,0x19,0x6b]
+// CHECK: subs     w24, wzr, w23, lsl #31     // encoding: [0xf8,0x7f,0x17,0x6b]
+
+        negs w22, w21, lsr #0
+        negs w20, w19, lsr #1
+        negs w18, w17, lsr #31
+// CHECK: subs     w22, wzr, w21, lsr #0      // encoding: [0xf6,0x03,0x55,0x6b]
+// CHECK: subs     w20, wzr, w19, lsr #1      // encoding: [0xf4,0x07,0x53,0x6b]
+// CHECK: subs     w18, wzr, w17, lsr #31     // encoding: [0xf2,0x7f,0x51,0x6b]
+
+        negs w16, w15, asr #0
+        negs w14, w13, asr #12
+        negs w12, w11, asr #31
+// CHECK: subs     w16, wzr, w15, asr #0      // encoding: [0xf0,0x03,0x8f,0x6b]
+// CHECK: subs     w14, wzr, w13, asr #12     // encoding: [0xee,0x33,0x8d,0x6b]
+// CHECK: subs     w12, wzr, w11, asr #31     // encoding: [0xec,0x7f,0x8b,0x6b]
+
+        negs x29, x30
+        negs x30, xzr
+        negs xzr, x0
+// CHECK: subs     x29, xzr, x30              // encoding: [0xfd,0x03,0x1e,0xeb]
+// CHECK: subs     x30, xzr, xzr              // encoding: [0xfe,0x03,0x1f,0xeb]
+// CHECK: subs     xzr, xzr, x0               // encoding: [0xff,0x03,0x00,0xeb]
+
+        negs x28, x27, lsl #0
+        negs x26, x25, lsl #29
+        negs x24, x23, lsl #31
+// CHECK: subs     x28, xzr, x27              // encoding: [0xfc,0x03,0x1b,0xeb]
+// CHECK: subs     x26, xzr, x25, lsl #29     // encoding: [0xfa,0x77,0x19,0xeb]
+// CHECK: subs     x24, xzr, x23, lsl #31     // encoding: [0xf8,0x7f,0x17,0xeb]
+
+        negs x22, x21, lsr #0
+        negs x20, x19, lsr #1
+        negs x18, x17, lsr #31
+// CHECK: subs     x22, xzr, x21, lsr #0      // encoding: [0xf6,0x03,0x55,0xeb]
+// CHECK: subs     x20, xzr, x19, lsr #1      // encoding: [0xf4,0x07,0x53,0xeb]
+// CHECK: subs     x18, xzr, x17, lsr #31     // encoding: [0xf2,0x7f,0x51,0xeb]
+
+        negs x16, x15, asr #0
+        negs x14, x13, asr #12
+        negs x12, x11, asr #31
+// CHECK: subs     x16, xzr, x15, asr #0      // encoding: [0xf0,0x03,0x8f,0xeb]
+// CHECK: subs     x14, xzr, x13, asr #12     // encoding: [0xee,0x33,0x8d,0xeb]
+// CHECK: subs     x12, xzr, x11, asr #31     // encoding: [0xec,0x7f,0x8b,0xeb]
+
+//------------------------------------------------------------------------------
+// Add-sub (shifted register)
+//------------------------------------------------------------------------------
+        adc w29, w27, w25
+        adc wzr, w3, w4
+        adc w9, wzr, w10
+        adc w20, w0, wzr
+// CHECK: adc      w29, w27, w25              // encoding: [0x7d,0x03,0x19,0x1a]
+// CHECK: adc      wzr, w3, w4                // encoding: [0x7f,0x00,0x04,0x1a]
+// CHECK: adc      w9, wzr, w10               // encoding: [0xe9,0x03,0x0a,0x1a]
+// CHECK: adc      w20, w0, wzr               // encoding: [0x14,0x00,0x1f,0x1a]
+
+        adc x29, x27, x25
+        adc xzr, x3, x4
+        adc x9, xzr, x10
+        adc x20, x0, xzr
+// CHECK: adc      x29, x27, x25              // encoding: [0x7d,0x03,0x19,0x9a]
+// CHECK: adc      xzr, x3, x4                // encoding: [0x7f,0x00,0x04,0x9a]
+// CHECK: adc      x9, xzr, x10               // encoding: [0xe9,0x03,0x0a,0x9a]
+// CHECK: adc      x20, x0, xzr               // encoding: [0x14,0x00,0x1f,0x9a]
+
+        adcs w29, w27, w25
+        adcs wzr, w3, w4
+        adcs w9, wzr, w10
+        adcs w20, w0, wzr
+// CHECK: adcs     w29, w27, w25              // encoding: [0x7d,0x03,0x19,0x3a]
+// CHECK: adcs     wzr, w3, w4                // encoding: [0x7f,0x00,0x04,0x3a]
+// CHECK: adcs     w9, wzr, w10               // encoding: [0xe9,0x03,0x0a,0x3a]
+// CHECK: adcs     w20, w0, wzr               // encoding: [0x14,0x00,0x1f,0x3a]
+
+        adcs x29, x27, x25
+        adcs xzr, x3, x4
+        adcs x9, xzr, x10
+        adcs x20, x0, xzr
+// CHECK: adcs     x29, x27, x25              // encoding: [0x7d,0x03,0x19,0xba]
+// CHECK: adcs     xzr, x3, x4                // encoding: [0x7f,0x00,0x04,0xba]
+// CHECK: adcs     x9, xzr, x10               // encoding: [0xe9,0x03,0x0a,0xba]
+// CHECK: adcs     x20, x0, xzr               // encoding: [0x14,0x00,0x1f,0xba]
+
+        sbc w29, w27, w25
+        sbc wzr, w3, w4
+        sbc w9, wzr, w10
+        sbc w20, w0, wzr
+// CHECK: sbc      w29, w27, w25              // encoding: [0x7d,0x03,0x19,0x5a]
+// CHECK: sbc      wzr, w3, w4                // encoding: [0x7f,0x00,0x04,0x5a]
+// CHECK: ngc      w9, w10                    // encoding: [0xe9,0x03,0x0a,0x5a]
+// CHECK: sbc      w20, w0, wzr               // encoding: [0x14,0x00,0x1f,0x5a]
+
+        sbc x29, x27, x25
+        sbc xzr, x3, x4
+        sbc x9, xzr, x10
+        sbc x20, x0, xzr
+// CHECK: sbc      x29, x27, x25              // encoding: [0x7d,0x03,0x19,0xda]
+// CHECK: sbc      xzr, x3, x4                // encoding: [0x7f,0x00,0x04,0xda]
+// CHECK: ngc      x9, x10                    // encoding: [0xe9,0x03,0x0a,0xda]
+// CHECK: sbc      x20, x0, xzr               // encoding: [0x14,0x00,0x1f,0xda]
+
+        sbcs w29, w27, w25
+        sbcs wzr, w3, w4
+        sbcs w9, wzr, w10
+        sbcs w20, w0, wzr
+// CHECK: sbcs     w29, w27, w25              // encoding: [0x7d,0x03,0x19,0x7a]
+// CHECK: sbcs     wzr, w3, w4                // encoding: [0x7f,0x00,0x04,0x7a]
+// CHECK: ngcs     w9, w10                    // encoding: [0xe9,0x03,0x0a,0x7a]
+// CHECK: sbcs     w20, w0, wzr               // encoding: [0x14,0x00,0x1f,0x7a]
+
+        sbcs x29, x27, x25
+        sbcs xzr, x3, x4
+        sbcs x9, xzr, x10
+        sbcs x20, x0, xzr
+// CHECK: sbcs     x29, x27, x25              // encoding: [0x7d,0x03,0x19,0xfa]
+// CHECK: sbcs     xzr, x3, x4                // encoding: [0x7f,0x00,0x04,0xfa]
+// CHECK: ngcs     x9, x10                    // encoding: [0xe9,0x03,0x0a,0xfa]
+// CHECK: sbcs     x20, x0, xzr               // encoding: [0x14,0x00,0x1f,0xfa]
+
+        ngc w3, w12
+        ngc wzr, w9
+        ngc w23, wzr
+// CHECK: ngc      w3, w12                    // encoding: [0xe3,0x03,0x0c,0x5a]
+// CHECK: ngc      wzr, w9                    // encoding: [0xff,0x03,0x09,0x5a]
+// CHECK: ngc      w23, wzr                   // encoding: [0xf7,0x03,0x1f,0x5a]
+
+        ngc x29, x30
+        ngc xzr, x0
+        ngc x0, xzr
+// CHECK: ngc      x29, x30                   // encoding: [0xfd,0x03,0x1e,0xda]
+// CHECK: ngc      xzr, x0                    // encoding: [0xff,0x03,0x00,0xda]
+// CHECK: ngc      x0, xzr                    // encoding: [0xe0,0x03,0x1f,0xda]
+
+        ngcs w3, w12
+        ngcs wzr, w9
+        ngcs w23, wzr
+// CHECK: ngcs     w3, w12                    // encoding: [0xe3,0x03,0x0c,0x7a]
+// CHECK: ngcs     wzr, w9                    // encoding: [0xff,0x03,0x09,0x7a]
+// CHECK: ngcs     w23, wzr                   // encoding: [0xf7,0x03,0x1f,0x7a]
+
+        ngcs x29, x30
+        ngcs xzr, x0
+        ngcs x0, xzr
+// CHECK: ngcs     x29, x30                   // encoding: [0xfd,0x03,0x1e,0xfa]
+// CHECK: ngcs     xzr, x0                    // encoding: [0xff,0x03,0x00,0xfa]
+// CHECK: ngcs     x0, xzr                    // encoding: [0xe0,0x03,0x1f,0xfa]
+
+//------------------------------------------------------------------------------
+// Bitfield
+//------------------------------------------------------------------------------
+
+        sbfm x1, x2, #3, #4
+        sbfm x3, x4, #63, #63
+        sbfm wzr, wzr, #31, #31
+        sbfm w12, w9, #0, #0
+// CHECK: sbfm     x1, x2, #3, #4             // encoding: [0x41,0x10,0x43,0x93]
+// CHECK: sbfm     x3, x4, #63, #63           // encoding: [0x83,0xfc,0x7f,0x93]
+// CHECK: sbfm     wzr, wzr, #31, #31         // encoding: [0xff,0x7f,0x1f,0x13]
+// CHECK: sbfm     w12, w9, #0, #0            // encoding: [0x2c,0x01,0x00,0x13]
+
+        ubfm x4, x5, #12, #10
+        ubfm xzr, x4, #0, #0
+        ubfm x4, xzr, #63, #5
+        ubfm x5, x6, #12, #63
+// CHECK: ubfm     x4, x5, #12, #10           // encoding: [0xa4,0x28,0x4c,0xd3]
+// CHECK: ubfm     xzr, x4, #0, #0            // encoding: [0x9f,0x00,0x40,0xd3]
+// CHECK: ubfm     x4, xzr, #63, #5            // encoding: [0xe4,0x17,0x7f,0xd3]
+// CHECK: ubfm     x5, x6, #12, #63           // encoding: [0xc5,0xfc,0x4c,0xd3]
+
+        bfm x4, x5, #12, #10
+        bfm xzr, x4, #0, #0
+        bfm x4, xzr, #63, #5
+        bfm x5, x6, #12, #63
+// CHECK: bfm      x4, x5, #12, #10           // encoding: [0xa4,0x28,0x4c,0xb3]
+// CHECK: bfm      xzr, x4, #0, #0            // encoding: [0x9f,0x00,0x40,0xb3]
+// CHECK: bfm      x4, xzr, #63, #5            // encoding: [0xe4,0x17,0x7f,0xb3]
+// CHECK: bfm      x5, x6, #12, #63           // encoding: [0xc5,0xfc,0x4c,0xb3]
+
+        sxtb w1, w2
+        sxtb xzr, w3
+        sxth w9, w10
+        sxth x0, w1
+        sxtw x3, w30
+// CHECK: sxtb     w1, w2                     // encoding: [0x41,0x1c,0x00,0x13]
+// CHECK: sxtb     xzr, w3                    // encoding: [0x7f,0x1c,0x40,0x93]
+// CHECK: sxth     w9, w10                    // encoding: [0x49,0x3d,0x00,0x13]
+// CHECK: sxth     x0, w1                     // encoding: [0x20,0x3c,0x40,0x93]
+// CHECK: sxtw     x3, w30                    // encoding: [0xc3,0x7f,0x40,0x93]
+
+        uxtb w1, w2
+        uxtb xzr, w3
+        uxth w9, w10
+        uxth x0, w1
+// CHECK: uxtb     w1, w2                     // encoding: [0x41,0x1c,0x00,0x53]
+// CHECK: uxtb     xzr, w3                    // encoding: [0x7f,0x1c,0x00,0x53]
+// CHECK: uxth     w9, w10                    // encoding: [0x49,0x3d,0x00,0x53]
+// CHECK: uxth     x0, w1                     // encoding: [0x20,0x3c,0x00,0x53]
+
+        asr w3, w2, #0
+        asr w9, w10, #31
+        asr x20, x21, #63
+        asr w1, wzr, #3
+// CHECK: asr      w3, w2, #0                 // encoding: [0x43,0x7c,0x00,0x13]
+// CHECK: asr      w9, w10, #31               // encoding: [0x49,0x7d,0x1f,0x13]
+// CHECK: asr      x20, x21, #63              // encoding: [0xb4,0xfe,0x7f,0x93]
+// CHECK: asr      w1, wzr, #3                // encoding: [0xe1,0x7f,0x03,0x13]
+
+        lsr w3, w2, #0
+        lsr w9, w10, #31
+        lsr x20, x21, #63
+        lsr wzr, wzr, #3
+// CHECK: lsr      w3, w2, #0                 // encoding: [0x43,0x7c,0x00,0x53]
+// CHECK: lsr      w9, w10, #31               // encoding: [0x49,0x7d,0x1f,0x53]
+// CHECK: lsr      x20, x21, #63              // encoding: [0xb4,0xfe,0x7f,0xd3]
+// CHECK: lsr      wzr, wzr, #3               // encoding: [0xff,0x7f,0x03,0x53]
+
+        lsl w3, w2, #0
+        lsl w9, w10, #31
+        lsl x20, x21, #63
+        lsl w1, wzr, #3
+// CHECK: lsl      w3, w2, #0                 // encoding: [0x43,0x7c,0x00,0x53]
+// CHECK: lsl      w9, w10, #31               // encoding: [0x49,0x01,0x01,0x53]
+// CHECK: lsl      x20, x21, #63              // encoding: [0xb4,0x02,0x41,0xd3]
+// CHECK: lsl      w1, wzr, #3                // encoding: [0xe1,0x73,0x1d,0x53]
+
+        sbfiz w9, w10, #0, #1
+        sbfiz x2, x3, #63, #1
+        sbfiz x19, x20, #0, #64
+        sbfiz x9, x10, #5, #59
+        sbfiz w9, w10, #0, #32
+        sbfiz w11, w12, #31, #1
+        sbfiz w13, w14, #29, #3
+        sbfiz xzr, xzr, #10, #11
+// CHECK: sbfiz    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x13]
+// CHECK: sbfiz    x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0x93]
+// CHECK: sbfiz    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0x93]
+// CHECK: sbfiz    x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0x93]
+// CHECK: sbfiz    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x13]
+// CHECK: sbfiz    w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x13]
+// CHECK: sbfiz    w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x13]
+// CHECK: sbfiz    xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0x93]
+
+        sbfx w9, w10, #0, #1
+        sbfx x2, x3, #63, #1
+        sbfx x19, x20, #0, #64
+        sbfx x9, x10, #5, #59
+        sbfx w9, w10, #0, #32
+        sbfx w11, w12, #31, #1
+        sbfx w13, w14, #29, #3
+        sbfx xzr, xzr, #10, #11
+// CHECK: sbfx     w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x13]
+// CHECK: sbfx     x2, x3, #63, #1            // encoding: [0x62,0xfc,0x7f,0x93]
+// CHECK: sbfx     x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0x93]
+// CHECK: sbfx     x9, x10, #5, #59           // encoding: [0x49,0xfd,0x45,0x93]
+// CHECK: sbfx     w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x13]
+// CHECK: sbfx     w11, w12, #31, #1          // encoding: [0x8b,0x7d,0x1f,0x13]
+// CHECK: sbfx     w13, w14, #29, #3          // encoding: [0xcd,0x7d,0x1d,0x13]
+// CHECK: sbfx     xzr, xzr, #10, #11         // encoding: [0xff,0x53,0x4a,0x93]
+
+        bfi w9, w10, #0, #1
+        bfi x2, x3, #63, #1
+        bfi x19, x20, #0, #64
+        bfi x9, x10, #5, #59
+        bfi w9, w10, #0, #32
+        bfi w11, w12, #31, #1
+        bfi w13, w14, #29, #3
+        bfi xzr, xzr, #10, #11
+// CHECK: bfi      w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x33]
+// CHECK: bfi      x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xb3]
+// CHECK: bfi      x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xb3]
+// CHECK: bfi      x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xb3]
+// CHECK: bfi      w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x33]
+// CHECK: bfi      w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x33]
+// CHECK: bfi      w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x33]
+// CHECK: bfi      xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xb3]
+
+        bfxil w9, w10, #0, #1
+        bfxil x2, x3, #63, #1
+        bfxil x19, x20, #0, #64
+        bfxil x9, x10, #5, #59
+        bfxil w9, w10, #0, #32
+        bfxil w11, w12, #31, #1
+        bfxil w13, w14, #29, #3
+        bfxil xzr, xzr, #10, #11
+// CHECK: bfxil    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x33]
+// CHECK: bfxil    x2, x3, #63, #1            // encoding: [0x62,0xfc,0x7f,0xb3]
+// CHECK: bfxil    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xb3]
+// CHECK: bfxil    x9, x10, #5, #59           // encoding: [0x49,0xfd,0x45,0xb3]
+// CHECK: bfxil    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x33]
+// CHECK: bfxil    w11, w12, #31, #1          // encoding: [0x8b,0x7d,0x1f,0x33]
+// CHECK: bfxil    w13, w14, #29, #3          // encoding: [0xcd,0x7d,0x1d,0x33]
+// CHECK: bfxil    xzr, xzr, #10, #11         // encoding: [0xff,0x53,0x4a,0xb3]
+
+        ubfiz w9, w10, #0, #1
+        ubfiz x2, x3, #63, #1
+        ubfiz x19, x20, #0, #64
+        ubfiz x9, x10, #5, #59
+        ubfiz w9, w10, #0, #32
+        ubfiz w11, w12, #31, #1
+        ubfiz w13, w14, #29, #3
+        ubfiz xzr, xzr, #10, #11
+// CHECK: ubfiz    w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x53]
+// CHECK: ubfiz    x2, x3, #63, #1            // encoding: [0x62,0x00,0x41,0xd3]
+// CHECK: ubfiz    x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xd3]
+// CHECK: ubfiz    x9, x10, #5, #59           // encoding: [0x49,0xe9,0x7b,0xd3]
+// CHECK: ubfiz    w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x53]
+// CHECK: ubfiz    w11, w12, #31, #1          // encoding: [0x8b,0x01,0x01,0x53]
+// CHECK: ubfiz    w13, w14, #29, #3          // encoding: [0xcd,0x09,0x03,0x53]
+// CHECK: ubfiz    xzr, xzr, #10, #11         // encoding: [0xff,0x2b,0x76,0xd3]
+
+        ubfx w9, w10, #0, #1
+        ubfx x2, x3, #63, #1
+        ubfx x19, x20, #0, #64
+        ubfx x9, x10, #5, #59
+        ubfx w9, w10, #0, #32
+        ubfx w11, w12, #31, #1
+        ubfx w13, w14, #29, #3
+        ubfx xzr, xzr, #10, #11
+// CHECK: ubfx     w9, w10, #0, #1            // encoding: [0x49,0x01,0x00,0x53]
+// CHECK: ubfx     x2, x3, #63, #1            // encoding: [0x62,0xfc,0x7f,0xd3]
+// CHECK: ubfx     x19, x20, #0, #64          // encoding: [0x93,0xfe,0x40,0xd3]
+// CHECK: ubfx     x9, x10, #5, #59           // encoding: [0x49,0xfd,0x45,0xd3]
+// CHECK: ubfx     w9, w10, #0, #32           // encoding: [0x49,0x7d,0x00,0x53]
+// CHECK: ubfx     w11, w12, #31, #1          // encoding: [0x8b,0x7d,0x1f,0x53]
+// CHECK: ubfx     w13, w14, #29, #3          // encoding: [0xcd,0x7d,0x1d,0x53]
+// CHECK: ubfx     xzr, xzr, #10, #11         // encoding: [0xff,0x53,0x4a,0xd3]
+
+//------------------------------------------------------------------------------
+// Compare & branch (immediate)
+//------------------------------------------------------------------------------
+
+        cbz w5, lbl
+        cbz x5, lbl
+        cbnz x2, lbl
+        cbnz x26, lbl
+// CHECK: cbz      w5, lbl                // encoding: [0x05'A',A,A,0x34'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: cbz      x5, lbl                // encoding: [0x05'A',A,A,0xb4'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: cbnz     x2, lbl                // encoding: [0x02'A',A,A,0xb5'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: cbnz     x26, lbl               // encoding: [0x1a'A',A,A,0xb5'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+
+        cbz wzr, lbl
+        cbnz xzr, lbl
+// CHECK: cbz      wzr, lbl               // encoding: [0x1f'A',A,A,0x34'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: cbnz     xzr, lbl               // encoding: [0x1f'A',A,A,0xb5'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+
+        cbz w5, #0
+        cbnz x3, #-4
+        cbz w20, #1048572
+        cbnz xzr, #-1048576
+// CHECK: cbz     w5, #0                  // encoding: [0x05,0x00,0x00,0x34]
+// CHECK: cbnz    x3, #-4                 // encoding: [0xe3,0xff,0xff,0xb5]
+// CHECK: cbz     w20, #1048572           // encoding: [0xf4,0xff,0x7f,0x34]
+// CHECK: cbnz    xzr, #-1048576          // encoding: [0x1f,0x00,0x80,0xb5]
+
+//------------------------------------------------------------------------------
+// Conditional branch (immediate)
+//------------------------------------------------------------------------------
+
+        b.eq lbl
+        b.ne lbl
+        b.cs lbl
+        b.hs lbl
+        b.lo lbl
+        b.cc lbl
+        b.mi lbl
+        b.pl lbl
+        b.vs lbl
+        b.vc lbl
+        b.hi lbl
+        b.ls lbl
+        b.ge lbl
+        b.lt lbl
+        b.gt lbl
+        b.le lbl
+        b.al lbl
+// CHECK: b.eq lbl                        // encoding: [A,A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.ne lbl                        // encoding: [0x01'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.hs lbl                        // encoding: [0x02'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.lo lbl                        // encoding: [0x03'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.mi lbl                        // encoding: [0x04'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.pl lbl                        // encoding: [0x05'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.vs lbl                        // encoding: [0x06'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.vc lbl                        // encoding: [0x07'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.hi lbl                        // encoding: [0x08'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.ls lbl                        // encoding: [0x09'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.ge lbl                        // encoding: [0x0a'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.lt lbl                        // encoding: [0x0b'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.gt lbl                        // encoding: [0x0c'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.le lbl                        // encoding: [0x0d'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+// CHECK: b.al lbl                        // encoding: [0x0e'A',A,A,0x54'A']
+// CHECK:                                 //   fixup A - offset: 0, value: lbl, kind: fixup_a64_condbr
+
+        b.eq #0
+        b.lt #-4
+        b.cc #1048572
+// CHECK: b.eq #0                         // encoding: [0x00,0x00,0x00,0x54]
+// CHECK: b.lt #-4                        // encoding: [0xeb,0xff,0xff,0x54]
+// CHECK: b.lo #1048572                   // encoding: [0xe3,0xff,0x7f,0x54]
+
+//------------------------------------------------------------------------------
+// Conditional compare (immediate)
+//------------------------------------------------------------------------------
+
+        ccmp w1, #31, #0, eq
+        ccmp w3, #0, #15, hs
+        ccmp wzr, #15, #13, cs
+// CHECK: ccmp    w1, #31, #0, eq         // encoding: [0x20,0x08,0x5f,0x7a]
+// CHECK: ccmp    w3, #0, #15, hs         // encoding: [0x6f,0x28,0x40,0x7a]
+// CHECK: ccmp    wzr, #15, #13, hs       // encoding: [0xed,0x2b,0x4f,0x7a]
+
+        ccmp x9, #31, #0, le
+        ccmp x3, #0, #15, gt
+        ccmp xzr, #5, #7, ne
+// CHECK: ccmp    x9, #31, #0, le         // encoding: [0x20,0xd9,0x5f,0xfa]
+// CHECK: ccmp    x3, #0, #15, gt         // encoding: [0x6f,0xc8,0x40,0xfa]
+// CHECK: ccmp    xzr, #5, #7, ne         // encoding: [0xe7,0x1b,0x45,0xfa]
+
+        ccmn w1, #31, #0, eq
+        ccmn w3, #0, #15, hs
+        ccmn wzr, #15, #13, cs
+// CHECK: ccmn    w1, #31, #0, eq         // encoding: [0x20,0x08,0x5f,0x3a]
+// CHECK: ccmn    w3, #0, #15, hs         // encoding: [0x6f,0x28,0x40,0x3a]
+// CHECK: ccmn    wzr, #15, #13, hs       // encoding: [0xed,0x2b,0x4f,0x3a]
+
+        ccmn x9, #31, #0, le
+        ccmn x3, #0, #15, gt
+        ccmn xzr, #5, #7, ne
+// CHECK: ccmn    x9, #31, #0, le         // encoding: [0x20,0xd9,0x5f,0xba]
+// CHECK: ccmn    x3, #0, #15, gt         // encoding: [0x6f,0xc8,0x40,0xba]
+// CHECK: ccmn    xzr, #5, #7, ne         // encoding: [0xe7,0x1b,0x45,0xba]
+
+//------------------------------------------------------------------------------
+// Conditional compare (register)
+//------------------------------------------------------------------------------
+
+        ccmp w1, wzr, #0, eq
+        ccmp w3, w0, #15, hs
+        ccmp wzr, w15, #13, cs
+// CHECK: ccmp    w1, wzr, #0, eq         // encoding: [0x20,0x00,0x5f,0x7a]
+// CHECK: ccmp    w3, w0, #15, hs         // encoding: [0x6f,0x20,0x40,0x7a]
+// CHECK: ccmp    wzr, w15, #13, hs       // encoding: [0xed,0x23,0x4f,0x7a]
+
+        ccmp x9, xzr, #0, le
+        ccmp x3, x0, #15, gt
+        ccmp xzr, x5, #7, ne
+// CHECK: ccmp    x9, xzr, #0, le         // encoding: [0x20,0xd1,0x5f,0xfa]
+// CHECK: ccmp    x3, x0, #15, gt         // encoding: [0x6f,0xc0,0x40,0xfa]
+// CHECK: ccmp    xzr, x5, #7, ne         // encoding: [0xe7,0x13,0x45,0xfa]
+
+        ccmn w1, wzr, #0, eq
+        ccmn w3, w0, #15, hs
+        ccmn wzr, w15, #13, cs
+// CHECK: ccmn    w1, wzr, #0, eq         // encoding: [0x20,0x00,0x5f,0x3a]
+// CHECK: ccmn    w3, w0, #15, hs         // encoding: [0x6f,0x20,0x40,0x3a]
+// CHECK: ccmn    wzr, w15, #13, hs       // encoding: [0xed,0x23,0x4f,0x3a]
+
+        ccmn x9, xzr, #0, le
+        ccmn x3, x0, #15, gt
+        ccmn xzr, x5, #7, ne
+// CHECK: ccmn    x9, xzr, #0, le         // encoding: [0x20,0xd1,0x5f,0xba]
+// CHECK: ccmn    x3, x0, #15, gt         // encoding: [0x6f,0xc0,0x40,0xba]
+// CHECK: ccmn    xzr, x5, #7, ne         // encoding: [0xe7,0x13,0x45,0xba]
+
+//------------------------------------------------------------------------------
+// Conditional select
+//------------------------------------------------------------------------------
+        csel w1, w0, w19, ne
+        csel wzr, w5, w9, eq
+        csel w9, wzr, w30, gt
+        csel w1, w28, wzr, mi
+// CHECK: csel     w1, w0, w19, ne            // encoding: [0x01,0x10,0x93,0x1a]
+// CHECK: csel     wzr, w5, w9, eq            // encoding: [0xbf,0x00,0x89,0x1a]
+// CHECK: csel     w9, wzr, w30, gt           // encoding: [0xe9,0xc3,0x9e,0x1a]
+// CHECK: csel     w1, w28, wzr, mi           // encoding: [0x81,0x43,0x9f,0x1a]
+
+        csel x19, x23, x29, lt
+        csel xzr, x3, x4, ge
+        csel x5, xzr, x6, cs
+        csel x7, x8, xzr, cc
+// CHECK: csel     x19, x23, x29, lt          // encoding: [0xf3,0xb2,0x9d,0x9a]
+// CHECK: csel     xzr, x3, x4, ge            // encoding: [0x7f,0xa0,0x84,0x9a]
+// CHECK: csel     x5, xzr, x6, hs            // encoding: [0xe5,0x23,0x86,0x9a]
+// CHECK: csel     x7, x8, xzr, lo            // encoding: [0x07,0x31,0x9f,0x9a]
+
+        csinc w1, w0, w19, ne
+        csinc wzr, w5, w9, eq
+        csinc w9, wzr, w30, gt
+        csinc w1, w28, wzr, mi
+// CHECK: csinc    w1, w0, w19, ne            // encoding: [0x01,0x14,0x93,0x1a]
+// CHECK: csinc    wzr, w5, w9, eq            // encoding: [0xbf,0x04,0x89,0x1a]
+// CHECK: csinc    w9, wzr, w30, gt           // encoding: [0xe9,0xc7,0x9e,0x1a]
+// CHECK: csinc    w1, w28, wzr, mi           // encoding: [0x81,0x47,0x9f,0x1a]
+
+        csinc x19, x23, x29, lt
+        csinc xzr, x3, x4, ge
+        csinc x5, xzr, x6, cs
+        csinc x7, x8, xzr, cc
+// CHECK: csinc    x19, x23, x29, lt          // encoding: [0xf3,0xb6,0x9d,0x9a]
+// CHECK: csinc    xzr, x3, x4, ge            // encoding: [0x7f,0xa4,0x84,0x9a]
+// CHECK: csinc    x5, xzr, x6, hs            // encoding: [0xe5,0x27,0x86,0x9a]
+// CHECK: csinc    x7, x8, xzr, lo            // encoding: [0x07,0x35,0x9f,0x9a]
+
+        csinv w1, w0, w19, ne
+        csinv wzr, w5, w9, eq
+        csinv w9, wzr, w30, gt
+        csinv w1, w28, wzr, mi
+// CHECK: csinv    w1, w0, w19, ne            // encoding: [0x01,0x10,0x93,0x5a]
+// CHECK: csinv    wzr, w5, w9, eq            // encoding: [0xbf,0x00,0x89,0x5a]
+// CHECK: csinv    w9, wzr, w30, gt           // encoding: [0xe9,0xc3,0x9e,0x5a]
+// CHECK: csinv    w1, w28, wzr, mi           // encoding: [0x81,0x43,0x9f,0x5a]
+
+        csinv x19, x23, x29, lt
+        csinv xzr, x3, x4, ge
+        csinv x5, xzr, x6, cs
+        csinv x7, x8, xzr, cc
+// CHECK: csinv    x19, x23, x29, lt          // encoding: [0xf3,0xb2,0x9d,0xda]
+// CHECK: csinv    xzr, x3, x4, ge            // encoding: [0x7f,0xa0,0x84,0xda]
+// CHECK: csinv    x5, xzr, x6, hs            // encoding: [0xe5,0x23,0x86,0xda]
+// CHECK: csinv    x7, x8, xzr, lo            // encoding: [0x07,0x31,0x9f,0xda]
+
+        csneg w1, w0, w19, ne
+        csneg wzr, w5, w9, eq
+        csneg w9, wzr, w30, gt
+        csneg w1, w28, wzr, mi
+// CHECK: csneg    w1, w0, w19, ne            // encoding: [0x01,0x14,0x93,0x5a]
+// CHECK: csneg    wzr, w5, w9, eq            // encoding: [0xbf,0x04,0x89,0x5a]
+// CHECK: csneg    w9, wzr, w30, gt           // encoding: [0xe9,0xc7,0x9e,0x5a]
+// CHECK: csneg    w1, w28, wzr, mi           // encoding: [0x81,0x47,0x9f,0x5a]
+
+        csneg x19, x23, x29, lt
+        csneg xzr, x3, x4, ge
+        csneg x5, xzr, x6, cs
+        csneg x7, x8, xzr, cc
+// CHECK: csneg    x19, x23, x29, lt          // encoding: [0xf3,0xb6,0x9d,0xda]
+// CHECK: csneg    xzr, x3, x4, ge            // encoding: [0x7f,0xa4,0x84,0xda]
+// CHECK: csneg    x5, xzr, x6, hs            // encoding: [0xe5,0x27,0x86,0xda]
+// CHECK: csneg    x7, x8, xzr, lo            // encoding: [0x07,0x35,0x9f,0xda]
+
+        cset w3, eq
+        cset x9, pl
+// CHECK: csinc    w3, wzr, wzr, ne           // encoding: [0xe3,0x17,0x9f,0x1a]
+// CHECK: csinc    x9, xzr, xzr, mi           // encoding: [0xe9,0x47,0x9f,0x9a]
+
+        csetm w20, ne
+        csetm x30, ge
+// CHECK: csinv    w20, wzr, wzr, eq          // encoding: [0xf4,0x03,0x9f,0x5a]
+// CHECK: csinv    x30, xzr, xzr, lt          // encoding: [0xfe,0xb3,0x9f,0xda]
+
+        cinc w3, w5, gt
+        cinc wzr, w4, le
+        cinc w9, wzr, lt
+// CHECK: csinc    w3, w5, w5, le             // encoding: [0xa3,0xd4,0x85,0x1a]
+// CHECK: csinc    wzr, w4, w4, gt            // encoding: [0x9f,0xc4,0x84,0x1a]
+// CHECK: csinc    w9, wzr, wzr, ge           // encoding: [0xe9,0xa7,0x9f,0x1a]
+
+        cinc x3, x5, gt
+        cinc xzr, x4, le
+        cinc x9, xzr, lt
+// CHECK: csinc     x3, x5, x5, le             // encoding: [0xa3,0xd4,0x85,0x9a]
+// CHECK: csinc     xzr, x4, x4, gt            // encoding: [0x9f,0xc4,0x84,0x9a]
+// CHECK: csinc     x9, xzr, xzr, ge           // encoding: [0xe9,0xa7,0x9f,0x9a]
+
+        cinv w3, w5, gt
+        cinv wzr, w4, le
+        cinv w9, wzr, lt
+// CHECK: csinv    w3, w5, w5, le             // encoding: [0xa3,0xd0,0x85,0x5a]
+// CHECK: csinv    wzr, w4, w4, gt            // encoding: [0x9f,0xc0,0x84,0x5a]
+// CHECK: csinv    w9, wzr, wzr, ge           // encoding: [0xe9,0xa3,0x9f,0x5a]
+
+        cinv x3, x5, gt
+        cinv xzr, x4, le
+        cinv x9, xzr, lt
+// CHECK: csinv    x3, x5, x5, le             // encoding: [0xa3,0xd0,0x85,0xda]
+// CHECK: csinv    xzr, x4, x4, gt            // encoding: [0x9f,0xc0,0x84,0xda]
+// CHECK: csinv    x9, xzr, xzr, ge           // encoding: [0xe9,0xa3,0x9f,0xda]
+
+        cneg w3, w5, gt
+        cneg wzr, w4, le
+        cneg w9, wzr, lt
+// CHECK: csneg    w3, w5, w5, le             // encoding: [0xa3,0xd4,0x85,0x5a]
+// CHECK: csneg    wzr, w4, w4, gt            // encoding: [0x9f,0xc4,0x84,0x5a]
+// CHECK: csneg    w9, wzr, wzr, ge           // encoding: [0xe9,0xa7,0x9f,0x5a]
+
+        cneg x3, x5, gt
+        cneg xzr, x4, le
+        cneg x9, xzr, lt
+// CHECK: csneg    x3, x5, x5, le             // encoding: [0xa3,0xd4,0x85,0xda]
+// CHECK: csneg    xzr, x4, x4, gt            // encoding: [0x9f,0xc4,0x84,0xda]
+// CHECK: csneg    x9, xzr, xzr, ge           // encoding: [0xe9,0xa7,0x9f,0xda]
+
+//------------------------------------------------------------------------------
+// Data-processing (1 source)
+//------------------------------------------------------------------------------
+
+	rbit	w0, w7
+	rbit	x18, x3
+	rev16	w17, w1
+	rev16	x5, x2
+	rev	w18, w0
+	rev32	x20, x1
+	rev32	x20, xzr
+// CHECK: rbit	w0, w7                       // encoding: [0xe0,0x00,0xc0,0x5a]
+// CHECK: rbit	x18, x3                      // encoding: [0x72,0x00,0xc0,0xda]
+// CHECK: rev16 w17, w1                      // encoding: [0x31,0x04,0xc0,0x5a]
+// CHECK: rev16	x5, x2                       // encoding: [0x45,0x04,0xc0,0xda]
+// CHECK: rev	w18, w0                      // encoding: [0x12,0x08,0xc0,0x5a]
+// CHECK: rev32	x20, x1                      // encoding: [0x34,0x08,0xc0,0xda]
+// CHECK: rev32	x20, xzr                     // encoding: [0xf4,0x0b,0xc0,0xda]
+
+	rev	x22, x2
+	rev	x18, xzr
+	rev	w7, wzr
+	clz	w24, w3
+	clz	x26, x4
+	cls	w3, w5
+	cls	x20, x5
+// CHECK: rev	x22, x2                      // encoding: [0x56,0x0c,0xc0,0xda]
+// CHECK: rev	x18, xzr                     // encoding: [0xf2,0x0f,0xc0,0xda]
+// CHECK: rev	w7, wzr                      // encoding: [0xe7,0x0b,0xc0,0x5a]
+// CHECK: clz	w24, w3                      // encoding: [0x78,0x10,0xc0,0x5a]
+// CHECK: clz	x26, x4                      // encoding: [0x9a,0x10,0xc0,0xda]
+// CHECK: cls	w3, w5                       // encoding: [0xa3,0x14,0xc0,0x5a]
+// CHECK: cls	x20, x5                      // encoding: [0xb4,0x14,0xc0,0xda]
+
+	clz	w24, wzr
+	rev	x22, xzr
+// CHECK: clz	w24, wzr                     // encoding: [0xf8,0x13,0xc0,0x5a]
+// CHECK: rev	x22, xzr                     // encoding: [0xf6,0x0f,0xc0,0xda]
+
+//------------------------------------------------------------------------------
+// Data-processing (2 source)
+//------------------------------------------------------------------------------
+
+        crc32b  w5, w7, w20
+        crc32h  w28, wzr, w30
+        crc32w  w0, w1, w2
+        crc32x  w7, w9, x20
+        crc32cb w9, w5, w4
+        crc32ch w13, w17, w25
+        crc32cw wzr, w3, w5
+        crc32cx w18, w16, xzr
+// CHECK: crc32b   w5, w7, w20             // encoding: [0xe5,0x40,0xd4,0x1a]
+// CHECK: crc32h   w28, wzr, w30           // encoding: [0xfc,0x47,0xde,0x1a]
+// CHECK: crc32w   w0, w1, w2              // encoding: [0x20,0x48,0xc2,0x1a]
+// CHECK: crc32x   w7, w9, x20             // encoding: [0x27,0x4d,0xd4,0x9a]
+// CHECK: crc32cb  w9, w5, w4              // encoding: [0xa9,0x50,0xc4,0x1a]
+// CHECK: crc32ch  w13, w17, w25           // encoding: [0x2d,0x56,0xd9,0x1a]
+// CHECK: crc32cw  wzr, w3, w5             // encoding: [0x7f,0x58,0xc5,0x1a]
+// CHECK: crc32cx  w18, w16, xzr           // encoding: [0x12,0x5e,0xdf,0x9a]
+
+        udiv	w0, w7, w10
+        udiv	x9, x22, x4
+        sdiv	w12, w21, w0
+        sdiv	x13, x2, x1
+        lslv	w11, w12, w13
+        lslv	x14, x15, x16
+        lsrv	w17, w18, w19
+        lsrv	x20, x21, x22
+        asrv	w23, w24, w25
+        asrv	x26, x27, x28
+        rorv	w0, w1, w2
+        rorv    x3, x4, x5
+
+
+// CHECK: udiv	w0, w7, w10                   // encoding: [0xe0,0x08,0xca,0x1a]
+// CHECK: udiv	x9, x22, x4                   // encoding: [0xc9,0x0a,0xc4,0x9a]
+// CHECK: sdiv	w12, w21, w0                  // encoding: [0xac,0x0e,0xc0,0x1a]
+// CHECK: sdiv	x13, x2, x1                   // encoding: [0x4d,0x0c,0xc1,0x9a]
+// CHECK: lsl	w11, w12, w13                 // encoding: [0x8b,0x21,0xcd,0x1a]
+// CHECK: lsl	x14, x15, x16                 // encoding: [0xee,0x21,0xd0,0x9a]
+// CHECK: lsr	w17, w18, w19                 // encoding: [0x51,0x26,0xd3,0x1a]
+// CHECK: lsr	x20, x21, x22                 // encoding: [0xb4,0x26,0xd6,0x9a]
+// CHECK: asr	w23, w24, w25                 // encoding: [0x17,0x2b,0xd9,0x1a]
+// CHECK: asr	x26, x27, x28                 // encoding: [0x7a,0x2b,0xdc,0x9a]
+// CHECK: ror	w0, w1, w2                    // encoding: [0x20,0x2c,0xc2,0x1a]
+// CHECK: ror  x3, x4, x5                     // encoding: [0x83,0x2c,0xc5,0x9a]
+
+
+        lsl	w6, w7, w8
+        lsl	x9, x10, x11
+        lsr	w12, w13, w14
+        lsr	x15, x16, x17
+        asr	w18, w19, w20
+        asr	x21, x22, x23
+        ror	w24, w25, w26
+        ror	x27, x28, x29
+// CHECK: lsl	w6, w7, w8                    // encoding: [0xe6,0x20,0xc8,0x1a]
+// CHECK: lsl	x9, x10, x11                  // encoding: [0x49,0x21,0xcb,0x9a]
+// CHECK: lsr	w12, w13, w14                 // encoding: [0xac,0x25,0xce,0x1a]
+// CHECK: lsr	x15, x16, x17                 // encoding: [0x0f,0x26,0xd1,0x9a]
+// CHECK: asr	w18, w19, w20                 // encoding: [0x72,0x2a,0xd4,0x1a]
+// CHECK: asr	x21, x22, x23                 // encoding: [0xd5,0x2a,0xd7,0x9a]
+// CHECK: ror	w24, w25, w26                 // encoding: [0x38,0x2f,0xda,0x1a]
+// CHECK: ror	x27, x28, x29                 // encoding: [0x9b,0x2f,0xdd,0x9a]
+
+        madd w1, w3, w7, w4
+        madd wzr, w0, w9, w11
+        madd w13, wzr, w4, w4
+        madd w19, w30, wzr, w29
+        madd w4, w5, w6, wzr
+// CHECK: madd     w1, w3, w7, w4             // encoding: [0x61,0x10,0x07,0x1b]
+// CHECK: madd     wzr, w0, w9, w11           // encoding: [0x1f,0x2c,0x09,0x1b]
+// CHECK: madd     w13, wzr, w4, w4           // encoding: [0xed,0x13,0x04,0x1b]
+// CHECK: madd     w19, w30, wzr, w29         // encoding: [0xd3,0x77,0x1f,0x1b]
+// CHECK: mul      w4, w5, w6                 // encoding: [0xa4,0x7c,0x06,0x1b]
+
+        madd x1, x3, x7, x4
+        madd xzr, x0, x9, x11
+        madd x13, xzr, x4, x4
+        madd x19, x30, xzr, x29
+        madd x4, x5, x6, xzr
+// CHECK: madd     x1, x3, x7, x4             // encoding: [0x61,0x10,0x07,0x9b]
+// CHECK: madd     xzr, x0, x9, x11           // encoding: [0x1f,0x2c,0x09,0x9b]
+// CHECK: madd     x13, xzr, x4, x4           // encoding: [0xed,0x13,0x04,0x9b]
+// CHECK: madd     x19, x30, xzr, x29         // encoding: [0xd3,0x77,0x1f,0x9b]
+// CHECK: mul      x4, x5, x6                 // encoding: [0xa4,0x7c,0x06,0x9b]
+
+        msub w1, w3, w7, w4
+        msub wzr, w0, w9, w11
+        msub w13, wzr, w4, w4
+        msub w19, w30, wzr, w29
+        msub w4, w5, w6, wzr
+// CHECK: msub     w1, w3, w7, w4             // encoding: [0x61,0x90,0x07,0x1b]
+// CHECK: msub     wzr, w0, w9, w11           // encoding: [0x1f,0xac,0x09,0x1b]
+// CHECK: msub     w13, wzr, w4, w4           // encoding: [0xed,0x93,0x04,0x1b]
+// CHECK: msub     w19, w30, wzr, w29         // encoding: [0xd3,0xf7,0x1f,0x1b]
+// CHECK: mneg     w4, w5, w6                 // encoding: [0xa4,0xfc,0x06,0x1b]
+
+        msub x1, x3, x7, x4
+        msub xzr, x0, x9, x11
+        msub x13, xzr, x4, x4
+        msub x19, x30, xzr, x29
+        msub x4, x5, x6, xzr
+// CHECK: msub     x1, x3, x7, x4             // encoding: [0x61,0x90,0x07,0x9b]
+// CHECK: msub     xzr, x0, x9, x11           // encoding: [0x1f,0xac,0x09,0x9b]
+// CHECK: msub     x13, xzr, x4, x4           // encoding: [0xed,0x93,0x04,0x9b]
+// CHECK: msub     x19, x30, xzr, x29         // encoding: [0xd3,0xf7,0x1f,0x9b]
+// CHECK: mneg     x4, x5, x6                 // encoding: [0xa4,0xfc,0x06,0x9b]
+
+        smaddl x3, w5, w2, x9
+        smaddl xzr, w10, w11, x12
+        smaddl x13, wzr, w14, x15
+        smaddl x16, w17, wzr, x18
+        smaddl x19, w20, w21, xzr
+// CHECK: smaddl   x3, w5, w2, x9             // encoding: [0xa3,0x24,0x22,0x9b]
+// CHECK: smaddl   xzr, w10, w11, x12         // encoding: [0x5f,0x31,0x2b,0x9b]
+// CHECK: smaddl   x13, wzr, w14, x15         // encoding: [0xed,0x3f,0x2e,0x9b]
+// CHECK: smaddl   x16, w17, wzr, x18         // encoding: [0x30,0x4a,0x3f,0x9b]
+// CHECK: smull    x19, w20, w21              // encoding: [0x93,0x7e,0x35,0x9b]
+
+        smsubl x3, w5, w2, x9
+        smsubl xzr, w10, w11, x12
+        smsubl x13, wzr, w14, x15
+        smsubl x16, w17, wzr, x18
+        smsubl x19, w20, w21, xzr
+// CHECK: smsubl   x3, w5, w2, x9             // encoding: [0xa3,0xa4,0x22,0x9b]
+// CHECK: smsubl   xzr, w10, w11, x12         // encoding: [0x5f,0xb1,0x2b,0x9b]
+// CHECK: smsubl   x13, wzr, w14, x15         // encoding: [0xed,0xbf,0x2e,0x9b]
+// CHECK: smsubl   x16, w17, wzr, x18         // encoding: [0x30,0xca,0x3f,0x9b]
+// CHECK: smnegl   x19, w20, w21              // encoding: [0x93,0xfe,0x35,0x9b]
+
+        umaddl x3, w5, w2, x9
+        umaddl xzr, w10, w11, x12
+        umaddl x13, wzr, w14, x15
+        umaddl x16, w17, wzr, x18
+        umaddl x19, w20, w21, xzr
+// CHECK: umaddl   x3, w5, w2, x9             // encoding: [0xa3,0x24,0xa2,0x9b]
+// CHECK: umaddl   xzr, w10, w11, x12         // encoding: [0x5f,0x31,0xab,0x9b]
+// CHECK: umaddl   x13, wzr, w14, x15         // encoding: [0xed,0x3f,0xae,0x9b]
+// CHECK: umaddl   x16, w17, wzr, x18         // encoding: [0x30,0x4a,0xbf,0x9b]
+// CHECK: umull    x19, w20, w21              // encoding: [0x93,0x7e,0xb5,0x9b]
+
+
+
+        umsubl x3, w5, w2, x9
+        umsubl xzr, w10, w11, x12
+        umsubl x13, wzr, w14, x15
+        umsubl x16, w17, wzr, x18
+        umsubl x19, w20, w21, xzr
+// CHECK: umsubl   x3, w5, w2, x9             // encoding: [0xa3,0xa4,0xa2,0x9b]
+// CHECK: umsubl   xzr, w10, w11, x12         // encoding: [0x5f,0xb1,0xab,0x9b]
+// CHECK: umsubl   x13, wzr, w14, x15         // encoding: [0xed,0xbf,0xae,0x9b]
+// CHECK: umsubl   x16, w17, wzr, x18         // encoding: [0x30,0xca,0xbf,0x9b]
+// CHECK: umnegl   x19, w20, w21              // encoding: [0x93,0xfe,0xb5,0x9b]
+
+        smulh x30, x29, x28
+        smulh xzr, x27, x26
+        smulh x25, xzr, x24
+        smulh x23, x22, xzr
+// CHECK: smulh    x30, x29, x28              // encoding: [0xbe,0x7f,0x5c,0x9b]
+// CHECK: smulh    xzr, x27, x26              // encoding: [0x7f,0x7f,0x5a,0x9b]
+// CHECK: smulh    x25, xzr, x24              // encoding: [0xf9,0x7f,0x58,0x9b]
+// CHECK: smulh    x23, x22, xzr              // encoding: [0xd7,0x7e,0x5f,0x9b]
+
+        umulh x30, x29, x28
+        umulh xzr, x27, x26
+        umulh x25, xzr, x24
+        umulh x23, x22, xzr
+// CHECK: umulh    x30, x29, x28              // encoding: [0xbe,0x7f,0xdc,0x9b]
+// CHECK: umulh    xzr, x27, x26              // encoding: [0x7f,0x7f,0xda,0x9b]
+// CHECK: umulh    x25, xzr, x24              // encoding: [0xf9,0x7f,0xd8,0x9b]
+// CHECK: umulh    x23, x22, xzr              // encoding: [0xd7,0x7e,0xdf,0x9b]
+
+        mul w3, w4, w5
+        mul wzr, w6, w7
+        mul w8, wzr, w9
+        mul w10, w11, wzr
+
+        mul x12, x13, x14
+        mul xzr, x15, x16
+        mul x17, xzr, x18
+        mul x19, x20, xzr
+
+        mneg w21, w22, w23
+        mneg wzr, w24, w25
+        mneg w26, wzr, w27
+        mneg w28, w29, wzr
+
+        smull x11, w13, w17
+        umull x11, w13, w17
+        smnegl x11, w13, w17
+        umnegl x11, w13, w17
+// CHECK: mul      w3, w4, w5                 // encoding: [0x83,0x7c,0x05,0x1b]
+// CHECK: mul      wzr, w6, w7                // encoding: [0xdf,0x7c,0x07,0x1b]
+// CHECK: mul      w8, wzr, w9                // encoding: [0xe8,0x7f,0x09,0x1b]
+// CHECK: mul      w10, w11, wzr              // encoding: [0x6a,0x7d,0x1f,0x1b]
+// CHECK: mul      x12, x13, x14              // encoding: [0xac,0x7d,0x0e,0x9b]
+// CHECK: mul      xzr, x15, x16              // encoding: [0xff,0x7d,0x10,0x9b]
+// CHECK: mul      x17, xzr, x18              // encoding: [0xf1,0x7f,0x12,0x9b]
+// CHECK: mul      x19, x20, xzr              // encoding: [0x93,0x7e,0x1f,0x9b]
+// CHECK: mneg     w21, w22, w23              // encoding: [0xd5,0xfe,0x17,0x1b]
+// CHECK: mneg     wzr, w24, w25              // encoding: [0x1f,0xff,0x19,0x1b]
+// CHECK: mneg     w26, wzr, w27              // encoding: [0xfa,0xff,0x1b,0x1b]
+// CHECK: mneg     w28, w29, wzr              // encoding: [0xbc,0xff,0x1f,0x1b]
+// CHECK: smull    x11, w13, w17              // encoding: [0xab,0x7d,0x31,0x9b]
+// CHECK: umull    x11, w13, w17              // encoding: [0xab,0x7d,0xb1,0x9b]
+// CHECK: smnegl   x11, w13, w17              // encoding: [0xab,0xfd,0x31,0x9b]
+// CHECK: umnegl   x11, w13, w17              // encoding: [0xab,0xfd,0xb1,0x9b]
+
+//------------------------------------------------------------------------------
+// Exception generation
+//------------------------------------------------------------------------------
+        svc #0
+        svc #65535
+// CHECK: svc      #0                         // encoding: [0x01,0x00,0x00,0xd4]
+// CHECK: svc      #65535                     // encoding: [0xe1,0xff,0x1f,0xd4]
+
+        hvc #1
+        smc #12000
+        brk #12
+        hlt #123
+// CHECK: hvc      #1                         // encoding: [0x22,0x00,0x00,0xd4]
+// CHECK: smc      #12000                     // encoding: [0x03,0xdc,0x05,0xd4]
+// CHECK: brk      #12                        // encoding: [0x80,0x01,0x20,0xd4]
+// CHECK: hlt      #123                       // encoding: [0x60,0x0f,0x40,0xd4]
+
+        dcps1 #42
+        dcps2 #9
+        dcps3 #1000
+// CHECK: dcps1    #42                        // encoding: [0x41,0x05,0xa0,0xd4]
+// CHECK: dcps2    #9                         // encoding: [0x22,0x01,0xa0,0xd4]
+// CHECK: dcps3    #1000                      // encoding: [0x03,0x7d,0xa0,0xd4]
+
+        dcps1
+        dcps2
+        dcps3
+// CHECK: dcps1                               // encoding: [0x01,0x00,0xa0,0xd4]
+// CHECK: dcps2                               // encoding: [0x02,0x00,0xa0,0xd4]
+// CHECK: dcps3                               // encoding: [0x03,0x00,0xa0,0xd4]
+
+//------------------------------------------------------------------------------
+// Extract (immediate)
+//------------------------------------------------------------------------------
+
+        extr w3, w5, w7, #0
+        extr w11, w13, w17, #31
+// CHECK: extr     w3, w5, w7, #0             // encoding: [0xa3,0x00,0x87,0x13]
+// CHECK: extr     w11, w13, w17, #31         // encoding: [0xab,0x7d,0x91,0x13]
+
+        extr x3, x5, x7, #15
+        extr x11, x13, x17, #63
+// CHECK: extr     x3, x5, x7, #15            // encoding: [0xa3,0x3c,0xc7,0x93]
+// CHECK: extr     x11, x13, x17, #63         // encoding: [0xab,0xfd,0xd1,0x93]
+
+        ror x19, x23, #24
+        ror x29, xzr, #63
+// CHECK: extr     x19, x23, x23, #24         // encoding: [0xf3,0x62,0xd7,0x93]
+// CHECK: extr     x29, xzr, xzr, #63         // encoding: [0xfd,0xff,0xdf,0x93]
+
+        ror w9, w13, #31
+// CHECK: extr     w9, w13, w13, #31          // encoding: [0xa9,0x7d,0x8d,0x13]
+
+//------------------------------------------------------------------------------
+// Floating-point compare
+//------------------------------------------------------------------------------
+
+        fcmp s3, s5
+        fcmp s31, #0.0
+// CHECK: fcmp    s3, s5                  // encoding: [0x60,0x20,0x25,0x1e]
+// CHECK: fcmp    s31, #0.0               // encoding: [0xe8,0x23,0x20,0x1e]
+
+        fcmpe s29, s30
+        fcmpe s15, #0.0
+// CHECK: fcmpe   s29, s30                // encoding: [0xb0,0x23,0x3e,0x1e]
+// CHECK: fcmpe   s15, #0.0               // encoding: [0xf8,0x21,0x20,0x1e]
+
+        fcmp d4, d12
+        fcmp d23, #0.0
+// CHECK: fcmp    d4, d12                 // encoding: [0x80,0x20,0x6c,0x1e]
+// CHECK: fcmp    d23, #0.0               // encoding: [0xe8,0x22,0x60,0x1e]
+
+        fcmpe d26, d22
+        fcmpe d29, #0.0
+// CHECK: fcmpe   d26, d22                // encoding: [0x50,0x23,0x76,0x1e]
+// CHECK: fcmpe   d29, #0.0               // encoding: [0xb8,0x23,0x60,0x1e]
+
+//------------------------------------------------------------------------------
+// Floating-point conditional compare
+//------------------------------------------------------------------------------
+
+        fccmp s1, s31, #0, eq
+        fccmp s3, s0, #15, hs
+        fccmp s31, s15, #13, cs
+// CHECK: fccmp    s1, s31, #0, eq         // encoding: [0x20,0x04,0x3f,0x1e]
+// CHECK: fccmp    s3, s0, #15, hs         // encoding: [0x6f,0x24,0x20,0x1e]
+// CHECK: fccmp    s31, s15, #13, hs       // encoding: [0xed,0x27,0x2f,0x1e]
+
+        fccmp d9, d31, #0, le
+        fccmp d3, d0, #15, gt
+        fccmp d31, d5, #7, ne
+// CHECK: fccmp    d9, d31, #0, le         // encoding: [0x20,0xd5,0x7f,0x1e]
+// CHECK: fccmp    d3, d0, #15, gt         // encoding: [0x6f,0xc4,0x60,0x1e]
+// CHECK: fccmp    d31, d5, #7, ne         // encoding: [0xe7,0x17,0x65,0x1e]
+
+        fccmpe s1, s31, #0, eq
+        fccmpe s3, s0, #15, hs
+        fccmpe s31, s15, #13, cs
+// CHECK: fccmpe    s1, s31, #0, eq         // encoding: [0x30,0x04,0x3f,0x1e]
+// CHECK: fccmpe    s3, s0, #15, hs         // encoding: [0x7f,0x24,0x20,0x1e]
+// CHECK: fccmpe    s31, s15, #13, hs       // encoding: [0xfd,0x27,0x2f,0x1e]
+
+        fccmpe d9, d31, #0, le
+        fccmpe d3, d0, #15, gt
+        fccmpe d31, d5, #7, ne
+// CHECK: fccmpe    d9, d31, #0, le         // encoding: [0x30,0xd5,0x7f,0x1e]
+// CHECK: fccmpe    d3, d0, #15, gt         // encoding: [0x7f,0xc4,0x60,0x1e]
+// CHECK: fccmpe    d31, d5, #7, ne         // encoding: [0xf7,0x17,0x65,0x1e]
+
+//------------------------------------------------------------------------------
+// Floating-point conditional compare
+//------------------------------------------------------------------------------
+
+        fcsel s3, s20, s9, pl
+        fcsel d9, d10, d11, mi
+// CHECK: fcsel   s3, s20, s9, pl         // encoding: [0x83,0x5e,0x29,0x1e]
+// CHECK: fcsel   d9, d10, d11, mi        // encoding: [0x49,0x4d,0x6b,0x1e]
+
+//------------------------------------------------------------------------------
+// Floating-point data-processing (1 source)
+//------------------------------------------------------------------------------
+
+        fmov s0, s1
+        fabs s2, s3
+        fneg s4, s5
+        fsqrt s6, s7
+        fcvt d8, s9
+        fcvt h10, s11
+        frintn s12, s13
+        frintp s14, s15
+        frintm s16, s17
+        frintz s18, s19
+        frinta s20, s21
+        frintx s22, s23
+        frinti s24, s25
+// CHECK: fmov     s0, s1                // encoding: [0x20,0x40,0x20,0x1e]
+// CHECK: fabs     s2, s3                // encoding: [0x62,0xc0,0x20,0x1e]
+// CHECK: fneg     s4, s5                     // encoding: [0xa4,0x40,0x21,0x1e]
+// CHECK: fsqrt    s6, s7                     // encoding: [0xe6,0xc0,0x21,0x1e]
+// CHECK: fcvt     d8, s9                     // encoding: [0x28,0xc1,0x22,0x1e]
+// CHECK: fcvt     h10, s11                   // encoding: [0x6a,0xc1,0x23,0x1e]
+// CHECK: frintn   s12, s13                   // encoding: [0xac,0x41,0x24,0x1e]
+// CHECK: frintp   s14, s15                   // encoding: [0xee,0xc1,0x24,0x1e]
+// CHECK: frintm   s16, s17                   // encoding: [0x30,0x42,0x25,0x1e]
+// CHECK: frintz   s18, s19                   // encoding: [0x72,0xc2,0x25,0x1e]
+// CHECK: frinta   s20, s21                   // encoding: [0xb4,0x42,0x26,0x1e]
+// CHECK: frintx   s22, s23                   // encoding: [0xf6,0x42,0x27,0x1e]
+// CHECK: frinti   s24, s25                   // encoding: [0x38,0xc3,0x27,0x1e]
+
+        fmov d0, d1
+        fabs d2, d3
+        fneg d4, d5
+        fsqrt d6, d7
+        fcvt s8, d9
+        fcvt h10, d11
+        frintn d12, d13
+        frintp d14, d15
+        frintm d16, d17
+        frintz d18, d19
+        frinta d20, d21
+        frintx d22, d23
+        frinti d24, d25
+// CHECK: fmov     d0, d1                     // encoding: [0x20,0x40,0x60,0x1e]
+// CHECK: fabs     d2, d3                     // encoding: [0x62,0xc0,0x60,0x1e]
+// CHECK: fneg     d4, d5                     // encoding: [0xa4,0x40,0x61,0x1e]
+// CHECK: fsqrt    d6, d7                     // encoding: [0xe6,0xc0,0x61,0x1e]
+// CHECK: fcvt     s8, d9                     // encoding: [0x28,0x41,0x62,0x1e]
+// CHECK: fcvt     h10, d11                   // encoding: [0x6a,0xc1,0x63,0x1e]
+// CHECK: frintn   d12, d13                   // encoding: [0xac,0x41,0x64,0x1e]
+// CHECK: frintp   d14, d15                   // encoding: [0xee,0xc1,0x64,0x1e]
+// CHECK: frintm   d16, d17                   // encoding: [0x30,0x42,0x65,0x1e]
+// CHECK: frintz   d18, d19                   // encoding: [0x72,0xc2,0x65,0x1e]
+// CHECK: frinta   d20, d21                   // encoding: [0xb4,0x42,0x66,0x1e]
+// CHECK: frintx   d22, d23                   // encoding: [0xf6,0x42,0x67,0x1e]
+// CHECK: frinti   d24, d25                   // encoding: [0x38,0xc3,0x67,0x1e]
+
+        fcvt s26, h27
+        fcvt d28, h29
+// CHECK: fcvt     s26, h27                   // encoding: [0x7a,0x43,0xe2,0x1e]
+// CHECK: fcvt     d28, h29                   // encoding: [0xbc,0xc3,0xe2,0x1e]
+
+//------------------------------------------------------------------------------
+// Floating-point data-processing (2 sources)
+//------------------------------------------------------------------------------
+
+        fmul s20, s19, s17
+        fdiv s1, s2, s3
+        fadd s4, s5, s6
+        fsub s7, s8, s9
+        fmax s10, s11, s12
+        fmin s13, s14, s15
+        fmaxnm s16, s17, s18
+        fminnm s19, s20, s21
+        fnmul s22, s23, s24
+// CHECK: fmul     s20, s19, s17              // encoding: [0x74,0x0a,0x31,0x1e]
+// CHECK: fdiv     s1, s2, s3                 // encoding: [0x41,0x18,0x23,0x1e]
+// CHECK: fadd     s4, s5, s6                 // encoding: [0xa4,0x28,0x26,0x1e]
+// CHECK: fsub     s7, s8, s9                 // encoding: [0x07,0x39,0x29,0x1e]
+// CHECK: fmax     s10, s11, s12              // encoding: [0x6a,0x49,0x2c,0x1e]
+// CHECK: fmin     s13, s14, s15              // encoding: [0xcd,0x59,0x2f,0x1e]
+// CHECK: fmaxnm   s16, s17, s18              // encoding: [0x30,0x6a,0x32,0x1e]
+// CHECK: fminnm   s19, s20, s21              // encoding: [0x93,0x7a,0x35,0x1e]
+// CHECK: fnmul    s22, s23, s24              // encoding: [0xf6,0x8a,0x38,0x1e]
+
+        fmul d20, d19, d17
+        fdiv d1, d2, d3
+        fadd d4, d5, d6
+        fsub d7, d8, d9
+        fmax d10, d11, d12
+        fmin d13, d14, d15
+        fmaxnm d16, d17, d18
+        fminnm d19, d20, d21
+        fnmul d22, d23, d24
+// CHECK: fmul     d20, d19, d17              // encoding: [0x74,0x0a,0x71,0x1e]
+// CHECK: fdiv     d1, d2, d3                 // encoding: [0x41,0x18,0x63,0x1e]
+// CHECK: fadd     d4, d5, d6                 // encoding: [0xa4,0x28,0x66,0x1e]
+// CHECK: fsub     d7, d8, d9                 // encoding: [0x07,0x39,0x69,0x1e]
+// CHECK: fmax     d10, d11, d12              // encoding: [0x6a,0x49,0x6c,0x1e]
+// CHECK: fmin     d13, d14, d15              // encoding: [0xcd,0x59,0x6f,0x1e]
+// CHECK: fmaxnm   d16, d17, d18              // encoding: [0x30,0x6a,0x72,0x1e]
+// CHECK: fminnm   d19, d20, d21              // encoding: [0x93,0x7a,0x75,0x1e]
+// CHECK: fnmul    d22, d23, d24              // encoding: [0xf6,0x8a,0x78,0x1e]
+
+//------------------------------------------------------------------------------
+// Floating-point data-processing (3 sources)
+//------------------------------------------------------------------------------
+
+        fmadd s3, s5, s6, s31
+        fmadd d3, d13, d0, d23
+        fmsub s3, s5, s6, s31
+        fmsub d3, d13, d0, d23
+        fnmadd s3, s5, s6, s31
+        fnmadd d3, d13, d0, d23
+        fnmsub s3, s5, s6, s31
+        fnmsub d3, d13, d0, d23
+// CHECK: fmadd   s3, s5, s6, s31         // encoding: [0xa3,0x7c,0x06,0x1f]
+// CHECK: fmadd   d3, d13, d0, d23        // encoding: [0xa3,0x5d,0x40,0x1f]
+// CHECK: fmsub   s3, s5, s6, s31         // encoding: [0xa3,0xfc,0x06,0x1f]
+// CHECK: fmsub   d3, d13, d0, d23        // encoding: [0xa3,0xdd,0x40,0x1f]
+// CHECK: fnmadd  s3, s5, s6, s31         // encoding: [0xa3,0x7c,0x26,0x1f]
+// CHECK: fnmadd  d3, d13, d0, d23        // encoding: [0xa3,0x5d,0x60,0x1f]
+// CHECK: fnmsub  s3, s5, s6, s31         // encoding: [0xa3,0xfc,0x26,0x1f]
+// CHECK: fnmsub  d3, d13, d0, d23        // encoding: [0xa3,0xdd,0x60,0x1f]
+
+//------------------------------------------------------------------------------
+// Floating-point <-> fixed-point conversion
+//------------------------------------------------------------------------------
+
+        fcvtzs w3, s5, #1
+        fcvtzs wzr, s20, #13
+        fcvtzs w19, s0, #32
+// CHECK: fcvtzs  w3, s5, #1              // encoding: [0xa3,0xfc,0x18,0x1e]
+// CHECK: fcvtzs  wzr, s20, #13           // encoding: [0x9f,0xce,0x18,0x1e]
+// CHECK: fcvtzs  w19, s0, #32            // encoding: [0x13,0x80,0x18,0x1e]
+
+        fcvtzs x3, s5, #1
+        fcvtzs x12, s30, #45
+        fcvtzs x19, s0, #64
+// CHECK: fcvtzs  x3, s5, #1              // encoding: [0xa3,0xfc,0x18,0x9e]
+// CHECK: fcvtzs  x12, s30, #45           // encoding: [0xcc,0x4f,0x18,0x9e]
+// CHECK: fcvtzs  x19, s0, #64            // encoding: [0x13,0x00,0x18,0x9e]
+
+        fcvtzs w3, d5, #1
+        fcvtzs wzr, d20, #13
+        fcvtzs w19, d0, #32
+// CHECK: fcvtzs  w3, d5, #1              // encoding: [0xa3,0xfc,0x58,0x1e]
+// CHECK: fcvtzs  wzr, d20, #13           // encoding: [0x9f,0xce,0x58,0x1e]
+// CHECK: fcvtzs  w19, d0, #32            // encoding: [0x13,0x80,0x58,0x1e]
+
+        fcvtzs x3, d5, #1
+        fcvtzs x12, d30, #45
+        fcvtzs x19, d0, #64
+// CHECK: fcvtzs  x3, d5, #1              // encoding: [0xa3,0xfc,0x58,0x9e]
+// CHECK: fcvtzs  x12, d30, #45           // encoding: [0xcc,0x4f,0x58,0x9e]
+// CHECK: fcvtzs  x19, d0, #64            // encoding: [0x13,0x00,0x58,0x9e]
+
+        fcvtzu w3, s5, #1
+        fcvtzu wzr, s20, #13
+        fcvtzu w19, s0, #32
+// CHECK: fcvtzu  w3, s5, #1              // encoding: [0xa3,0xfc,0x19,0x1e]
+// CHECK: fcvtzu  wzr, s20, #13           // encoding: [0x9f,0xce,0x19,0x1e]
+// CHECK: fcvtzu  w19, s0, #32            // encoding: [0x13,0x80,0x19,0x1e]
+
+        fcvtzu x3, s5, #1
+        fcvtzu x12, s30, #45
+        fcvtzu x19, s0, #64
+// CHECK: fcvtzu  x3, s5, #1              // encoding: [0xa3,0xfc,0x19,0x9e]
+// CHECK: fcvtzu  x12, s30, #45           // encoding: [0xcc,0x4f,0x19,0x9e]
+// CHECK: fcvtzu  x19, s0, #64            // encoding: [0x13,0x00,0x19,0x9e]
+
+        fcvtzu w3, d5, #1
+        fcvtzu wzr, d20, #13
+        fcvtzu w19, d0, #32
+// CHECK: fcvtzu  w3, d5, #1              // encoding: [0xa3,0xfc,0x59,0x1e]
+// CHECK: fcvtzu  wzr, d20, #13           // encoding: [0x9f,0xce,0x59,0x1e]
+// CHECK: fcvtzu  w19, d0, #32            // encoding: [0x13,0x80,0x59,0x1e]
+
+        fcvtzu x3, d5, #1
+        fcvtzu x12, d30, #45
+        fcvtzu x19, d0, #64
+// CHECK: fcvtzu  x3, d5, #1              // encoding: [0xa3,0xfc,0x59,0x9e]
+// CHECK: fcvtzu  x12, d30, #45           // encoding: [0xcc,0x4f,0x59,0x9e]
+// CHECK: fcvtzu  x19, d0, #64            // encoding: [0x13,0x00,0x59,0x9e]
+
+        scvtf s23, w19, #1
+        scvtf s31, wzr, #20
+        scvtf s14, w0, #32
+// CHECK: scvtf   s23, w19, #1            // encoding: [0x77,0xfe,0x02,0x1e]
+// CHECK: scvtf   s31, wzr, #20           // encoding: [0xff,0xb3,0x02,0x1e]
+// CHECK: scvtf   s14, w0, #32            // encoding: [0x0e,0x80,0x02,0x1e]
+
+        scvtf s23, x19, #1
+        scvtf s31, xzr, #20
+        scvtf s14, x0, #64
+// CHECK: scvtf   s23, x19, #1            // encoding: [0x77,0xfe,0x02,0x9e]
+// CHECK: scvtf   s31, xzr, #20           // encoding: [0xff,0xb3,0x02,0x9e]
+// CHECK: scvtf   s14, x0, #64            // encoding: [0x0e,0x00,0x02,0x9e]
+
+        scvtf d23, w19, #1
+        scvtf d31, wzr, #20
+        scvtf d14, w0, #32
+// CHECK: scvtf   d23, w19, #1            // encoding: [0x77,0xfe,0x42,0x1e]
+// CHECK: scvtf   d31, wzr, #20           // encoding: [0xff,0xb3,0x42,0x1e]
+// CHECK: scvtf   d14, w0, #32            // encoding: [0x0e,0x80,0x42,0x1e]
+
+        scvtf d23, x19, #1
+        scvtf d31, xzr, #20
+        scvtf d14, x0, #64
+// CHECK: scvtf   d23, x19, #1            // encoding: [0x77,0xfe,0x42,0x9e]
+// CHECK: scvtf   d31, xzr, #20           // encoding: [0xff,0xb3,0x42,0x9e]
+// CHECK: scvtf   d14, x0, #64            // encoding: [0x0e,0x00,0x42,0x9e]
+
+        ucvtf s23, w19, #1
+        ucvtf s31, wzr, #20
+        ucvtf s14, w0, #32
+// CHECK: ucvtf   s23, w19, #1            // encoding: [0x77,0xfe,0x03,0x1e]
+// CHECK: ucvtf   s31, wzr, #20           // encoding: [0xff,0xb3,0x03,0x1e]
+// CHECK: ucvtf   s14, w0, #32            // encoding: [0x0e,0x80,0x03,0x1e]
+
+        ucvtf s23, x19, #1
+        ucvtf s31, xzr, #20
+        ucvtf s14, x0, #64
+// CHECK: ucvtf   s23, x19, #1            // encoding: [0x77,0xfe,0x03,0x9e]
+// CHECK: ucvtf   s31, xzr, #20           // encoding: [0xff,0xb3,0x03,0x9e]
+// CHECK: ucvtf   s14, x0, #64            // encoding: [0x0e,0x00,0x03,0x9e]
+
+        ucvtf d23, w19, #1
+        ucvtf d31, wzr, #20
+        ucvtf d14, w0, #32
+// CHECK: ucvtf   d23, w19, #1            // encoding: [0x77,0xfe,0x43,0x1e]
+// CHECK: ucvtf   d31, wzr, #20           // encoding: [0xff,0xb3,0x43,0x1e]
+// CHECK: ucvtf   d14, w0, #32            // encoding: [0x0e,0x80,0x43,0x1e]
+
+        ucvtf d23, x19, #1
+        ucvtf d31, xzr, #20
+        ucvtf d14, x0, #64
+// CHECK: ucvtf   d23, x19, #1            // encoding: [0x77,0xfe,0x43,0x9e]
+// CHECK: ucvtf   d31, xzr, #20           // encoding: [0xff,0xb3,0x43,0x9e]
+// CHECK: ucvtf   d14, x0, #64            // encoding: [0x0e,0x00,0x43,0x9e]
+
+//------------------------------------------------------------------------------
+// Floating-point <-> integer conversion
+//------------------------------------------------------------------------------
+        fcvtns w3, s31
+        fcvtns xzr, s12
+        fcvtnu wzr, s12
+        fcvtnu x0, s0
+// CHECK: fcvtns   w3, s31                    // encoding: [0xe3,0x03,0x20,0x1e]
+// CHECK: fcvtns   xzr, s12                   // encoding: [0x9f,0x01,0x20,0x9e]
+// CHECK: fcvtnu   wzr, s12                   // encoding: [0x9f,0x01,0x21,0x1e]
+// CHECK: fcvtnu   x0, s0                     // encoding: [0x00,0x00,0x21,0x9e]
+
+        fcvtps wzr, s9
+        fcvtps x12, s20
+        fcvtpu w30, s23
+        fcvtpu x29, s3
+// CHECK: fcvtps   wzr, s9                    // encoding: [0x3f,0x01,0x28,0x1e]
+// CHECK: fcvtps   x12, s20                   // encoding: [0x8c,0x02,0x28,0x9e]
+// CHECK: fcvtpu   w30, s23                   // encoding: [0xfe,0x02,0x29,0x1e]
+// CHECK: fcvtpu   x29, s3                    // encoding: [0x7d,0x00,0x29,0x9e]
+
+        fcvtms w2, s3
+        fcvtms x4, s5
+        fcvtmu w6, s7
+        fcvtmu x8, s9
+// CHECK: fcvtms   w2, s3                     // encoding: [0x62,0x00,0x30,0x1e]
+// CHECK: fcvtms   x4, s5                     // encoding: [0xa4,0x00,0x30,0x9e]
+// CHECK: fcvtmu   w6, s7                     // encoding: [0xe6,0x00,0x31,0x1e]
+// CHECK: fcvtmu   x8, s9                     // encoding: [0x28,0x01,0x31,0x9e]
+
+        fcvtzs w10, s11
+        fcvtzs x12, s13
+        fcvtzu w14, s15
+        fcvtzu x15, s16
+// CHECK: fcvtzs   w10, s11                   // encoding: [0x6a,0x01,0x38,0x1e]
+// CHECK: fcvtzs   x12, s13                   // encoding: [0xac,0x01,0x38,0x9e]
+// CHECK: fcvtzu   w14, s15                   // encoding: [0xee,0x01,0x39,0x1e]
+// CHECK: fcvtzu   x15, s16                   // encoding: [0x0f,0x02,0x39,0x9e]
+
+        scvtf s17, w18
+        scvtf s19, x20
+        ucvtf s21, w22
+        scvtf s23, x24
+// CHECK: scvtf    s17, w18                   // encoding: [0x51,0x02,0x22,0x1e]
+// CHECK: scvtf    s19, x20                   // encoding: [0x93,0x02,0x22,0x9e]
+// CHECK: ucvtf    s21, w22                   // encoding: [0xd5,0x02,0x23,0x1e]
+// CHECK: scvtf    s23, x24                   // encoding: [0x17,0x03,0x22,0x9e]
+
+        fcvtas w25, s26
+        fcvtas x27, s28
+        fcvtau w29, s30
+        fcvtau xzr, s0
+// CHECK: fcvtas   w25, s26                   // encoding: [0x59,0x03,0x24,0x1e]
+// CHECK: fcvtas   x27, s28                   // encoding: [0x9b,0x03,0x24,0x9e]
+// CHECK: fcvtau   w29, s30                   // encoding: [0xdd,0x03,0x25,0x1e]
+// CHECK: fcvtau   xzr, s0                    // encoding: [0x1f,0x00,0x25,0x9e]
+
+        fcvtns w3, d31
+        fcvtns xzr, d12
+        fcvtnu wzr, d12
+        fcvtnu x0, d0
+// CHECK: fcvtns   w3, d31                    // encoding: [0xe3,0x03,0x60,0x1e]
+// CHECK: fcvtns   xzr, d12                   // encoding: [0x9f,0x01,0x60,0x9e]
+// CHECK: fcvtnu   wzr, d12                   // encoding: [0x9f,0x01,0x61,0x1e]
+// CHECK: fcvtnu   x0, d0                     // encoding: [0x00,0x00,0x61,0x9e]
+
+        fcvtps wzr, d9
+        fcvtps x12, d20
+        fcvtpu w30, d23
+        fcvtpu x29, d3
+// CHECK: fcvtps   wzr, d9                    // encoding: [0x3f,0x01,0x68,0x1e]
+// CHECK: fcvtps   x12, d20                   // encoding: [0x8c,0x02,0x68,0x9e]
+// CHECK: fcvtpu   w30, d23                   // encoding: [0xfe,0x02,0x69,0x1e]
+// CHECK: fcvtpu   x29, d3                    // encoding: [0x7d,0x00,0x69,0x9e]
+
+        fcvtms w2, d3
+        fcvtms x4, d5
+        fcvtmu w6, d7
+        fcvtmu x8, d9
+// CHECK: fcvtms   w2, d3                     // encoding: [0x62,0x00,0x70,0x1e]
+// CHECK: fcvtms   x4, d5                     // encoding: [0xa4,0x00,0x70,0x9e]
+// CHECK: fcvtmu   w6, d7                     // encoding: [0xe6,0x00,0x71,0x1e]
+// CHECK: fcvtmu   x8, d9                     // encoding: [0x28,0x01,0x71,0x9e]
+
+        fcvtzs w10, d11
+        fcvtzs x12, d13
+        fcvtzu w14, d15
+        fcvtzu x15, d16
+// CHECK: fcvtzs   w10, d11                   // encoding: [0x6a,0x01,0x78,0x1e]
+// CHECK: fcvtzs   x12, d13                   // encoding: [0xac,0x01,0x78,0x9e]
+// CHECK: fcvtzu   w14, d15                   // encoding: [0xee,0x01,0x79,0x1e]
+// CHECK: fcvtzu   x15, d16                   // encoding: [0x0f,0x02,0x79,0x9e]
+
+        scvtf d17, w18
+        scvtf d19, x20
+        ucvtf d21, w22
+        ucvtf d23, x24
+// CHECK: scvtf    d17, w18                   // encoding: [0x51,0x02,0x62,0x1e]
+// CHECK: scvtf    d19, x20                   // encoding: [0x93,0x02,0x62,0x9e]
+// CHECK: ucvtf    d21, w22                   // encoding: [0xd5,0x02,0x63,0x1e]
+// CHECK: ucvtf    d23, x24                   // encoding: [0x17,0x03,0x63,0x9e]
+
+        fcvtas w25, d26
+        fcvtas x27, d28
+        fcvtau w29, d30
+        fcvtau xzr, d0
+// CHECK: fcvtas   w25, d26                   // encoding: [0x59,0x03,0x64,0x1e]
+// CHECK: fcvtas   x27, d28                   // encoding: [0x9b,0x03,0x64,0x9e]
+// CHECK: fcvtau   w29, d30                   // encoding: [0xdd,0x03,0x65,0x1e]
+// CHECK: fcvtau   xzr, d0                    // encoding: [0x1f,0x00,0x65,0x9e]
+
+        fmov w3, s9
+        fmov s9, w3
+// CHECK: fmov     w3, s9                     // encoding: [0x23,0x01,0x26,0x1e]
+// CHECK: fmov     s9, w3                     // encoding: [0x69,0x00,0x27,0x1e]
+
+        fmov x20, d31
+        fmov d1, x15
+// CHECK: fmov     x20, d31                   // encoding: [0xf4,0x03,0x66,0x9e]
+// CHECK: fmov     d1, x15                    // encoding: [0xe1,0x01,0x67,0x9e]
+
+        fmov x3, v12.d[1]
+        fmov v1.d[1], x19
+        fmov v3.2d[1], xzr
+// CHECK: fmov     x3, v12.d[1]               // encoding: [0x83,0x01,0xae,0x9e]
+// CHECK: fmov     v1.d[1], x19               // encoding: [0x61,0x02,0xaf,0x9e]
+// CHECK: fmov     v3.d[1], xzr               // encoding: [0xe3,0x03,0xaf,0x9e]
+
+//------------------------------------------------------------------------------
+// Floating-point immediate
+//------------------------------------------------------------------------------
+
+        fmov s2, #0.125
+        fmov s3, #1.0
+        fmov d30, #16.0
+// CHECK: fmov     s2, #0.12500000            // encoding: [0x02,0x10,0x28,0x1e]
+// CHECK: fmov     s3, #1.00000000            // encoding: [0x03,0x10,0x2e,0x1e]
+// CHECK: fmov     d30, #16.00000000          // encoding: [0x1e,0x10,0x66,0x1e]
+
+        fmov s4, #1.0625
+        fmov d10, #1.9375
+// CHECK: fmov     s4, #1.06250000            // encoding: [0x04,0x30,0x2e,0x1e]
+// CHECK: fmov     d10, #1.93750000           // encoding: [0x0a,0xf0,0x6f,0x1e]
+
+        fmov s12, #-1.0
+// CHECK: fmov     s12, #-1.00000000          // encoding: [0x0c,0x10,0x3e,0x1e]
+
+        fmov d16, #8.5
+// CHECK: fmov     d16, #8.50000000           // encoding: [0x10,0x30,0x64,0x1e]
+
+//------------------------------------------------------------------------------
+// Load-register (literal)
+//------------------------------------------------------------------------------
+        ldr w3, here
+        ldr x29, there
+        ldrsw xzr, everywhere
+// CHECK: ldr     w3, here                // encoding: [0x03'A',A,A,0x18'A']
+// CHECK:                                 //   fixup A - offset: 0, value: here, kind: fixup_a64_ld_prel
+// CHECK: ldr     x29, there              // encoding: [0x1d'A',A,A,0x58'A']
+// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_a64_ld_prel
+// CHECK: ldrsw   xzr, everywhere         // encoding: [0x1f'A',A,A,0x98'A']
+// CHECK:                                 //   fixup A - offset: 0, value: everywhere, kind: fixup_a64_ld_prel
+
+        ldr s0, who_knows
+        ldr d0, i_dont
+        ldr q0, there_must_be_a_better_way
+// CHECK: ldr     s0, who_knows           // encoding: [A,A,A,0x1c'A']
+// CHECK:                                 //   fixup A - offset: 0, value: who_knows, kind: fixup_a64_ld_prel
+// CHECK: ldr     d0, i_dont              // encoding: [A,A,A,0x5c'A']
+// CHECK:                                 //   fixup A - offset: 0, value: i_dont, kind: fixup_a64_ld_prel
+// CHECK: ldr     q0, there_must_be_a_better_way // encoding: [A,A,A,0x9c'A']
+// CHECK:                                 //   fixup A - offset: 0, value: there_must_be_a_better_way, kind: fixup_a64_ld_prel
+
+        ldr w0, #1048572
+        ldr x10, #-1048576
+// CHECK: ldr     w0, #1048572            // encoding: [0xe0,0xff,0x7f,0x18]
+// CHECK: ldr     x10, #-1048576          // encoding: [0x0a,0x00,0x80,0x58]
+
+        prfm pldl1strm, nowhere
+        prfm #22, somewhere
+// CHECK: prfm    pldl1strm, nowhere      // encoding: [0x01'A',A,A,0xd8'A']
+// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_a64_ld_prel
+// CHECK: prfm    #22, somewhere          // encoding: [0x16'A',A,A,0xd8'A']
+// CHECK:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_a64_ld_prel
+
+//------------------------------------------------------------------------------
+// Floating-point immediate
+//------------------------------------------------------------------------------
+
+        fmov s2, #0.125
+        fmov s3, #1.0
+        fmov d30, #16.0
+// CHECK: fmov     s2, #0.12500000            // encoding: [0x02,0x10,0x28,0x1e]
+// CHECK: fmov     s3, #1.00000000            // encoding: [0x03,0x10,0x2e,0x1e]
+// CHECK: fmov     d30, #16.00000000          // encoding: [0x1e,0x10,0x66,0x1e]
+
+        fmov s4, #1.0625
+        fmov d10, #1.9375
+// CHECK: fmov     s4, #1.06250000            // encoding: [0x04,0x30,0x2e,0x1e]
+// CHECK: fmov     d10, #1.93750000           // encoding: [0x0a,0xf0,0x6f,0x1e]
+
+        fmov s12, #-1.0
+// CHECK: fmov     s12, #-1.00000000          // encoding: [0x0c,0x10,0x3e,0x1e]
+
+        fmov d16, #8.5
+// CHECK: fmov     d16, #8.50000000           // encoding: [0x10,0x30,0x64,0x1e]
+
+//------------------------------------------------------------------------------
+// Load/store exclusive
+//------------------------------------------------------------------------------
+
+        stxrb      w1, w2, [x3, #0]
+        stxrh      w2, w3, [x4]
+        stxr       wzr, w4, [sp]
+        stxr       w5, x6, [x7]
+// CHECK: stxrb    w1, w2, [x3]              // encoding: [0x62,0x7c,0x01,0x08]
+// CHECK: stxrh    w2, w3, [x4]              // encoding: [0x83,0x7c,0x02,0x48]
+// CHECK: stxr     wzr, w4, [sp]             // encoding: [0xe4,0x7f,0x1f,0x88]
+// CHECK: stxr     w5, x6, [x7]              // encoding: [0xe6,0x7c,0x05,0xc8]
+
+        ldxrb      w7, [x9]
+        ldxrh      wzr, [x10]
+        ldxr       w9, [sp]
+        ldxr       x10, [x11]
+// CHECK: ldxrb    w7, [x9]                  // encoding: [0x27,0x7d,0x5f,0x08]
+// CHECK: ldxrh    wzr, [x10]                // encoding: [0x5f,0x7d,0x5f,0x48]
+// CHECK: ldxr     w9, [sp]                  // encoding: [0xe9,0x7f,0x5f,0x88]
+// CHECK: ldxr     x10, [x11]                // encoding: [0x6a,0x7d,0x5f,0xc8]
+
+        stxp       w11, w12, w13, [x14]
+        stxp       wzr, x23, x14, [x15]
+// CHECK: stxp     w11, w12, w13, [x14]      // encoding: [0xcc,0x35,0x2b,0x88]
+// CHECK: stxp     wzr, x23, x14, [x15]      // encoding: [0xf7,0x39,0x3f,0xc8]
+
+        ldxp       w12, wzr, [sp]
+        ldxp       x13, x14, [x15]
+// CHECK: ldxp     w12, wzr, [sp]            // encoding: [0xec,0x7f,0x7f,0x88]
+// CHECK: ldxp     x13, x14, [x15]           // encoding: [0xed,0x39,0x7f,0xc8]
+
+        stlxrb     w14, w15, [x16]
+        stlxrh     w15, w16, [x17,#0]
+        stlxr      wzr, w17, [sp]
+        stlxr      w18, x19, [x20]
+// CHECK: stlxrb   w14, w15, [x16]           // encoding: [0x0f,0xfe,0x0e,0x08]
+// CHECK: stlxrh   w15, w16, [x17]           // encoding: [0x30,0xfe,0x0f,0x48]
+// CHECK: stlxr    wzr, w17, [sp]            // encoding: [0xf1,0xff,0x1f,0x88]
+// CHECK: stlxr    w18, x19, [x20]           // encoding: [0x93,0xfe,0x12,0xc8]
+
+        ldaxrb     w19, [x21]
+        ldaxrh     w20, [sp]
+        ldaxr      wzr, [x22]
+        ldaxr      x21, [x23]
+// CHECK: ldaxrb   w19, [x21]                // encoding: [0xb3,0xfe,0x5f,0x08]
+// CHECK: ldaxrh   w20, [sp]                 // encoding: [0xf4,0xff,0x5f,0x48]
+// CHECK: ldaxr    wzr, [x22]                // encoding: [0xdf,0xfe,0x5f,0x88]
+// CHECK: ldaxr    x21, [x23]                // encoding: [0xf5,0xfe,0x5f,0xc8]
+
+        stlxp      wzr, w22, w23, [x24]
+        stlxp      w25, x26, x27, [sp]
+// CHECK: stlxp    wzr, w22, w23, [x24]      // encoding: [0x16,0xdf,0x3f,0x88]
+// CHECK: stlxp    w25, x26, x27, [sp]       // encoding: [0xfa,0xef,0x39,0xc8]
+
+        ldaxp      w26, wzr, [sp]
+        ldaxp      x27, x28, [x30]
+// CHECK: ldaxp    w26, wzr, [sp]            // encoding: [0xfa,0xff,0x7f,0x88]
+// CHECK: ldaxp    x27, x28, [x30]           // encoding: [0xdb,0xf3,0x7f,0xc8]
+
+        stlrb      w27, [sp]
+        stlrh      w28, [x0]
+        stlr       wzr, [x1]
+        stlr       x30, [x2]
+// CHECK: stlrb    w27, [sp]                 // encoding: [0xfb,0xff,0x9f,0x08]
+// CHECK: stlrh    w28, [x0]                 // encoding: [0x1c,0xfc,0x9f,0x48]
+// CHECK: stlr     wzr, [x1]                 // encoding: [0x3f,0xfc,0x9f,0x88]
+// CHECK: stlr     x30, [x2]                 // encoding: [0x5e,0xfc,0x9f,0xc8]
+
+        ldarb      w29, [sp]
+        ldarh      w30, [x0]
+        ldar       wzr, [x1]
+        ldar       x1, [x2]
+// CHECK: ldarb    w29, [sp]                 // encoding: [0xfd,0xff,0xdf,0x08]
+// CHECK: ldarh    w30, [x0]                 // encoding: [0x1e,0xfc,0xdf,0x48]
+// CHECK: ldar     wzr, [x1]                 // encoding: [0x3f,0xfc,0xdf,0x88]
+// CHECK: ldar     x1, [x2]                  // encoding: [0x41,0xfc,0xdf,0xc8]
+
+        stlxp      wzr, w22, w23, [x24,#0]
+// CHECK: stlxp    wzr, w22, w23, [x24]      // encoding: [0x16,0xdf,0x3f,0x88]
+
+//------------------------------------------------------------------------------
+// Load/store (unaligned immediate)
+//------------------------------------------------------------------------------
+
+        sturb w9, [sp, #0]
+        sturh wzr, [x12, #255]
+        stur w16, [x0, #-256]
+        stur x28, [x14, #1]
+// CHECK: sturb    w9, [sp]                   // encoding: [0xe9,0x03,0x00,0x38]
+// CHECK: sturh    wzr, [x12, #255]           // encoding: [0x9f,0xf1,0x0f,0x78]
+// CHECK: stur     w16, [x0, #-256]           // encoding: [0x10,0x00,0x10,0xb8]
+// CHECK: stur     x28, [x14, #1]             // encoding: [0xdc,0x11,0x00,0xf8]
+
+        ldurb w1, [x20, #255]
+        ldurh w20, [x1, #255]
+        ldur w12, [sp, #255]
+        ldur xzr, [x12, #255]
+// CHECK: ldurb    w1, [x20, #255]            // encoding: [0x81,0xf2,0x4f,0x38]
+// CHECK: ldurh    w20, [x1, #255]            // encoding: [0x34,0xf0,0x4f,0x78]
+// CHECK: ldur     w12, [sp, #255]            // encoding: [0xec,0xf3,0x4f,0xb8]
+// CHECK: ldur     xzr, [x12, #255]           // encoding: [0x9f,0xf1,0x4f,0xf8]
+
+        ldursb x9, [x7, #-256]
+        ldursh x17, [x19, #-256]
+        ldursw x20, [x15, #-256]
+        ldursw x13, [x2]
+        prfum pldl2keep, [sp, #-256]
+        ldursb w19, [x1, #-256]
+        ldursh w15, [x21, #-256]
+// CHECK: ldursb   x9, [x7, #-256]            // encoding: [0xe9,0x00,0x90,0x38]
+// CHECK: ldursh   x17, [x19, #-256]          // encoding: [0x71,0x02,0x90,0x78]
+// CHECK: ldursw   x20, [x15, #-256]          // encoding: [0xf4,0x01,0x90,0xb8]
+// CHECK: ldursw   x13, [x2]                  // encoding: [0x4d,0x00,0x80,0xb8]
+// CHECK: prfum    pldl2keep, [sp, #-256]     // encoding: [0xe2,0x03,0x90,0xf8]
+// CHECK: ldursb   w19, [x1, #-256]           // encoding: [0x33,0x00,0xd0,0x38]
+// CHECK: ldursh   w15, [x21, #-256]          // encoding: [0xaf,0x02,0xd0,0x78]
+
+        stur b0, [sp, #1]
+        stur h12, [x12, #-1]
+        stur s15, [x0, #255]
+        stur d31, [x5, #25]
+        stur q9, [x5]
+// CHECK: stur     b0, [sp, #1]               // encoding: [0xe0,0x13,0x00,0x3c]
+// CHECK: stur     h12, [x12, #-1]            // encoding: [0x8c,0xf1,0x1f,0x7c]
+// CHECK: stur     s15, [x0, #255]            // encoding: [0x0f,0xf0,0x0f,0xbc]
+// CHECK: stur     d31, [x5, #25]             // encoding: [0xbf,0x90,0x01,0xfc]
+// CHECK: stur     q9, [x5]                   // encoding: [0xa9,0x00,0x80,0x3c]
+
+        ldur b3, [sp]
+        ldur h5, [x4, #-256]
+        ldur s7, [x12, #-1]
+        ldur d11, [x19, #4]
+        ldur q13, [x1, #2]
+// CHECK: ldur     b3, [sp]                   // encoding: [0xe3,0x03,0x40,0x3c]
+// CHECK: ldur     h5, [x4, #-256]            // encoding: [0x85,0x00,0x50,0x7c]
+// CHECK: ldur     s7, [x12, #-1]             // encoding: [0x87,0xf1,0x5f,0xbc]
+// CHECK: ldur     d11, [x19, #4]             // encoding: [0x6b,0x42,0x40,0xfc]
+// CHECK: ldur     q13, [x1, #2]              // encoding: [0x2d,0x20,0xc0,0x3c]
+
+//------------------------------------------------------------------------------
+// Load/store (unsigned immediate)
+//------------------------------------------------------------------------------
+
+//// Basic addressing mode limits: 8 byte access
+        ldr x0, [x0]
+        ldr x4, [x29, #0]
+        ldr x30, [x12, #32760]
+        ldr x20, [sp, #8]
+// CHECK: ldr      x0, [x0]                   // encoding: [0x00,0x00,0x40,0xf9]
+// CHECK: ldr      x4, [x29]                  // encoding: [0xa4,0x03,0x40,0xf9]
+// CHECK: ldr      x30, [x12, #32760]         // encoding: [0x9e,0xfd,0x7f,0xf9]
+// CHECK: ldr      x20, [sp, #8]              // encoding: [0xf4,0x07,0x40,0xf9]
+
+//// Rt treats 31 as zero-register
+        ldr xzr, [sp]
+// CHECK: ldr      xzr, [sp]                  // encoding: [0xff,0x03,0x40,0xf9]
+
+        //// 4-byte load, check still 64-bit address, limits
+        ldr w2, [sp]
+        ldr w17, [sp, #16380]
+        ldr w13, [x2, #4]
+// CHECK: ldr      w2, [sp]                   // encoding: [0xe2,0x03,0x40,0xb9]
+// CHECK: ldr      w17, [sp, #16380]          // encoding: [0xf1,0xff,0x7f,0xb9]
+// CHECK: ldr      w13, [x2, #4]              // encoding: [0x4d,0x04,0x40,0xb9]
+
+//// Signed 4-byte load. Limits.
+        ldrsw x2, [x5,#4]
+        ldrsw x23, [sp, #16380]
+// CHECK: ldrsw    x2, [x5, #4]               // encoding: [0xa2,0x04,0x80,0xb9]
+// CHECK: ldrsw    x23, [sp, #16380]          // encoding: [0xf7,0xff,0xbf,0xb9]
+
+////  2-byte loads
+        ldrh w2, [x4]
+        ldrsh w23, [x6, #8190]
+        ldrsh wzr, [sp, #2]
+        ldrsh x29, [x2, #2]
+// CHECK: ldrh     w2, [x4]                   // encoding: [0x82,0x00,0x40,0x79]
+// CHECK: ldrsh    w23, [x6, #8190]           // encoding: [0xd7,0xfc,0xff,0x79]
+// CHECK: ldrsh    wzr, [sp, #2]              // encoding: [0xff,0x07,0xc0,0x79]
+// CHECK: ldrsh    x29, [x2, #2]              // encoding: [0x5d,0x04,0x80,0x79]
+
+//// 1-byte loads
+        ldrb w26, [x3, #121]
+        ldrb w12, [x2, #0]
+        ldrsb w27, [sp, #4095]
+        ldrsb xzr, [x15]
+// CHECK: ldrb     w26, [x3, #121]            // encoding: [0x7a,0xe4,0x41,0x39]
+// CHECK: ldrb     w12, [x2]                  // encoding: [0x4c,0x00,0x40,0x39]
+// CHECK: ldrsb    w27, [sp, #4095]           // encoding: [0xfb,0xff,0xff,0x39]
+// CHECK: ldrsb    xzr, [x15]                 // encoding: [0xff,0x01,0x80,0x39]
+
+//// Stores
+        str x30, [sp]
+        str w20, [x4, #16380]
+        strh w20, [x10, #14]
+        strh w17, [sp, #8190]
+        strb w23, [x3, #4095]
+        strb wzr, [x2]
+// CHECK: str      x30, [sp]                  // encoding: [0xfe,0x03,0x00,0xf9]
+// CHECK: str      w20, [x4, #16380]          // encoding: [0x94,0xfc,0x3f,0xb9]
+// CHECK: strh     w20, [x10, #14]            // encoding: [0x54,0x1d,0x00,0x79]
+// CHECK: strh     w17, [sp, #8190]           // encoding: [0xf1,0xff,0x3f,0x79]
+// CHECK: strb     w23, [x3, #4095]           // encoding: [0x77,0xfc,0x3f,0x39]
+// CHECK: strb     wzr, [x2]                  // encoding: [0x5f,0x00,0x00,0x39]
+
+//// Relocations
+        str x15, [x5, #:lo12:sym]
+        ldrb w15, [x5, #:lo12:sym]
+        ldrsh x15, [x5, #:lo12:sym]
+        ldrsw x15, [x5, #:lo12:sym]
+        ldr x15, [x5, #:lo12:sym]
+        ldr q3, [x2, #:lo12:sym]
+// CHECK: str     x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,A,0xf9'A']
+// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst64_lo12
+// CHECK: ldrb    w15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x40'A',0x39'A']
+// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst8_lo12
+// CHECK: ldrsh   x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x80'A',0x79'A']
+// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst16_lo12
+// CHECK: ldrsw   x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x80'A',0xb9'A']
+// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst32_lo12
+// CHECK: ldr     x15, [x5, #:lo12:sym]   // encoding: [0xaf'A',A,0x40'A',0xf9'A']
+// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst64_lo12
+// CHECK: ldr     q3, [x2, #:lo12:sym]    // encoding: [0x43'A',A,0xc0'A',0x3d'A']
+// CHECK:                                         //   fixup A - offset: 0, value: :lo12:sym, kind: fixup_a64_ldst128_lo12
+
+        prfm pldl1keep, [sp, #8]
+        prfm pldl1strm, [x3]
+        prfm pldl2keep, [x5,#16]
+        prfm pldl2strm, [x2]
+        prfm pldl3keep, [x5]
+        prfm pldl3strm, [x6]
+        prfm plil1keep, [sp, #8]
+        prfm plil1strm, [x3]
+        prfm plil2keep, [x5,#16]
+        prfm plil2strm, [x2]
+        prfm plil3keep, [x5]
+        prfm plil3strm, [x6]
+        prfm pstl1keep, [sp, #8]
+        prfm pstl1strm, [x3]
+        prfm pstl2keep, [x5,#16]
+        prfm pstl2strm, [x2]
+        prfm pstl3keep, [x5]
+        prfm pstl3strm, [x6]
+        prfm #15, [sp]
+// CHECK: prfm    pldl1keep, [sp, #8]     // encoding: [0xe0,0x07,0x80,0xf9]
+// CHECK: prfm    pldl1strm, [x3, #0]     // encoding: [0x61,0x00,0x80,0xf9]
+// CHECK: prfm    pldl2keep, [x5, #16]    // encoding: [0xa2,0x08,0x80,0xf9]
+// CHECK: prfm    pldl2strm, [x2, #0]     // encoding: [0x43,0x00,0x80,0xf9]
+// CHECK: prfm    pldl3keep, [x5, #0]     // encoding: [0xa4,0x00,0x80,0xf9]
+// CHECK: prfm    pldl3strm, [x6, #0]     // encoding: [0xc5,0x00,0x80,0xf9]
+// CHECK: prfm    plil1keep, [sp, #8]     // encoding: [0xe8,0x07,0x80,0xf9]
+// CHECK: prfm    plil1strm, [x3, #0]     // encoding: [0x69,0x00,0x80,0xf9]
+// CHECK: prfm    plil2keep, [x5, #16]    // encoding: [0xaa,0x08,0x80,0xf9]
+// CHECK: prfm    plil2strm, [x2, #0]     // encoding: [0x4b,0x00,0x80,0xf9]
+// CHECK: prfm    plil3keep, [x5, #0]     // encoding: [0xac,0x00,0x80,0xf9]
+// CHECK: prfm    plil3strm, [x6, #0]     // encoding: [0xcd,0x00,0x80,0xf9]
+// CHECK: prfm    pstl1keep, [sp, #8]     // encoding: [0xf0,0x07,0x80,0xf9]
+// CHECK: prfm    pstl1strm, [x3, #0]     // encoding: [0x71,0x00,0x80,0xf9]
+// CHECK: prfm    pstl2keep, [x5, #16]    // encoding: [0xb2,0x08,0x80,0xf9]
+// CHECK: prfm    pstl2strm, [x2, #0]     // encoding: [0x53,0x00,0x80,0xf9]
+// CHECK: prfm    pstl3keep, [x5, #0]     // encoding: [0xb4,0x00,0x80,0xf9]
+// CHECK: prfm    pstl3strm, [x6, #0]     // encoding: [0xd5,0x00,0x80,0xf9]
+// CHECK: prfm    #15, [sp, #0]           // encoding: [0xef,0x03,0x80,0xf9]
+
+//// Floating-point versions
+
+        ldr b31, [sp, #4095]
+        ldr h20, [x2, #8190]
+        ldr s10, [x19, #16380]
+        ldr d3, [x10, #32760]
+        str q12, [sp, #65520]
+// CHECK: ldr      b31, [sp, #4095]           // encoding: [0xff,0xff,0x7f,0x3d]
+// CHECK: ldr      h20, [x2, #8190]           // encoding: [0x54,0xfc,0x7f,0x7d]
+// CHECK: ldr      s10, [x19, #16380]         // encoding: [0x6a,0xfe,0x7f,0xbd]
+// CHECK: ldr      d3, [x10, #32760]          // encoding: [0x43,0xfd,0x7f,0xfd]
+// CHECK: str      q12, [sp, #65520]          // encoding: [0xec,0xff,0xbf,0x3d]
+
+//------------------------------------------------------------------------------
+// Load/store register (register offset)
+//------------------------------------------------------------------------------
+
+        ldrb w3, [sp, x5]
+        ldrb w9, [x27, x6, lsl #0]
+        ldrsb w10, [x30, x7]
+        ldrb w11, [x29, x3, sxtx]
+        strb w12, [x28, xzr, sxtx #0]
+        ldrb w14, [x26, w6, uxtw]
+        ldrsb w15, [x25, w7, uxtw #0]
+        ldrb w17, [x23, w9, sxtw]
+        ldrsb x18, [x22, w10, sxtw #0]
+// CHECK: ldrb     w3, [sp, x5]               // encoding: [0xe3,0x6b,0x65,0x38]
+// CHECK: ldrb     w9, [x27, x6, lsl #0]      // encoding: [0x69,0x7b,0x66,0x38]
+// CHECK: ldrsb    w10, [x30, x7]             // encoding: [0xca,0x6b,0xe7,0x38]
+// CHECK: ldrb     w11, [x29, x3, sxtx]       // encoding: [0xab,0xeb,0x63,0x38]
+// CHECK: strb     w12, [x28, xzr, sxtx #0]   // encoding: [0x8c,0xfb,0x3f,0x38]
+// CHECK: ldrb     w14, [x26, w6, uxtw]       // encoding: [0x4e,0x4b,0x66,0x38]
+// CHECK: ldrsb    w15, [x25, w7, uxtw #0]    // encoding: [0x2f,0x5b,0xe7,0x38]
+// CHECK: ldrb     w17, [x23, w9, sxtw]       // encoding: [0xf1,0xca,0x69,0x38]
+// CHECK: ldrsb    x18, [x22, w10, sxtw #0]   // encoding: [0xd2,0xda,0xaa,0x38]
+
+        ldrsh w3, [sp, x5]
+        ldrsh w9, [x27, x6, lsl #0]
+        ldrh w10, [x30, x7, lsl #1]
+        strh w11, [x29, x3, sxtx]
+        ldrh w12, [x28, xzr, sxtx #0]
+        ldrsh x13, [x27, x5, sxtx #1]
+        ldrh w14, [x26, w6, uxtw]
+        ldrh w15, [x25, w7, uxtw #0]
+        ldrsh w16, [x24, w8, uxtw #1]
+        ldrh w17, [x23, w9, sxtw]
+        ldrh w18, [x22, w10, sxtw #0]
+        strh w19, [x21, wzr, sxtw #1]
+// CHECK: ldrsh    w3, [sp, x5]               // encoding: [0xe3,0x6b,0xe5,0x78]
+// CHECK: ldrsh    w9, [x27, x6]              // encoding: [0x69,0x6b,0xe6,0x78]
+// CHECK: ldrh     w10, [x30, x7, lsl #1]     // encoding: [0xca,0x7b,0x67,0x78]
+// CHECK: strh     w11, [x29, x3, sxtx]       // encoding: [0xab,0xeb,0x23,0x78]
+// CHECK: ldrh     w12, [x28, xzr, sxtx]      // encoding: [0x8c,0xeb,0x7f,0x78]
+// CHECK: ldrsh    x13, [x27, x5, sxtx #1]    // encoding: [0x6d,0xfb,0xa5,0x78]
+// CHECK: ldrh     w14, [x26, w6, uxtw]       // encoding: [0x4e,0x4b,0x66,0x78]
+// CHECK: ldrh     w15, [x25, w7, uxtw]       // encoding: [0x2f,0x4b,0x67,0x78]
+// CHECK: ldrsh    w16, [x24, w8, uxtw #1]    // encoding: [0x10,0x5b,0xe8,0x78]
+// CHECK: ldrh     w17, [x23, w9, sxtw]       // encoding: [0xf1,0xca,0x69,0x78]
+// CHECK: ldrh     w18, [x22, w10, sxtw]      // encoding: [0xd2,0xca,0x6a,0x78]
+// CHECK: strh     w19, [x21, wzr, sxtw #1]   // encoding: [0xb3,0xda,0x3f,0x78]
+
+        ldr w3, [sp, x5]
+        ldr s9, [x27, x6, lsl #0]
+        ldr w10, [x30, x7, lsl #2]
+        ldr w11, [x29, x3, sxtx]
+        str s12, [x28, xzr, sxtx #0]
+        str w13, [x27, x5, sxtx #2]
+        str w14, [x26, w6, uxtw]
+        ldr w15, [x25, w7, uxtw #0]
+        ldr w16, [x24, w8, uxtw #2]
+        ldrsw x17, [x23, w9, sxtw]
+        ldr w18, [x22, w10, sxtw #0]
+        ldrsw x19, [x21, wzr, sxtw #2]
+// CHECK: ldr      w3, [sp, x5]               // encoding: [0xe3,0x6b,0x65,0xb8]
+// CHECK: ldr      s9, [x27, x6]              // encoding: [0x69,0x6b,0x66,0xbc]
+// CHECK: ldr      w10, [x30, x7, lsl #2]     // encoding: [0xca,0x7b,0x67,0xb8]
+// CHECK: ldr      w11, [x29, x3, sxtx]       // encoding: [0xab,0xeb,0x63,0xb8]
+// CHECK: str      s12, [x28, xzr, sxtx]      // encoding: [0x8c,0xeb,0x3f,0xbc]
+// CHECK: str      w13, [x27, x5, sxtx #2]    // encoding: [0x6d,0xfb,0x25,0xb8]
+// CHECK: str      w14, [x26, w6, uxtw]       // encoding: [0x4e,0x4b,0x26,0xb8]
+// CHECK: ldr      w15, [x25, w7, uxtw]       // encoding: [0x2f,0x4b,0x67,0xb8]
+// CHECK: ldr      w16, [x24, w8, uxtw #2]    // encoding: [0x10,0x5b,0x68,0xb8]
+// CHECK: ldrsw    x17, [x23, w9, sxtw]       // encoding: [0xf1,0xca,0xa9,0xb8]
+// CHECK: ldr      w18, [x22, w10, sxtw]      // encoding: [0xd2,0xca,0x6a,0xb8]
+// CHECK: ldrsw    x19, [x21, wzr, sxtw #2]   // encoding: [0xb3,0xda,0xbf,0xb8]
+
+        ldr x3, [sp, x5]
+        str x9, [x27, x6, lsl #0]
+        ldr d10, [x30, x7, lsl #3]
+        str x11, [x29, x3, sxtx]
+        ldr x12, [x28, xzr, sxtx #0]
+        ldr x13, [x27, x5, sxtx #3]
+        prfm pldl1keep, [x26, w6, uxtw]
+        ldr x15, [x25, w7, uxtw #0]
+        ldr x16, [x24, w8, uxtw #3]
+        ldr x17, [x23, w9, sxtw]
+        ldr x18, [x22, w10, sxtw #0]
+        str d19, [x21, wzr, sxtw #3]
+        prfm #6, [x0, x5]
+// CHECK: ldr      x3, [sp, x5]               // encoding: [0xe3,0x6b,0x65,0xf8]
+// CHECK: str      x9, [x27, x6]              // encoding: [0x69,0x6b,0x26,0xf8]
+// CHECK: ldr      d10, [x30, x7, lsl #3]     // encoding: [0xca,0x7b,0x67,0xfc]
+// CHECK: str      x11, [x29, x3, sxtx]       // encoding: [0xab,0xeb,0x23,0xf8]
+// CHECK: ldr      x12, [x28, xzr, sxtx]      // encoding: [0x8c,0xeb,0x7f,0xf8]
+// CHECK: ldr      x13, [x27, x5, sxtx #3]    // encoding: [0x6d,0xfb,0x65,0xf8]
+// CHECK: prfm     pldl1keep, [x26, w6, uxtw] // encoding: [0x40,0x4b,0xa6,0xf8]
+// CHECK: ldr      x15, [x25, w7, uxtw]       // encoding: [0x2f,0x4b,0x67,0xf8]
+// CHECK: ldr      x16, [x24, w8, uxtw #3]    // encoding: [0x10,0x5b,0x68,0xf8]
+// CHECK: ldr      x17, [x23, w9, sxtw]       // encoding: [0xf1,0xca,0x69,0xf8]
+// CHECK: ldr      x18, [x22, w10, sxtw]      // encoding: [0xd2,0xca,0x6a,0xf8]
+// CHECK: str      d19, [x21, wzr, sxtw #3]   // encoding: [0xb3,0xda,0x3f,0xfc]
+// CHECK: prfm     #6, [x0, x5, lsl #0]       // encoding: [0x06,0x68,0xa5,0xf8]
+
+        ldr q3, [sp, x5]
+        ldr q9, [x27, x6, lsl #0]
+        ldr q10, [x30, x7, lsl #4]
+        str q11, [x29, x3, sxtx]
+        str q12, [x28, xzr, sxtx #0]
+        str q13, [x27, x5, sxtx #4]
+        ldr q14, [x26, w6, uxtw]
+        ldr q15, [x25, w7, uxtw #0]
+        ldr q16, [x24, w8, uxtw #4]
+        ldr q17, [x23, w9, sxtw]
+        str q18, [x22, w10, sxtw #0]
+        ldr q19, [x21, wzr, sxtw #4]
+// CHECK: ldr      q3, [sp, x5]               // encoding: [0xe3,0x6b,0xe5,0x3c]
+// CHECK: ldr      q9, [x27, x6]              // encoding: [0x69,0x6b,0xe6,0x3c]
+// CHECK: ldr      q10, [x30, x7, lsl #4]     // encoding: [0xca,0x7b,0xe7,0x3c]
+// CHECK: str      q11, [x29, x3, sxtx]       // encoding: [0xab,0xeb,0xa3,0x3c]
+// CHECK: str      q12, [x28, xzr, sxtx]      // encoding: [0x8c,0xeb,0xbf,0x3c]
+// CHECK: str      q13, [x27, x5, sxtx #4]    // encoding: [0x6d,0xfb,0xa5,0x3c]
+// CHECK: ldr      q14, [x26, w6, uxtw]       // encoding: [0x4e,0x4b,0xe6,0x3c]
+// CHECK: ldr      q15, [x25, w7, uxtw]       // encoding: [0x2f,0x4b,0xe7,0x3c]
+// CHECK: ldr      q16, [x24, w8, uxtw #4]    // encoding: [0x10,0x5b,0xe8,0x3c]
+// CHECK: ldr      q17, [x23, w9, sxtw]       // encoding: [0xf1,0xca,0xe9,0x3c]
+// CHECK: str      q18, [x22, w10, sxtw]      // encoding: [0xd2,0xca,0xaa,0x3c]
+// CHECK: ldr      q19, [x21, wzr, sxtw #4]   // encoding: [0xb3,0xda,0xff,0x3c]
+
+//------------------------------------------------------------------------------
+// Load/store register (immediate post-indexed)
+//------------------------------------------------------------------------------
+
+        strb w9, [x2], #255
+        strb w10, [x3], #1
+        strb w10, [x3], #-256
+        strh w9, [x2], #255
+        strh w9, [x2], #1
+        strh w10, [x3], #-256
+// CHECK: strb     w9, [x2], #255             // encoding: [0x49,0xf4,0x0f,0x38]
+// CHECK: strb     w10, [x3], #1              // encoding: [0x6a,0x14,0x00,0x38]
+// CHECK: strb     w10, [x3], #-256           // encoding: [0x6a,0x04,0x10,0x38]
+// CHECK: strh     w9, [x2], #255             // encoding: [0x49,0xf4,0x0f,0x78]
+// CHECK: strh     w9, [x2], #1               // encoding: [0x49,0x14,0x00,0x78]
+// CHECK: strh     w10, [x3], #-256           // encoding: [0x6a,0x04,0x10,0x78]
+
+        str w19, [sp], #255
+        str w20, [x30], #1
+        str w21, [x12], #-256
+        str xzr, [x9], #255
+        str x2, [x3], #1
+        str x19, [x12], #-256
+// CHECK: str      w19, [sp], #255            // encoding: [0xf3,0xf7,0x0f,0xb8]
+// CHECK: str      w20, [x30], #1             // encoding: [0xd4,0x17,0x00,0xb8]
+// CHECK: str      w21, [x12], #-256          // encoding: [0x95,0x05,0x10,0xb8]
+// CHECK: str      xzr, [x9], #255            // encoding: [0x3f,0xf5,0x0f,0xf8]
+// CHECK: str      x2, [x3], #1               // encoding: [0x62,0x14,0x00,0xf8]
+// CHECK: str      x19, [x12], #-256          // encoding: [0x93,0x05,0x10,0xf8]
+
+        ldrb w9, [x2], #255
+        ldrb w10, [x3], #1
+        ldrb w10, [x3], #-256
+        ldrh w9, [x2], #255
+        ldrh w9, [x2], #1
+        ldrh w10, [x3], #-256
+// CHECK: ldrb     w9, [x2], #255             // encoding: [0x49,0xf4,0x4f,0x38]
+// CHECK: ldrb     w10, [x3], #1              // encoding: [0x6a,0x14,0x40,0x38]
+// CHECK: ldrb     w10, [x3], #-256           // encoding: [0x6a,0x04,0x50,0x38]
+// CHECK: ldrh     w9, [x2], #255             // encoding: [0x49,0xf4,0x4f,0x78]
+// CHECK: ldrh     w9, [x2], #1               // encoding: [0x49,0x14,0x40,0x78]
+// CHECK: ldrh     w10, [x3], #-256           // encoding: [0x6a,0x04,0x50,0x78]
+
+        ldr w19, [sp], #255
+        ldr w20, [x30], #1
+        ldr w21, [x12], #-256
+        ldr xzr, [x9], #255
+        ldr x2, [x3], #1
+        ldr x19, [x12], #-256
+// CHECK: ldr      w19, [sp], #255            // encoding: [0xf3,0xf7,0x4f,0xb8]
+// CHECK: ldr      w20, [x30], #1             // encoding: [0xd4,0x17,0x40,0xb8]
+// CHECK: ldr      w21, [x12], #-256          // encoding: [0x95,0x05,0x50,0xb8]
+// CHECK: ldr      xzr, [x9], #255            // encoding: [0x3f,0xf5,0x4f,0xf8]
+// CHECK: ldr      x2, [x3], #1               // encoding: [0x62,0x14,0x40,0xf8]
+// CHECK: ldr      x19, [x12], #-256          // encoding: [0x93,0x05,0x50,0xf8]
+
+        ldrsb xzr, [x9], #255
+        ldrsb x2, [x3], #1
+        ldrsb x19, [x12], #-256
+        ldrsh xzr, [x9], #255
+        ldrsh x2, [x3], #1
+        ldrsh x19, [x12], #-256
+        ldrsw xzr, [x9], #255
+        ldrsw x2, [x3], #1
+        ldrsw x19, [x12], #-256
+// CHECK: ldrsb    xzr, [x9], #255            // encoding: [0x3f,0xf5,0x8f,0x38]
+// CHECK: ldrsb    x2, [x3], #1               // encoding: [0x62,0x14,0x80,0x38]
+// CHECK: ldrsb    x19, [x12], #-256          // encoding: [0x93,0x05,0x90,0x38]
+// CHECK: ldrsh    xzr, [x9], #255            // encoding: [0x3f,0xf5,0x8f,0x78]
+// CHECK: ldrsh    x2, [x3], #1               // encoding: [0x62,0x14,0x80,0x78]
+// CHECK: ldrsh    x19, [x12], #-256          // encoding: [0x93,0x05,0x90,0x78]
+// CHECK: ldrsw    xzr, [x9], #255            // encoding: [0x3f,0xf5,0x8f,0xb8]
+// CHECK: ldrsw    x2, [x3], #1               // encoding: [0x62,0x14,0x80,0xb8]
+// CHECK: ldrsw    x19, [x12], #-256          // encoding: [0x93,0x05,0x90,0xb8]
+
+        ldrsb wzr, [x9], #255
+        ldrsb w2, [x3], #1
+        ldrsb w19, [x12], #-256
+        ldrsh wzr, [x9], #255
+        ldrsh w2, [x3], #1
+        ldrsh w19, [x12], #-256
+// CHECK: ldrsb    wzr, [x9], #255            // encoding: [0x3f,0xf5,0xcf,0x38]
+// CHECK: ldrsb    w2, [x3], #1               // encoding: [0x62,0x14,0xc0,0x38]
+// CHECK: ldrsb    w19, [x12], #-256          // encoding: [0x93,0x05,0xd0,0x38]
+// CHECK: ldrsh    wzr, [x9], #255            // encoding: [0x3f,0xf5,0xcf,0x78]
+// CHECK: ldrsh    w2, [x3], #1               // encoding: [0x62,0x14,0xc0,0x78]
+// CHECK: ldrsh    w19, [x12], #-256          // encoding: [0x93,0x05,0xd0,0x78]
+
+        str b0, [x0], #255
+        str b3, [x3], #1
+        str b5, [sp], #-256
+        str h10, [x10], #255
+        str h13, [x23], #1
+        str h15, [sp], #-256
+        str s20, [x20], #255
+        str s23, [x23], #1
+        str s25, [x0], #-256
+        str d20, [x20], #255
+        str d23, [x23], #1
+        str d25, [x0], #-256
+// CHECK: str      b0, [x0], #255             // encoding: [0x00,0xf4,0x0f,0x3c]
+// CHECK: str      b3, [x3], #1               // encoding: [0x63,0x14,0x00,0x3c]
+// CHECK: str      b5, [sp], #-256            // encoding: [0xe5,0x07,0x10,0x3c]
+// CHECK: str      h10, [x10], #255           // encoding: [0x4a,0xf5,0x0f,0x7c]
+// CHECK: str      h13, [x23], #1             // encoding: [0xed,0x16,0x00,0x7c]
+// CHECK: str      h15, [sp], #-256           // encoding: [0xef,0x07,0x10,0x7c]
+// CHECK: str      s20, [x20], #255           // encoding: [0x94,0xf6,0x0f,0xbc]
+// CHECK: str      s23, [x23], #1             // encoding: [0xf7,0x16,0x00,0xbc]
+// CHECK: str      s25, [x0], #-256           // encoding: [0x19,0x04,0x10,0xbc]
+// CHECK: str      d20, [x20], #255           // encoding: [0x94,0xf6,0x0f,0xfc]
+// CHECK: str      d23, [x23], #1             // encoding: [0xf7,0x16,0x00,0xfc]
+// CHECK: str      d25, [x0], #-256           // encoding: [0x19,0x04,0x10,0xfc]
+
+        ldr b0, [x0], #255
+        ldr b3, [x3], #1
+        ldr b5, [sp], #-256
+        ldr h10, [x10], #255
+        ldr h13, [x23], #1
+        ldr h15, [sp], #-256
+        ldr s20, [x20], #255
+        ldr s23, [x23], #1
+        ldr s25, [x0], #-256
+        ldr d20, [x20], #255
+        ldr d23, [x23], #1
+        ldr d25, [x0], #-256
+// CHECK: ldr      b0, [x0], #255             // encoding: [0x00,0xf4,0x4f,0x3c]
+// CHECK: ldr      b3, [x3], #1               // encoding: [0x63,0x14,0x40,0x3c]
+// CHECK: ldr      b5, [sp], #-256            // encoding: [0xe5,0x07,0x50,0x3c]
+// CHECK: ldr      h10, [x10], #255           // encoding: [0x4a,0xf5,0x4f,0x7c]
+// CHECK: ldr      h13, [x23], #1             // encoding: [0xed,0x16,0x40,0x7c]
+// CHECK: ldr      h15, [sp], #-256           // encoding: [0xef,0x07,0x50,0x7c]
+// CHECK: ldr      s20, [x20], #255           // encoding: [0x94,0xf6,0x4f,0xbc]
+// CHECK: ldr      s23, [x23], #1             // encoding: [0xf7,0x16,0x40,0xbc]
+// CHECK: ldr      s25, [x0], #-256           // encoding: [0x19,0x04,0x50,0xbc]
+// CHECK: ldr      d20, [x20], #255           // encoding: [0x94,0xf6,0x4f,0xfc]
+// CHECK: ldr      d23, [x23], #1             // encoding: [0xf7,0x16,0x40,0xfc]
+// CHECK: ldr      d25, [x0], #-256           // encoding: [0x19,0x04,0x50,0xfc]
+
+        ldr q20, [x1], #255
+        ldr q23, [x9], #1
+        ldr q25, [x20], #-256
+        str q10, [x1], #255
+        str q22, [sp], #1
+        str q21, [x20], #-256
+// CHECK: ldr      q20, [x1], #255            // encoding: [0x34,0xf4,0xcf,0x3c]
+// CHECK: ldr      q23, [x9], #1              // encoding: [0x37,0x15,0xc0,0x3c]
+// CHECK: ldr      q25, [x20], #-256          // encoding: [0x99,0x06,0xd0,0x3c]
+// CHECK: str      q10, [x1], #255            // encoding: [0x2a,0xf4,0x8f,0x3c]
+// CHECK: str      q22, [sp], #1              // encoding: [0xf6,0x17,0x80,0x3c]
+// CHECK: str      q21, [x20], #-256          // encoding: [0x95,0x06,0x90,0x3c]
+
+//------------------------------------------------------------------------------
+// Load/store register (immediate pre-indexed)
+//------------------------------------------------------------------------------
+
+        ldr x3, [x4, #0]!
+        ldr xzr, [sp, #0]!
+// CHECK: ldr      x3, [x4, #0]!              // encoding: [0x83,0x0c,0x40,0xf8]
+// CHECK: ldr      xzr, [sp, #0]!              // encoding: [0xff,0x0f,0x40,0xf8]
+
+        strb w9, [x2, #255]!
+        strb w10, [x3, #1]!
+        strb w10, [x3, #-256]!
+        strh w9, [x2, #255]!
+        strh w9, [x2, #1]!
+        strh w10, [x3, #-256]!
+// CHECK: strb     w9, [x2, #255]!            // encoding: [0x49,0xfc,0x0f,0x38]
+// CHECK: strb     w10, [x3, #1]!             // encoding: [0x6a,0x1c,0x00,0x38]
+// CHECK: strb     w10, [x3, #-256]!          // encoding: [0x6a,0x0c,0x10,0x38]
+// CHECK: strh     w9, [x2, #255]!            // encoding: [0x49,0xfc,0x0f,0x78]
+// CHECK: strh     w9, [x2, #1]!              // encoding: [0x49,0x1c,0x00,0x78]
+// CHECK: strh     w10, [x3, #-256]!          // encoding: [0x6a,0x0c,0x10,0x78]
+
+        str w19, [sp, #255]!
+        str w20, [x30, #1]!
+        str w21, [x12, #-256]!
+        str xzr, [x9, #255]!
+        str x2, [x3, #1]!
+        str x19, [x12, #-256]!
+// CHECK: str      w19, [sp, #255]!           // encoding: [0xf3,0xff,0x0f,0xb8]
+// CHECK: str      w20, [x30, #1]!            // encoding: [0xd4,0x1f,0x00,0xb8]
+// CHECK: str      w21, [x12, #-256]!         // encoding: [0x95,0x0d,0x10,0xb8]
+// CHECK: str      xzr, [x9, #255]!           // encoding: [0x3f,0xfd,0x0f,0xf8]
+// CHECK: str      x2, [x3, #1]!              // encoding: [0x62,0x1c,0x00,0xf8]
+// CHECK: str      x19, [x12, #-256]!         // encoding: [0x93,0x0d,0x10,0xf8]
+
+        ldrb w9, [x2, #255]!
+        ldrb w10, [x3, #1]!
+        ldrb w10, [x3, #-256]!
+        ldrh w9, [x2, #255]!
+        ldrh w9, [x2, #1]!
+        ldrh w10, [x3, #-256]!
+// CHECK: ldrb     w9, [x2, #255]!            // encoding: [0x49,0xfc,0x4f,0x38]
+// CHECK: ldrb     w10, [x3, #1]!             // encoding: [0x6a,0x1c,0x40,0x38]
+// CHECK: ldrb     w10, [x3, #-256]!          // encoding: [0x6a,0x0c,0x50,0x38]
+// CHECK: ldrh     w9, [x2, #255]!            // encoding: [0x49,0xfc,0x4f,0x78]
+// CHECK: ldrh     w9, [x2, #1]!              // encoding: [0x49,0x1c,0x40,0x78]
+// CHECK: ldrh     w10, [x3, #-256]!          // encoding: [0x6a,0x0c,0x50,0x78]
+
+        ldr w19, [sp, #255]!
+        ldr w20, [x30, #1]!
+        ldr w21, [x12, #-256]!
+        ldr xzr, [x9, #255]!
+        ldr x2, [x3, #1]!
+        ldr x19, [x12, #-256]!
+// CHECK: ldr      w19, [sp, #255]!           // encoding: [0xf3,0xff,0x4f,0xb8]
+// CHECK: ldr      w20, [x30, #1]!            // encoding: [0xd4,0x1f,0x40,0xb8]
+// CHECK: ldr      w21, [x12, #-256]!         // encoding: [0x95,0x0d,0x50,0xb8]
+// CHECK: ldr      xzr, [x9, #255]!           // encoding: [0x3f,0xfd,0x4f,0xf8]
+// CHECK: ldr      x2, [x3, #1]!              // encoding: [0x62,0x1c,0x40,0xf8]
+// CHECK: ldr      x19, [x12, #-256]!         // encoding: [0x93,0x0d,0x50,0xf8]
+
+        ldrsb xzr, [x9, #255]!
+        ldrsb x2, [x3, #1]!
+        ldrsb x19, [x12, #-256]!
+        ldrsh xzr, [x9, #255]!
+        ldrsh x2, [x3, #1]!
+        ldrsh x19, [x12, #-256]!
+        ldrsw xzr, [x9, #255]!
+        ldrsw x2, [x3, #1]!
+        ldrsw x19, [x12, #-256]!
+// CHECK: ldrsb    xzr, [x9, #255]!           // encoding: [0x3f,0xfd,0x8f,0x38]
+// CHECK: ldrsb    x2, [x3, #1]!              // encoding: [0x62,0x1c,0x80,0x38]
+// CHECK: ldrsb    x19, [x12, #-256]!         // encoding: [0x93,0x0d,0x90,0x38]
+// CHECK: ldrsh    xzr, [x9, #255]!           // encoding: [0x3f,0xfd,0x8f,0x78]
+// CHECK: ldrsh    x2, [x3, #1]!              // encoding: [0x62,0x1c,0x80,0x78]
+// CHECK: ldrsh    x19, [x12, #-256]!         // encoding: [0x93,0x0d,0x90,0x78]
+// CHECK: ldrsw    xzr, [x9, #255]!           // encoding: [0x3f,0xfd,0x8f,0xb8]
+// CHECK: ldrsw    x2, [x3, #1]!              // encoding: [0x62,0x1c,0x80,0xb8]
+// CHECK: ldrsw    x19, [x12, #-256]!         // encoding: [0x93,0x0d,0x90,0xb8]
+
+        ldrsb wzr, [x9, #255]!
+        ldrsb w2, [x3, #1]!
+        ldrsb w19, [x12, #-256]!
+        ldrsh wzr, [x9, #255]!
+        ldrsh w2, [x3, #1]!
+        ldrsh w19, [x12, #-256]!
+// CHECK: ldrsb    wzr, [x9, #255]!           // encoding: [0x3f,0xfd,0xcf,0x38]
+// CHECK: ldrsb    w2, [x3, #1]!              // encoding: [0x62,0x1c,0xc0,0x38]
+// CHECK: ldrsb    w19, [x12, #-256]!         // encoding: [0x93,0x0d,0xd0,0x38]
+// CHECK: ldrsh    wzr, [x9, #255]!           // encoding: [0x3f,0xfd,0xcf,0x78]
+// CHECK: ldrsh    w2, [x3, #1]!              // encoding: [0x62,0x1c,0xc0,0x78]
+// CHECK: ldrsh    w19, [x12, #-256]!         // encoding: [0x93,0x0d,0xd0,0x78]
+
+        str b0, [x0, #255]!
+        str b3, [x3, #1]!
+        str b5, [sp, #-256]!
+        str h10, [x10, #255]!
+        str h13, [x23, #1]!
+        str h15, [sp, #-256]!
+        str s20, [x20, #255]!
+        str s23, [x23, #1]!
+        str s25, [x0, #-256]!
+        str d20, [x20, #255]!
+        str d23, [x23, #1]!
+        str d25, [x0, #-256]!
+// CHECK: str      b0, [x0, #255]!            // encoding: [0x00,0xfc,0x0f,0x3c]
+// CHECK: str      b3, [x3, #1]!              // encoding: [0x63,0x1c,0x00,0x3c]
+// CHECK: str      b5, [sp, #-256]!           // encoding: [0xe5,0x0f,0x10,0x3c]
+// CHECK: str      h10, [x10, #255]!          // encoding: [0x4a,0xfd,0x0f,0x7c]
+// CHECK: str      h13, [x23, #1]!            // encoding: [0xed,0x1e,0x00,0x7c]
+// CHECK: str      h15, [sp, #-256]!          // encoding: [0xef,0x0f,0x10,0x7c]
+// CHECK: str      s20, [x20, #255]!          // encoding: [0x94,0xfe,0x0f,0xbc]
+// CHECK: str      s23, [x23, #1]!            // encoding: [0xf7,0x1e,0x00,0xbc]
+// CHECK: str      s25, [x0, #-256]!          // encoding: [0x19,0x0c,0x10,0xbc]
+// CHECK: str      d20, [x20, #255]!          // encoding: [0x94,0xfe,0x0f,0xfc]
+// CHECK: str      d23, [x23, #1]!            // encoding: [0xf7,0x1e,0x00,0xfc]
+// CHECK: str      d25, [x0, #-256]!          // encoding: [0x19,0x0c,0x10,0xfc]
+
+        ldr b0, [x0, #255]!
+        ldr b3, [x3, #1]!
+        ldr b5, [sp, #-256]!
+        ldr h10, [x10, #255]!
+        ldr h13, [x23, #1]!
+        ldr h15, [sp, #-256]!
+        ldr s20, [x20, #255]!
+        ldr s23, [x23, #1]!
+        ldr s25, [x0, #-256]!
+        ldr d20, [x20, #255]!
+        ldr d23, [x23, #1]!
+        ldr d25, [x0, #-256]!
+// CHECK: ldr      b0, [x0, #255]!            // encoding: [0x00,0xfc,0x4f,0x3c]
+// CHECK: ldr      b3, [x3, #1]!              // encoding: [0x63,0x1c,0x40,0x3c]
+// CHECK: ldr      b5, [sp, #-256]!           // encoding: [0xe5,0x0f,0x50,0x3c]
+// CHECK: ldr      h10, [x10, #255]!          // encoding: [0x4a,0xfd,0x4f,0x7c]
+// CHECK: ldr      h13, [x23, #1]!            // encoding: [0xed,0x1e,0x40,0x7c]
+// CHECK: ldr      h15, [sp, #-256]!          // encoding: [0xef,0x0f,0x50,0x7c]
+// CHECK: ldr      s20, [x20, #255]!          // encoding: [0x94,0xfe,0x4f,0xbc]
+// CHECK: ldr      s23, [x23, #1]!            // encoding: [0xf7,0x1e,0x40,0xbc]
+// CHECK: ldr      s25, [x0, #-256]!          // encoding: [0x19,0x0c,0x50,0xbc]
+// CHECK: ldr      d20, [x20, #255]!          // encoding: [0x94,0xfe,0x4f,0xfc]
+// CHECK: ldr      d23, [x23, #1]!            // encoding: [0xf7,0x1e,0x40,0xfc]
+// CHECK: ldr      d25, [x0, #-256]!          // encoding: [0x19,0x0c,0x50,0xfc]
+
+        ldr q20, [x1, #255]!
+        ldr q23, [x9, #1]!
+        ldr q25, [x20, #-256]!
+        str q10, [x1, #255]!
+        str q22, [sp, #1]!
+        str q21, [x20, #-256]!
+// CHECK: ldr      q20, [x1, #255]!           // encoding: [0x34,0xfc,0xcf,0x3c]
+// CHECK: ldr      q23, [x9, #1]!             // encoding: [0x37,0x1d,0xc0,0x3c]
+// CHECK: ldr      q25, [x20, #-256]!         // encoding: [0x99,0x0e,0xd0,0x3c]
+// CHECK: str      q10, [x1, #255]!           // encoding: [0x2a,0xfc,0x8f,0x3c]
+// CHECK: str      q22, [sp, #1]!             // encoding: [0xf6,0x1f,0x80,0x3c]
+// CHECK: str      q21, [x20, #-256]!         // encoding: [0x95,0x0e,0x90,0x3c]
+
+//------------------------------------------------------------------------------
+// Load/store (unprivileged)
+//------------------------------------------------------------------------------
+
+        sttrb w9, [sp, #0]
+        sttrh wzr, [x12, #255]
+        sttr w16, [x0, #-256]
+        sttr x28, [x14, #1]
+// CHECK: sttrb    w9, [sp]                   // encoding: [0xe9,0x0b,0x00,0x38]
+// CHECK: sttrh    wzr, [x12, #255]           // encoding: [0x9f,0xf9,0x0f,0x78]
+// CHECK: sttr     w16, [x0, #-256]           // encoding: [0x10,0x08,0x10,0xb8]
+// CHECK: sttr     x28, [x14, #1]             // encoding: [0xdc,0x19,0x00,0xf8]
+
+        ldtrb w1, [x20, #255]
+        ldtrh w20, [x1, #255]
+        ldtr w12, [sp, #255]
+        ldtr xzr, [x12, #255]
+// CHECK: ldtrb    w1, [x20, #255]            // encoding: [0x81,0xfa,0x4f,0x38]
+// CHECK: ldtrh    w20, [x1, #255]            // encoding: [0x34,0xf8,0x4f,0x78]
+// CHECK: ldtr     w12, [sp, #255]            // encoding: [0xec,0xfb,0x4f,0xb8]
+// CHECK: ldtr     xzr, [x12, #255]           // encoding: [0x9f,0xf9,0x4f,0xf8]
+
+        ldtrsb x9, [x7, #-256]
+        ldtrsh x17, [x19, #-256]
+        ldtrsw x20, [x15, #-256]
+        ldtrsb w19, [x1, #-256]
+        ldtrsh w15, [x21, #-256]
+// CHECK: ldtrsb   x9, [x7, #-256]            // encoding: [0xe9,0x08,0x90,0x38]
+// CHECK: ldtrsh   x17, [x19, #-256]          // encoding: [0x71,0x0a,0x90,0x78]
+// CHECK: ldtrsw   x20, [x15, #-256]          // encoding: [0xf4,0x09,0x90,0xb8]
+// CHECK: ldtrsb   w19, [x1, #-256]           // encoding: [0x33,0x08,0xd0,0x38]
+// CHECK: ldtrsh   w15, [x21, #-256]          // encoding: [0xaf,0x0a,0xd0,0x78]
+
+//------------------------------------------------------------------------------
+// Load/store register pair (offset)
+//------------------------------------------------------------------------------
+
+        ldp w3, w5, [sp]
+        stp wzr, w9, [sp, #252]
+        ldp w2, wzr, [sp, #-256]
+        ldp w9, w10, [sp, #4]
+// CHECK: ldp      w3, w5, [sp]               // encoding: [0xe3,0x17,0x40,0x29]
+// CHECK: stp      wzr, w9, [sp, #252]        // encoding: [0xff,0xa7,0x1f,0x29]
+// CHECK: ldp      w2, wzr, [sp, #-256]       // encoding: [0xe2,0x7f,0x60,0x29]
+// CHECK: ldp      w9, w10, [sp, #4]          // encoding: [0xe9,0xab,0x40,0x29]
+
+        ldpsw x9, x10, [sp, #4]
+        ldpsw x9, x10, [x2, #-256]
+        ldpsw x20, x30, [sp, #252]
+// CHECK: ldpsw    x9, x10, [sp, #4]          // encoding: [0xe9,0xab,0x40,0x69]
+// CHECK: ldpsw    x9, x10, [x2, #-256]       // encoding: [0x49,0x28,0x60,0x69]
+// CHECK: ldpsw    x20, x30, [sp, #252]       // encoding: [0xf4,0xfb,0x5f,0x69]
+
+        ldp x21, x29, [x2, #504]
+        ldp x22, x23, [x3, #-512]
+        ldp x24, x25, [x4, #8]
+// CHECK: ldp      x21, x29, [x2, #504]       // encoding: [0x55,0xf4,0x5f,0xa9]
+// CHECK: ldp      x22, x23, [x3, #-512]      // encoding: [0x76,0x5c,0x60,0xa9]
+// CHECK: ldp      x24, x25, [x4, #8]         // encoding: [0x98,0xe4,0x40,0xa9]
+
+        ldp s29, s28, [sp, #252]
+        stp s27, s26, [sp, #-256]
+        ldp s1, s2, [x3, #44]
+// CHECK: ldp      s29, s28, [sp, #252]       // encoding: [0xfd,0xf3,0x5f,0x2d]
+// CHECK: stp      s27, s26, [sp, #-256]      // encoding: [0xfb,0x6b,0x20,0x2d]
+// CHECK: ldp      s1, s2, [x3, #44]          // encoding: [0x61,0x88,0x45,0x2d]
+
+        stp d3, d5, [x9, #504]
+        stp d7, d11, [x10, #-512]
+        ldp d2, d3, [x30, #-8]
+// CHECK: stp      d3, d5, [x9, #504]         // encoding: [0x23,0x95,0x1f,0x6d]
+// CHECK: stp      d7, d11, [x10, #-512]      // encoding: [0x47,0x2d,0x20,0x6d]
+// CHECK: ldp      d2, d3, [x30, #-8]         // encoding: [0xc2,0x8f,0x7f,0x6d]
+
+        stp q3, q5, [sp]
+        stp q17, q19, [sp, #1008]
+        ldp q23, q29, [x1, #-1024]
+// CHECK: stp      q3, q5, [sp]               // encoding: [0xe3,0x17,0x00,0xad]
+// CHECK: stp      q17, q19, [sp, #1008]      // encoding: [0xf1,0xcf,0x1f,0xad]
+// CHECK: ldp      q23, q29, [x1, #-1024]     // encoding: [0x37,0x74,0x60,0xad]
+
+//------------------------------------------------------------------------------
+// Load/store register pair (post-indexed)
+//------------------------------------------------------------------------------
+
+        ldp w3, w5, [sp], #0
+        stp wzr, w9, [sp], #252
+        ldp w2, wzr, [sp], #-256
+        ldp w9, w10, [sp], #4
+// CHECK: ldp      w3, w5, [sp], #0           // encoding: [0xe3,0x17,0xc0,0x28]
+// CHECK: stp      wzr, w9, [sp], #252        // encoding: [0xff,0xa7,0x9f,0x28]
+// CHECK: ldp      w2, wzr, [sp], #-256       // encoding: [0xe2,0x7f,0xe0,0x28]
+// CHECK: ldp      w9, w10, [sp], #4          // encoding: [0xe9,0xab,0xc0,0x28]
+
+        ldpsw x9, x10, [sp], #4
+        ldpsw x9, x10, [x2], #-256
+        ldpsw x20, x30, [sp], #252
+// CHECK: ldpsw    x9, x10, [sp], #4          // encoding: [0xe9,0xab,0xc0,0x68]
+// CHECK: ldpsw    x9, x10, [x2], #-256       // encoding: [0x49,0x28,0xe0,0x68]
+// CHECK: ldpsw    x20, x30, [sp], #252       // encoding: [0xf4,0xfb,0xdf,0x68]
+
+        ldp x21, x29, [x2], #504
+        ldp x22, x23, [x3], #-512
+        ldp x24, x25, [x4], #8
+// CHECK: ldp      x21, x29, [x2], #504       // encoding: [0x55,0xf4,0xdf,0xa8]
+// CHECK: ldp      x22, x23, [x3], #-512      // encoding: [0x76,0x5c,0xe0,0xa8]
+// CHECK: ldp      x24, x25, [x4], #8         // encoding: [0x98,0xe4,0xc0,0xa8]
+
+        ldp s29, s28, [sp], #252
+        stp s27, s26, [sp], #-256
+        ldp s1, s2, [x3], #44
+// CHECK: ldp      s29, s28, [sp], #252       // encoding: [0xfd,0xf3,0xdf,0x2c]
+// CHECK: stp      s27, s26, [sp], #-256      // encoding: [0xfb,0x6b,0xa0,0x2c]
+// CHECK: ldp      s1, s2, [x3], #44          // encoding: [0x61,0x88,0xc5,0x2c]
+
+        stp d3, d5, [x9], #504
+        stp d7, d11, [x10], #-512
+        ldp d2, d3, [x30], #-8
+// CHECK: stp      d3, d5, [x9], #504         // encoding: [0x23,0x95,0x9f,0x6c]
+// CHECK: stp      d7, d11, [x10], #-512      // encoding: [0x47,0x2d,0xa0,0x6c]
+// CHECK: ldp      d2, d3, [x30], #-8         // encoding: [0xc2,0x8f,0xff,0x6c]
+
+        stp q3, q5, [sp], #0
+        stp q17, q19, [sp], #1008
+        ldp q23, q29, [x1], #-1024
+// CHECK: stp      q3, q5, [sp], #0           // encoding: [0xe3,0x17,0x80,0xac]
+// CHECK: stp      q17, q19, [sp], #1008      // encoding: [0xf1,0xcf,0x9f,0xac]
+// CHECK: ldp      q23, q29, [x1], #-1024     // encoding: [0x37,0x74,0xe0,0xac]
+
+//------------------------------------------------------------------------------
+// Load/store register pair (pre-indexed)
+//------------------------------------------------------------------------------
+        ldp w3, w5, [sp, #0]!
+        stp wzr, w9, [sp, #252]!
+        ldp w2, wzr, [sp, #-256]!
+        ldp w9, w10, [sp, #4]!
+// CHECK: ldp      w3, w5, [sp, #0]!          // encoding: [0xe3,0x17,0xc0,0x29]
+// CHECK: stp      wzr, w9, [sp, #252]!       // encoding: [0xff,0xa7,0x9f,0x29]
+// CHECK: ldp      w2, wzr, [sp, #-256]!      // encoding: [0xe2,0x7f,0xe0,0x29]
+// CHECK: ldp      w9, w10, [sp, #4]!         // encoding: [0xe9,0xab,0xc0,0x29]
+
+        ldpsw x9, x10, [sp, #4]!
+        ldpsw x9, x10, [x2, #-256]!
+        ldpsw x20, x30, [sp, #252]!
+// CHECK: ldpsw    x9, x10, [sp, #4]!         // encoding: [0xe9,0xab,0xc0,0x69]
+// CHECK: ldpsw    x9, x10, [x2, #-256]!      // encoding: [0x49,0x28,0xe0,0x69]
+// CHECK: ldpsw    x20, x30, [sp, #252]!      // encoding: [0xf4,0xfb,0xdf,0x69]
+
+        ldp x21, x29, [x2, #504]!
+        ldp x22, x23, [x3, #-512]!
+        ldp x24, x25, [x4, #8]!
+// CHECK: ldp      x21, x29, [x2, #504]!      // encoding: [0x55,0xf4,0xdf,0xa9]
+// CHECK: ldp      x22, x23, [x3, #-512]!     // encoding: [0x76,0x5c,0xe0,0xa9]
+// CHECK: ldp      x24, x25, [x4, #8]!        // encoding: [0x98,0xe4,0xc0,0xa9]
+
+        ldp s29, s28, [sp, #252]!
+        stp s27, s26, [sp, #-256]!
+        ldp s1, s2, [x3, #44]!
+// CHECK: ldp      s29, s28, [sp, #252]!      // encoding: [0xfd,0xf3,0xdf,0x2d]
+// CHECK: stp      s27, s26, [sp, #-256]!     // encoding: [0xfb,0x6b,0xa0,0x2d]
+// CHECK: ldp      s1, s2, [x3, #44]!         // encoding: [0x61,0x88,0xc5,0x2d]
+
+        stp d3, d5, [x9, #504]!
+        stp d7, d11, [x10, #-512]!
+        ldp d2, d3, [x30, #-8]!
+// CHECK: stp      d3, d5, [x9, #504]!        // encoding: [0x23,0x95,0x9f,0x6d]
+// CHECK: stp      d7, d11, [x10, #-512]!     // encoding: [0x47,0x2d,0xa0,0x6d]
+// CHECK: ldp      d2, d3, [x30, #-8]!        // encoding: [0xc2,0x8f,0xff,0x6d]
+
+        stp q3, q5, [sp, #0]!
+        stp q17, q19, [sp, #1008]!
+        ldp q23, q29, [x1, #-1024]!
+// CHECK: stp      q3, q5, [sp, #0]!          // encoding: [0xe3,0x17,0x80,0xad]
+// CHECK: stp      q17, q19, [sp, #1008]!     // encoding: [0xf1,0xcf,0x9f,0xad]
+// CHECK: ldp      q23, q29, [x1, #-1024]!    // encoding: [0x37,0x74,0xe0,0xad]
+
+//------------------------------------------------------------------------------
+// Load/store non-temporal register pair (offset)
+//------------------------------------------------------------------------------
+
+        ldnp w3, w5, [sp]
+        stnp wzr, w9, [sp, #252]
+        ldnp w2, wzr, [sp, #-256]
+        ldnp w9, w10, [sp, #4]
+// CHECK: ldnp      w3, w5, [sp]               // encoding: [0xe3,0x17,0x40,0x28]
+// CHECK: stnp      wzr, w9, [sp, #252]        // encoding: [0xff,0xa7,0x1f,0x28]
+// CHECK: ldnp      w2, wzr, [sp, #-256]       // encoding: [0xe2,0x7f,0x60,0x28]
+// CHECK: ldnp      w9, w10, [sp, #4]          // encoding: [0xe9,0xab,0x40,0x28]
+
+        ldnp x21, x29, [x2, #504]
+        ldnp x22, x23, [x3, #-512]
+        ldnp x24, x25, [x4, #8]
+// CHECK: ldnp      x21, x29, [x2, #504]       // encoding: [0x55,0xf4,0x5f,0xa8]
+// CHECK: ldnp      x22, x23, [x3, #-512]      // encoding: [0x76,0x5c,0x60,0xa8]
+// CHECK: ldnp      x24, x25, [x4, #8]         // encoding: [0x98,0xe4,0x40,0xa8]
+
+        ldnp s29, s28, [sp, #252]
+        stnp s27, s26, [sp, #-256]
+        ldnp s1, s2, [x3, #44]
+// CHECK: ldnp      s29, s28, [sp, #252]       // encoding: [0xfd,0xf3,0x5f,0x2c]
+// CHECK: stnp      s27, s26, [sp, #-256]      // encoding: [0xfb,0x6b,0x20,0x2c]
+// CHECK: ldnp      s1, s2, [x3, #44]          // encoding: [0x61,0x88,0x45,0x2c]
+
+        stnp d3, d5, [x9, #504]
+        stnp d7, d11, [x10, #-512]
+        ldnp d2, d3, [x30, #-8]
+// CHECK: stnp      d3, d5, [x9, #504]         // encoding: [0x23,0x95,0x1f,0x6c]
+// CHECK: stnp      d7, d11, [x10, #-512]      // encoding: [0x47,0x2d,0x20,0x6c]
+// CHECK: ldnp      d2, d3, [x30, #-8]         // encoding: [0xc2,0x8f,0x7f,0x6c]
+
+        stnp q3, q5, [sp]
+        stnp q17, q19, [sp, #1008]
+        ldnp q23, q29, [x1, #-1024]
+// CHECK: stnp      q3, q5, [sp]               // encoding: [0xe3,0x17,0x00,0xac]
+// CHECK: stnp      q17, q19, [sp, #1008]      // encoding: [0xf1,0xcf,0x1f,0xac]
+// CHECK: ldnp      q23, q29, [x1, #-1024]     // encoding: [0x37,0x74,0x60,0xac]
+
+//------------------------------------------------------------------------------
+// Logical (immediate)
+//------------------------------------------------------------------------------
+        // 32 bit replication-width
+        orr w3, w9, #0xffff0000
+        orr wsp, w10, #0xe00000ff
+        orr w9, w10, #0x000003ff
+// CHECK: orr      w3, w9, #0xffff0000        // encoding: [0x23,0x3d,0x10,0x32]
+// CHECK: orr      wsp, w10, #0xe00000ff      // encoding: [0x5f,0x29,0x03,0x32]
+// CHECK: orr      w9, w10, #0x3ff            // encoding: [0x49,0x25,0x00,0x32]
+
+        // 16 bit replication width
+        and w14, w15, #0x80008000
+        and w12, w13, #0xffc3ffc3
+        and w11, wzr, #0x00030003
+// CHECK: and      w14, w15, #0x80008000      // encoding: [0xee,0x81,0x01,0x12]
+// CHECK: and      w12, w13, #0xffc3ffc3      // encoding: [0xac,0xad,0x0a,0x12]
+// CHECK: and      w11, wzr, #0x30003         // encoding: [0xeb,0x87,0x00,0x12]
+
+        // 8 bit replication width
+        eor w3, w6, #0xe0e0e0e0
+        eor wsp, wzr, #0x03030303
+        eor w16, w17, #0x81818181
+// CHECK: eor      w3, w6, #0xe0e0e0e0        // encoding: [0xc3,0xc8,0x03,0x52]
+// CHECK: eor      wsp, wzr, #0x3030303       // encoding: [0xff,0xc7,0x00,0x52]
+// CHECK: eor      w16, w17, #0x81818181      // encoding: [0x30,0xc6,0x01,0x52]
+
+        // 4 bit replication width
+        ands wzr, w18, #0xcccccccc
+        ands w19, w20, #0x33333333
+        ands w21, w22, #0x99999999
+// CHECK: ands     wzr, w18, #0xcccccccc      // encoding: [0x5f,0xe6,0x02,0x72]
+// CHECK: ands     w19, w20, #0x33333333      // encoding: [0x93,0xe6,0x00,0x72]
+// CHECK: ands     w21, w22, #0x99999999      // encoding: [0xd5,0xe6,0x01,0x72]
+
+        // 2 bit replication width
+        tst w3, #0xaaaaaaaa
+        tst wzr, #0x55555555
+// CHECK: ands     wzr, w3, #0xaaaaaaaa       // encoding: [0x7f,0xf0,0x01,0x72]
+// CHECK: ands     wzr, wzr, #0x55555555      // encoding: [0xff,0xf3,0x00,0x72]
+
+        // 64 bit replication-width
+        eor x3, x5, #0xffffffffc000000
+        and x9, x10, #0x00007fffffffffff
+        orr x11, x12, #0x8000000000000fff
+// CHECK: eor      x3, x5, #0xffffffffc000000 // encoding: [0xa3,0x84,0x66,0xd2]
+// CHECK: and      x9, x10, #0x7fffffffffff   // encoding: [0x49,0xb9,0x40,0x92]
+// CHECK: orr      x11, x12, #0x8000000000000fff // encoding: [0x8b,0x31,0x41,0xb2]
+
+        // 32 bit replication-width
+        orr x3, x9, #0xffff0000ffff0000
+        orr sp, x10, #0xe00000ffe00000ff
+        orr x9, x10, #0x000003ff000003ff
+// CHECK: orr      x3, x9, #0xffff0000ffff0000 // encoding: [0x23,0x3d,0x10,0xb2]
+// CHECK: orr      sp, x10, #0xe00000ffe00000ff // encoding: [0x5f,0x29,0x03,0xb2]
+// CHECK: orr      x9, x10, #0x3ff000003ff    // encoding: [0x49,0x25,0x00,0xb2]
+
+        // 16 bit replication-width
+        and x14, x15, #0x8000800080008000
+        and x12, x13, #0xffc3ffc3ffc3ffc3
+        and x11, xzr, #0x0003000300030003
+// CHECK: and      x14, x15, #0x8000800080008000 // encoding: [0xee,0x81,0x01,0x92]
+// CHECK: and      x12, x13, #0xffc3ffc3ffc3ffc3 // encoding: [0xac,0xad,0x0a,0x92]
+// CHECK: and      x11, xzr, #0x3000300030003 // encoding: [0xeb,0x87,0x00,0x92]
+
+        // 8 bit replication-width
+        eor x3, x6, #0xe0e0e0e0e0e0e0e0
+        eor sp, xzr, #0x0303030303030303
+        eor x16, x17, #0x8181818181818181
+// CHECK: eor      x3, x6, #0xe0e0e0e0e0e0e0e0 // encoding: [0xc3,0xc8,0x03,0xd2]
+// CHECK: eor      sp, xzr, #0x303030303030303 // encoding: [0xff,0xc7,0x00,0xd2]
+// CHECK: eor      x16, x17, #0x8181818181818181 // encoding: [0x30,0xc6,0x01,0xd2]
+
+        // 4 bit replication-width
+        ands xzr, x18, #0xcccccccccccccccc
+        ands x19, x20, #0x3333333333333333
+        ands x21, x22, #0x9999999999999999
+// CHECK: ands     xzr, x18, #0xcccccccccccccccc // encoding: [0x5f,0xe6,0x02,0xf2]
+// CHECK: ands     x19, x20, #0x3333333333333333 // encoding: [0x93,0xe6,0x00,0xf2]
+// CHECK: ands     x21, x22, #0x9999999999999999 // encoding: [0xd5,0xe6,0x01,0xf2]
+
+        // 2 bit replication-width
+        tst x3, #0xaaaaaaaaaaaaaaaa
+        tst xzr, #0x5555555555555555
+// CHECK: ands     xzr, x3, #0xaaaaaaaaaaaaaaaa    // encoding: [0x7f,0xf0,0x01,0xf2]
+// CHECK: ands     xzr, xzr, #0x5555555555555555   // encoding: [0xff,0xf3,0x00,0xf2]
+
+        mov w3, #0xf000f
+        mov x10, #0xaaaaaaaaaaaaaaaa
+// CHECK: orr      w3, wzr, #0xf000f          // encoding: [0xe3,0x8f,0x00,0x32]
+// CHECK: orr      x10, xzr, #0xaaaaaaaaaaaaaaaa // encoding: [0xea,0xf3,0x01,0xb2]
+
+//------------------------------------------------------------------------------
+// Logical (shifted register)
+//------------------------------------------------------------------------------
+
+        and w12, w23, w21
+        and w16, w15, w1, lsl #1
+        and w9, w4, w10, lsl #31
+        and w3, w30, w11, lsl #0
+        and x3, x5, x7, lsl #63
+// CHECK: and      w12, w23, w21              // encoding: [0xec,0x02,0x15,0x0a]
+// CHECK: and      w16, w15, w1, lsl #1       // encoding: [0xf0,0x05,0x01,0x0a]
+// CHECK: and      w9, w4, w10, lsl #31       // encoding: [0x89,0x7c,0x0a,0x0a]
+// CHECK: and      w3, w30, w11               // encoding: [0xc3,0x03,0x0b,0x0a]
+// CHECK: and      x3, x5, x7, lsl #63        // encoding: [0xa3,0xfc,0x07,0x8a]
+
+        and x5, x14, x19, asr #4
+        and w3, w17, w19, ror #31
+        and w0, w2, wzr, lsr #17
+        and w3, w30, w11, asr #0
+// CHECK: and      x5, x14, x19, asr #4       // encoding: [0xc5,0x11,0x93,0x8a]
+// CHECK: and      w3, w17, w19, ror #31      // encoding: [0x23,0x7e,0xd3,0x0a]
+// CHECK: and      w0, w2, wzr, lsr #17       // encoding: [0x40,0x44,0x5f,0x0a]
+// CHECK: and      w3, w30, w11, asr #0       // encoding: [0xc3,0x03,0x8b,0x0a]
+
+        and xzr, x4, x26, lsl #0
+        and w3, wzr, w20, ror #0
+        and x7, x20, xzr, asr #63
+// CHECK: and      xzr, x4, x26               // encoding: [0x9f,0x00,0x1a,0x8a]
+// CHECK: and      w3, wzr, w20, ror #0       // encoding: [0xe3,0x03,0xd4,0x0a]
+// CHECK: and      x7, x20, xzr, asr #63      // encoding: [0x87,0xfe,0x9f,0x8a]
+
+        bic x13, x20, x14, lsl #47
+        bic w2, w7, w9
+        orr w2, w7, w0, asr #31
+        orr x8, x9, x10, lsl #12
+        orn x3, x5, x7, asr #0
+        orn w2, w5, w29
+// CHECK: bic      x13, x20, x14, lsl #47     // encoding: [0x8d,0xbe,0x2e,0x8a]
+// CHECK: bic      w2, w7, w9                 // encoding: [0xe2,0x00,0x29,0x0a]
+// CHECK: orr      w2, w7, w0, asr #31        // encoding: [0xe2,0x7c,0x80,0x2a]
+// CHECK: orr      x8, x9, x10, lsl #12       // encoding: [0x28,0x31,0x0a,0xaa]
+// CHECK: orn      x3, x5, x7, asr #0         // encoding: [0xa3,0x00,0xa7,0xaa]
+// CHECK: orn      w2, w5, w29                // encoding: [0xa2,0x00,0x3d,0x2a]
+
+        ands w7, wzr, w9, lsl #1
+        ands x3, x5, x20, ror #63
+        bics w3, w5, w7, lsl #0
+        bics x3, xzr, x3, lsl #1
+// CHECK: ands     w7, wzr, w9, lsl #1        // encoding: [0xe7,0x07,0x09,0x6a]
+// CHECK: ands     x3, x5, x20, ror #63       // encoding: [0xa3,0xfc,0xd4,0xea]
+// CHECK: bics     w3, w5, w7                 // encoding: [0xa3,0x00,0x27,0x6a]
+// CHECK: bics     x3, xzr, x3, lsl #1        // encoding: [0xe3,0x07,0x23,0xea]
+
+        tst w3, w7, lsl #31
+        tst x2, x20, asr #0
+// CHECK: tst      w3, w7, lsl #31            // encoding: [0x7f,0x7c,0x07,0x6a]
+// CHECK: tst      x2, x20, asr #0            // encoding: [0x5f,0x00,0x94,0xea]
+
+        mov x3, x6
+        mov x3, xzr
+        mov wzr, w2
+        mov w3, w5
+// CHECK: mov      x3, x6                     // encoding: [0xe3,0x03,0x06,0xaa]
+// CHECK: mov      x3, xzr                    // encoding: [0xe3,0x03,0x1f,0xaa]
+// CHECK: mov      wzr, w2                    // encoding: [0xff,0x03,0x02,0x2a]
+// CHECK: mov      w3, w5                     // encoding: [0xe3,0x03,0x05,0x2a]
+
+//------------------------------------------------------------------------------
+// Move wide (immediate)
+//------------------------------------------------------------------------------
+
+        movz w1, #65535, lsl #0
+        movz w2, #0, lsl #16
+        movn w2, #1234, lsl #0
+// CHECK: movz     w1, #65535                 // encoding: [0xe1,0xff,0x9f,0x52]
+// CHECK: movz     w2, #0, lsl #16            // encoding: [0x02,0x00,0xa0,0x52]
+// CHECK: movn     w2, #1234                  // encoding: [0x42,0x9a,0x80,0x12]
+
+        movz x2, #1234, lsl #32
+        movk xzr, #4321, lsl #48
+// CHECK: movz     x2, #1234, lsl #32         // encoding: [0x42,0x9a,0xc0,0xd2]
+// CHECK: movk     xzr, #4321, lsl #48        // encoding: [0x3f,0x1c,0xe2,0xf2]
+
+        movz x2, #:abs_g0:sym
+        movk w3, #:abs_g0_nc:sym
+// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0x02'A',A,0x80'A',0xd2'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_a64_movw_uabs_g0
+// CHECK: movk     w3, #:abs_g0_nc:sym    // encoding: [0x03'A',A,0x80'A',0x72'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_a64_movw_uabs_g0_nc
+
+        movz x4, #:abs_g1:sym
+        movk w5, #:abs_g1_nc:sym
+// CHECK: movz     x4, #:abs_g1:sym       // encoding: [0x04'A',A,0xa0'A',0xd2'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_a64_movw_uabs_g1
+// CHECK: movk     w5, #:abs_g1_nc:sym    // encoding: [0x05'A',A,0xa0'A',0x72'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_a64_movw_uabs_g1_nc
+
+        movz x6, #:abs_g2:sym
+        movk x7, #:abs_g2_nc:sym
+// CHECK: movz     x6, #:abs_g2:sym       // encoding: [0x06'A',A,0xc0'A',0xd2'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_a64_movw_uabs_g2
+// CHECK: movk     x7, #:abs_g2_nc:sym    // encoding: [0x07'A',A,0xc0'A',0xf2'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_a64_movw_uabs_g2_nc
+
+        movz x8, #:abs_g3:sym
+        movk x9, #:abs_g3:sym
+// CHECK: movz     x8, #:abs_g3:sym       // encoding: [0x08'A',A,0xe0'A',0xd2'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_a64_movw_uabs_g3
+// CHECK: movk     x9, #:abs_g3:sym       // encoding: [0x09'A',A,0xe0'A',0xf2'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_a64_movw_uabs_g3
+
+        movn x30, #:abs_g0_s:sym
+        movz x19, #:abs_g0_s:sym
+        movn w10, #:abs_g0_s:sym
+        movz w25, #:abs_g0_s:sym
+// CHECK: movn     x30, #:abs_g0_s:sym    // encoding: [0x1e'A',A,0x80'A',0x92'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
+// CHECK: movz     x19, #:abs_g0_s:sym    // encoding: [0x13'A',A,0x80'A',0x92'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
+// CHECK: movn     w10, #:abs_g0_s:sym    // encoding: [0x0a'A',A,0x80'A',0x12'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
+// CHECK: movz     w25, #:abs_g0_s:sym    // encoding: [0x19'A',A,0x80'A',0x12'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_s:sym, kind: fixup_a64_movw_sabs_g0
+
+        movn x30, #:abs_g1_s:sym
+        movz x19, #:abs_g1_s:sym
+        movn w10, #:abs_g1_s:sym
+        movz w25, #:abs_g1_s:sym
+// CHECK: movn     x30, #:abs_g1_s:sym    // encoding: [0x1e'A',A,0xa0'A',0x92'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
+// CHECK: movz     x19, #:abs_g1_s:sym    // encoding: [0x13'A',A,0xa0'A',0x92'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
+// CHECK: movn     w10, #:abs_g1_s:sym    // encoding: [0x0a'A',A,0xa0'A',0x12'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
+// CHECK: movz     w25, #:abs_g1_s:sym    // encoding: [0x19'A',A,0xa0'A',0x12'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_s:sym, kind: fixup_a64_movw_sabs_g1
+
+        movn x30, #:abs_g2_s:sym
+        movz x19, #:abs_g2_s:sym
+// CHECK: movn     x30, #:abs_g2_s:sym    // encoding: [0x1e'A',A,0xc0'A',0x92'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_a64_movw_sabs_g2
+// CHECK: movz     x19, #:abs_g2_s:sym    // encoding: [0x13'A',A,0xc0'A',0x92'A']
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_s:sym, kind: fixup_a64_movw_sabs_g2
+
+//------------------------------------------------------------------------------
+// PC-relative addressing
+//------------------------------------------------------------------------------
+
+        adr x2, loc
+        adr xzr, loc
+ // CHECK: adr     x2, loc                 // encoding: [0x02'A',A,A,0x10'A']
+ // CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_a64_adr_prel
+ // CHECK: adr     xzr, loc                // encoding: [0x1f'A',A,A,0x10'A']
+ // CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_a64_adr_prel
+
+        adrp x29, loc
+ // CHECK: adrp    x29, loc                // encoding: [0x1d'A',A,A,0x90'A']
+ // CHECK:                                 //   fixup A - offset: 0, value: loc, kind: fixup_a64_adr_prel_page
+
+        adrp x30, #4096
+        adr x20, #0
+        adr x9, #-1
+        adr x5, #1048575
+// CHECK: adrp    x30, #4096              // encoding: [0x1e,0x00,0x00,0xb0]
+// CHECK: adr     x20, #0                 // encoding: [0x14,0x00,0x00,0x10]
+// CHECK: adr     x9, #-1                 // encoding: [0xe9,0xff,0xff,0x70]
+// CHECK: adr     x5, #1048575            // encoding: [0xe5,0xff,0x7f,0x70]
+
+        adr x9, #1048575
+        adr x2, #-1048576
+        adrp x9, #4294963200
+        adrp x20, #-4294967296
+// CHECK: adr     x9, #1048575            // encoding: [0xe9,0xff,0x7f,0x70]
+// CHECK: adr     x2, #-1048576           // encoding: [0x02,0x00,0x80,0x10]
+// CHECK: adrp    x9, #4294963200         // encoding: [0xe9,0xff,0x7f,0xf0]
+// CHECK: adrp    x20, #-4294967296       // encoding: [0x14,0x00,0x80,0x90]
+
+//------------------------------------------------------------------------------
+// System
+//------------------------------------------------------------------------------
+
+        hint #0
+        hint #127
+// CHECK: nop                             // encoding: [0x1f,0x20,0x03,0xd5]
+// CHECK: hint    #127                    // encoding: [0xff,0x2f,0x03,0xd5]
+
+        nop
+        yield
+        wfe
+        wfi
+        sev
+        sevl
+// CHECK: nop                             // encoding: [0x1f,0x20,0x03,0xd5]
+// CHECK: yield                           // encoding: [0x3f,0x20,0x03,0xd5]
+// CHECK: wfe                             // encoding: [0x5f,0x20,0x03,0xd5]
+// CHECK: wfi                             // encoding: [0x7f,0x20,0x03,0xd5]
+// CHECK: sev                             // encoding: [0x9f,0x20,0x03,0xd5]
+// CHECK: sevl                            // encoding: [0xbf,0x20,0x03,0xd5]
+
+        clrex
+        clrex #0
+        clrex #7
+        clrex #15
+// CHECK: clrex                           // encoding: [0x5f,0x3f,0x03,0xd5]
+// CHECK: clrex   #0                      // encoding: [0x5f,0x30,0x03,0xd5]
+// CHECK: clrex   #7                      // encoding: [0x5f,0x37,0x03,0xd5]
+// CHECK: clrex                           // encoding: [0x5f,0x3f,0x03,0xd5]
+
+        dsb #0
+        dsb #12
+        dsb #15
+        dsb oshld
+        dsb oshst
+        dsb osh
+        dsb nshld
+        dsb nshst
+        dsb nsh
+        dsb ishld
+        dsb ishst
+        dsb ish
+        dsb ld
+        dsb st
+        dsb sy
+// CHECK: dsb     #0                      // encoding: [0x9f,0x30,0x03,0xd5]
+// CHECK: dsb     #12                     // encoding: [0x9f,0x3c,0x03,0xd5]
+// CHECK: dsb     sy                      // encoding: [0x9f,0x3f,0x03,0xd5]
+// CHECK: dsb     oshld                   // encoding: [0x9f,0x31,0x03,0xd5]
+// CHECK: dsb     oshst                   // encoding: [0x9f,0x32,0x03,0xd5]
+// CHECK: dsb     osh                     // encoding: [0x9f,0x33,0x03,0xd5]
+// CHECK: dsb     nshld                   // encoding: [0x9f,0x35,0x03,0xd5]
+// CHECK: dsb     nshst                   // encoding: [0x9f,0x36,0x03,0xd5]
+// CHECK: dsb     nsh                     // encoding: [0x9f,0x37,0x03,0xd5]
+// CHECK: dsb     ishld                   // encoding: [0x9f,0x39,0x03,0xd5]
+// CHECK: dsb     ishst                   // encoding: [0x9f,0x3a,0x03,0xd5]
+// CHECK: dsb     ish                     // encoding: [0x9f,0x3b,0x03,0xd5]
+// CHECK: dsb     ld                      // encoding: [0x9f,0x3d,0x03,0xd5]
+// CHECK: dsb     st                      // encoding: [0x9f,0x3e,0x03,0xd5]
+// CHECK: dsb     sy                      // encoding: [0x9f,0x3f,0x03,0xd5]
+
+        dmb #0
+        dmb #12
+        dmb #15
+        dmb oshld
+        dmb oshst
+        dmb osh
+        dmb nshld
+        dmb nshst
+        dmb nsh
+        dmb ishld
+        dmb ishst
+        dmb ish
+        dmb ld
+        dmb st
+        dmb sy
+// CHECK: dmb     #0                      // encoding: [0xbf,0x30,0x03,0xd5]
+// CHECK: dmb     #12                     // encoding: [0xbf,0x3c,0x03,0xd5]
+// CHECK: dmb     sy                      // encoding: [0xbf,0x3f,0x03,0xd5]
+// CHECK: dmb     oshld                   // encoding: [0xbf,0x31,0x03,0xd5]
+// CHECK: dmb     oshst                   // encoding: [0xbf,0x32,0x03,0xd5]
+// CHECK: dmb     osh                     // encoding: [0xbf,0x33,0x03,0xd5]
+// CHECK: dmb     nshld                   // encoding: [0xbf,0x35,0x03,0xd5]
+// CHECK: dmb     nshst                   // encoding: [0xbf,0x36,0x03,0xd5]
+// CHECK: dmb     nsh                     // encoding: [0xbf,0x37,0x03,0xd5]
+// CHECK: dmb     ishld                   // encoding: [0xbf,0x39,0x03,0xd5]
+// CHECK: dmb     ishst                   // encoding: [0xbf,0x3a,0x03,0xd5]
+// CHECK: dmb     ish                     // encoding: [0xbf,0x3b,0x03,0xd5]
+// CHECK: dmb     ld                      // encoding: [0xbf,0x3d,0x03,0xd5]
+// CHECK: dmb     st                      // encoding: [0xbf,0x3e,0x03,0xd5]
+// CHECK: dmb     sy                      // encoding: [0xbf,0x3f,0x03,0xd5]
+
+        isb sy
+        isb
+        isb #12
+// CHECK: isb                             // encoding: [0xdf,0x3f,0x03,0xd5]
+// CHECK: isb                             // encoding: [0xdf,0x3f,0x03,0xd5]
+// CHECK: isb     #12                     // encoding: [0xdf,0x3c,0x03,0xd5]
+
+
+        msr spsel, #0
+        msr daifset, #15
+        msr daifclr, #12
+// CHECK: msr     spsel, #0               // encoding: [0xbf,0x40,0x00,0xd5]
+// CHECK: msr     daifset, #15            // encoding: [0xdf,0x4f,0x03,0xd5]
+// CHECK: msr     daifclr, #12            // encoding: [0xff,0x4c,0x03,0xd5]
+
+        sys #7, c5, c9, #7, x5
+        sys #0, c15, c15, #2
+// CHECK: sys     #7, c5, c9, #7, x5      // encoding: [0xe5,0x59,0x0f,0xd5]
+// CHECK: sys     #0, c15, c15, #2, xzr   // encoding: [0x5f,0xff,0x08,0xd5]
+
+        sysl x9, #7, c5, c9, #7
+        sysl x1, #0, c15, c15, #2
+// CHECK: sysl    x9, #7, c5, c9, #7      // encoding: [0xe9,0x59,0x2f,0xd5]
+// CHECK: sysl    x1, #0, c15, c15, #2    // encoding: [0x41,0xff,0x28,0xd5]
+
+        ic ialluis
+        ic iallu
+        ic ivau, x9
+// CHECK:         ic      ialluis                 // encoding: [0x1f,0x71,0x08,0xd5]
+// CHECK:         ic      iallu                   // encoding: [0x1f,0x75,0x08,0xd5]
+// CHECK:         ic      ivau, x9                // encoding: [0x29,0x75,0x0b,0xd5]
+
+        dc zva, x12
+        dc ivac, xzr
+        dc isw, x2
+        dc cvac, x9
+        dc csw, x10
+        dc cvau, x0
+        dc civac, x3
+        dc cisw, x30
+// CHECK:         dc      zva, x12                // encoding: [0x2c,0x74,0x0b,0xd5]
+// CHECK:         dc      ivac, xzr               // encoding: [0x3f,0x76,0x08,0xd5]
+// CHECK:         dc      isw, x2                 // encoding: [0x42,0x76,0x08,0xd5]
+// CHECK:         dc      cvac, x9                // encoding: [0x29,0x7a,0x0b,0xd5]
+// CHECK:         dc      csw, x10                // encoding: [0x4a,0x7a,0x08,0xd5]
+// CHECK:         dc      cvau, x0                // encoding: [0x20,0x7b,0x0b,0xd5]
+// CHECK:         dc      civac, x3               // encoding: [0x23,0x7e,0x0b,0xd5]
+// CHECK:         dc      cisw, x30               // encoding: [0x5e,0x7e,0x08,0xd5]
+
+        at S1E1R, x19
+        at S1E2R, x19
+        at S1E3R, x19
+        at S1E1W, x19
+        at S1E2W, x19
+        at S1E3W, x19
+        at S1E0R, x19
+        at S1E0W, x19
+        at S12E1R, x20
+        at S12E1W, x20
+        at S12E0R, x20
+        at S12E0W, x20
+// CHECK: at      s1e1r, x19              // encoding: [0x13,0x78,0x08,0xd5]
+// CHECK: at      s1e2r, x19              // encoding: [0x13,0x78,0x0c,0xd5]
+// CHECK: at      s1e3r, x19              // encoding: [0x13,0x78,0x0e,0xd5]
+// CHECK: at      s1e1w, x19              // encoding: [0x33,0x78,0x08,0xd5]
+// CHECK: at      s1e2w, x19              // encoding: [0x33,0x78,0x0c,0xd5]
+// CHECK: at      s1e3w, x19              // encoding: [0x33,0x78,0x0e,0xd5]
+// CHECK: at      s1e0r, x19              // encoding: [0x53,0x78,0x08,0xd5]
+// CHECK: at      s1e0w, x19              // encoding: [0x73,0x78,0x08,0xd5]
+// CHECK: at      s12e1r, x20             // encoding: [0x94,0x78,0x0c,0xd5]
+// CHECK: at      s12e1w, x20             // encoding: [0xb4,0x78,0x0c,0xd5]
+// CHECK: at      s12e0r, x20             // encoding: [0xd4,0x78,0x0c,0xd5]
+// CHECK: at      s12e0w, x20             // encoding: [0xf4,0x78,0x0c,0xd5]
+
+        tlbi IPAS2E1IS, x4
+        tlbi IPAS2LE1IS, x9
+        tlbi VMALLE1IS
+        tlbi ALLE2IS
+        tlbi ALLE3IS
+        tlbi VAE1IS, x1
+        tlbi VAE2IS, x2
+        tlbi VAE3IS, x3
+        tlbi ASIDE1IS, x5
+        tlbi VAAE1IS, x9
+        tlbi ALLE1IS
+        tlbi VALE1IS, x10
+        tlbi VALE2IS, x11
+        tlbi VALE3IS, x13
+        tlbi VMALLS12E1IS
+        tlbi VAALE1IS, x14
+        tlbi IPAS2E1, x15
+        tlbi IPAS2LE1, x16
+        tlbi VMALLE1
+        tlbi ALLE2
+        tlbi ALLE3
+        tlbi VAE1, x17
+        tlbi VAE2, x18
+        tlbi VAE3, x19
+        tlbi ASIDE1, x20
+        tlbi VAAE1, x21
+        tlbi ALLE1
+        tlbi VALE1, x22
+        tlbi VALE2, x23
+        tlbi VALE3, x24
+        tlbi VMALLS12E1
+        tlbi VAALE1, x25
+// CHECK: tlbi    ipas2e1is, x4           // encoding: [0x24,0x80,0x0c,0xd5]
+// CHECK: tlbi    ipas2le1is, x9          // encoding: [0xa9,0x80,0x0c,0xd5]
+// CHECK: tlbi    vmalle1is               // encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK: tlbi    alle2is                 // encoding: [0x1f,0x83,0x0c,0xd5]
+// CHECK: tlbi    alle3is                 // encoding: [0x1f,0x83,0x0e,0xd5]
+// CHECK: tlbi    vae1is, x1              // encoding: [0x21,0x83,0x08,0xd5]
+// CHECK: tlbi    vae2is, x2              // encoding: [0x22,0x83,0x0c,0xd5]
+// CHECK: tlbi    vae3is, x3              // encoding: [0x23,0x83,0x0e,0xd5]
+// CHECK: tlbi    aside1is, x5            // encoding: [0x45,0x83,0x08,0xd5]
+// CHECK: tlbi    vaae1is, x9             // encoding: [0x69,0x83,0x08,0xd5]
+// CHECK: tlbi    alle1is                 // encoding: [0x9f,0x83,0x0c,0xd5]
+// CHECK: tlbi    vale1is, x10            // encoding: [0xaa,0x83,0x08,0xd5]
+// CHECK: tlbi    vale2is, x11            // encoding: [0xab,0x83,0x0c,0xd5]
+// CHECK: tlbi    vale3is, x13            // encoding: [0xad,0x83,0x0e,0xd5]
+// CHECK: tlbi    vmalls12e1is            // encoding: [0xdf,0x83,0x0c,0xd5]
+// CHECK: tlbi    vaale1is, x14           // encoding: [0xee,0x83,0x08,0xd5]
+// CHECK: tlbi    ipas2e1, x15            // encoding: [0x2f,0x84,0x0c,0xd5]
+// CHECK: tlbi    ipas2le1, x16           // encoding: [0xb0,0x84,0x0c,0xd5]
+// CHECK: tlbi    vmalle1                 // encoding: [0x1f,0x87,0x08,0xd5]
+// CHECK: tlbi    alle2                   // encoding: [0x1f,0x87,0x0c,0xd5]
+// CHECK: tlbi    alle3                   // encoding: [0x1f,0x87,0x0e,0xd5]
+// CHECK: tlbi    vae1, x17               // encoding: [0x31,0x87,0x08,0xd5]
+// CHECK: tlbi    vae2, x18               // encoding: [0x32,0x87,0x0c,0xd5]
+// CHECK: tlbi    vae3, x19               // encoding: [0x33,0x87,0x0e,0xd5]
+// CHECK: tlbi    aside1, x20             // encoding: [0x54,0x87,0x08,0xd5]
+// CHECK: tlbi    vaae1, x21              // encoding: [0x75,0x87,0x08,0xd5]
+// CHECK: tlbi    alle1                   // encoding: [0x9f,0x87,0x0c,0xd5]
+// CHECK: tlbi    vale1, x22              // encoding: [0xb6,0x87,0x08,0xd5]
+// CHECK: tlbi    vale2, x23              // encoding: [0xb7,0x87,0x0c,0xd5]
+// CHECK: tlbi    vale3, x24              // encoding: [0xb8,0x87,0x0e,0xd5]
+// CHECK: tlbi    vmalls12e1              // encoding: [0xdf,0x87,0x0c,0xd5]
+// CHECK: tlbi    vaale1, x25             // encoding: [0xf9,0x87,0x08,0xd5]
+
+	msr TEECR32_EL1, x12
+	msr OSDTRRX_EL1, x12
+	msr MDCCINT_EL1, x12
+	msr MDSCR_EL1, x12
+	msr OSDTRTX_EL1, x12
+	msr DBGDTR_EL0, x12
+	msr DBGDTRTX_EL0, x12
+	msr OSECCR_EL1, x12
+	msr DBGVCR32_EL2, x12
+	msr DBGBVR0_EL1, x12
+	msr DBGBVR1_EL1, x12
+	msr DBGBVR2_EL1, x12
+	msr DBGBVR3_EL1, x12
+	msr DBGBVR4_EL1, x12
+	msr DBGBVR5_EL1, x12
+	msr DBGBVR6_EL1, x12
+	msr DBGBVR7_EL1, x12
+	msr DBGBVR8_EL1, x12
+	msr DBGBVR9_EL1, x12
+	msr DBGBVR10_EL1, x12
+	msr DBGBVR11_EL1, x12
+	msr DBGBVR12_EL1, x12
+	msr DBGBVR13_EL1, x12
+	msr DBGBVR14_EL1, x12
+	msr DBGBVR15_EL1, x12
+	msr DBGBCR0_EL1, x12
+	msr DBGBCR1_EL1, x12
+	msr DBGBCR2_EL1, x12
+	msr DBGBCR3_EL1, x12
+	msr DBGBCR4_EL1, x12
+	msr DBGBCR5_EL1, x12
+	msr DBGBCR6_EL1, x12
+	msr DBGBCR7_EL1, x12
+	msr DBGBCR8_EL1, x12
+	msr DBGBCR9_EL1, x12
+	msr DBGBCR10_EL1, x12
+	msr DBGBCR11_EL1, x12
+	msr DBGBCR12_EL1, x12
+	msr DBGBCR13_EL1, x12
+	msr DBGBCR14_EL1, x12
+	msr DBGBCR15_EL1, x12
+	msr DBGWVR0_EL1, x12
+	msr DBGWVR1_EL1, x12
+	msr DBGWVR2_EL1, x12
+	msr DBGWVR3_EL1, x12
+	msr DBGWVR4_EL1, x12
+	msr DBGWVR5_EL1, x12
+	msr DBGWVR6_EL1, x12
+	msr DBGWVR7_EL1, x12
+	msr DBGWVR8_EL1, x12
+	msr DBGWVR9_EL1, x12
+	msr DBGWVR10_EL1, x12
+	msr DBGWVR11_EL1, x12
+	msr DBGWVR12_EL1, x12
+	msr DBGWVR13_EL1, x12
+	msr DBGWVR14_EL1, x12
+	msr DBGWVR15_EL1, x12
+	msr DBGWCR0_EL1, x12
+	msr DBGWCR1_EL1, x12
+	msr DBGWCR2_EL1, x12
+	msr DBGWCR3_EL1, x12
+	msr DBGWCR4_EL1, x12
+	msr DBGWCR5_EL1, x12
+	msr DBGWCR6_EL1, x12
+	msr DBGWCR7_EL1, x12
+	msr DBGWCR8_EL1, x12
+	msr DBGWCR9_EL1, x12
+	msr DBGWCR10_EL1, x12
+	msr DBGWCR11_EL1, x12
+	msr DBGWCR12_EL1, x12
+	msr DBGWCR13_EL1, x12
+	msr DBGWCR14_EL1, x12
+	msr DBGWCR15_EL1, x12
+	msr TEEHBR32_EL1, x12
+	msr OSLAR_EL1, x12
+	msr OSDLR_EL1, x12
+	msr DBGPRCR_EL1, x12
+	msr DBGCLAIMSET_EL1, x12
+	msr DBGCLAIMCLR_EL1, x12
+	msr CSSELR_EL1, x12
+	msr VPIDR_EL2, x12
+	msr VMPIDR_EL2, x12
+	msr SCTLR_EL1, x12
+	msr SCTLR_EL2, x12
+	msr SCTLR_EL3, x12
+	msr ACTLR_EL1, x12
+	msr ACTLR_EL2, x12
+	msr ACTLR_EL3, x12
+	msr CPACR_EL1, x12
+	msr HCR_EL2, x12
+	msr SCR_EL3, x12
+	msr MDCR_EL2, x12
+	msr SDER32_EL3, x12
+	msr CPTR_EL2, x12
+	msr CPTR_EL3, x12
+	msr HSTR_EL2, x12
+	msr HACR_EL2, x12
+	msr MDCR_EL3, x12
+	msr TTBR0_EL1, x12
+	msr TTBR0_EL2, x12
+	msr TTBR0_EL3, x12
+	msr TTBR1_EL1, x12
+	msr TCR_EL1, x12
+	msr TCR_EL2, x12
+	msr TCR_EL3, x12
+	msr VTTBR_EL2, x12
+	msr VTCR_EL2, x12
+	msr DACR32_EL2, x12
+	msr SPSR_EL1, x12
+	msr SPSR_EL2, x12
+	msr SPSR_EL3, x12
+	msr ELR_EL1, x12
+	msr ELR_EL2, x12
+	msr ELR_EL3, x12
+	msr SP_EL0, x12
+	msr SP_EL1, x12
+	msr SP_EL2, x12
+	msr SPSel, x12
+	msr NZCV, x12
+	msr DAIF, x12
+	msr CurrentEL, x12
+	msr SPSR_irq, x12
+	msr SPSR_abt, x12
+	msr SPSR_und, x12
+	msr SPSR_fiq, x12
+	msr FPCR, x12
+	msr FPSR, x12
+	msr DSPSR_EL0, x12
+	msr DLR_EL0, x12
+	msr IFSR32_EL2, x12
+	msr AFSR0_EL1, x12
+	msr AFSR0_EL2, x12
+	msr AFSR0_EL3, x12
+	msr AFSR1_EL1, x12
+	msr AFSR1_EL2, x12
+	msr AFSR1_EL3, x12
+	msr ESR_EL1, x12
+	msr ESR_EL2, x12
+	msr ESR_EL3, x12
+	msr FPEXC32_EL2, x12
+	msr FAR_EL1, x12
+	msr FAR_EL2, x12
+	msr FAR_EL3, x12
+	msr HPFAR_EL2, x12
+	msr PAR_EL1, x12
+	msr PMCR_EL0, x12
+	msr PMCNTENSET_EL0, x12
+	msr PMCNTENCLR_EL0, x12
+	msr PMOVSCLR_EL0, x12
+	msr PMSELR_EL0, x12
+	msr PMCCNTR_EL0, x12
+	msr PMXEVTYPER_EL0, x12
+	msr PMXEVCNTR_EL0, x12
+	msr PMUSERENR_EL0, x12
+	msr PMINTENSET_EL1, x12
+	msr PMINTENCLR_EL1, x12
+	msr PMOVSSET_EL0, x12
+	msr MAIR_EL1, x12
+	msr MAIR_EL2, x12
+	msr MAIR_EL3, x12
+	msr AMAIR_EL1, x12
+	msr AMAIR_EL2, x12
+	msr AMAIR_EL3, x12
+	msr VBAR_EL1, x12
+	msr VBAR_EL2, x12
+	msr VBAR_EL3, x12
+	msr RMR_EL1, x12
+	msr RMR_EL2, x12
+	msr RMR_EL3, x12
+	msr CONTEXTIDR_EL1, x12
+	msr TPIDR_EL0, x12
+	msr TPIDR_EL2, x12
+	msr TPIDR_EL3, x12
+	msr TPIDRRO_EL0, x12
+	msr TPIDR_EL1, x12
+	msr CNTFRQ_EL0, x12
+	msr CNTVOFF_EL2, x12
+	msr CNTKCTL_EL1, x12
+	msr CNTHCTL_EL2, x12
+	msr CNTP_TVAL_EL0, x12
+	msr CNTHP_TVAL_EL2, x12
+	msr CNTPS_TVAL_EL1, x12
+	msr CNTP_CTL_EL0, x12
+	msr CNTHP_CTL_EL2, x12
+	msr CNTPS_CTL_EL1, x12
+	msr CNTP_CVAL_EL0, x12
+	msr CNTHP_CVAL_EL2, x12
+	msr CNTPS_CVAL_EL1, x12
+	msr CNTV_TVAL_EL0, x12
+	msr CNTV_CTL_EL0, x12
+	msr CNTV_CVAL_EL0, x12
+	msr PMEVCNTR0_EL0, x12
+	msr PMEVCNTR1_EL0, x12
+	msr PMEVCNTR2_EL0, x12
+	msr PMEVCNTR3_EL0, x12
+	msr PMEVCNTR4_EL0, x12
+	msr PMEVCNTR5_EL0, x12
+	msr PMEVCNTR6_EL0, x12
+	msr PMEVCNTR7_EL0, x12
+	msr PMEVCNTR8_EL0, x12
+	msr PMEVCNTR9_EL0, x12
+	msr PMEVCNTR10_EL0, x12
+	msr PMEVCNTR11_EL0, x12
+	msr PMEVCNTR12_EL0, x12
+	msr PMEVCNTR13_EL0, x12
+	msr PMEVCNTR14_EL0, x12
+	msr PMEVCNTR15_EL0, x12
+	msr PMEVCNTR16_EL0, x12
+	msr PMEVCNTR17_EL0, x12
+	msr PMEVCNTR18_EL0, x12
+	msr PMEVCNTR19_EL0, x12
+	msr PMEVCNTR20_EL0, x12
+	msr PMEVCNTR21_EL0, x12
+	msr PMEVCNTR22_EL0, x12
+	msr PMEVCNTR23_EL0, x12
+	msr PMEVCNTR24_EL0, x12
+	msr PMEVCNTR25_EL0, x12
+	msr PMEVCNTR26_EL0, x12
+	msr PMEVCNTR27_EL0, x12
+	msr PMEVCNTR28_EL0, x12
+	msr PMEVCNTR29_EL0, x12
+	msr PMEVCNTR30_EL0, x12
+	msr PMCCFILTR_EL0, x12
+	msr PMEVTYPER0_EL0, x12
+	msr PMEVTYPER1_EL0, x12
+	msr PMEVTYPER2_EL0, x12
+	msr PMEVTYPER3_EL0, x12
+	msr PMEVTYPER4_EL0, x12
+	msr PMEVTYPER5_EL0, x12
+	msr PMEVTYPER6_EL0, x12
+	msr PMEVTYPER7_EL0, x12
+	msr PMEVTYPER8_EL0, x12
+	msr PMEVTYPER9_EL0, x12
+	msr PMEVTYPER10_EL0, x12
+	msr PMEVTYPER11_EL0, x12
+	msr PMEVTYPER12_EL0, x12
+	msr PMEVTYPER13_EL0, x12
+	msr PMEVTYPER14_EL0, x12
+	msr PMEVTYPER15_EL0, x12
+	msr PMEVTYPER16_EL0, x12
+	msr PMEVTYPER17_EL0, x12
+	msr PMEVTYPER18_EL0, x12
+	msr PMEVTYPER19_EL0, x12
+	msr PMEVTYPER20_EL0, x12
+	msr PMEVTYPER21_EL0, x12
+	msr PMEVTYPER22_EL0, x12
+	msr PMEVTYPER23_EL0, x12
+	msr PMEVTYPER24_EL0, x12
+	msr PMEVTYPER25_EL0, x12
+	msr PMEVTYPER26_EL0, x12
+	msr PMEVTYPER27_EL0, x12
+	msr PMEVTYPER28_EL0, x12
+	msr PMEVTYPER29_EL0, x12
+	msr PMEVTYPER30_EL0, x12
+// CHECK: msr      teecr32_el1, x12           // encoding: [0x0c,0x00,0x12,0xd5]
+// CHECK: msr      osdtrrx_el1, x12           // encoding: [0x4c,0x00,0x10,0xd5]
+// CHECK: msr      mdccint_el1, x12           // encoding: [0x0c,0x02,0x10,0xd5]
+// CHECK: msr      mdscr_el1, x12             // encoding: [0x4c,0x02,0x10,0xd5]
+// CHECK: msr      osdtrtx_el1, x12           // encoding: [0x4c,0x03,0x10,0xd5]
+// CHECK: msr      dbgdtr_el0, x12            // encoding: [0x0c,0x04,0x13,0xd5]
+// CHECK: msr      dbgdtrtx_el0, x12          // encoding: [0x0c,0x05,0x13,0xd5]
+// CHECK: msr      oseccr_el1, x12            // encoding: [0x4c,0x06,0x10,0xd5]
+// CHECK: msr      dbgvcr32_el2, x12          // encoding: [0x0c,0x07,0x14,0xd5]
+// CHECK: msr      dbgbvr0_el1, x12           // encoding: [0x8c,0x00,0x10,0xd5]
+// CHECK: msr      dbgbvr1_el1, x12           // encoding: [0x8c,0x01,0x10,0xd5]
+// CHECK: msr      dbgbvr2_el1, x12           // encoding: [0x8c,0x02,0x10,0xd5]
+// CHECK: msr      dbgbvr3_el1, x12           // encoding: [0x8c,0x03,0x10,0xd5]
+// CHECK: msr      dbgbvr4_el1, x12           // encoding: [0x8c,0x04,0x10,0xd5]
+// CHECK: msr      dbgbvr5_el1, x12           // encoding: [0x8c,0x05,0x10,0xd5]
+// CHECK: msr      dbgbvr6_el1, x12           // encoding: [0x8c,0x06,0x10,0xd5]
+// CHECK: msr      dbgbvr7_el1, x12           // encoding: [0x8c,0x07,0x10,0xd5]
+// CHECK: msr      dbgbvr8_el1, x12           // encoding: [0x8c,0x08,0x10,0xd5]
+// CHECK: msr      dbgbvr9_el1, x12           // encoding: [0x8c,0x09,0x10,0xd5]
+// CHECK: msr      dbgbvr10_el1, x12          // encoding: [0x8c,0x0a,0x10,0xd5]
+// CHECK: msr      dbgbvr11_el1, x12          // encoding: [0x8c,0x0b,0x10,0xd5]
+// CHECK: msr      dbgbvr12_el1, x12          // encoding: [0x8c,0x0c,0x10,0xd5]
+// CHECK: msr      dbgbvr13_el1, x12          // encoding: [0x8c,0x0d,0x10,0xd5]
+// CHECK: msr      dbgbvr14_el1, x12          // encoding: [0x8c,0x0e,0x10,0xd5]
+// CHECK: msr      dbgbvr15_el1, x12          // encoding: [0x8c,0x0f,0x10,0xd5]
+// CHECK: msr      dbgbcr0_el1, x12           // encoding: [0xac,0x00,0x10,0xd5]
+// CHECK: msr      dbgbcr1_el1, x12           // encoding: [0xac,0x01,0x10,0xd5]
+// CHECK: msr      dbgbcr2_el1, x12           // encoding: [0xac,0x02,0x10,0xd5]
+// CHECK: msr      dbgbcr3_el1, x12           // encoding: [0xac,0x03,0x10,0xd5]
+// CHECK: msr      dbgbcr4_el1, x12           // encoding: [0xac,0x04,0x10,0xd5]
+// CHECK: msr      dbgbcr5_el1, x12           // encoding: [0xac,0x05,0x10,0xd5]
+// CHECK: msr      dbgbcr6_el1, x12           // encoding: [0xac,0x06,0x10,0xd5]
+// CHECK: msr      dbgbcr7_el1, x12           // encoding: [0xac,0x07,0x10,0xd5]
+// CHECK: msr      dbgbcr8_el1, x12           // encoding: [0xac,0x08,0x10,0xd5]
+// CHECK: msr      dbgbcr9_el1, x12           // encoding: [0xac,0x09,0x10,0xd5]
+// CHECK: msr      dbgbcr10_el1, x12          // encoding: [0xac,0x0a,0x10,0xd5]
+// CHECK: msr      dbgbcr11_el1, x12          // encoding: [0xac,0x0b,0x10,0xd5]
+// CHECK: msr      dbgbcr12_el1, x12          // encoding: [0xac,0x0c,0x10,0xd5]
+// CHECK: msr      dbgbcr13_el1, x12          // encoding: [0xac,0x0d,0x10,0xd5]
+// CHECK: msr      dbgbcr14_el1, x12          // encoding: [0xac,0x0e,0x10,0xd5]
+// CHECK: msr      dbgbcr15_el1, x12          // encoding: [0xac,0x0f,0x10,0xd5]
+// CHECK: msr      dbgwvr0_el1, x12           // encoding: [0xcc,0x00,0x10,0xd5]
+// CHECK: msr      dbgwvr1_el1, x12           // encoding: [0xcc,0x01,0x10,0xd5]
+// CHECK: msr      dbgwvr2_el1, x12           // encoding: [0xcc,0x02,0x10,0xd5]
+// CHECK: msr      dbgwvr3_el1, x12           // encoding: [0xcc,0x03,0x10,0xd5]
+// CHECK: msr      dbgwvr4_el1, x12           // encoding: [0xcc,0x04,0x10,0xd5]
+// CHECK: msr      dbgwvr5_el1, x12           // encoding: [0xcc,0x05,0x10,0xd5]
+// CHECK: msr      dbgwvr6_el1, x12           // encoding: [0xcc,0x06,0x10,0xd5]
+// CHECK: msr      dbgwvr7_el1, x12           // encoding: [0xcc,0x07,0x10,0xd5]
+// CHECK: msr      dbgwvr8_el1, x12           // encoding: [0xcc,0x08,0x10,0xd5]
+// CHECK: msr      dbgwvr9_el1, x12           // encoding: [0xcc,0x09,0x10,0xd5]
+// CHECK: msr      dbgwvr10_el1, x12          // encoding: [0xcc,0x0a,0x10,0xd5]
+// CHECK: msr      dbgwvr11_el1, x12          // encoding: [0xcc,0x0b,0x10,0xd5]
+// CHECK: msr      dbgwvr12_el1, x12          // encoding: [0xcc,0x0c,0x10,0xd5]
+// CHECK: msr      dbgwvr13_el1, x12          // encoding: [0xcc,0x0d,0x10,0xd5]
+// CHECK: msr      dbgwvr14_el1, x12          // encoding: [0xcc,0x0e,0x10,0xd5]
+// CHECK: msr      dbgwvr15_el1, x12          // encoding: [0xcc,0x0f,0x10,0xd5]
+// CHECK: msr      dbgwcr0_el1, x12           // encoding: [0xec,0x00,0x10,0xd5]
+// CHECK: msr      dbgwcr1_el1, x12           // encoding: [0xec,0x01,0x10,0xd5]
+// CHECK: msr      dbgwcr2_el1, x12           // encoding: [0xec,0x02,0x10,0xd5]
+// CHECK: msr      dbgwcr3_el1, x12           // encoding: [0xec,0x03,0x10,0xd5]
+// CHECK: msr      dbgwcr4_el1, x12           // encoding: [0xec,0x04,0x10,0xd5]
+// CHECK: msr      dbgwcr5_el1, x12           // encoding: [0xec,0x05,0x10,0xd5]
+// CHECK: msr      dbgwcr6_el1, x12           // encoding: [0xec,0x06,0x10,0xd5]
+// CHECK: msr      dbgwcr7_el1, x12           // encoding: [0xec,0x07,0x10,0xd5]
+// CHECK: msr      dbgwcr8_el1, x12           // encoding: [0xec,0x08,0x10,0xd5]
+// CHECK: msr      dbgwcr9_el1, x12           // encoding: [0xec,0x09,0x10,0xd5]
+// CHECK: msr      dbgwcr10_el1, x12          // encoding: [0xec,0x0a,0x10,0xd5]
+// CHECK: msr      dbgwcr11_el1, x12          // encoding: [0xec,0x0b,0x10,0xd5]
+// CHECK: msr      dbgwcr12_el1, x12          // encoding: [0xec,0x0c,0x10,0xd5]
+// CHECK: msr      dbgwcr13_el1, x12          // encoding: [0xec,0x0d,0x10,0xd5]
+// CHECK: msr      dbgwcr14_el1, x12          // encoding: [0xec,0x0e,0x10,0xd5]
+// CHECK: msr      dbgwcr15_el1, x12          // encoding: [0xec,0x0f,0x10,0xd5]
+// CHECK: msr      teehbr32_el1, x12          // encoding: [0x0c,0x10,0x12,0xd5]
+// CHECK: msr      oslar_el1, x12             // encoding: [0x8c,0x10,0x10,0xd5]
+// CHECK: msr      osdlr_el1, x12             // encoding: [0x8c,0x13,0x10,0xd5]
+// CHECK: msr      dbgprcr_el1, x12           // encoding: [0x8c,0x14,0x10,0xd5]
+// CHECK: msr      dbgclaimset_el1, x12       // encoding: [0xcc,0x78,0x10,0xd5]
+// CHECK: msr      dbgclaimclr_el1, x12       // encoding: [0xcc,0x79,0x10,0xd5]
+// CHECK: msr      csselr_el1, x12            // encoding: [0x0c,0x00,0x1a,0xd5]
+// CHECK: msr      vpidr_el2, x12             // encoding: [0x0c,0x00,0x1c,0xd5]
+// CHECK: msr      vmpidr_el2, x12            // encoding: [0xac,0x00,0x1c,0xd5]
+// CHECK: msr      sctlr_el1, x12             // encoding: [0x0c,0x10,0x18,0xd5]
+// CHECK: msr      sctlr_el2, x12             // encoding: [0x0c,0x10,0x1c,0xd5]
+// CHECK: msr      sctlr_el3, x12             // encoding: [0x0c,0x10,0x1e,0xd5]
+// CHECK: msr      actlr_el1, x12             // encoding: [0x2c,0x10,0x18,0xd5]
+// CHECK: msr      actlr_el2, x12             // encoding: [0x2c,0x10,0x1c,0xd5]
+// CHECK: msr      actlr_el3, x12             // encoding: [0x2c,0x10,0x1e,0xd5]
+// CHECK: msr      cpacr_el1, x12             // encoding: [0x4c,0x10,0x18,0xd5]
+// CHECK: msr      hcr_el2, x12               // encoding: [0x0c,0x11,0x1c,0xd5]
+// CHECK: msr      scr_el3, x12               // encoding: [0x0c,0x11,0x1e,0xd5]
+// CHECK: msr      mdcr_el2, x12              // encoding: [0x2c,0x11,0x1c,0xd5]
+// CHECK: msr      sder32_el3, x12            // encoding: [0x2c,0x11,0x1e,0xd5]
+// CHECK: msr      cptr_el2, x12              // encoding: [0x4c,0x11,0x1c,0xd5]
+// CHECK: msr      cptr_el3, x12              // encoding: [0x4c,0x11,0x1e,0xd5]
+// CHECK: msr      hstr_el2, x12              // encoding: [0x6c,0x11,0x1c,0xd5]
+// CHECK: msr      hacr_el2, x12              // encoding: [0xec,0x11,0x1c,0xd5]
+// CHECK: msr      mdcr_el3, x12              // encoding: [0x2c,0x13,0x1e,0xd5]
+// CHECK: msr      ttbr0_el1, x12             // encoding: [0x0c,0x20,0x18,0xd5]
+// CHECK: msr      ttbr0_el2, x12             // encoding: [0x0c,0x20,0x1c,0xd5]
+// CHECK: msr      ttbr0_el3, x12             // encoding: [0x0c,0x20,0x1e,0xd5]
+// CHECK: msr      ttbr1_el1, x12             // encoding: [0x2c,0x20,0x18,0xd5]
+// CHECK: msr      tcr_el1, x12               // encoding: [0x4c,0x20,0x18,0xd5]
+// CHECK: msr      tcr_el2, x12               // encoding: [0x4c,0x20,0x1c,0xd5]
+// CHECK: msr      tcr_el3, x12               // encoding: [0x4c,0x20,0x1e,0xd5]
+// CHECK: msr      vttbr_el2, x12             // encoding: [0x0c,0x21,0x1c,0xd5]
+// CHECK: msr      vtcr_el2, x12              // encoding: [0x4c,0x21,0x1c,0xd5]
+// CHECK: msr      dacr32_el2, x12            // encoding: [0x0c,0x30,0x1c,0xd5]
+// CHECK: msr      spsr_el1, x12              // encoding: [0x0c,0x40,0x18,0xd5]
+// CHECK: msr      spsr_el2, x12              // encoding: [0x0c,0x40,0x1c,0xd5]
+// CHECK: msr      spsr_el3, x12              // encoding: [0x0c,0x40,0x1e,0xd5]
+// CHECK: msr      elr_el1, x12               // encoding: [0x2c,0x40,0x18,0xd5]
+// CHECK: msr      elr_el2, x12               // encoding: [0x2c,0x40,0x1c,0xd5]
+// CHECK: msr      elr_el3, x12               // encoding: [0x2c,0x40,0x1e,0xd5]
+// CHECK: msr      sp_el0, x12                // encoding: [0x0c,0x41,0x18,0xd5]
+// CHECK: msr      sp_el1, x12                // encoding: [0x0c,0x41,0x1c,0xd5]
+// CHECK: msr      sp_el2, x12                // encoding: [0x0c,0x41,0x1e,0xd5]
+// CHECK: msr      spsel, x12                 // encoding: [0x0c,0x42,0x18,0xd5]
+// CHECK: msr      nzcv, x12                  // encoding: [0x0c,0x42,0x1b,0xd5]
+// CHECK: msr      daif, x12                  // encoding: [0x2c,0x42,0x1b,0xd5]
+// CHECK: msr      currentel, x12             // encoding: [0x4c,0x42,0x18,0xd5]
+// CHECK: msr      spsr_irq, x12              // encoding: [0x0c,0x43,0x1c,0xd5]
+// CHECK: msr      spsr_abt, x12              // encoding: [0x2c,0x43,0x1c,0xd5]
+// CHECK: msr      spsr_und, x12              // encoding: [0x4c,0x43,0x1c,0xd5]
+// CHECK: msr      spsr_fiq, x12              // encoding: [0x6c,0x43,0x1c,0xd5]
+// CHECK: msr      fpcr, x12                  // encoding: [0x0c,0x44,0x1b,0xd5]
+// CHECK: msr      fpsr, x12                  // encoding: [0x2c,0x44,0x1b,0xd5]
+// CHECK: msr      dspsr_el0, x12             // encoding: [0x0c,0x45,0x1b,0xd5]
+// CHECK: msr      dlr_el0, x12               // encoding: [0x2c,0x45,0x1b,0xd5]
+// CHECK: msr      ifsr32_el2, x12            // encoding: [0x2c,0x50,0x1c,0xd5]
+// CHECK: msr      afsr0_el1, x12             // encoding: [0x0c,0x51,0x18,0xd5]
+// CHECK: msr      afsr0_el2, x12             // encoding: [0x0c,0x51,0x1c,0xd5]
+// CHECK: msr      afsr0_el3, x12             // encoding: [0x0c,0x51,0x1e,0xd5]
+// CHECK: msr      afsr1_el1, x12             // encoding: [0x2c,0x51,0x18,0xd5]
+// CHECK: msr      afsr1_el2, x12             // encoding: [0x2c,0x51,0x1c,0xd5]
+// CHECK: msr      afsr1_el3, x12             // encoding: [0x2c,0x51,0x1e,0xd5]
+// CHECK: msr      esr_el1, x12               // encoding: [0x0c,0x52,0x18,0xd5]
+// CHECK: msr      esr_el2, x12               // encoding: [0x0c,0x52,0x1c,0xd5]
+// CHECK: msr      esr_el3, x12               // encoding: [0x0c,0x52,0x1e,0xd5]
+// CHECK: msr      fpexc32_el2, x12           // encoding: [0x0c,0x53,0x1c,0xd5]
+// CHECK: msr      far_el1, x12               // encoding: [0x0c,0x60,0x18,0xd5]
+// CHECK: msr      far_el2, x12               // encoding: [0x0c,0x60,0x1c,0xd5]
+// CHECK: msr      far_el3, x12               // encoding: [0x0c,0x60,0x1e,0xd5]
+// CHECK: msr      hpfar_el2, x12             // encoding: [0x8c,0x60,0x1c,0xd5]
+// CHECK: msr      par_el1, x12               // encoding: [0x0c,0x74,0x18,0xd5]
+// CHECK: msr      pmcr_el0, x12              // encoding: [0x0c,0x9c,0x1b,0xd5]
+// CHECK: msr      pmcntenset_el0, x12        // encoding: [0x2c,0x9c,0x1b,0xd5]
+// CHECK: msr      pmcntenclr_el0, x12        // encoding: [0x4c,0x9c,0x1b,0xd5]
+// CHECK: msr      pmovsclr_el0, x12          // encoding: [0x6c,0x9c,0x1b,0xd5]
+// CHECK: msr      pmselr_el0, x12            // encoding: [0xac,0x9c,0x1b,0xd5]
+// CHECK: msr      pmccntr_el0, x12           // encoding: [0x0c,0x9d,0x1b,0xd5]
+// CHECK: msr      pmxevtyper_el0, x12        // encoding: [0x2c,0x9d,0x1b,0xd5]
+// CHECK: msr      pmxevcntr_el0, x12         // encoding: [0x4c,0x9d,0x1b,0xd5]
+// CHECK: msr      pmuserenr_el0, x12         // encoding: [0x0c,0x9e,0x1b,0xd5]
+// CHECK: msr      pmintenset_el1, x12        // encoding: [0x2c,0x9e,0x18,0xd5]
+// CHECK: msr      pmintenclr_el1, x12        // encoding: [0x4c,0x9e,0x18,0xd5]
+// CHECK: msr      pmovsset_el0, x12          // encoding: [0x6c,0x9e,0x1b,0xd5]
+// CHECK: msr      mair_el1, x12              // encoding: [0x0c,0xa2,0x18,0xd5]
+// CHECK: msr      mair_el2, x12              // encoding: [0x0c,0xa2,0x1c,0xd5]
+// CHECK: msr      mair_el3, x12              // encoding: [0x0c,0xa2,0x1e,0xd5]
+// CHECK: msr      amair_el1, x12             // encoding: [0x0c,0xa3,0x18,0xd5]
+// CHECK: msr      amair_el2, x12             // encoding: [0x0c,0xa3,0x1c,0xd5]
+// CHECK: msr      amair_el3, x12             // encoding: [0x0c,0xa3,0x1e,0xd5]
+// CHECK: msr      vbar_el1, x12              // encoding: [0x0c,0xc0,0x18,0xd5]
+// CHECK: msr      vbar_el2, x12              // encoding: [0x0c,0xc0,0x1c,0xd5]
+// CHECK: msr      vbar_el3, x12              // encoding: [0x0c,0xc0,0x1e,0xd5]
+// CHECK: msr      rmr_el1, x12               // encoding: [0x4c,0xc0,0x18,0xd5]
+// CHECK: msr      rmr_el2, x12               // encoding: [0x4c,0xc0,0x1c,0xd5]
+// CHECK: msr      rmr_el3, x12               // encoding: [0x4c,0xc0,0x1e,0xd5]
+// CHECK: msr      contextidr_el1, x12        // encoding: [0x2c,0xd0,0x18,0xd5]
+// CHECK: msr      tpidr_el0, x12             // encoding: [0x4c,0xd0,0x1b,0xd5]
+// CHECK: msr      tpidr_el2, x12             // encoding: [0x4c,0xd0,0x1c,0xd5]
+// CHECK: msr      tpidr_el3, x12             // encoding: [0x4c,0xd0,0x1e,0xd5]
+// CHECK: msr      tpidrro_el0, x12           // encoding: [0x6c,0xd0,0x1b,0xd5]
+// CHECK: msr      tpidr_el1, x12             // encoding: [0x8c,0xd0,0x18,0xd5]
+// CHECK: msr      cntfrq_el0, x12            // encoding: [0x0c,0xe0,0x1b,0xd5]
+// CHECK: msr      cntvoff_el2, x12           // encoding: [0x6c,0xe0,0x1c,0xd5]
+// CHECK: msr      cntkctl_el1, x12           // encoding: [0x0c,0xe1,0x18,0xd5]
+// CHECK: msr      cnthctl_el2, x12           // encoding: [0x0c,0xe1,0x1c,0xd5]
+// CHECK: msr      cntp_tval_el0, x12         // encoding: [0x0c,0xe2,0x1b,0xd5]
+// CHECK: msr      cnthp_tval_el2, x12        // encoding: [0x0c,0xe2,0x1c,0xd5]
+// CHECK: msr      cntps_tval_el1, x12        // encoding: [0x0c,0xe2,0x1f,0xd5]
+// CHECK: msr      cntp_ctl_el0, x12          // encoding: [0x2c,0xe2,0x1b,0xd5]
+// CHECK: msr      cnthp_ctl_el2, x12         // encoding: [0x2c,0xe2,0x1c,0xd5]
+// CHECK: msr      cntps_ctl_el1, x12         // encoding: [0x2c,0xe2,0x1f,0xd5]
+// CHECK: msr      cntp_cval_el0, x12         // encoding: [0x4c,0xe2,0x1b,0xd5]
+// CHECK: msr      cnthp_cval_el2, x12        // encoding: [0x4c,0xe2,0x1c,0xd5]
+// CHECK: msr      cntps_cval_el1, x12        // encoding: [0x4c,0xe2,0x1f,0xd5]
+// CHECK: msr      cntv_tval_el0, x12         // encoding: [0x0c,0xe3,0x1b,0xd5]
+// CHECK: msr      cntv_ctl_el0, x12          // encoding: [0x2c,0xe3,0x1b,0xd5]
+// CHECK: msr      cntv_cval_el0, x12         // encoding: [0x4c,0xe3,0x1b,0xd5]
+// CHECK: msr      pmevcntr0_el0, x12         // encoding: [0x0c,0xe8,0x1b,0xd5]
+// CHECK: msr      pmevcntr1_el0, x12         // encoding: [0x2c,0xe8,0x1b,0xd5]
+// CHECK: msr      pmevcntr2_el0, x12         // encoding: [0x4c,0xe8,0x1b,0xd5]
+// CHECK: msr      pmevcntr3_el0, x12         // encoding: [0x6c,0xe8,0x1b,0xd5]
+// CHECK: msr      pmevcntr4_el0, x12         // encoding: [0x8c,0xe8,0x1b,0xd5]
+// CHECK: msr      pmevcntr5_el0, x12         // encoding: [0xac,0xe8,0x1b,0xd5]
+// CHECK: msr      pmevcntr6_el0, x12         // encoding: [0xcc,0xe8,0x1b,0xd5]
+// CHECK: msr      pmevcntr7_el0, x12         // encoding: [0xec,0xe8,0x1b,0xd5]
+// CHECK: msr      pmevcntr8_el0, x12         // encoding: [0x0c,0xe9,0x1b,0xd5]
+// CHECK: msr      pmevcntr9_el0, x12         // encoding: [0x2c,0xe9,0x1b,0xd5]
+// CHECK: msr      pmevcntr10_el0, x12        // encoding: [0x4c,0xe9,0x1b,0xd5]
+// CHECK: msr      pmevcntr11_el0, x12        // encoding: [0x6c,0xe9,0x1b,0xd5]
+// CHECK: msr      pmevcntr12_el0, x12        // encoding: [0x8c,0xe9,0x1b,0xd5]
+// CHECK: msr      pmevcntr13_el0, x12        // encoding: [0xac,0xe9,0x1b,0xd5]
+// CHECK: msr      pmevcntr14_el0, x12        // encoding: [0xcc,0xe9,0x1b,0xd5]
+// CHECK: msr      pmevcntr15_el0, x12        // encoding: [0xec,0xe9,0x1b,0xd5]
+// CHECK: msr      pmevcntr16_el0, x12        // encoding: [0x0c,0xea,0x1b,0xd5]
+// CHECK: msr      pmevcntr17_el0, x12        // encoding: [0x2c,0xea,0x1b,0xd5]
+// CHECK: msr      pmevcntr18_el0, x12        // encoding: [0x4c,0xea,0x1b,0xd5]
+// CHECK: msr      pmevcntr19_el0, x12        // encoding: [0x6c,0xea,0x1b,0xd5]
+// CHECK: msr      pmevcntr20_el0, x12        // encoding: [0x8c,0xea,0x1b,0xd5]
+// CHECK: msr      pmevcntr21_el0, x12        // encoding: [0xac,0xea,0x1b,0xd5]
+// CHECK: msr      pmevcntr22_el0, x12        // encoding: [0xcc,0xea,0x1b,0xd5]
+// CHECK: msr      pmevcntr23_el0, x12        // encoding: [0xec,0xea,0x1b,0xd5]
+// CHECK: msr      pmevcntr24_el0, x12        // encoding: [0x0c,0xeb,0x1b,0xd5]
+// CHECK: msr      pmevcntr25_el0, x12        // encoding: [0x2c,0xeb,0x1b,0xd5]
+// CHECK: msr      pmevcntr26_el0, x12        // encoding: [0x4c,0xeb,0x1b,0xd5]
+// CHECK: msr      pmevcntr27_el0, x12        // encoding: [0x6c,0xeb,0x1b,0xd5]
+// CHECK: msr      pmevcntr28_el0, x12        // encoding: [0x8c,0xeb,0x1b,0xd5]
+// CHECK: msr      pmevcntr29_el0, x12        // encoding: [0xac,0xeb,0x1b,0xd5]
+// CHECK: msr      pmevcntr30_el0, x12        // encoding: [0xcc,0xeb,0x1b,0xd5]
+// CHECK: msr      pmccfiltr_el0, x12         // encoding: [0xec,0xef,0x1b,0xd5]
+// CHECK: msr      pmevtyper0_el0, x12        // encoding: [0x0c,0xec,0x1b,0xd5]
+// CHECK: msr      pmevtyper1_el0, x12        // encoding: [0x2c,0xec,0x1b,0xd5]
+// CHECK: msr      pmevtyper2_el0, x12        // encoding: [0x4c,0xec,0x1b,0xd5]
+// CHECK: msr      pmevtyper3_el0, x12        // encoding: [0x6c,0xec,0x1b,0xd5]
+// CHECK: msr      pmevtyper4_el0, x12        // encoding: [0x8c,0xec,0x1b,0xd5]
+// CHECK: msr      pmevtyper5_el0, x12        // encoding: [0xac,0xec,0x1b,0xd5]
+// CHECK: msr      pmevtyper6_el0, x12        // encoding: [0xcc,0xec,0x1b,0xd5]
+// CHECK: msr      pmevtyper7_el0, x12        // encoding: [0xec,0xec,0x1b,0xd5]
+// CHECK: msr      pmevtyper8_el0, x12        // encoding: [0x0c,0xed,0x1b,0xd5]
+// CHECK: msr      pmevtyper9_el0, x12        // encoding: [0x2c,0xed,0x1b,0xd5]
+// CHECK: msr      pmevtyper10_el0, x12       // encoding: [0x4c,0xed,0x1b,0xd5]
+// CHECK: msr      pmevtyper11_el0, x12       // encoding: [0x6c,0xed,0x1b,0xd5]
+// CHECK: msr      pmevtyper12_el0, x12       // encoding: [0x8c,0xed,0x1b,0xd5]
+// CHECK: msr      pmevtyper13_el0, x12       // encoding: [0xac,0xed,0x1b,0xd5]
+// CHECK: msr      pmevtyper14_el0, x12       // encoding: [0xcc,0xed,0x1b,0xd5]
+// CHECK: msr      pmevtyper15_el0, x12       // encoding: [0xec,0xed,0x1b,0xd5]
+// CHECK: msr      pmevtyper16_el0, x12       // encoding: [0x0c,0xee,0x1b,0xd5]
+// CHECK: msr      pmevtyper17_el0, x12       // encoding: [0x2c,0xee,0x1b,0xd5]
+// CHECK: msr      pmevtyper18_el0, x12       // encoding: [0x4c,0xee,0x1b,0xd5]
+// CHECK: msr      pmevtyper19_el0, x12       // encoding: [0x6c,0xee,0x1b,0xd5]
+// CHECK: msr      pmevtyper20_el0, x12       // encoding: [0x8c,0xee,0x1b,0xd5]
+// CHECK: msr      pmevtyper21_el0, x12       // encoding: [0xac,0xee,0x1b,0xd5]
+// CHECK: msr      pmevtyper22_el0, x12       // encoding: [0xcc,0xee,0x1b,0xd5]
+// CHECK: msr      pmevtyper23_el0, x12       // encoding: [0xec,0xee,0x1b,0xd5]
+// CHECK: msr      pmevtyper24_el0, x12       // encoding: [0x0c,0xef,0x1b,0xd5]
+// CHECK: msr      pmevtyper25_el0, x12       // encoding: [0x2c,0xef,0x1b,0xd5]
+// CHECK: msr      pmevtyper26_el0, x12       // encoding: [0x4c,0xef,0x1b,0xd5]
+// CHECK: msr      pmevtyper27_el0, x12       // encoding: [0x6c,0xef,0x1b,0xd5]
+// CHECK: msr      pmevtyper28_el0, x12       // encoding: [0x8c,0xef,0x1b,0xd5]
+// CHECK: msr      pmevtyper29_el0, x12       // encoding: [0xac,0xef,0x1b,0xd5]
+// CHECK: msr      pmevtyper30_el0, x12       // encoding: [0xcc,0xef,0x1b,0xd5]
+
+	mrs x9, TEECR32_EL1
+	mrs x9, OSDTRRX_EL1
+	mrs x9, MDCCSR_EL0
+	mrs x9, MDCCINT_EL1
+	mrs x9, MDSCR_EL1
+	mrs x9, OSDTRTX_EL1
+	mrs x9, DBGDTR_EL0
+	mrs x9, DBGDTRRX_EL0
+	mrs x9, OSECCR_EL1
+	mrs x9, DBGVCR32_EL2
+	mrs x9, DBGBVR0_EL1
+	mrs x9, DBGBVR1_EL1
+	mrs x9, DBGBVR2_EL1
+	mrs x9, DBGBVR3_EL1
+	mrs x9, DBGBVR4_EL1
+	mrs x9, DBGBVR5_EL1
+	mrs x9, DBGBVR6_EL1
+	mrs x9, DBGBVR7_EL1
+	mrs x9, DBGBVR8_EL1
+	mrs x9, DBGBVR9_EL1
+	mrs x9, DBGBVR10_EL1
+	mrs x9, DBGBVR11_EL1
+	mrs x9, DBGBVR12_EL1
+	mrs x9, DBGBVR13_EL1
+	mrs x9, DBGBVR14_EL1
+	mrs x9, DBGBVR15_EL1
+	mrs x9, DBGBCR0_EL1
+	mrs x9, DBGBCR1_EL1
+	mrs x9, DBGBCR2_EL1
+	mrs x9, DBGBCR3_EL1
+	mrs x9, DBGBCR4_EL1
+	mrs x9, DBGBCR5_EL1
+	mrs x9, DBGBCR6_EL1
+	mrs x9, DBGBCR7_EL1
+	mrs x9, DBGBCR8_EL1
+	mrs x9, DBGBCR9_EL1
+	mrs x9, DBGBCR10_EL1
+	mrs x9, DBGBCR11_EL1
+	mrs x9, DBGBCR12_EL1
+	mrs x9, DBGBCR13_EL1
+	mrs x9, DBGBCR14_EL1
+	mrs x9, DBGBCR15_EL1
+	mrs x9, DBGWVR0_EL1
+	mrs x9, DBGWVR1_EL1
+	mrs x9, DBGWVR2_EL1
+	mrs x9, DBGWVR3_EL1
+	mrs x9, DBGWVR4_EL1
+	mrs x9, DBGWVR5_EL1
+	mrs x9, DBGWVR6_EL1
+	mrs x9, DBGWVR7_EL1
+	mrs x9, DBGWVR8_EL1
+	mrs x9, DBGWVR9_EL1
+	mrs x9, DBGWVR10_EL1
+	mrs x9, DBGWVR11_EL1
+	mrs x9, DBGWVR12_EL1
+	mrs x9, DBGWVR13_EL1
+	mrs x9, DBGWVR14_EL1
+	mrs x9, DBGWVR15_EL1
+	mrs x9, DBGWCR0_EL1
+	mrs x9, DBGWCR1_EL1
+	mrs x9, DBGWCR2_EL1
+	mrs x9, DBGWCR3_EL1
+	mrs x9, DBGWCR4_EL1
+	mrs x9, DBGWCR5_EL1
+	mrs x9, DBGWCR6_EL1
+	mrs x9, DBGWCR7_EL1
+	mrs x9, DBGWCR8_EL1
+	mrs x9, DBGWCR9_EL1
+	mrs x9, DBGWCR10_EL1
+	mrs x9, DBGWCR11_EL1
+	mrs x9, DBGWCR12_EL1
+	mrs x9, DBGWCR13_EL1
+	mrs x9, DBGWCR14_EL1
+	mrs x9, DBGWCR15_EL1
+	mrs x9, MDRAR_EL1
+	mrs x9, TEEHBR32_EL1
+	mrs x9, OSLSR_EL1
+	mrs x9, OSDLR_EL1
+	mrs x9, DBGPRCR_EL1
+	mrs x9, DBGCLAIMSET_EL1
+	mrs x9, DBGCLAIMCLR_EL1
+	mrs x9, DBGAUTHSTATUS_EL1
+	mrs x9, MIDR_EL1
+	mrs x9, CCSIDR_EL1
+	mrs x9, CSSELR_EL1
+	mrs x9, VPIDR_EL2
+	mrs x9, CLIDR_EL1
+	mrs x9, CTR_EL0
+	mrs x9, MPIDR_EL1
+	mrs x9, VMPIDR_EL2
+	mrs x9, REVIDR_EL1
+	mrs x9, AIDR_EL1
+	mrs x9, DCZID_EL0
+	mrs x9, ID_PFR0_EL1
+	mrs x9, ID_PFR1_EL1
+	mrs x9, ID_DFR0_EL1
+	mrs x9, ID_AFR0_EL1
+	mrs x9, ID_MMFR0_EL1
+	mrs x9, ID_MMFR1_EL1
+	mrs x9, ID_MMFR2_EL1
+	mrs x9, ID_MMFR3_EL1
+	mrs x9, ID_ISAR0_EL1
+	mrs x9, ID_ISAR1_EL1
+	mrs x9, ID_ISAR2_EL1
+	mrs x9, ID_ISAR3_EL1
+	mrs x9, ID_ISAR4_EL1
+	mrs x9, ID_ISAR5_EL1
+	mrs x9, MVFR0_EL1
+	mrs x9, MVFR1_EL1
+	mrs x9, MVFR2_EL1
+	mrs x9, ID_AA64PFR0_EL1
+	mrs x9, ID_AA64PFR1_EL1
+	mrs x9, ID_AA64DFR0_EL1
+	mrs x9, ID_AA64DFR1_EL1
+	mrs x9, ID_AA64AFR0_EL1
+	mrs x9, ID_AA64AFR1_EL1
+	mrs x9, ID_AA64ISAR0_EL1
+	mrs x9, ID_AA64ISAR1_EL1
+	mrs x9, ID_AA64MMFR0_EL1
+	mrs x9, ID_AA64MMFR1_EL1
+	mrs x9, SCTLR_EL1
+	mrs x9, SCTLR_EL2
+	mrs x9, SCTLR_EL3
+	mrs x9, ACTLR_EL1
+	mrs x9, ACTLR_EL2
+	mrs x9, ACTLR_EL3
+	mrs x9, CPACR_EL1
+	mrs x9, HCR_EL2
+	mrs x9, SCR_EL3
+	mrs x9, MDCR_EL2
+	mrs x9, SDER32_EL3
+	mrs x9, CPTR_EL2
+	mrs x9, CPTR_EL3
+	mrs x9, HSTR_EL2
+	mrs x9, HACR_EL2
+	mrs x9, MDCR_EL3
+	mrs x9, TTBR0_EL1
+	mrs x9, TTBR0_EL2
+	mrs x9, TTBR0_EL3
+	mrs x9, TTBR1_EL1
+	mrs x9, TCR_EL1
+	mrs x9, TCR_EL2
+	mrs x9, TCR_EL3
+	mrs x9, VTTBR_EL2
+	mrs x9, VTCR_EL2
+	mrs x9, DACR32_EL2
+	mrs x9, SPSR_EL1
+	mrs x9, SPSR_EL2
+	mrs x9, SPSR_EL3
+	mrs x9, ELR_EL1
+	mrs x9, ELR_EL2
+	mrs x9, ELR_EL3
+	mrs x9, SP_EL0
+	mrs x9, SP_EL1
+	mrs x9, SP_EL2
+	mrs x9, SPSel
+	mrs x9, NZCV
+	mrs x9, DAIF
+	mrs x9, CurrentEL
+	mrs x9, SPSR_irq
+	mrs x9, SPSR_abt
+	mrs x9, SPSR_und
+	mrs x9, SPSR_fiq
+	mrs x9, FPCR
+	mrs x9, FPSR
+	mrs x9, DSPSR_EL0
+	mrs x9, DLR_EL0
+	mrs x9, IFSR32_EL2
+	mrs x9, AFSR0_EL1
+	mrs x9, AFSR0_EL2
+	mrs x9, AFSR0_EL3
+	mrs x9, AFSR1_EL1
+	mrs x9, AFSR1_EL2
+	mrs x9, AFSR1_EL3
+	mrs x9, ESR_EL1
+	mrs x9, ESR_EL2
+	mrs x9, ESR_EL3
+	mrs x9, FPEXC32_EL2
+	mrs x9, FAR_EL1
+	mrs x9, FAR_EL2
+	mrs x9, FAR_EL3
+	mrs x9, HPFAR_EL2
+	mrs x9, PAR_EL1
+	mrs x9, PMCR_EL0
+	mrs x9, PMCNTENSET_EL0
+	mrs x9, PMCNTENCLR_EL0
+	mrs x9, PMOVSCLR_EL0
+	mrs x9, PMSELR_EL0
+	mrs x9, PMCEID0_EL0
+	mrs x9, PMCEID1_EL0
+	mrs x9, PMCCNTR_EL0
+	mrs x9, PMXEVTYPER_EL0
+	mrs x9, PMXEVCNTR_EL0
+	mrs x9, PMUSERENR_EL0
+	mrs x9, PMINTENSET_EL1
+	mrs x9, PMINTENCLR_EL1
+	mrs x9, PMOVSSET_EL0
+	mrs x9, MAIR_EL1
+	mrs x9, MAIR_EL2
+	mrs x9, MAIR_EL3
+	mrs x9, AMAIR_EL1
+	mrs x9, AMAIR_EL2
+	mrs x9, AMAIR_EL3
+	mrs x9, VBAR_EL1
+	mrs x9, VBAR_EL2
+	mrs x9, VBAR_EL3
+	mrs x9, RVBAR_EL1
+	mrs x9, RVBAR_EL2
+	mrs x9, RVBAR_EL3
+	mrs x9, RMR_EL1
+	mrs x9, RMR_EL2
+	mrs x9, RMR_EL3
+	mrs x9, ISR_EL1
+	mrs x9, CONTEXTIDR_EL1
+	mrs x9, TPIDR_EL0
+	mrs x9, TPIDR_EL2
+	mrs x9, TPIDR_EL3
+	mrs x9, TPIDRRO_EL0
+	mrs x9, TPIDR_EL1
+	mrs x9, CNTFRQ_EL0
+	mrs x9, CNTPCT_EL0
+	mrs x9, CNTVCT_EL0
+	mrs x9, CNTVOFF_EL2
+	mrs x9, CNTKCTL_EL1
+	mrs x9, CNTHCTL_EL2
+	mrs x9, CNTP_TVAL_EL0
+	mrs x9, CNTHP_TVAL_EL2
+	mrs x9, CNTPS_TVAL_EL1
+	mrs x9, CNTP_CTL_EL0
+	mrs x9, CNTHP_CTL_EL2
+	mrs x9, CNTPS_CTL_EL1
+	mrs x9, CNTP_CVAL_EL0
+	mrs x9, CNTHP_CVAL_EL2
+	mrs x9, CNTPS_CVAL_EL1
+	mrs x9, CNTV_TVAL_EL0
+	mrs x9, CNTV_CTL_EL0
+	mrs x9, CNTV_CVAL_EL0
+	mrs x9, PMEVCNTR0_EL0
+	mrs x9, PMEVCNTR1_EL0
+	mrs x9, PMEVCNTR2_EL0
+	mrs x9, PMEVCNTR3_EL0
+	mrs x9, PMEVCNTR4_EL0
+	mrs x9, PMEVCNTR5_EL0
+	mrs x9, PMEVCNTR6_EL0
+	mrs x9, PMEVCNTR7_EL0
+	mrs x9, PMEVCNTR8_EL0
+	mrs x9, PMEVCNTR9_EL0
+	mrs x9, PMEVCNTR10_EL0
+	mrs x9, PMEVCNTR11_EL0
+	mrs x9, PMEVCNTR12_EL0
+	mrs x9, PMEVCNTR13_EL0
+	mrs x9, PMEVCNTR14_EL0
+	mrs x9, PMEVCNTR15_EL0
+	mrs x9, PMEVCNTR16_EL0
+	mrs x9, PMEVCNTR17_EL0
+	mrs x9, PMEVCNTR18_EL0
+	mrs x9, PMEVCNTR19_EL0
+	mrs x9, PMEVCNTR20_EL0
+	mrs x9, PMEVCNTR21_EL0
+	mrs x9, PMEVCNTR22_EL0
+	mrs x9, PMEVCNTR23_EL0
+	mrs x9, PMEVCNTR24_EL0
+	mrs x9, PMEVCNTR25_EL0
+	mrs x9, PMEVCNTR26_EL0
+	mrs x9, PMEVCNTR27_EL0
+	mrs x9, PMEVCNTR28_EL0
+	mrs x9, PMEVCNTR29_EL0
+	mrs x9, PMEVCNTR30_EL0
+	mrs x9, PMCCFILTR_EL0
+	mrs x9, PMEVTYPER0_EL0
+	mrs x9, PMEVTYPER1_EL0
+	mrs x9, PMEVTYPER2_EL0
+	mrs x9, PMEVTYPER3_EL0
+	mrs x9, PMEVTYPER4_EL0
+	mrs x9, PMEVTYPER5_EL0
+	mrs x9, PMEVTYPER6_EL0
+	mrs x9, PMEVTYPER7_EL0
+	mrs x9, PMEVTYPER8_EL0
+	mrs x9, PMEVTYPER9_EL0
+	mrs x9, PMEVTYPER10_EL0
+	mrs x9, PMEVTYPER11_EL0
+	mrs x9, PMEVTYPER12_EL0
+	mrs x9, PMEVTYPER13_EL0
+	mrs x9, PMEVTYPER14_EL0
+	mrs x9, PMEVTYPER15_EL0
+	mrs x9, PMEVTYPER16_EL0
+	mrs x9, PMEVTYPER17_EL0
+	mrs x9, PMEVTYPER18_EL0
+	mrs x9, PMEVTYPER19_EL0
+	mrs x9, PMEVTYPER20_EL0
+	mrs x9, PMEVTYPER21_EL0
+	mrs x9, PMEVTYPER22_EL0
+	mrs x9, PMEVTYPER23_EL0
+	mrs x9, PMEVTYPER24_EL0
+	mrs x9, PMEVTYPER25_EL0
+	mrs x9, PMEVTYPER26_EL0
+	mrs x9, PMEVTYPER27_EL0
+	mrs x9, PMEVTYPER28_EL0
+	mrs x9, PMEVTYPER29_EL0
+	mrs x9, PMEVTYPER30_EL0
+// CHECK: mrs      x9, teecr32_el1            // encoding: [0x09,0x00,0x32,0xd5]
+// CHECK: mrs      x9, osdtrrx_el1            // encoding: [0x49,0x00,0x30,0xd5]
+// CHECK: mrs      x9, mdccsr_el0             // encoding: [0x09,0x01,0x33,0xd5]
+// CHECK: mrs      x9, mdccint_el1            // encoding: [0x09,0x02,0x30,0xd5]
+// CHECK: mrs      x9, mdscr_el1              // encoding: [0x49,0x02,0x30,0xd5]
+// CHECK: mrs      x9, osdtrtx_el1            // encoding: [0x49,0x03,0x30,0xd5]
+// CHECK: mrs      x9, dbgdtr_el0             // encoding: [0x09,0x04,0x33,0xd5]
+// CHECK: mrs      x9, dbgdtrrx_el0           // encoding: [0x09,0x05,0x33,0xd5]
+// CHECK: mrs      x9, oseccr_el1             // encoding: [0x49,0x06,0x30,0xd5]
+// CHECK: mrs      x9, dbgvcr32_el2           // encoding: [0x09,0x07,0x34,0xd5]
+// CHECK: mrs      x9, dbgbvr0_el1            // encoding: [0x89,0x00,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr1_el1            // encoding: [0x89,0x01,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr2_el1            // encoding: [0x89,0x02,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr3_el1            // encoding: [0x89,0x03,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr4_el1            // encoding: [0x89,0x04,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr5_el1            // encoding: [0x89,0x05,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr6_el1            // encoding: [0x89,0x06,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr7_el1            // encoding: [0x89,0x07,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr8_el1            // encoding: [0x89,0x08,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr9_el1            // encoding: [0x89,0x09,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr10_el1           // encoding: [0x89,0x0a,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr11_el1           // encoding: [0x89,0x0b,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr12_el1           // encoding: [0x89,0x0c,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr13_el1           // encoding: [0x89,0x0d,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr14_el1           // encoding: [0x89,0x0e,0x30,0xd5]
+// CHECK: mrs      x9, dbgbvr15_el1           // encoding: [0x89,0x0f,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr0_el1            // encoding: [0xa9,0x00,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr1_el1            // encoding: [0xa9,0x01,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr2_el1            // encoding: [0xa9,0x02,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr3_el1            // encoding: [0xa9,0x03,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr4_el1            // encoding: [0xa9,0x04,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr5_el1            // encoding: [0xa9,0x05,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr6_el1            // encoding: [0xa9,0x06,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr7_el1            // encoding: [0xa9,0x07,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr8_el1            // encoding: [0xa9,0x08,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr9_el1            // encoding: [0xa9,0x09,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr10_el1           // encoding: [0xa9,0x0a,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr11_el1           // encoding: [0xa9,0x0b,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr12_el1           // encoding: [0xa9,0x0c,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr13_el1           // encoding: [0xa9,0x0d,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr14_el1           // encoding: [0xa9,0x0e,0x30,0xd5]
+// CHECK: mrs      x9, dbgbcr15_el1           // encoding: [0xa9,0x0f,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr0_el1            // encoding: [0xc9,0x00,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr1_el1            // encoding: [0xc9,0x01,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr2_el1            // encoding: [0xc9,0x02,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr3_el1            // encoding: [0xc9,0x03,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr4_el1            // encoding: [0xc9,0x04,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr5_el1            // encoding: [0xc9,0x05,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr6_el1            // encoding: [0xc9,0x06,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr7_el1            // encoding: [0xc9,0x07,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr8_el1            // encoding: [0xc9,0x08,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr9_el1            // encoding: [0xc9,0x09,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr10_el1           // encoding: [0xc9,0x0a,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr11_el1           // encoding: [0xc9,0x0b,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr12_el1           // encoding: [0xc9,0x0c,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr13_el1           // encoding: [0xc9,0x0d,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr14_el1           // encoding: [0xc9,0x0e,0x30,0xd5]
+// CHECK: mrs      x9, dbgwvr15_el1           // encoding: [0xc9,0x0f,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr0_el1            // encoding: [0xe9,0x00,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr1_el1            // encoding: [0xe9,0x01,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr2_el1            // encoding: [0xe9,0x02,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr3_el1            // encoding: [0xe9,0x03,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr4_el1            // encoding: [0xe9,0x04,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr5_el1            // encoding: [0xe9,0x05,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr6_el1            // encoding: [0xe9,0x06,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr7_el1            // encoding: [0xe9,0x07,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr8_el1            // encoding: [0xe9,0x08,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr9_el1            // encoding: [0xe9,0x09,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr10_el1           // encoding: [0xe9,0x0a,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr11_el1           // encoding: [0xe9,0x0b,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr12_el1           // encoding: [0xe9,0x0c,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr13_el1           // encoding: [0xe9,0x0d,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr14_el1           // encoding: [0xe9,0x0e,0x30,0xd5]
+// CHECK: mrs      x9, dbgwcr15_el1           // encoding: [0xe9,0x0f,0x30,0xd5]
+// CHECK: mrs      x9, mdrar_el1              // encoding: [0x09,0x10,0x30,0xd5]
+// CHECK: mrs      x9, teehbr32_el1           // encoding: [0x09,0x10,0x32,0xd5]
+// CHECK: mrs      x9, oslsr_el1              // encoding: [0x89,0x11,0x30,0xd5]
+// CHECK: mrs      x9, osdlr_el1              // encoding: [0x89,0x13,0x30,0xd5]
+// CHECK: mrs      x9, dbgprcr_el1            // encoding: [0x89,0x14,0x30,0xd5]
+// CHECK: mrs      x9, dbgclaimset_el1        // encoding: [0xc9,0x78,0x30,0xd5]
+// CHECK: mrs      x9, dbgclaimclr_el1        // encoding: [0xc9,0x79,0x30,0xd5]
+// CHECK: mrs      x9, dbgauthstatus_el1      // encoding: [0xc9,0x7e,0x30,0xd5]
+// CHECK: mrs      x9, midr_el1               // encoding: [0x09,0x00,0x38,0xd5]
+// CHECK: mrs      x9, ccsidr_el1             // encoding: [0x09,0x00,0x39,0xd5]
+// CHECK: mrs      x9, csselr_el1             // encoding: [0x09,0x00,0x3a,0xd5]
+// CHECK: mrs      x9, vpidr_el2              // encoding: [0x09,0x00,0x3c,0xd5]
+// CHECK: mrs      x9, clidr_el1              // encoding: [0x29,0x00,0x39,0xd5]
+// CHECK: mrs      x9, ctr_el0                // encoding: [0x29,0x00,0x3b,0xd5]
+// CHECK: mrs      x9, mpidr_el1              // encoding: [0xa9,0x00,0x38,0xd5]
+// CHECK: mrs      x9, vmpidr_el2             // encoding: [0xa9,0x00,0x3c,0xd5]
+// CHECK: mrs      x9, revidr_el1             // encoding: [0xc9,0x00,0x38,0xd5]
+// CHECK: mrs      x9, aidr_el1               // encoding: [0xe9,0x00,0x39,0xd5]
+// CHECK: mrs      x9, dczid_el0              // encoding: [0xe9,0x00,0x3b,0xd5]
+// CHECK: mrs      x9, id_pfr0_el1            // encoding: [0x09,0x01,0x38,0xd5]
+// CHECK: mrs      x9, id_pfr1_el1            // encoding: [0x29,0x01,0x38,0xd5]
+// CHECK: mrs      x9, id_dfr0_el1            // encoding: [0x49,0x01,0x38,0xd5]
+// CHECK: mrs      x9, id_afr0_el1            // encoding: [0x69,0x01,0x38,0xd5]
+// CHECK: mrs      x9, id_mmfr0_el1           // encoding: [0x89,0x01,0x38,0xd5]
+// CHECK: mrs      x9, id_mmfr1_el1           // encoding: [0xa9,0x01,0x38,0xd5]
+// CHECK: mrs      x9, id_mmfr2_el1           // encoding: [0xc9,0x01,0x38,0xd5]
+// CHECK: mrs      x9, id_mmfr3_el1           // encoding: [0xe9,0x01,0x38,0xd5]
+// CHECK: mrs      x9, id_isar0_el1           // encoding: [0x09,0x02,0x38,0xd5]
+// CHECK: mrs      x9, id_isar1_el1           // encoding: [0x29,0x02,0x38,0xd5]
+// CHECK: mrs      x9, id_isar2_el1           // encoding: [0x49,0x02,0x38,0xd5]
+// CHECK: mrs      x9, id_isar3_el1           // encoding: [0x69,0x02,0x38,0xd5]
+// CHECK: mrs      x9, id_isar4_el1           // encoding: [0x89,0x02,0x38,0xd5]
+// CHECK: mrs      x9, id_isar5_el1           // encoding: [0xa9,0x02,0x38,0xd5]
+// CHECK: mrs      x9, mvfr0_el1              // encoding: [0x09,0x03,0x38,0xd5]
+// CHECK: mrs      x9, mvfr1_el1              // encoding: [0x29,0x03,0x38,0xd5]
+// CHECK: mrs      x9, mvfr2_el1              // encoding: [0x49,0x03,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64pfr0_el1        // encoding: [0x09,0x04,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64pfr1_el1        // encoding: [0x29,0x04,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64dfr0_el1        // encoding: [0x09,0x05,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64dfr1_el1        // encoding: [0x29,0x05,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64afr0_el1        // encoding: [0x89,0x05,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64afr1_el1        // encoding: [0xa9,0x05,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64isar0_el1       // encoding: [0x09,0x06,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64isar1_el1       // encoding: [0x29,0x06,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64mmfr0_el1       // encoding: [0x09,0x07,0x38,0xd5]
+// CHECK: mrs      x9, id_aa64mmfr1_el1       // encoding: [0x29,0x07,0x38,0xd5]
+// CHECK: mrs      x9, sctlr_el1              // encoding: [0x09,0x10,0x38,0xd5]
+// CHECK: mrs      x9, sctlr_el2              // encoding: [0x09,0x10,0x3c,0xd5]
+// CHECK: mrs      x9, sctlr_el3              // encoding: [0x09,0x10,0x3e,0xd5]
+// CHECK: mrs      x9, actlr_el1              // encoding: [0x29,0x10,0x38,0xd5]
+// CHECK: mrs      x9, actlr_el2              // encoding: [0x29,0x10,0x3c,0xd5]
+// CHECK: mrs      x9, actlr_el3              // encoding: [0x29,0x10,0x3e,0xd5]
+// CHECK: mrs      x9, cpacr_el1              // encoding: [0x49,0x10,0x38,0xd5]
+// CHECK: mrs      x9, hcr_el2                // encoding: [0x09,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, scr_el3                // encoding: [0x09,0x11,0x3e,0xd5]
+// CHECK: mrs      x9, mdcr_el2               // encoding: [0x29,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, sder32_el3             // encoding: [0x29,0x11,0x3e,0xd5]
+// CHECK: mrs      x9, cptr_el2               // encoding: [0x49,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, cptr_el3               // encoding: [0x49,0x11,0x3e,0xd5]
+// CHECK: mrs      x9, hstr_el2               // encoding: [0x69,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, hacr_el2               // encoding: [0xe9,0x11,0x3c,0xd5]
+// CHECK: mrs      x9, mdcr_el3               // encoding: [0x29,0x13,0x3e,0xd5]
+// CHECK: mrs      x9, ttbr0_el1              // encoding: [0x09,0x20,0x38,0xd5]
+// CHECK: mrs      x9, ttbr0_el2              // encoding: [0x09,0x20,0x3c,0xd5]
+// CHECK: mrs      x9, ttbr0_el3              // encoding: [0x09,0x20,0x3e,0xd5]
+// CHECK: mrs      x9, ttbr1_el1              // encoding: [0x29,0x20,0x38,0xd5]
+// CHECK: mrs      x9, tcr_el1                // encoding: [0x49,0x20,0x38,0xd5]
+// CHECK: mrs      x9, tcr_el2                // encoding: [0x49,0x20,0x3c,0xd5]
+// CHECK: mrs      x9, tcr_el3                // encoding: [0x49,0x20,0x3e,0xd5]
+// CHECK: mrs      x9, vttbr_el2              // encoding: [0x09,0x21,0x3c,0xd5]
+// CHECK: mrs      x9, vtcr_el2               // encoding: [0x49,0x21,0x3c,0xd5]
+// CHECK: mrs      x9, dacr32_el2             // encoding: [0x09,0x30,0x3c,0xd5]
+// CHECK: mrs      x9, spsr_el1               // encoding: [0x09,0x40,0x38,0xd5]
+// CHECK: mrs      x9, spsr_el2               // encoding: [0x09,0x40,0x3c,0xd5]
+// CHECK: mrs      x9, spsr_el3               // encoding: [0x09,0x40,0x3e,0xd5]
+// CHECK: mrs      x9, elr_el1                // encoding: [0x29,0x40,0x38,0xd5]
+// CHECK: mrs      x9, elr_el2                // encoding: [0x29,0x40,0x3c,0xd5]
+// CHECK: mrs      x9, elr_el3                // encoding: [0x29,0x40,0x3e,0xd5]
+// CHECK: mrs      x9, sp_el0                 // encoding: [0x09,0x41,0x38,0xd5]
+// CHECK: mrs      x9, sp_el1                 // encoding: [0x09,0x41,0x3c,0xd5]
+// CHECK: mrs      x9, sp_el2                 // encoding: [0x09,0x41,0x3e,0xd5]
+// CHECK: mrs      x9, spsel                  // encoding: [0x09,0x42,0x38,0xd5]
+// CHECK: mrs      x9, nzcv                   // encoding: [0x09,0x42,0x3b,0xd5]
+// CHECK: mrs      x9, daif                   // encoding: [0x29,0x42,0x3b,0xd5]
+// CHECK: mrs      x9, currentel              // encoding: [0x49,0x42,0x38,0xd5]
+// CHECK: mrs      x9, spsr_irq               // encoding: [0x09,0x43,0x3c,0xd5]
+// CHECK: mrs      x9, spsr_abt               // encoding: [0x29,0x43,0x3c,0xd5]
+// CHECK: mrs      x9, spsr_und               // encoding: [0x49,0x43,0x3c,0xd5]
+// CHECK: mrs      x9, spsr_fiq               // encoding: [0x69,0x43,0x3c,0xd5]
+// CHECK: mrs      x9, fpcr                   // encoding: [0x09,0x44,0x3b,0xd5]
+// CHECK: mrs      x9, fpsr                   // encoding: [0x29,0x44,0x3b,0xd5]
+// CHECK: mrs      x9, dspsr_el0              // encoding: [0x09,0x45,0x3b,0xd5]
+// CHECK: mrs      x9, dlr_el0                // encoding: [0x29,0x45,0x3b,0xd5]
+// CHECK: mrs      x9, ifsr32_el2             // encoding: [0x29,0x50,0x3c,0xd5]
+// CHECK: mrs      x9, afsr0_el1              // encoding: [0x09,0x51,0x38,0xd5]
+// CHECK: mrs      x9, afsr0_el2              // encoding: [0x09,0x51,0x3c,0xd5]
+// CHECK: mrs      x9, afsr0_el3              // encoding: [0x09,0x51,0x3e,0xd5]
+// CHECK: mrs      x9, afsr1_el1              // encoding: [0x29,0x51,0x38,0xd5]
+// CHECK: mrs      x9, afsr1_el2              // encoding: [0x29,0x51,0x3c,0xd5]
+// CHECK: mrs      x9, afsr1_el3              // encoding: [0x29,0x51,0x3e,0xd5]
+// CHECK: mrs      x9, esr_el1                // encoding: [0x09,0x52,0x38,0xd5]
+// CHECK: mrs      x9, esr_el2                // encoding: [0x09,0x52,0x3c,0xd5]
+// CHECK: mrs      x9, esr_el3                // encoding: [0x09,0x52,0x3e,0xd5]
+// CHECK: mrs      x9, fpexc32_el2            // encoding: [0x09,0x53,0x3c,0xd5]
+// CHECK: mrs      x9, far_el1                // encoding: [0x09,0x60,0x38,0xd5]
+// CHECK: mrs      x9, far_el2                // encoding: [0x09,0x60,0x3c,0xd5]
+// CHECK: mrs      x9, far_el3                // encoding: [0x09,0x60,0x3e,0xd5]
+// CHECK: mrs      x9, hpfar_el2              // encoding: [0x89,0x60,0x3c,0xd5]
+// CHECK: mrs      x9, par_el1                // encoding: [0x09,0x74,0x38,0xd5]
+// CHECK: mrs      x9, pmcr_el0               // encoding: [0x09,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, pmcntenset_el0         // encoding: [0x29,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, pmcntenclr_el0         // encoding: [0x49,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, pmovsclr_el0           // encoding: [0x69,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, pmselr_el0             // encoding: [0xa9,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, pmceid0_el0            // encoding: [0xc9,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, pmceid1_el0            // encoding: [0xe9,0x9c,0x3b,0xd5]
+// CHECK: mrs      x9, pmccntr_el0            // encoding: [0x09,0x9d,0x3b,0xd5]
+// CHECK: mrs      x9, pmxevtyper_el0         // encoding: [0x29,0x9d,0x3b,0xd5]
+// CHECK: mrs      x9, pmxevcntr_el0          // encoding: [0x49,0x9d,0x3b,0xd5]
+// CHECK: mrs      x9, pmuserenr_el0          // encoding: [0x09,0x9e,0x3b,0xd5]
+// CHECK: mrs      x9, pmintenset_el1         // encoding: [0x29,0x9e,0x38,0xd5]
+// CHECK: mrs      x9, pmintenclr_el1         // encoding: [0x49,0x9e,0x38,0xd5]
+// CHECK: mrs      x9, pmovsset_el0           // encoding: [0x69,0x9e,0x3b,0xd5]
+// CHECK: mrs      x9, mair_el1               // encoding: [0x09,0xa2,0x38,0xd5]
+// CHECK: mrs      x9, mair_el2               // encoding: [0x09,0xa2,0x3c,0xd5]
+// CHECK: mrs      x9, mair_el3               // encoding: [0x09,0xa2,0x3e,0xd5]
+// CHECK: mrs      x9, amair_el1              // encoding: [0x09,0xa3,0x38,0xd5]
+// CHECK: mrs      x9, amair_el2              // encoding: [0x09,0xa3,0x3c,0xd5]
+// CHECK: mrs      x9, amair_el3              // encoding: [0x09,0xa3,0x3e,0xd5]
+// CHECK: mrs      x9, vbar_el1               // encoding: [0x09,0xc0,0x38,0xd5]
+// CHECK: mrs      x9, vbar_el2               // encoding: [0x09,0xc0,0x3c,0xd5]
+// CHECK: mrs      x9, vbar_el3               // encoding: [0x09,0xc0,0x3e,0xd5]
+// CHECK: mrs      x9, rvbar_el1              // encoding: [0x29,0xc0,0x38,0xd5]
+// CHECK: mrs      x9, rvbar_el2              // encoding: [0x29,0xc0,0x3c,0xd5]
+// CHECK: mrs      x9, rvbar_el3              // encoding: [0x29,0xc0,0x3e,0xd5]
+// CHECK: mrs      x9, rmr_el1                // encoding: [0x49,0xc0,0x38,0xd5]
+// CHECK: mrs      x9, rmr_el2                // encoding: [0x49,0xc0,0x3c,0xd5]
+// CHECK: mrs      x9, rmr_el3                // encoding: [0x49,0xc0,0x3e,0xd5]
+// CHECK: mrs      x9, isr_el1                // encoding: [0x09,0xc1,0x38,0xd5]
+// CHECK: mrs      x9, contextidr_el1         // encoding: [0x29,0xd0,0x38,0xd5]
+// CHECK: mrs      x9, tpidr_el0              // encoding: [0x49,0xd0,0x3b,0xd5]
+// CHECK: mrs      x9, tpidr_el2              // encoding: [0x49,0xd0,0x3c,0xd5]
+// CHECK: mrs      x9, tpidr_el3              // encoding: [0x49,0xd0,0x3e,0xd5]
+// CHECK: mrs      x9, tpidrro_el0            // encoding: [0x69,0xd0,0x3b,0xd5]
+// CHECK: mrs      x9, tpidr_el1              // encoding: [0x89,0xd0,0x38,0xd5]
+// CHECK: mrs      x9, cntfrq_el0             // encoding: [0x09,0xe0,0x3b,0xd5]
+// CHECK: mrs      x9, cntpct_el0             // encoding: [0x29,0xe0,0x3b,0xd5]
+// CHECK: mrs      x9, cntvct_el0             // encoding: [0x49,0xe0,0x3b,0xd5]
+// CHECK: mrs      x9, cntvoff_el2            // encoding: [0x69,0xe0,0x3c,0xd5]
+// CHECK: mrs      x9, cntkctl_el1            // encoding: [0x09,0xe1,0x38,0xd5]
+// CHECK: mrs      x9, cnthctl_el2            // encoding: [0x09,0xe1,0x3c,0xd5]
+// CHECK: mrs      x9, cntp_tval_el0          // encoding: [0x09,0xe2,0x3b,0xd5]
+// CHECK: mrs      x9, cnthp_tval_el2         // encoding: [0x09,0xe2,0x3c,0xd5]
+// CHECK: mrs      x9, cntps_tval_el1         // encoding: [0x09,0xe2,0x3f,0xd5]
+// CHECK: mrs      x9, cntp_ctl_el0           // encoding: [0x29,0xe2,0x3b,0xd5]
+// CHECK: mrs      x9, cnthp_ctl_el2          // encoding: [0x29,0xe2,0x3c,0xd5]
+// CHECK: mrs      x9, cntps_ctl_el1          // encoding: [0x29,0xe2,0x3f,0xd5]
+// CHECK: mrs      x9, cntp_cval_el0          // encoding: [0x49,0xe2,0x3b,0xd5]
+// CHECK: mrs      x9, cnthp_cval_el2         // encoding: [0x49,0xe2,0x3c,0xd5]
+// CHECK: mrs      x9, cntps_cval_el1         // encoding: [0x49,0xe2,0x3f,0xd5]
+// CHECK: mrs      x9, cntv_tval_el0          // encoding: [0x09,0xe3,0x3b,0xd5]
+// CHECK: mrs      x9, cntv_ctl_el0           // encoding: [0x29,0xe3,0x3b,0xd5]
+// CHECK: mrs      x9, cntv_cval_el0          // encoding: [0x49,0xe3,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr0_el0          // encoding: [0x09,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr1_el0          // encoding: [0x29,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr2_el0          // encoding: [0x49,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr3_el0          // encoding: [0x69,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr4_el0          // encoding: [0x89,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr5_el0          // encoding: [0xa9,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr6_el0          // encoding: [0xc9,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr7_el0          // encoding: [0xe9,0xe8,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr8_el0          // encoding: [0x09,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr9_el0          // encoding: [0x29,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr10_el0         // encoding: [0x49,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr11_el0         // encoding: [0x69,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr12_el0         // encoding: [0x89,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr13_el0         // encoding: [0xa9,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr14_el0         // encoding: [0xc9,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr15_el0         // encoding: [0xe9,0xe9,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr16_el0         // encoding: [0x09,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr17_el0         // encoding: [0x29,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr18_el0         // encoding: [0x49,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr19_el0         // encoding: [0x69,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr20_el0         // encoding: [0x89,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr21_el0         // encoding: [0xa9,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr22_el0         // encoding: [0xc9,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr23_el0         // encoding: [0xe9,0xea,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr24_el0         // encoding: [0x09,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr25_el0         // encoding: [0x29,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr26_el0         // encoding: [0x49,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr27_el0         // encoding: [0x69,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr28_el0         // encoding: [0x89,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr29_el0         // encoding: [0xa9,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, pmevcntr30_el0         // encoding: [0xc9,0xeb,0x3b,0xd5]
+// CHECK: mrs      x9, pmccfiltr_el0          // encoding: [0xe9,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper0_el0         // encoding: [0x09,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper1_el0         // encoding: [0x29,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper2_el0         // encoding: [0x49,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper3_el0         // encoding: [0x69,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper4_el0         // encoding: [0x89,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper5_el0         // encoding: [0xa9,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper6_el0         // encoding: [0xc9,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper7_el0         // encoding: [0xe9,0xec,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper8_el0         // encoding: [0x09,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper9_el0         // encoding: [0x29,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper10_el0        // encoding: [0x49,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper11_el0        // encoding: [0x69,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper12_el0        // encoding: [0x89,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper13_el0        // encoding: [0xa9,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper14_el0        // encoding: [0xc9,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper15_el0        // encoding: [0xe9,0xed,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper16_el0        // encoding: [0x09,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper17_el0        // encoding: [0x29,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper18_el0        // encoding: [0x49,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper19_el0        // encoding: [0x69,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper20_el0        // encoding: [0x89,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper21_el0        // encoding: [0xa9,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper22_el0        // encoding: [0xc9,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper23_el0        // encoding: [0xe9,0xee,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper24_el0        // encoding: [0x09,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper25_el0        // encoding: [0x29,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper26_el0        // encoding: [0x49,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper27_el0        // encoding: [0x69,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper28_el0        // encoding: [0x89,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper29_el0        // encoding: [0xa9,0xef,0x3b,0xd5]
+// CHECK: mrs      x9, pmevtyper30_el0        // encoding: [0xc9,0xef,0x3b,0xd5]
+
+        mrs x12, s3_7_c15_c1_5
+        mrs x13, s3_2_c11_c15_7
+        msr s3_0_c15_c0_0, x12
+        msr s3_7_c11_c13_7, x5
+// CHECK: mrs     x12, s3_7_c15_c1_5      // encoding: [0xac,0xf1,0x3f,0xd5]
+// CHECK: mrs     x13, s3_2_c11_c15_7     // encoding: [0xed,0xbf,0x3a,0xd5]
+// CHECK: msr     s3_0_c15_c0_0, x12      // encoding: [0x0c,0xf0,0x18,0xd5]
+// CHECK: msr     s3_7_c11_c13_7, x5      // encoding: [0xe5,0xbd,0x1f,0xd5]
+
+//------------------------------------------------------------------------------
+// Unconditional branch (immediate)
+//------------------------------------------------------------------------------
+
+        tbz x5, #0, somewhere
+        tbz xzr, #63, elsewhere
+        tbnz x5, #45, nowhere
+// CHECK: tbz     x5, #0, somewhere       // encoding: [0x05'A',A,A,0x36'A']
+// CHECK:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_a64_tstbr
+// CHECK: tbz     xzr, #63, elsewhere     // encoding: [0x1f'A',A,0xf8'A',0xb6'A']
+// CHECK:                                 //   fixup A - offset: 0, value: elsewhere, kind: fixup_a64_tstbr
+// CHECK: tbnz    x5, #45, nowhere        // encoding: [0x05'A',A,0x68'A',0xb7'A']
+// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_a64_tstbr
+
+        tbnz w3, #2, there
+        tbnz wzr, #31, nowhere
+        tbz w5, #12, anywhere
+// CHECK: tbnz    w3, #2, there           // encoding: [0x03'A',A,0x10'A',0x37'A']
+// CHECK:                                 //   fixup A - offset: 0, value: there, kind: fixup_a64_tstbr
+// CHECK: tbnz    wzr, #31, nowhere       // encoding: [0x1f'A',A,0xf8'A',0x37'A']
+// CHECK:                                 //   fixup A - offset: 0, value: nowhere, kind: fixup_a64_tstbr
+// CHECK: tbz     w5, #12, anywhere       // encoding: [0x05'A',A,0x60'A',0x36'A']
+// CHECK:                                 //   fixup A - offset: 0, value: anywhere, kind: fixup_a64_tstbr
+
+//------------------------------------------------------------------------------
+// Unconditional branch (immediate)
+//------------------------------------------------------------------------------
+
+        b somewhere
+        bl elsewhere
+// CHECK: b       somewhere               // encoding: [A,A,A,0x14'A']
+// CHECK:                                 //   fixup A - offset: 0, value: somewhere, kind: fixup_a64_uncondbr
+// CHECK: bl      elsewhere               // encoding: [A,A,A,0x94'A']
+// CHECK:                                 //   fixup A - offset: 0, value: elsewhere, kind: fixup_a64_call
+
+        b #4
+        bl #0
+        b #134217724
+        bl #-134217728
+// CHECK: b       #4                      // encoding: [0x01,0x00,0x00,0x14]
+// CHECK: bl      #0                      // encoding: [0x00,0x00,0x00,0x94]
+// CHECK: b       #134217724              // encoding: [0xff,0xff,0xff,0x15]
+// CHECK: bl      #-134217728             // encoding: [0x00,0x00,0x00,0x96]
+
+//------------------------------------------------------------------------------
+// Unconditional branch (register)
+//------------------------------------------------------------------------------
+
+        br x20
+        blr xzr
+        ret x10
+// CHECK: br       x20                        // encoding: [0x80,0x02,0x1f,0xd6]
+// CHECK: blr      xzr                        // encoding: [0xe0,0x03,0x3f,0xd6]
+// CHECK: ret      x10                        // encoding: [0x40,0x01,0x5f,0xd6]
+
+        ret
+        eret
+        drps
+// CHECK: ret                                 // encoding: [0xc0,0x03,0x5f,0xd6]
+// CHECK: eret                                // encoding: [0xe0,0x03,0x9f,0xd6]
+// CHECK: drps                                // encoding: [0xe0,0x03,0xbf,0xd6]
+
diff --git a/test/MC/AArch64/elf-globaladdress.ll b/test/MC/AArch64/elf-globaladdress.ll
new file mode 100644
index 0000000..190439d
--- /dev/null
+++ b/test/MC/AArch64/elf-globaladdress.ll
@@ -0,0 +1,111 @@
+;; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
+;; RUN:   elf-dump | FileCheck -check-prefix=OBJ %s
+
+; Also take it on a round-trip through llvm-mc to stretch assembly-parsing's legs:
+;; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | \
+;; RUN:     llvm-mc -arch=aarch64 -filetype=obj -o - | \
+;; RUN:     elf-dump | FileCheck -check-prefix=OBJ %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @loadstore() {
+    %val8 = load i8* @var8
+    store volatile i8 %val8, i8* @var8
+
+    %val16 = load i16* @var16
+    store volatile i16 %val16, i16* @var16
+
+    %val32 = load i32* @var32
+    store volatile i32 %val32, i32* @var32
+
+    %val64 = load i64* @var64
+    store volatile i64 %val64, i64* @var64
+
+    ret void
+}
+
+@globaddr = global i64* null
+
+define void @address() {
+    store i64* @var64, i64** @globaddr
+    ret void
+}
+
+; Check we're using EM_AARCH64
+; OBJ: 'e_machine', 0x00
+
+; OBJ: .rela.text
+
+; var8
+; R_AARCH64_ADR_PREL_PG_HI21 against var8
+; OBJ: 'r_sym', 0x0000000f
+; OBJ-NEXT: 'r_type', 0x00000113
+
+; R_AARCH64_LDST8_ABS_LO12_NC against var8
+; OBJ: 'r_sym', 0x0000000f
+; OBJ-NEXT: 'r_type', 0x00000116
+
+
+; var16
+; R_AARCH64_ADR_PREL_PG_HI21 against var16
+; OBJ: 'r_sym', 0x0000000c
+; OBJ-NEXT: 'r_type', 0x00000113
+
+; R_AARCH64_LDST16_ABS_LO12_NC against var16
+; OBJ: 'r_sym', 0x0000000c
+; OBJ-NEXT: 'r_type', 0x0000011c
+
+
+; var32
+; R_AARCH64_ADR_PREL_PG_HI21 against var32
+; OBJ: 'r_sym', 0x0000000d
+; OBJ-NEXT: 'r_type', 0x00000113
+
+; R_AARCH64_LDST32_ABS_LO12_NC against var32
+; OBJ: 'r_sym', 0x0000000d
+; OBJ-NEXT: 'r_type', 0x0000011d
+
+
+; var64
+; R_AARCH64_ADR_PREL_PG_HI21 against var64
+; OBJ: 'r_sym', 0x0000000e
+; OBJ-NEXT: 'r_type', 0x00000113
+
+; R_AARCH64_LDST64_ABS_LO12_NC against var64
+; OBJ: 'r_sym', 0x0000000e
+; OBJ-NEXT: 'r_type', 0x0000011e
+
+; This is on the store, so not really important, but it stops the next
+; match working.
+; R_AARCH64_LDST64_ABS_LO12_NC against var64
+; OBJ: 'r_sym', 0x0000000e
+; OBJ-NEXT: 'r_type', 0x0000011e
+
+
+; Pure address-calculation against var64
+; R_AARCH64_ADR_PREL_PG_HI21 against var64
+; OBJ: 'r_sym', 0x0000000e
+; OBJ-NEXT: 'r_type', 0x00000113
+
+; R_AARCH64_ADD_ABS_LO12_NC against var64
+; OBJ: 'r_sym', 0x0000000e
+; OBJ-NEXT: 'r_type', 0x00000115
+
+
+; Make sure the symbols don't move around, otherwise relocation info
+; will be wrong:
+
+; OBJ: Symbol 12
+; OBJ-NEXT: var16
+
+; OBJ: Symbol 13
+; OBJ-NEXT: var32
+
+; OBJ: Symbol 14
+; OBJ-NEXT: var64
+
+; OBJ: Symbol 15
+; OBJ-NEXT: var8
diff --git a/test/MC/AArch64/elf-objdump.s b/test/MC/AArch64/elf-objdump.s
new file mode 100644
index 0000000..c5aa5b1
--- /dev/null
+++ b/test/MC/AArch64/elf-objdump.s
@@ -0,0 +1,5 @@
+// 64 bit little endian
+// RUN: llvm-mc -filetype=obj -arch=aarch64 -triple aarch64-none-linux-gnu %s -o - | llvm-objdump -d
+
+// We just want to see if llvm-objdump works at all.
+// CHECK: .text
diff --git a/test/MC/AArch64/elf-reloc-addsubimm.s b/test/MC/AArch64/elf-reloc-addsubimm.s
new file mode 100644
index 0000000..7fa6e90
--- /dev/null
+++ b/test/MC/AArch64/elf-reloc-addsubimm.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -arch=aarch64 -filetype=obj %s -o - | \
+// RUN:   elf-dump | FileCheck -check-prefix=OBJ %s
+
+        add x2, x3, #:lo12:some_label
+// OBJ: .rela.text
+
+// OBJ: 'r_offset', 0x0000000000000000
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000115
+
+// OBJ: .symtab
+// OBJ: Symbol 5
+// OBJ-NEXT: some_label
+\ No newline at end of file
diff --git a/test/MC/AArch64/elf-reloc-condbr.s b/test/MC/AArch64/elf-reloc-condbr.s
new file mode 100644
index 0000000..283d3b9
--- /dev/null
+++ b/test/MC/AArch64/elf-reloc-condbr.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -arch=aarch64 -filetype=obj %s -o - | \
+// RUN:   elf-dump | FileCheck -check-prefix=OBJ %s
+
+        b.eq somewhere
+// OBJ: .rela.text
+
+// OBJ: 'r_offset', 0x0000000000000000
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000118
+
+// OBJ: .symtab
+// OBJ: Symbol 5
+// OBJ-NEXT: somewhere
+\ No newline at end of file
diff --git a/test/MC/AArch64/elf-reloc-ldrlit.s b/test/MC/AArch64/elf-reloc-ldrlit.s
new file mode 100644
index 0000000..ce9ff49
--- /dev/null
+++ b/test/MC/AArch64/elf-reloc-ldrlit.s
@@ -0,0 +1,28 @@
+// RUN: llvm-mc -arch=aarch64 -filetype=obj %s -o - | \
+// RUN:   elf-dump | FileCheck -check-prefix=OBJ %s
+
+        ldr x0, some_label
+        ldr w3, some_label
+        ldrsw x9, some_label
+        prfm pldl3keep, some_label
+// OBJ: .rela.text
+
+// OBJ: 'r_offset', 0x0000000000000000
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000111
+
+// OBJ: 'r_offset', 0x0000000000000004
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000111
+
+// OBJ: 'r_offset', 0x0000000000000008
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000111
+
+// OBJ: 'r_offset', 0x000000000000000c
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000111
+
+// OBJ: .symtab
+// OBJ: Symbol 5
+// OBJ-NEXT: some_label
+\ No newline at end of file
diff --git a/test/MC/AArch64/elf-reloc-ldstunsimm.s b/test/MC/AArch64/elf-reloc-ldstunsimm.s
new file mode 100644
index 0000000..345fc82
--- /dev/null
+++ b/test/MC/AArch64/elf-reloc-ldstunsimm.s
@@ -0,0 +1,34 @@
+// RUN: llvm-mc -arch=aarch64 -filetype=obj %s -o - | \
+// RUN:   elf-dump | FileCheck -check-prefix=OBJ %s
+
+        ldrb w0, [sp, #:lo12:some_label]
+        ldrh w0, [sp, #:lo12:some_label]
+        ldr w0, [sp, #:lo12:some_label]
+        ldr x0, [sp, #:lo12:some_label]
+        str q0, [sp, #:lo12:some_label]
+
+// OBJ: .rela.text
+
+// OBJ: 'r_offset', 0x0000000000000000
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000116
+
+// OBJ: 'r_offset', 0x0000000000000004
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000011c
+
+// OBJ: 'r_offset', 0x0000000000000008
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000011d
+
+// OBJ: 'r_offset', 0x000000000000000c
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000011e
+
+// OBJ: 'r_offset', 0x0000000000000010
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000012b
+
+// OBJ: .symtab
+// OBJ: Symbol 5
+// OBJ-NEXT: some_label
diff --git a/test/MC/AArch64/elf-reloc-movw.s b/test/MC/AArch64/elf-reloc-movw.s
new file mode 100644
index 0000000..cb7dc67
--- /dev/null
+++ b/test/MC/AArch64/elf-reloc-movw.s
@@ -0,0 +1,98 @@
+// RUN: llvm-mc -arch=aarch64 -filetype=obj %s -o - | \
+// RUN:   elf-dump | FileCheck -check-prefix=OBJ %s
+
+        movz x0, #:abs_g0:some_label
+        movk x0, #:abs_g0_nc:some_label
+
+        movz x3, #:abs_g1:some_label
+        movk x5, #:abs_g1_nc:some_label
+
+        movz x3, #:abs_g2:some_label
+        movk x5, #:abs_g2_nc:some_label
+
+        movz x7, #:abs_g3:some_label
+        movk x11, #:abs_g3:some_label
+
+        movz x13, #:abs_g0_s:some_label
+        movn x17, #:abs_g0_s:some_label
+
+        movz x19, #:abs_g1_s:some_label
+        movn x19, #:abs_g1_s:some_label
+
+        movz x19, #:abs_g2_s:some_label
+        movn x19, #:abs_g2_s:some_label
+// OBJ: .rela.text
+
+// :abs_g0: => R_AARCH64_MOVW_UABS_G0
+// OBJ: 'r_offset', 0x0000000000000000
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000107
+
+// :abs_g0_nc: => R_AARCH64_MOVW_UABS_G0_NC
+// OBJ: 'r_offset', 0x0000000000000004
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000108
+
+// :abs_g1: => R_AARCH64_MOVW_UABS_G1
+// OBJ: 'r_offset', 0x0000000000000008
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000109
+
+// :abs_g1_nc: => R_AARCH64_MOVW_UABS_G1_NC
+// OBJ: 'r_offset', 0x000000000000000c
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000010a
+
+// :abs_g2: => R_AARCH64_MOVW_UABS_G2
+// OBJ: 'r_offset', 0x0000000000000010
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000010b
+
+// :abs_g2_nc: => R_AARCH64_MOVW_UABS_G2_NC
+// OBJ: 'r_offset', 0x0000000000000014
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000010c
+
+// :abs_g3: => R_AARCH64_MOVW_UABS_G3
+// OBJ: 'r_offset', 0x0000000000000018
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000010d
+
+// :abs_g3: => R_AARCH64_MOVW_UABS_G3
+// OBJ: 'r_offset', 0x000000000000001c
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000010d
+
+// :abs_g0_s: => R_AARCH64_MOVW_SABS_G0
+// OBJ: 'r_offset', 0x0000000000000020
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000010e
+
+// :abs_g0_s: => R_AARCH64_MOVW_SABS_G0
+// OBJ: 'r_offset', 0x0000000000000024
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000010e
+
+// :abs_g1_s: => R_AARCH64_MOVW_SABS_G1
+// OBJ: 'r_offset', 0x0000000000000028
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000010f
+
+// :abs_g1_s: => R_AARCH64_MOVW_SABS_G1
+// OBJ: 'r_offset', 0x000000000000002c
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000010f
+
+// :abs_g2_s: => R_AARCH64_MOVW_SABS_G2
+// OBJ: 'r_offset', 0x0000000000000030
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000110
+
+// :abs_g2_s: => R_AARCH64_MOVW_SABS_G2
+// OBJ: 'r_offset', 0x0000000000000034
+// OBJ:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000110
+
+// OBJ: .symtab
+// OBJ: Symbol 5
+// OBJ-NEXT: some_label
diff --git a/test/MC/AArch64/elf-reloc-pcreladdressing.s b/test/MC/AArch64/elf-reloc-pcreladdressing.s
new file mode 100644
index 0000000..39a8ba9
--- /dev/null
+++ b/test/MC/AArch64/elf-reloc-pcreladdressing.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -arch=aarch64 -filetype=obj %s -o - | \
+// RUN:   elf-dump | FileCheck -check-prefix=OBJ %s
+
+        adr x2, some_label
+        adrp x5, some_label
+
+        adrp x5, :got:some_label
+        ldr x0, [x5, #:got_lo12:some_label]
+// OBJ: .rela.text
+
+// OBJ: 'r_offset', 0x0000000000000000
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000112
+
+// OBJ: 'r_offset', 0x0000000000000004
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000113
+
+// OBJ: 'r_offset', 0x0000000000000008
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000137
+
+// OBJ: 'r_offset', 0x000000000000000c
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000138
+
+// OBJ: .symtab
+// OBJ: Symbol 5
+// OBJ-NEXT: some_label
+\ No newline at end of file
diff --git a/test/MC/AArch64/elf-reloc-tstb.s b/test/MC/AArch64/elf-reloc-tstb.s
new file mode 100644
index 0000000..c5e2981
--- /dev/null
+++ b/test/MC/AArch64/elf-reloc-tstb.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -arch=aarch64 -filetype=obj %s -o - | \
+// RUN:   elf-dump | FileCheck -check-prefix=OBJ %s
+
+        tbz x6, #45, somewhere
+        tbnz w3, #15, somewhere
+// OBJ: .rela.text
+
+// OBJ: 'r_offset', 0x0000000000000000
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000117
+
+// OBJ: 'r_offset', 0x0000000000000004
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x00000117
+
+// OBJ: .symtab
+// OBJ: Symbol 5
+// OBJ-NEXT: somewhere
diff --git a/test/MC/AArch64/elf-reloc-uncondbrimm.s b/test/MC/AArch64/elf-reloc-uncondbrimm.s
new file mode 100644
index 0000000..0e97bc6
--- /dev/null
+++ b/test/MC/AArch64/elf-reloc-uncondbrimm.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -arch=aarch64 -filetype=obj %s -o - | \
+// RUN:   elf-dump | FileCheck -check-prefix=OBJ %s
+
+        b somewhere
+        bl somewhere
+// OBJ: .rela.text
+
+// OBJ: 'r_offset', 0x0000000000000000
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000011a
+
+// OBJ: 'r_offset', 0x0000000000000004
+// OBJ-NEXT:  'r_sym', 0x00000005
+// OBJ-NEXT: 'r_type', 0x0000011b
+
+// OBJ: .symtab
+// OBJ: Symbol 5
+// OBJ-NEXT: somewhere
+\ No newline at end of file
diff --git a/test/MC/AArch64/lit.local.cfg b/test/MC/AArch64/lit.local.cfg
new file mode 100644
index 0000000..cc02173
--- /dev/null
+++ b/test/MC/AArch64/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll', '.c', '.cpp', '.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+\ No newline at end of file
diff --git a/test/MC/AArch64/mapping-across-sections.s b/test/MC/AArch64/mapping-across-sections.s
new file mode 100644
index 0000000..3d32c1d
--- /dev/null
+++ b/test/MC/AArch64/mapping-across-sections.s
@@ -0,0 +1,28 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+        .text
+        add w0, w0, w0
+
+// .wibble should *not* inherit .text's mapping symbol. It's a completely different section.
+        .section .wibble
+        add w0, w0, w0
+
+// A setion should be able to start with a $d
+        .section .starts_data
+        .word 42
+
+// Changing back to .text should not emit a redundant $x
+        .text
+        add w0, w0, w0
+
+// With all those constraints, we want:
+//   + .text to have $x at 0 and no others
+//   + .wibble to have $x at 0
+//   + .starts_data to have $d at 0
+
+
+// CHECK: 00000000 .starts_data 00000000 $d
+// CHECK-NEXT: 00000000 .text 00000000 $x
+// CHECK-NEXT: 00000000 .wibble 00000000 $x
+// CHECK-NOT: ${{[adtx]}}
+
diff --git a/test/MC/AArch64/mapping-within-section.s b/test/MC/AArch64/mapping-within-section.s
new file mode 100644
index 0000000..c8bd804
--- /dev/null
+++ b/test/MC/AArch64/mapping-within-section.s
@@ -0,0 +1,23 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+    .text
+// $x at 0x0000
+    add w0, w0, w0
+// $d at 0x0004
+    .ascii "012"
+    .byte 1
+    .hword 2
+    .word 4
+    .xword 8
+    .single 4.0
+    .double 8.0
+    .space 10
+    .zero 3
+    .fill 10, 2, 42
+    .org 100, 12
+// $x at 0x0018
+    add x0, x0, x0
+
+// CHECK: 00000004         .text  00000000 $d
+// CHECK-NEXT: 00000000         .text  00000000 $x
+// CHECK-NEXT: 00000064         .text  00000000 $x
diff --git a/test/MC/AArch64/tls-relocs.s b/test/MC/AArch64/tls-relocs.s
new file mode 100644
index 0000000..690fa8c
--- /dev/null
+++ b/test/MC/AArch64/tls-relocs.s
@@ -0,0 +1,662 @@
+// RUN: llvm-mc -arch=aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -arch=aarch64 -filetype=obj < %s -o %t
+// RUN: elf-dump %t | FileCheck --check-prefix=CHECK-ELF %s
+// RUN: llvm-objdump -r %t | FileCheck --check-prefix=CHECK-ELF-NAMES %s
+
+// CHECK-ELF:  .rela.text
+
+        // TLS local-dynamic forms
+        movz x1, #:dtprel_g2:var
+        movn x2, #:dtprel_g2:var
+        movz x3, #:dtprel_g2:var
+        movn x4, #:dtprel_g2:var
+// CHECK: movz    x1, #:dtprel_g2:var     // encoding: [0x01'A',A,0xc0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
+// CHECK-NEXT: movn    x2, #:dtprel_g2:var     // encoding: [0x02'A',A,0xc0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
+// CHECK-NEXT: movz    x3, #:dtprel_g2:var     // encoding: [0x03'A',A,0xc0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
+// CHECK-NEXT: movn    x4, #:dtprel_g2:var     // encoding: [0x04'A',A,0xc0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_a64_movw_dtprel_g2
+
+// CHECK-ELF: # Relocation 0
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000000)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM:0x[0-9a-f]+]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020b)
+// CHECK-ELF: # Relocation 1
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000004)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020b)
+// CHECK-ELF: # Relocation 2
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000008)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020b)
+// CHECK-ELF: # Relocation 3
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000000c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020b)
+
+// CHECK-ELF-NAMES: 0 R_AARCH64_TLSLD_MOVW_DTPREL_G2
+// CHECK-ELF-NAMES: 4 R_AARCH64_TLSLD_MOVW_DTPREL_G2
+// CHECK-ELF-NAMES: 8 R_AARCH64_TLSLD_MOVW_DTPREL_G2
+// CHECK-ELF-NAMES: 12 R_AARCH64_TLSLD_MOVW_DTPREL_G2
+
+        movz x5, #:dtprel_g1:var
+        movn x6, #:dtprel_g1:var
+        movz w7, #:dtprel_g1:var
+        movn w8, #:dtprel_g1:var
+// CHECK: movz    x5, #:dtprel_g1:var     // encoding: [0x05'A',A,0xa0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
+// CHECK-NEXT: movn    x6, #:dtprel_g1:var     // encoding: [0x06'A',A,0xa0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
+// CHECK-NEXT: movz    w7, #:dtprel_g1:var     // encoding: [0x07'A',A,0xa0'A',0x12'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
+// CHECK-NEXT: movn    w8, #:dtprel_g1:var     // encoding: [0x08'A',A,0xa0'A',0x12'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_a64_movw_dtprel_g1
+
+// CHECK-ELF: # Relocation 4
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000010)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020c)
+// CHECK-ELF: # Relocation 5
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000014)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020c)
+// CHECK-ELF: # Relocation 6
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000018)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020c)
+// CHECK-ELF: # Relocation 7
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000001c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020c)
+
+// CHECK-ELF-NAMES: 16 R_AARCH64_TLSLD_MOVW_DTPREL_G1
+// CHECK-ELF-NAMES: 20 R_AARCH64_TLSLD_MOVW_DTPREL_G1
+// CHECK-ELF-NAMES: 24 R_AARCH64_TLSLD_MOVW_DTPREL_G1
+// CHECK-ELF-NAMES: 28 R_AARCH64_TLSLD_MOVW_DTPREL_G1
+
+        movk x9, #:dtprel_g1_nc:var
+        movk w10, #:dtprel_g1_nc:var
+// CHECK: movk    x9, #:dtprel_g1_nc:var  // encoding: [0x09'A',A,0xa0'A',0xf2'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_a64_movw_dtprel_g1_nc
+// CHECK-NEXT: movk    w10, #:dtprel_g1_nc:var // encoding: [0x0a'A',A,0xa0'A',0x72'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_a64_movw_dtprel_g1_nc
+
+// CHECK-ELF: # Relocation 8
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000020)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020d)
+// CHECK-ELF: # Relocation 9
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000024)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020d)
+
+// CHECK-ELF-NAMES: 32 R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC
+// CHECK-ELF-NAMES: 36 R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC
+
+        movz x11, #:dtprel_g0:var
+        movn x12, #:dtprel_g0:var
+        movz w13, #:dtprel_g0:var
+        movn w14, #:dtprel_g0:var
+// CHECK: movz    x11, #:dtprel_g0:var    // encoding: [0x0b'A',A,0x80'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_a64_movw_dtprel_g0
+// CHECK-NEXT: movn    x12, #:dtprel_g0:var    // encoding: [0x0c'A',A,0x80'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_a64_movw_dtprel_g0
+// CHECK-NEXT: movz    w13, #:dtprel_g0:var    // encoding: [0x0d'A',A,0x80'A',0x12'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_a64_movw_dtprel_g0
+// CHECK-NEXT: movn    w14, #:dtprel_g0:var    // encoding: [0x0e'A',A,0x80'A',0x12'A']
+
+
+// CHECK-ELF: # Relocation 10
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000028)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020e)
+// CHECK-ELF: # Relocation 11
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000002c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020e)
+// CHECK-ELF: # Relocation 12
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000030)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020e)
+// CHECK-ELF: # Relocation 13
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000034)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020e)
+
+// CHECK-ELF-NAMES: 40 R_AARCH64_TLSLD_MOVW_DTPREL_G0
+// CHECK-ELF-NAMES: 44 R_AARCH64_TLSLD_MOVW_DTPREL_G0
+// CHECK-ELF-NAMES: 48 R_AARCH64_TLSLD_MOVW_DTPREL_G0
+// CHECK-ELF-NAMES: 52 R_AARCH64_TLSLD_MOVW_DTPREL_G0
+
+
+        movk x15, #:dtprel_g0_nc:var
+        movk w16, #:dtprel_g0_nc:var
+// CHECK: movk    x15, #:dtprel_g0_nc:var // encoding: [0x0f'A',A,0x80'A',0xf2'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_a64_movw_dtprel_g0_nc
+// CHECK-NEXT: movk    w16, #:dtprel_g0_nc:var // encoding: [0x10'A',A,0x80'A',0x72'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_a64_movw_dtprel_g0_nc
+
+// CHECK-ELF: # Relocation 14
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000038)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020f)
+// CHECK-ELF: # Relocation 15
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000003c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000020f)
+
+// CHECK-ELF-NAMES: 56 R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC
+// CHECK-ELF-NAMES: 60 R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC
+
+        add x17, x18, #:dtprel_hi12:var, lsl #12
+        add w19, w20, #:dtprel_hi12:var, lsl #12
+// CHECK: add     x17, x18, #:dtprel_hi12:var, lsl #12 // encoding: [0x51'A',0x02'A',0x40'A',0x91'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_a64_add_dtprel_hi12
+// CHECK-NEXT: add     w19, w20, #:dtprel_hi12:var, lsl #12 // encoding: [0x93'A',0x02'A',0x40'A',0x11'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_hi12:var, kind: fixup_a64_add_dtprel_hi12
+
+// CHECK-ELF: # Relocation 16
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000040)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000210)
+// CHECK-ELF: # Relocation 17
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000044)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000210)
+
+// CHECK-ELF-NAMES: 64 R_AARCH64_TLSLD_ADD_DTPREL_HI12
+// CHECK-ELF-NAMES: 68 R_AARCH64_TLSLD_ADD_DTPREL_HI12
+
+
+        add x21, x22, #:dtprel_lo12:var
+        add w23, w24, #:dtprel_lo12:var
+// CHECK: add     x21, x22, #:dtprel_lo12:var // encoding: [0xd5'A',0x02'A',A,0x91'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_add_dtprel_lo12
+// CHECK-NEXT: add     w23, w24, #:dtprel_lo12:var // encoding: [0x17'A',0x03'A',A,0x11'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_add_dtprel_lo12
+
+// CHECK-ELF: # Relocation 18
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000048)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000211)
+// CHECK-ELF: # Relocation 19
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000004c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000211)
+
+// CHECK-ELF-NAMES: 72 R_AARCH64_TLSLD_ADD_DTPREL_LO12
+// CHECK-ELF-NAMES: 76 R_AARCH64_TLSLD_ADD_DTPREL_LO12
+
+        add x25, x26, #:dtprel_lo12_nc:var
+        add w27, w28, #:dtprel_lo12_nc:var
+// CHECK: add     x25, x26, #:dtprel_lo12_nc:var // encoding: [0x59'A',0x03'A',A,0x91'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_add_dtprel_lo12_nc
+// CHECK-NEXT: add     w27, w28, #:dtprel_lo12_nc:var // encoding: [0x9b'A',0x03'A',A,0x11'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_add_dtprel_lo12_nc
+
+// CHECK-ELF: # Relocation 20
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000050)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000212)
+// CHECK-ELF: # Relocation 21
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000054)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000212)
+
+// CHECK-ELF-NAMES: 80 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC
+// CHECK-ELF-NAMES: 84 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC
+
+        ldrb w29, [x30, #:dtprel_lo12:var]
+        ldrsb x29, [x28, #:dtprel_lo12_nc:var]
+// CHECK: ldrb    w29, [x30, #:dtprel_lo12:var] // encoding: [0xdd'A',0x03'A',0x40'A',0x39'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst8_dtprel_lo12
+// CHECK-NEXT: ldrsb   x29, [x28, #:dtprel_lo12_nc:var] // encoding: [0x9d'A',0x03'A',0x80'A',0x39'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst8_dtprel_lo12_nc
+
+// CHECK-ELF: # Relocation 22
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000058)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000213)
+// CHECK-ELF: # Relocation 23
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000005c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000214)
+
+// CHECK-ELF-NAMES: 88 R_AARCH64_TLSLD_LDST8_DTPREL_LO12
+// CHECK-ELF-NAMES: 92 R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC
+
+        strh w27, [x26, #:dtprel_lo12:var]
+        ldrsh x25, [x24, #:dtprel_lo12_nc:var]
+// CHECK: strh    w27, [x26, #:dtprel_lo12:var] // encoding: [0x5b'A',0x03'A',A,0x79'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst16_dtprel_lo12
+// CHECK-NEXT: ldrsh   x25, [x24, #:dtprel_lo12_nc:var] // encoding: [0x19'A',0x03'A',0x80'A',0x79'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst16_dtprel_lo12_n
+
+// CHECK-ELF: # Relocation 24
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000060)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000215)
+// CHECK-ELF: # Relocation 25
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000064)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000216)
+
+// CHECK-ELF-NAMES: 96 R_AARCH64_TLSLD_LDST16_DTPREL_LO12
+// CHECK-ELF-NAMES: 100 R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC
+
+        ldr w23, [x22, #:dtprel_lo12:var]
+        ldrsw x21, [x20, #:dtprel_lo12_nc:var]
+// CHECK: ldr     w23, [x22, #:dtprel_lo12:var] // encoding: [0xd7'A',0x02'A',0x40'A',0xb9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst32_dtprel_lo12
+// CHECK-NEXT: ldrsw   x21, [x20, #:dtprel_lo12_nc:var] // encoding: [0x95'A',0x02'A',0x80'A',0xb9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst32_dtprel_lo12_n
+
+// CHECK-ELF: # Relocation 26
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000068)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000217)
+// CHECK-ELF: # Relocation 27
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000006c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000218)
+
+// CHECK-ELF-NAMES: 104 R_AARCH64_TLSLD_LDST32_DTPREL_LO12
+// CHECK-ELF-NAMES: 108 R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC
+
+        ldr x19, [x18, #:dtprel_lo12:var]
+        str x17, [x16, #:dtprel_lo12_nc:var]
+// CHECK: ldr     x19, [x18, #:dtprel_lo12:var] // encoding: [0x53'A',0x02'A',0x40'A',0xf9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_a64_ldst64_dtprel_lo12
+// CHECK-NEXT: str     x17, [x16, #:dtprel_lo12_nc:var] // encoding: [0x11'A',0x02'A',A,0xf9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_a64_ldst64_dtprel_lo12_nc
+
+
+// CHECK-ELF: # Relocation 28
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000070)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000219)
+// CHECK-ELF: # Relocation 29
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000074)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000021a)
+
+// CHECK-ELF-NAMES: 112 R_AARCH64_TLSLD_LDST64_DTPREL_LO12
+// CHECK-ELF-NAMES: 116 R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC
+
+        // TLS initial-exec forms
+        movz x15, #:gottprel_g1:var
+        movz w14, #:gottprel_g1:var
+// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0x0f'A',A,0xa0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_a64_movw_gottprel_g1
+// CHECK-NEXT: movz    w14, #:gottprel_g1:var  // encoding: [0x0e'A',A,0xa0'A',0x12'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_a64_movw_gottprel_g1
+
+// CHECK-ELF: # Relocation 30
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000078)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000021b)
+// CHECK-ELF: # Relocation 31
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000007c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000021b)
+
+// CHECK-ELF-NAMES: 120 R_AARCH64_TLSIE_MOVW_GOTTPREL_G1
+// CHECK-ELF-NAMES: 124 R_AARCH64_TLSIE_MOVW_GOTTPREL_G1
+
+        movk x13, #:gottprel_g0_nc:var
+        movk w12, #:gottprel_g0_nc:var
+// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0x0d'A',A,0x80'A',0xf2'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_a64_movw_gottprel_g0_nc
+// CHECK-NEXT: movk    w12, #:gottprel_g0_nc:var // encoding: [0x0c'A',A,0x80'A',0x72'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_a64_movw_gottprel_g0_nc
+
+// CHECK-ELF: # Relocation 32
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000080)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000021c)
+// CHECK-ELF: # Relocation 33
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000084)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000021c)
+
+// CHECK-ELF-NAMES: 128 R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC
+// CHECK-ELF-NAMES: 132 R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC
+
+        adrp x11, :gottprel:var
+        ldr x10, [x0, #:gottprel_lo12:var]
+        ldr x9, :gottprel:var
+// CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_a64_adr_gottprel_page
+// CHECK-NEXT: ldr     x10, [x0, #:gottprel_lo12:var] // encoding: [0x0a'A',A,0x40'A',0xf9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_a64_ld64_gottprel_lo12_nc
+// CHECK-NEXT: ldr     x9, :gottprel:var       // encoding: [0x09'A',A,A,0x58'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_a64_ld_gottprel_prel19
+
+// CHECK-ELF: # Relocation 34
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000088)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000021d)
+// CHECK-ELF: # Relocation 35
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000008c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000021e)
+// CHECK-ELF: # Relocation 36
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000090)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000021f)
+
+// CHECK-ELF-NAMES: 136 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE
+// CHECK-ELF-NAMES: 140 R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+// CHECK-ELF-NAMES: 144 R_AARCH64_TLSIE_LD_GOTTPREL_PREL19
+
+        // TLS local-exec forms
+        movz x3, #:tprel_g2:var
+        movn x4, #:tprel_g2:var
+// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0x03'A',A,0xc0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_a64_movw_tprel_g2
+// CHECK-NEXT: movn    x4, #:tprel_g2:var      // encoding: [0x04'A',A,0xc0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_a64_movw_tprel_g2
+
+// CHECK-ELF: # Relocation 37
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000094)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000220)
+// CHECK-ELF: # Relocation 38
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000098)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000220)
+
+// CHECK-ELF-NAMES: 148 R_AARCH64_TLSLE_MOVW_TPREL_G2
+// CHECK-ELF-NAMES: 152 R_AARCH64_TLSLE_MOVW_TPREL_G2
+
+        movz x5, #:tprel_g1:var
+        movn x6, #:tprel_g1:var
+        movz w7, #:tprel_g1:var
+        movn w8, #:tprel_g1:var
+// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0x05'A',A,0xa0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
+// CHECK-NEXT: movn    x6, #:tprel_g1:var      // encoding: [0x06'A',A,0xa0'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
+// CHECK-NEXT: movz    w7, #:tprel_g1:var      // encoding: [0x07'A',A,0xa0'A',0x12'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
+// CHECK-NEXT: movn    w8, #:tprel_g1:var      // encoding: [0x08'A',A,0xa0'A',0x12'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_a64_movw_tprel_g1
+
+// CHECK-ELF: # Relocation 39
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000009c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000221)
+// CHECK-ELF: # Relocation 40
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000a0)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000221)
+// CHECK-ELF: # Relocation 41
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000a4)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000221)
+// CHECK-ELF: # Relocation 42
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000a8)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000221)
+
+// CHECK-ELF-NAMES: 156 R_AARCH64_TLSLE_MOVW_TPREL_G1
+// CHECK-ELF-NAMES: 160 R_AARCH64_TLSLE_MOVW_TPREL_G1
+// CHECK-ELF-NAMES: 164 R_AARCH64_TLSLE_MOVW_TPREL_G1
+// CHECK-ELF-NAMES: 168 R_AARCH64_TLSLE_MOVW_TPREL_G1
+
+        movk x9, #:tprel_g1_nc:var
+        movk w10, #:tprel_g1_nc:var
+// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0x09'A',A,0xa0'A',0xf2'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_a64_movw_tprel_g1_nc
+// CHECK-NEXT: movk    w10, #:tprel_g1_nc:var  // encoding: [0x0a'A',A,0xa0'A',0x72'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_a64_movw_tprel_g1_nc
+
+// CHECK-ELF: # Relocation 43
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000ac)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000222)
+// CHECK-ELF: # Relocation 44
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000b0)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000222)
+
+// CHECK-ELF-NAMES: 172 R_AARCH64_TLSLE_MOVW_TPREL_G1_NC
+// CHECK-ELF-NAMES: 176 R_AARCH64_TLSLE_MOVW_TPREL_G1_NC
+
+        movz x11, #:tprel_g0:var
+        movn x12, #:tprel_g0:var
+        movz w13, #:tprel_g0:var
+        movn w14, #:tprel_g0:var
+// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0x0b'A',A,0x80'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
+// CHECK-NEXT: movn    x12, #:tprel_g0:var     // encoding: [0x0c'A',A,0x80'A',0x92'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
+// CHECK-NEXT: movz    w13, #:tprel_g0:var     // encoding: [0x0d'A',A,0x80'A',0x12'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
+// CHECK-NEXT: movn    w14, #:tprel_g0:var     // encoding: [0x0e'A',A,0x80'A',0x12'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_a64_movw_tprel_g0
+
+// CHECK-ELF: # Relocation 45
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000b4)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000223)
+// CHECK-ELF: # Relocation 46
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000b8)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000223)
+// CHECK-ELF: # Relocation 47
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000bc)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000223)
+// CHECK-ELF: # Relocation 48
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000c0)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000223)
+
+// CHECK-ELF-NAMES: 180 R_AARCH64_TLSLE_MOVW_TPREL_G0
+// CHECK-ELF-NAMES: 184 R_AARCH64_TLSLE_MOVW_TPREL_G0
+// CHECK-ELF-NAMES: 188 R_AARCH64_TLSLE_MOVW_TPREL_G0
+// CHECK-ELF-NAMES: 192 R_AARCH64_TLSLE_MOVW_TPREL_G0
+
+        movk x15, #:tprel_g0_nc:var
+        movk w16, #:tprel_g0_nc:var
+// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0x0f'A',A,0x80'A',0xf2'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_a64_movw_tprel_g0_nc
+// CHECK-NEXT: movk    w16, #:tprel_g0_nc:var  // encoding: [0x10'A',A,0x80'A',0x72'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_a64_movw_tprel_g0_nc
+
+// CHECK-ELF: # Relocation 49
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000c4)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000224)
+// CHECK-ELF: # Relocation 50
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000c8)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000224)
+
+// CHECK-ELF-NAMES: 196 R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+// CHECK-ELF-NAMES: 200 R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+
+        add x17, x18, #:tprel_hi12:var, lsl #12
+        add w19, w20, #:tprel_hi12:var, lsl #12
+// CHECK: add     x17, x18, #:tprel_hi12:var, lsl #12 // encoding: [0x51'A',0x02'A',0x40'A',0x91'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_a64_add_tprel_hi12
+// CHECK-NEXT: add     w19, w20, #:tprel_hi12:var, lsl #12 // encoding: [0x93'A',0x02'A',0x40'A',0x11'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_hi12:var, kind: fixup_a64_add_tprel_hi12
+
+// CHECK-ELF: # Relocation 51
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000cc)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000225)
+// CHECK-ELF: # Relocation 52
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000d0)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000225)
+
+// CHECK-ELF-NAMES: 204 R_AARCH64_TLSLE_ADD_TPREL_HI12
+// CHECK-ELF-NAMES: 208 R_AARCH64_TLSLE_ADD_TPREL_HI12
+
+        add x21, x22, #:tprel_lo12:var
+        add w23, w24, #:tprel_lo12:var
+// CHECK: add     x21, x22, #:tprel_lo12:var // encoding: [0xd5'A',0x02'A',A,0x91'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_add_tprel_lo12
+// CHECK-NEXT: add     w23, w24, #:tprel_lo12:var // encoding: [0x17'A',0x03'A',A,0x11'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_add_tprel_lo12
+
+// CHECK-ELF: # Relocation 53
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000d4)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000226)
+// CHECK-ELF: # Relocation 54
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000d8)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000226)
+
+// CHECK-ELF-NAMES: 212 R_AARCH64_TLSLE_ADD_TPREL_LO12
+// CHECK-ELF-NAMES: 216 R_AARCH64_TLSLE_ADD_TPREL_LO12
+
+        add x25, x26, #:tprel_lo12_nc:var
+        add w27, w28, #:tprel_lo12_nc:var
+// CHECK: add     x25, x26, #:tprel_lo12_nc:var // encoding: [0x59'A',0x03'A',A,0x91'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_add_tprel_lo12_nc
+// CHECK-NEXT: add     w27, w28, #:tprel_lo12_nc:var // encoding: [0x9b'A',0x03'A',A,0x11'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_add_tprel_lo12_nc
+
+// CHECK-ELF: # Relocation 55
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000dc)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000227)
+// CHECK-ELF: # Relocation 56
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000e0)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000227)
+
+
+// CHECK-ELF-NAMES: 220 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC
+// CHECK-ELF-NAMES: 224 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC
+
+        ldrb w29, [x30, #:tprel_lo12:var]
+        ldrsb x29, [x28, #:tprel_lo12_nc:var]
+// CHECK: ldrb    w29, [x30, #:tprel_lo12:var] // encoding: [0xdd'A',0x03'A',0x40'A',0x39'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst8_tprel_lo12
+// CHECK-NEXT: ldrsb   x29, [x28, #:tprel_lo12_nc:var] // encoding: [0x9d'A',0x03'A',0x80'A',0x39'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst8_tprel_lo12_nc
+
+// CHECK-ELF: # Relocation 57
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000e4)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000228)
+// CHECK-ELF: # Relocation 58
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000e8)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000229)
+
+// CHECK-ELF-NAMES: 228 R_AARCH64_TLSLE_LDST8_TPREL_LO12
+// CHECK-ELF-NAMES: 232 R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC
+
+        strh w27, [x26, #:tprel_lo12:var]
+        ldrsh x25, [x24, #:tprel_lo12_nc:var]
+// CHECK: strh    w27, [x26, #:tprel_lo12:var] // encoding: [0x5b'A',0x03'A',A,0x79'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst16_tprel_lo12
+// CHECK-NEXT: ldrsh   x25, [x24, #:tprel_lo12_nc:var] // encoding: [0x19'A',0x03'A',0x80'A',0x79'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst16_tprel_lo12_n
+
+// CHECK-ELF: # Relocation 59
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000ec)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000022a)
+// CHECK-ELF: # Relocation 60
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000f0)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000022b)
+
+// CHECK-ELF-NAMES: 236 R_AARCH64_TLSLE_LDST16_TPREL_LO12
+// CHECK-ELF-NAMES: 240 R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC
+
+        ldr w23, [x22, #:tprel_lo12:var]
+        ldrsw x21, [x20, #:tprel_lo12_nc:var]
+// CHECK: ldr     w23, [x22, #:tprel_lo12:var] // encoding: [0xd7'A',0x02'A',0x40'A',0xb9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst32_tprel_lo12
+// CHECK-NEXT: ldrsw   x21, [x20, #:tprel_lo12_nc:var] // encoding: [0x95'A',0x02'A',0x80'A',0xb9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst32_tprel_lo12_n
+
+// CHECK-ELF: # Relocation 61
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000f4)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000022c)
+// CHECK-ELF: # Relocation 62
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000f8)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000022d)
+
+// CHECK-ELF-NAMES: 244 R_AARCH64_TLSLE_LDST32_TPREL_LO12
+// CHECK-ELF-NAMES: 248 R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC
+
+        ldr x19, [x18, #:tprel_lo12:var]
+        str x17, [x16, #:tprel_lo12_nc:var]
+// CHECK: ldr     x19, [x18, #:tprel_lo12:var] // encoding: [0x53'A',0x02'A',0x40'A',0xf9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_a64_ldst64_tprel_lo12
+// CHECK-NEXT: str     x17, [x16, #:tprel_lo12_nc:var] // encoding: [0x11'A',0x02'A',A,0xf9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_a64_ldst64_tprel_lo12_nc
+
+// CHECK-ELF: # Relocation 63
+// CHECK-ELF-NEXT: (('r_offset', 0x00000000000000fc)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000022e)
+// CHECK-ELF: # Relocation 64
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000100)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x0000022f)
+
+// CHECK-ELF-NAMES: 252 R_AARCH64_TLSLE_LDST64_TPREL_LO12
+// CHECK-ELF-NAMES: 256 R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC
+
+        // TLS descriptor forms
+        adrp x8, :tlsdesc:var
+        ldr x7, [x6, :tlsdesc_lo12:var]
+        add x5, x4, #:tlsdesc_lo12:var
+        .tlsdesccall var
+        blr x3
+
+// CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_a64_tlsdesc_adr_page
+// CHECK-NEXT: ldr     x7, [x6, #:tlsdesc_lo12:var] // encoding: [0xc7'A',A,0x40'A',0xf9'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_a64_tlsdesc_ld64_lo12_nc
+// CHECK-NEXT: add     x5, x4, #:tlsdesc_lo12:var // encoding: [0x85'A',A,A,0x91'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_a64_tlsdesc_add_lo12_nc
+// CHECK-NEXT: .tlsdesccall var                // encoding: []
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_a64_tlsdesc_call
+// CHECK: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
+
+
+// CHECK-ELF: # Relocation 65
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000104)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000232)
+// CHECK-ELF: # Relocation 66
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000108)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000233)
+// CHECK-ELF: # Relocation 67
+// CHECK-ELF-NEXT: (('r_offset', 0x000000000000010c)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000234)
+// CHECK-ELF: # Relocation 68
+// CHECK-ELF-NEXT: (('r_offset', 0x0000000000000110)
+// CHECK-ELF-NEXT:  ('r_sym', [[VARSYM]])
+// CHECK-ELF-NEXT:  ('r_type', 0x00000239)
+
+// CHECK-ELF-NAMES: 260 R_AARCH64_TLSDESC_ADR_PAGE
+// CHECK-ELF-NAMES: 264 R_AARCH64_TLSDESC_LD64_LO12_NC
+// CHECK-ELF-NAMES: 268 R_AARCH64_TLSDESC_ADD_LO12_NC
+// CHECK-ELF-NAMES: 272 R_AARCH64_TLSDESC_CALL
+
+
+// Make sure symbol 5 has type STT_TLS:
+
+// CHECK-ELF: # Symbol 5
+// CHECK-ELF-NEXT: (('st_name', 0x00000006) # 'var'
+// CHECK-ELF-NEXT:  ('st_bind', 0x1)
+// CHECK-ELF-NEXT:  ('st_type', 0x6)
diff --git a/test/MC/ARM/AlignedBundling/group-bundle-arm.s b/test/MC/ARM/AlignedBundling/group-bundle-arm.s
index 823d9e0..1d67353 100644
--- a/test/MC/ARM/AlignedBundling/group-bundle-arm.s
+++ b/test/MC/ARM/AlignedBundling/group-bundle-arm.s
@@ -5,8 +5,8 @@
 # instructions should not be inserted. However, for bundle-locked groups
 # it can be.
 
-	.syntax unified
-	.text
+  .syntax unified
+  .text
   .bundle_align_mode 4
 
   bx lr
@@ -35,3 +35,14 @@
 # CHECK-NEXT: 2c: nop
 # CHECK-NEXT: 30: bx
 
+  .align 4
+foo:
+  b foo
+  .long 3892240112
+  .long 3892240112
+  .long 3892240112
+  .long 3892240112
+  .long 3892240112
+  .long 3892240112
+# CHECK:  40: b
+
diff --git a/test/MC/ARM/arm_instructions.s b/test/MC/ARM/arm_instructions.s
index ce7e036..a4b6bda 100644
--- a/test/MC/ARM/arm_instructions.s
+++ b/test/MC/ARM/arm_instructions.s
@@ -1,7 +1,14 @@
-@ RUN: llvm-mc -mcpu=cortex-a8 -triple arm-unknown-unknown -show-encoding %s | FileCheck %s
-
-@ CHECK: trap
-@ CHECK: encoding: [0xfe,0xde,0xff,0xe7]
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple arm-unknown-unknown -show-encoding %s \
+@ RUN:  | FileCheck %s -check-prefix=ALL
+@ RUN: llvm-mc -mcpu=cortex-a9-mp -triple armv7-unknown-nacl -show-encoding %s \
+@ RUN:  | FileCheck %s -check-prefix=NACL
+@ RUN: llvm-mc -mcpu=cortex-a8 -mattr=+nacl-trap -triple armv7 -show-encoding %s \
+@ RUN:  | FileCheck %s -check-prefix=NACL
+
+@ ALL: trap
+@ ALL: encoding: [0xfe,0xde,0xff,0xe7]
+@ NACL: trap
+@ NACL: encoding: [0xf0,0xde,0xfe,0xe7]
         trap
 
 @ CHECK: bx	lr
diff --git a/test/MC/ARM/basic-arm-instructions.s b/test/MC/ARM/basic-arm-instructions.s
index 5c2a214..45ea278 100644
--- a/test/MC/ARM/basic-arm-instructions.s
+++ b/test/MC/ARM/basic-arm-instructions.s
@@ -2087,6 +2087,49 @@ Lforward:
 @ CHECK: srsia	sp, #5                  @ encoding: [0x05,0x05,0xcd,0xf8]
 @ CHECK: srsia	sp!, #5                 @ encoding: [0x05,0x05,0xed,0xf8]
 
+@ Compatibility aliases.
+        srsda #5
+        srsdb #1
+        srsia #0
+        srsib #15
+
+        srsda #31!
+        srsdb #19!
+        srsia #2!
+        srsib #14!
+
+        srsfa #11
+        srsea #10
+        srsfd #9
+        srsed #5
+
+        srsfa #5!
+        srsea #5!
+        srsfd #5!
+        srsed #5!
+
+        srs #5
+        srs #5!
+
+@ CHECK: srsda	sp, #5                  @ encoding: [0x05,0x05,0x4d,0xf8]
+@ CHECK: srsdb	sp, #1                  @ encoding: [0x01,0x05,0x4d,0xf9]
+@ CHECK: srsia	sp, #0                  @ encoding: [0x00,0x05,0xcd,0xf8]
+@ CHECK: srsib	sp, #15                 @ encoding: [0x0f,0x05,0xcd,0xf9]
+@ CHECK: srsda	sp!, #31                @ encoding: [0x1f,0x05,0x6d,0xf8]
+@ CHECK: srsdb	sp!, #19                @ encoding: [0x13,0x05,0x6d,0xf9]
+@ CHECK: srsia	sp!, #2                 @ encoding: [0x02,0x05,0xed,0xf8]
+@ CHECK: srsib	sp!, #14                @ encoding: [0x0e,0x05,0xed,0xf9]
+@ CHECK: srsda	sp, #11                 @ encoding: [0x0b,0x05,0x4d,0xf8]
+@ CHECK: srsdb	sp, #10                 @ encoding: [0x0a,0x05,0x4d,0xf9]
+@ CHECK: srsia	sp, #9                  @ encoding: [0x09,0x05,0xcd,0xf8]
+@ CHECK: srsib	sp, #5                  @ encoding: [0x05,0x05,0xcd,0xf9]
+@ CHECK: srsda	sp!, #5                 @ encoding: [0x05,0x05,0x6d,0xf8]
+@ CHECK: srsdb	sp!, #5                 @ encoding: [0x05,0x05,0x6d,0xf9]
+@ CHECK: srsia	sp!, #5                 @ encoding: [0x05,0x05,0xed,0xf8]
+@ CHECK: srsib	sp!, #5                 @ encoding: [0x05,0x05,0xed,0xf9]
+@ CHECK: srsia	sp, #5                  @ encoding: [0x05,0x05,0xcd,0xf8]
+@ CHECK: srsia	sp!, #5                 @ encoding: [0x05,0x05,0xed,0xf8]
+
 
 @------------------------------------------------------------------------------
 @ SSAT
diff --git a/test/MC/ARM/basic-thumb2-instructions.s b/test/MC/ARM/basic-thumb2-instructions.s
index d495c91..9278a2a 100644
--- a/test/MC/ARM/basic-thumb2-instructions.s
+++ b/test/MC/ARM/basic-thumb2-instructions.s
@@ -2352,6 +2352,32 @@ _func:
 @ CHECK: srsia	sp, #5                  @ encoding: [0x8d,0xe9,0x05,0xc0]
 @ CHECK: srsia	sp!, #5                 @ encoding: [0xad,0xe9,0x05,0xc0]
 
+        srsdb #1
+        srsia #0
+
+        srsdb #19!
+        srsia #2!
+
+        srsea #10
+        srsfd #9
+
+        srsea #5!
+        srsfd #5!
+
+        srs #5
+        srs #5!
+
+@ CHECK: srsdb	sp, #1                  @ encoding: [0x0d,0xe8,0x01,0xc0]
+@ CHECK: srsia	sp, #0                  @ encoding: [0x8d,0xe9,0x00,0xc0]
+@ CHECK: srsdb	sp!, #19                @ encoding: [0x2d,0xe8,0x13,0xc0]
+@ CHECK: srsia	sp!, #2                 @ encoding: [0xad,0xe9,0x02,0xc0]
+@ CHECK: srsdb	sp, #10                 @ encoding: [0x0d,0xe8,0x0a,0xc0]
+@ CHECK: srsia	sp, #9                  @ encoding: [0x8d,0xe9,0x09,0xc0]
+@ CHECK: srsdb	sp!, #5                 @ encoding: [0x2d,0xe8,0x05,0xc0]
+@ CHECK: srsia	sp!, #5                 @ encoding: [0xad,0xe9,0x05,0xc0]
+@ CHECK: srsia	sp, #5                  @ encoding: [0x8d,0xe9,0x05,0xc0]
+@ CHECK: srsia	sp!, #5                 @ encoding: [0xad,0xe9,0x05,0xc0]
+
 
 @------------------------------------------------------------------------------
 @ SSAT
diff --git a/test/MC/ARM/elf-eflags-eabi-cg.ll b/test/MC/ARM/elf-eflags-eabi-cg.ll
new file mode 100644
index 0000000..2e86a0f
--- /dev/null
+++ b/test/MC/ARM/elf-eflags-eabi-cg.ll
@@ -0,0 +1,13 @@
+; Codegen version to check for ELF header flags.
+;
+; RUN: llc %s -mtriple=thumbv7-linux-gnueabi -relocation-model=pic \
+; RUN: -filetype=obj -o - | elf-dump --dump-section-data | \
+; RUN: FileCheck %s
+
+define void @bar() nounwind {
+entry:
+  ret void
+}
+
+; For now the only e_flag set is EF_ARM_EABI_VER5
+;CHECK:    'e_flags', 0x05000000
diff --git a/test/MC/ARM/neon-bitwise-encoding.s b/test/MC/ARM/neon-bitwise-encoding.s
index e8c1dd6..8c72288 100644
--- a/test/MC/ARM/neon-bitwise-encoding.s
+++ b/test/MC/ARM/neon-bitwise-encoding.s
@@ -1,4 +1,5 @@
-@ RUN: llvm-mc -mcpu=cortex-a8 -triple arm-unknown-unknown -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple arm-unknown-unknown -show-encoding < %s \
+@ RUN: | FileCheck %s
 
 	vand	d16, d17, d16
 	vand	q8, q8, q9
@@ -255,6 +256,42 @@
 	veor.f   q8, q2
 	veor.i64 q8, q2
 
+	vclt.s16 q5, #0
+	vclt.s16 d5, #0
+
+	vceq.s16 q5, q3
+	vceq.s16 d5, d3
+
+	vcgt.s16 q5, q3
+	vcgt.s16 d5, d3
+
+	vcge.s16 q5, q3
+	vcge.s16 d5, d3
+
+	vcgt.s16 q5, #0
+	vcgt.s16 d5, #0
+
+	vcge.s16 q5, #0
+	vcge.s16 d5, #0
+
+	vceq.s16 q5, #0
+	vceq.s16 d5, #0
+
+	vcle.s16 q5, #0
+	vcle.s16 d5, #0
+
+	vacge.f32 d5, d30
+	vacge.f32 q5, q3
+
+	vacgt.f32 d5, d30
+	vacgt.f32 q5, q3
+
+@ FIXME: We don't have an alias that reverses the operands
+@  vacle.f32 d5, d30 
+@  vacle.f32 q5, q3 
+@  vaclt.f32 d5, d30
+@  vaclt.f32 q5, q3
+
 @ CHECK: vand	q6, q6, q5              @ encoding: [0x5a,0xc1,0x0c,0xf2]
 @ CHECK: vand	q6, q6, q5              @ encoding: [0x5a,0xc1,0x0c,0xf2]
 @ CHECK: vand	q7, q7, q1              @ encoding: [0x52,0xe1,0x0e,0xf2]
@@ -272,3 +309,32 @@
 @ CHECK: veor	q7, q7, q1              @ encoding: [0x52,0xe1,0x0e,0xf3]
 @ CHECK: veor	q8, q8, q2              @ encoding: [0xd4,0x01,0x40,0xf3]
 @ CHECK: veor	q8, q8, q2              @ encoding: [0xd4,0x01,0x40,0xf3]
+@ CHECK: vclt.s16        q5, q5, #0      @ encoding: [0x4a,0xa2,0xb5,0xf3]
+@ CHECK: vclt.s16        d5, d5, #0      @ encoding: [0x05,0x52,0xb5,0xf3]
+
+@ CHECK: vceq.i16        q5, q5, q3      @ encoding: [0x56,0xa8,0x1a,0xf3]
+@ CHECK: vceq.i16        d5, d5, d3      @ encoding: [0x13,0x58,0x15,0xf3]
+
+@ CHECK: vcgt.s16        q5, q5, q3      @ encoding: [0x46,0xa3,0x1a,0xf2]
+@ CHECK: vcgt.s16        d5, d5, d3      @ encoding: [0x03,0x53,0x15,0xf2]
+
+@ CHECK: vcge.s16        q5, q5, q3      @ encoding: [0x56,0xa3,0x1a,0xf2]
+@ CHECK: vcge.s16        d5, d5, d3      @ encoding: [0x13,0x53,0x15,0xf2]
+
+@ CHECK: vcgt.s16        q5, q5, #0      @ encoding: [0x4a,0xa0,0xb5,0xf3]
+@ CHECK: vcgt.s16        d5, d5, #0      @ encoding: [0x05,0x50,0xb5,0xf3]
+
+@ CHECK: vcge.s16        q5, q5, #0      @ encoding: [0xca,0xa0,0xb5,0xf3]
+@ CHECK: vcge.s16        d5, d5, #0      @ encoding: [0x85,0x50,0xb5,0xf3]
+
+@ CHECK: vceq.i16        q5, q5, #0      @ encoding: [0x4a,0xa1,0xb5,0xf3]
+@ CHECK: vceq.i16        d5, d5, #0      @ encoding: [0x05,0x51,0xb5,0xf3]
+
+@ CHECK: vcle.s16        q5, q5, #0      @ encoding: [0xca,0xa1,0xb5,0xf3]
+@ CHECK: vcle.s16        d5, d5, #0      @ encoding: [0x85,0x51,0xb5,0xf3]
+
+@ CHECK: vacge.f32       d5, d5, d30     @ encoding: [0x3e,0x5e,0x05,0xf3]
+@ CHECK: vacge.f32       q5, q5, q3      @ encoding: [0x56,0xae,0x0a,0xf3]
+
+@ CHECK: vacgt.f32       d5, d5, d30     @ encoding: [0x3e,0x5e,0x25,0xf3]
+@ CHECK: vacgt.f32       q5, q5, q3      @ encoding: [0x56,0xae,0x2a,0xf3]
diff --git a/test/MC/ARM/neon-vld-encoding.s b/test/MC/ARM/neon-vld-encoding.s
index 3cc6bf1..648e917 100644
--- a/test/MC/ARM/neon-vld-encoding.s
+++ b/test/MC/ARM/neon-vld-encoding.s
@@ -1,163 +1,163 @@
 @ RUN: llvm-mc -mcpu=cortex-a8 -triple armv7-apple-darwin -show-encoding < %s | FileCheck %s
 
-	vld1.8	{d16}, [r0, :64]
+	vld1.8	{d16}, [r0:64]
 	vld1.16	{d16}, [r0]
 	vld1.32	{d16}, [r0]
 	vld1.64	{d16}, [r0]
-	vld1.8	{d16, d17}, [r0, :64]
-	vld1.16	{d16, d17}, [r0, :128]
+	vld1.8	{d16, d17}, [r0:64]
+	vld1.16	{d16, d17}, [r0:128]
 	vld1.32	{d16, d17}, [r0]
 	vld1.64	{d16, d17}, [r0]
 	vld1.8 {d1, d2, d3}, [r3]
-	vld1.16 {d4, d5, d6}, [r3, :64]
+	vld1.16 {d4, d5, d6}, [r3:64]
 	vld1.32 {d5, d6, d7}, [r3]
-	vld1.64 {d6, d7, d8}, [r3, :64]
+	vld1.64 {d6, d7, d8}, [r3:64]
 	vld1.8 {d1, d2, d3, d4}, [r3]
-	vld1.16 {d4, d5, d6, d7}, [r3, :64]
+	vld1.16 {d4, d5, d6, d7}, [r3:64]
 	vld1.32 {d5, d6, d7, d8}, [r3]
-	vld1.64 {d6, d7, d8, d9}, [r3, :64]
+	vld1.64 {d6, d7, d8, d9}, [r3:64]
 
-	vld1.8	{d16}, [r0, :64]!
+	vld1.8	{d16}, [r0:64]!
 	vld1.16	{d16}, [r0]!
 	vld1.32	{d16}, [r0]!
 	vld1.64	{d16}, [r0]!
-	vld1.8	{d16, d17}, [r0, :64]!
-	vld1.16	{d16, d17}, [r0, :128]!
+	vld1.8	{d16, d17}, [r0:64]!
+	vld1.16	{d16, d17}, [r0:128]!
 	vld1.32	{d16, d17}, [r0]!
 	vld1.64	{d16, d17}, [r0]!
 
-	vld1.8	{d16}, [r0, :64], r5
+	vld1.8	{d16}, [r0:64], r5
 	vld1.16	{d16}, [r0], r5
 	vld1.32	{d16}, [r0], r5
 	vld1.64	{d16}, [r0], r5
-	vld1.8	{d16, d17}, [r0, :64], r5
-	vld1.16	{d16, d17}, [r0, :128], r5
+	vld1.8	{d16, d17}, [r0:64], r5
+	vld1.16	{d16, d17}, [r0:128], r5
 	vld1.32	{d16, d17}, [r0], r5
 	vld1.64	{d16, d17}, [r0], r5
 
 	vld1.8 {d1, d2, d3}, [r3]!
-	vld1.16 {d4, d5, d6}, [r3, :64]!
+	vld1.16 {d4, d5, d6}, [r3:64]!
 	vld1.32 {d5, d6, d7}, [r3]!
-	vld1.64 {d6, d7, d8}, [r3, :64]!
+	vld1.64 {d6, d7, d8}, [r3:64]!
 
 	vld1.8 {d1, d2, d3}, [r3], r6
-	vld1.16 {d4, d5, d6}, [r3, :64], r6
+	vld1.16 {d4, d5, d6}, [r3:64], r6
 	vld1.32 {d5, d6, d7}, [r3], r6
-	vld1.64 {d6, d7, d8}, [r3, :64], r6
+	vld1.64 {d6, d7, d8}, [r3:64], r6
 
 	vld1.8 {d1, d2, d3, d4}, [r3]!
-	vld1.16 {d4, d5, d6, d7}, [r3, :64]!
+	vld1.16 {d4, d5, d6, d7}, [r3:64]!
 	vld1.32 {d5, d6, d7, d8}, [r3]!
-	vld1.64 {d6, d7, d8, d9}, [r3, :64]!
+	vld1.64 {d6, d7, d8, d9}, [r3:64]!
 
 	vld1.8 {d1, d2, d3, d4}, [r3], r8
-	vld1.16 {d4, d5, d6, d7}, [r3, :64], r8
+	vld1.16 {d4, d5, d6, d7}, [r3:64], r8
 	vld1.32 {d5, d6, d7, d8}, [r3], r8
-	vld1.64 {d6, d7, d8, d9}, [r3, :64], r8
+	vld1.64 {d6, d7, d8, d9}, [r3:64], r8
 
-@ CHECK: vld1.8 {d16}, [r0, :64]        @ encoding: [0x1f,0x07,0x60,0xf4]
+@ CHECK: vld1.8 {d16}, [r0:64]          @ encoding: [0x1f,0x07,0x60,0xf4]
 @ CHECK: vld1.16 {d16}, [r0]            @ encoding: [0x4f,0x07,0x60,0xf4]
 @ CHECK: vld1.32 {d16}, [r0]            @ encoding: [0x8f,0x07,0x60,0xf4]
 @ CHECK: vld1.64 {d16}, [r0]            @ encoding: [0xcf,0x07,0x60,0xf4]
-@ CHECK: vld1.8 {d16, d17}, [r0, :64]   @ encoding: [0x1f,0x0a,0x60,0xf4]
-@ CHECK: vld1.16 {d16, d17}, [r0, :128] @ encoding: [0x6f,0x0a,0x60,0xf4]
+@ CHECK: vld1.8 {d16, d17}, [r0:64]     @ encoding: [0x1f,0x0a,0x60,0xf4]
+@ CHECK: vld1.16 {d16, d17}, [r0:128]   @ encoding: [0x6f,0x0a,0x60,0xf4]
 @ CHECK: vld1.32 {d16, d17}, [r0]       @ encoding: [0x8f,0x0a,0x60,0xf4]
 @ CHECK: vld1.64 {d16, d17}, [r0]       @ encoding: [0xcf,0x0a,0x60,0xf4]
 @ CHECK: vld1.8 {d1, d2, d3}, [r3]      @ encoding: [0x0f,0x16,0x23,0xf4]
-@ CHECK: vld1.16 {d4, d5, d6}, [r3, :64] @ encoding: [0x5f,0x46,0x23,0xf4]
+@ CHECK: vld1.16 {d4, d5, d6}, [r3:64]  @ encoding: [0x5f,0x46,0x23,0xf4]
 @ CHECK: vld1.32 {d5, d6, d7}, [r3]     @ encoding: [0x8f,0x56,0x23,0xf4]
-@ CHECK: vld1.64 {d6, d7, d8}, [r3, :64] @ encoding: [0xdf,0x66,0x23,0xf4]
+@ CHECK: vld1.64 {d6, d7, d8}, [r3:64]  @ encoding: [0xdf,0x66,0x23,0xf4]
 @ CHECK: vld1.8 {d1, d2, d3, d4}, [r3]  @ encoding: [0x0f,0x12,0x23,0xf4]
-@ CHECK: vld1.16 {d4, d5, d6, d7}, [r3, :64] @ encoding: [0x5f,0x42,0x23,0xf4]
+@ CHECK: vld1.16 {d4, d5, d6, d7}, [r3:64] @ encoding: [0x5f,0x42,0x23,0xf4]
 @ CHECK: vld1.32 {d5, d6, d7, d8}, [r3]  @ encoding: [0x8f,0x52,0x23,0xf4]
-@ CHECK: vld1.64 {d6, d7, d8, d9}, [r3, :64] @ encoding: [0xdf,0x62,0x23,0xf4]
-@ CHECK: vld1.8	{d16}, [r0, :64]!       @ encoding: [0x1d,0x07,0x60,0xf4]
+@ CHECK: vld1.64 {d6, d7, d8, d9}, [r3:64] @ encoding: [0xdf,0x62,0x23,0xf4]
+@ CHECK: vld1.8	{d16}, [r0:64]!         @ encoding: [0x1d,0x07,0x60,0xf4]
 
 @ CHECK: vld1.16 {d16}, [r0]!           @ encoding: [0x4d,0x07,0x60,0xf4]
 @ CHECK: vld1.32 {d16}, [r0]!           @ encoding: [0x8d,0x07,0x60,0xf4]
 @ CHECK: vld1.64 {d16}, [r0]!           @ encoding: [0xcd,0x07,0x60,0xf4]
-@ CHECK: vld1.8 {d16, d17}, [r0, :64]!  @ encoding: [0x1d,0x0a,0x60,0xf4]
-@ CHECK: vld1.16 {d16, d17}, [r0, :128]! @ encoding: [0x6d,0x0a,0x60,0xf4]
+@ CHECK: vld1.8 {d16, d17}, [r0:64]!    @ encoding: [0x1d,0x0a,0x60,0xf4]
+@ CHECK: vld1.16 {d16, d17}, [r0:128]!  @ encoding: [0x6d,0x0a,0x60,0xf4]
 @ CHECK: vld1.32 {d16, d17}, [r0]!      @ encoding: [0x8d,0x0a,0x60,0xf4]
 @ CHECK: vld1.64 {d16, d17}, [r0]!      @ encoding: [0xcd,0x0a,0x60,0xf4]
 
-@ CHECK: vld1.8 {d16}, [r0, :64], r5    @ encoding: [0x15,0x07,0x60,0xf4]
+@ CHECK: vld1.8 {d16}, [r0:64], r5      @ encoding: [0x15,0x07,0x60,0xf4]
 @ CHECK: vld1.16 {d16}, [r0], r5        @ encoding: [0x45,0x07,0x60,0xf4]
 @ CHECK: vld1.32 {d16}, [r0], r5        @ encoding: [0x85,0x07,0x60,0xf4]
 @ CHECK: vld1.64 {d16}, [r0], r5        @ encoding: [0xc5,0x07,0x60,0xf4]
-@ CHECK: vld1.8 {d16, d17}, [r0, :64], r5 @ encoding: [0x15,0x0a,0x60,0xf4]
-@ CHECK: vld1.16 {d16, d17}, [r0, :128], r5 @ encoding: [0x65,0x0a,0x60,0xf4]
+@ CHECK: vld1.8 {d16, d17}, [r0:64], r5 @ encoding: [0x15,0x0a,0x60,0xf4]
+@ CHECK: vld1.16 {d16, d17}, [r0:128], r5 @ encoding: [0x65,0x0a,0x60,0xf4]
 @ CHECK: vld1.32 {d16, d17}, [r0], r5   @ encoding: [0x85,0x0a,0x60,0xf4]
 @ CHECK: vld1.64 {d16, d17}, [r0], r5   @ encoding: [0xc5,0x0a,0x60,0xf4]
 
 @ CHECK: vld1.8	{d1, d2, d3}, [r3]!     @ encoding: [0x0d,0x16,0x23,0xf4]
-@ CHECK: vld1.16 {d4, d5, d6}, [r3, :64]! @ encoding: [0x5d,0x46,0x23,0xf4]
+@ CHECK: vld1.16 {d4, d5, d6}, [r3:64]! @ encoding: [0x5d,0x46,0x23,0xf4]
 @ CHECK: vld1.32 {d5, d6, d7}, [r3]!     @ encoding: [0x8d,0x56,0x23,0xf4]
-@ CHECK: vld1.64 {d6, d7, d8}, [r3, :64]! @ encoding: [0xdd,0x66,0x23,0xf4]
+@ CHECK: vld1.64 {d6, d7, d8}, [r3:64]! @ encoding: [0xdd,0x66,0x23,0xf4]
 
 @ CHECK: vld1.8	{d1, d2, d3}, [r3], r6  @ encoding: [0x06,0x16,0x23,0xf4]
-@ CHECK: vld1.16 {d4, d5, d6}, [r3, :64], r6 @ encoding: [0x56,0x46,0x23,0xf4]
+@ CHECK: vld1.16 {d4, d5, d6}, [r3:64], r6 @ encoding: [0x56,0x46,0x23,0xf4]
 @ CHECK: vld1.32 {d5, d6, d7}, [r3], r6  @ encoding: [0x86,0x56,0x23,0xf4]
-@ CHECK: vld1.64 {d6, d7, d8}, [r3, :64], r6 @ encoding: [0xd6,0x66,0x23,0xf4]
+@ CHECK: vld1.64 {d6, d7, d8}, [r3:64], r6 @ encoding: [0xd6,0x66,0x23,0xf4]
 
 @ CHECK: vld1.8	{d1, d2, d3, d4}, [r3]! @ encoding: [0x0d,0x12,0x23,0xf4]
-@ CHECK: vld1.16 {d4, d5, d6, d7}, [r3, :64]! @ encoding: [0x5d,0x42,0x23,0xf4]
+@ CHECK: vld1.16 {d4, d5, d6, d7}, [r3:64]! @ encoding: [0x5d,0x42,0x23,0xf4]
 @ CHECK: vld1.32 {d5, d6, d7, d8}, [r3]! @ encoding: [0x8d,0x52,0x23,0xf4]
-@ CHECK: vld1.64 {d6, d7, d8, d9}, [r3, :64]! @ encoding: [0xdd,0x62,0x23,0xf4]
+@ CHECK: vld1.64 {d6, d7, d8, d9}, [r3:64]! @ encoding: [0xdd,0x62,0x23,0xf4]
 
 @ CHECK: vld1.8	{d1, d2, d3, d4}, [r3], r8 @ encoding: [0x08,0x12,0x23,0xf4]
-@ CHECK: vld1.16 {d4, d5, d6, d7}, [r3, :64], r8 @ encoding: [0x58,0x42,0x23,0xf4]
+@ CHECK: vld1.16 {d4, d5, d6, d7}, [r3:64], r8 @ encoding: [0x58,0x42,0x23,0xf4]
 @ CHECK: vld1.32 {d5, d6, d7, d8}, [r3], r8 @ encoding: [0x88,0x52,0x23,0xf4]
-@ CHECK: vld1.64 {d6, d7, d8, d9}, [r3, :64], r8 @ encoding: [0xd8,0x62,0x23,0xf4]
+@ CHECK: vld1.64 {d6, d7, d8, d9}, [r3:64], r8 @ encoding: [0xd8,0x62,0x23,0xf4]
 
 
-	vld2.8	{d16, d17}, [r0, :64]
-	vld2.16	{d16, d17}, [r0, :128]
+	vld2.8	{d16, d17}, [r0:64]
+	vld2.16	{d16, d17}, [r0:128]
 	vld2.32	{d16, d17}, [r0]
-	vld2.8	{d16, d17, d18, d19}, [r0, :64]
-	vld2.16	{d16, d17, d18, d19}, [r0, :128]
-	vld2.32	{d16, d17, d18, d19}, [r0, :256]
+	vld2.8	{d16, d17, d18, d19}, [r0:64]
+	vld2.16	{d16, d17, d18, d19}, [r0:128]
+	vld2.32	{d16, d17, d18, d19}, [r0:256]
 
-	vld2.8	{d19, d20}, [r0, :64]!
-	vld2.16	{d16, d17}, [r0, :128]!
+	vld2.8	{d19, d20}, [r0:64]!
+	vld2.16	{d16, d17}, [r0:128]!
 	vld2.32	{q10}, [r0]!
-	vld2.8	{d4-d7}, [r0, :64]!
-	vld2.16	{d1, d2, d3, d4}, [r0, :128]!
-	vld2.32	{q7, q8}, [r0, :256]!
+	vld2.8	{d4-d7}, [r0:64]!
+	vld2.16	{d1, d2, d3, d4}, [r0:128]!
+	vld2.32	{q7, q8}, [r0:256]!
 
-	vld2.8	{d19, d20}, [r0, :64], r6
-	vld2.16	{d16, d17}, [r0, :128], r6
+	vld2.8	{d19, d20}, [r0:64], r6
+	vld2.16	{d16, d17}, [r0:128], r6
 	vld2.32	{q10}, [r0], r6
-	vld2.8	{d4-d7}, [r0, :64], r6
-	vld2.16	{d1, d2, d3, d4}, [r0, :128], r6
-	vld2.32	{q7, q8}, [r0, :256], r6
+	vld2.8	{d4-d7}, [r0:64], r6
+	vld2.16	{d1, d2, d3, d4}, [r0:128], r6
+	vld2.32	{q7, q8}, [r0:256], r6
 
-@ CHECK: vld2.8	{d16, d17}, [r0, :64]   @ encoding: [0x1f,0x08,0x60,0xf4]
-@ CHECK: vld2.16 {d16, d17}, [r0, :128] @ encoding: [0x6f,0x08,0x60,0xf4]
+@ CHECK: vld2.8	{d16, d17}, [r0:64]   @ encoding: [0x1f,0x08,0x60,0xf4]
+@ CHECK: vld2.16 {d16, d17}, [r0:128] @ encoding: [0x6f,0x08,0x60,0xf4]
 @ CHECK: vld2.32 {d16, d17}, [r0] @ encoding: [0x8f,0x08,0x60,0xf4]
-@ CHECK: vld2.8	{d16, d17, d18, d19}, [r0, :64] @ encoding: [0x1f,0x03,0x60,0xf4]
-@ CHECK: vld2.16 {d16, d17, d18, d19}, [r0, :128] @ encoding: [0x6f,0x03,0x60,0xf4]
-@ CHECK: vld2.32 {d16, d17, d18, d19}, [r0, :256] @ encoding: [0xbf,0x03,0x60,0xf4]
+@ CHECK: vld2.8	{d16, d17, d18, d19}, [r0:64] @ encoding: [0x1f,0x03,0x60,0xf4]
+@ CHECK: vld2.16 {d16, d17, d18, d19}, [r0:128] @ encoding: [0x6f,0x03,0x60,0xf4]
+@ CHECK: vld2.32 {d16, d17, d18, d19}, [r0:256] @ encoding: [0xbf,0x03,0x60,0xf4]
 
-@ CHECK: vld2.8	{d19, d20}, [r0, :64]!  @ encoding: [0x1d,0x38,0x60,0xf4]
-@ CHECK: vld2.16 {d16, d17}, [r0, :128]! @ encoding: [0x6d,0x08,0x60,0xf4]
+@ CHECK: vld2.8	{d19, d20}, [r0:64]!  @ encoding: [0x1d,0x38,0x60,0xf4]
+@ CHECK: vld2.16 {d16, d17}, [r0:128]! @ encoding: [0x6d,0x08,0x60,0xf4]
 @ CHECK: vld2.32 {d20, d21}, [r0]!       @ encoding: [0x8d,0x48,0x60,0xf4]
-@ CHECK: vld2.8	{d4, d5, d6, d7}, [r0, :64]! @ encoding: [0x1d,0x43,0x20,0xf4]
-@ CHECK: vld2.16 {d1, d2, d3, d4}, [r0, :128]! @ encoding: [0x6d,0x13,0x20,0xf4]
-@ CHECK: vld2.32 {d14, d15, d16, d17}, [r0, :256]! @ encoding: [0xbd,0xe3,0x20,0xf4]
+@ CHECK: vld2.8	{d4, d5, d6, d7}, [r0:64]! @ encoding: [0x1d,0x43,0x20,0xf4]
+@ CHECK: vld2.16 {d1, d2, d3, d4}, [r0:128]! @ encoding: [0x6d,0x13,0x20,0xf4]
+@ CHECK: vld2.32 {d14, d15, d16, d17}, [r0:256]! @ encoding: [0xbd,0xe3,0x20,0xf4]
 
-@ CHECK: vld2.8	{d19, d20}, [r0, :64], r6 @ encoding: [0x16,0x38,0x60,0xf4]
-@ CHECK: vld2.16 {d16, d17}, [r0, :128], r6 @ encoding: [0x66,0x08,0x60,0xf4]
+@ CHECK: vld2.8	{d19, d20}, [r0:64], r6 @ encoding: [0x16,0x38,0x60,0xf4]
+@ CHECK: vld2.16 {d16, d17}, [r0:128], r6 @ encoding: [0x66,0x08,0x60,0xf4]
 @ CHECK: vld2.32 {d20, d21}, [r0], r6    @ encoding: [0x86,0x48,0x60,0xf4]
-@ CHECK: vld2.8	{d4, d5, d6, d7}, [r0, :64], r6 @ encoding: [0x16,0x43,0x20,0xf4]
-@ CHECK: vld2.16 {d1, d2, d3, d4}, [r0, :128], r6 @ encoding: [0x66,0x13,0x20,0xf4]
-@ CHECK: vld2.32 {d14, d15, d16, d17}, [r0, :256], r6 @ encoding: [0xb6,0xe3,0x20,0xf4]
+@ CHECK: vld2.8	{d4, d5, d6, d7}, [r0:64], r6 @ encoding: [0x16,0x43,0x20,0xf4]
+@ CHECK: vld2.16 {d1, d2, d3, d4}, [r0:128], r6 @ encoding: [0x66,0x13,0x20,0xf4]
+@ CHECK: vld2.32 {d14, d15, d16, d17}, [r0:256], r6 @ encoding: [0xb6,0xe3,0x20,0xf4]
 
 
 	vld3.8 {d16, d17, d18}, [r1]
 	vld3.16 {d6, d7, d8}, [r2]
 	vld3.32 {d1, d2, d3}, [r3]
-	vld3.8 {d16, d18, d20}, [r0, :64]
+	vld3.8 {d16, d18, d20}, [r0:64]
 	vld3.u16 {d27, d29, d31}, [r4]
 	vld3.i32 {d6, d8, d10}, [r5]
 
@@ -171,7 +171,7 @@
 	vld3.p8 {d6, d7, d8}, [r8]!
 	vld3.16 {d9, d10, d11}, [r7]!
 	vld3.f32 {d1, d2, d3}, [r6]!
-	vld3.8 {d16, d18, d20}, [r0, :64]!
+	vld3.8 {d16, d18, d20}, [r0:64]!
 	vld3.p16 {d20, d22, d24}, [r5]!
 	vld3.32 {d5, d7, d9}, [r4]!
 
@@ -179,7 +179,7 @@
 @ CHECK: vld3.8	{d16, d17, d18}, [r1]   @ encoding: [0x0f,0x04,0x61,0xf4]
 @ CHECK: vld3.16	{d6, d7, d8}, [r2]      @ encoding: [0x4f,0x64,0x22,0xf4]
 @ CHECK: vld3.32	{d1, d2, d3}, [r3]      @ encoding: [0x8f,0x14,0x23,0xf4]
-@ CHECK: vld3.8	{d16, d18, d20}, [r0, :64] @ encoding: [0x1f,0x05,0x60,0xf4]
+@ CHECK: vld3.8	{d16, d18, d20}, [r0:64] @ encoding: [0x1f,0x05,0x60,0xf4]
 @ CHECK: vld3.16	{d27, d29, d31}, [r4]   @ encoding: [0x4f,0xb5,0x64,0xf4]
 @ CHECK: vld3.32	{d6, d8, d10}, [r5]     @ encoding: [0x8f,0x65,0x25,0xf4]
 @ CHECK: vld3.8	{d12, d13, d14}, [r6], r1 @ encoding: [0x01,0xc4,0x26,0xf4]
@@ -191,48 +191,48 @@
 @ CHECK: vld3.8	{d6, d7, d8}, [r8]!     @ encoding: [0x0d,0x64,0x28,0xf4]
 @ CHECK: vld3.16	{d9, d10, d11}, [r7]!   @ encoding: [0x4d,0x94,0x27,0xf4]
 @ CHECK: vld3.32	{d1, d2, d3}, [r6]!     @ encoding: [0x8d,0x14,0x26,0xf4]
-@ CHECK: vld3.8	{d16, d18, d20}, [r0, :64]! @ encoding: [0x1d,0x05,0x60,0xf4]
+@ CHECK: vld3.8	{d16, d18, d20}, [r0:64]! @ encoding: [0x1d,0x05,0x60,0xf4]
 @ CHECK: vld3.16	{d20, d22, d24}, [r5]!  @ encoding: [0x4d,0x45,0x65,0xf4]
 @ CHECK: vld3.32	{d5, d7, d9}, [r4]!     @ encoding: [0x8d,0x55,0x24,0xf4]
 
 
-	vld4.8 {d16, d17, d18, d19}, [r1, :64]
-	vld4.16 {d16, d17, d18, d19}, [r2, :128]
-	vld4.32 {d16, d17, d18, d19}, [r3, :256]
-	vld4.8 {d17, d19, d21, d23}, [r5, :256]
+	vld4.8 {d16, d17, d18, d19}, [r1:64]
+	vld4.16 {d16, d17, d18, d19}, [r2:128]
+	vld4.32 {d16, d17, d18, d19}, [r3:256]
+	vld4.8 {d17, d19, d21, d23}, [r5:256]
 	vld4.16 {d17, d19, d21, d23}, [r7]
 	vld4.32 {d16, d18, d20, d22}, [r8]
 
-	vld4.s8 {d16, d17, d18, d19}, [r1, :64]!
-	vld4.s16 {d16, d17, d18, d19}, [r2, :128]!
-	vld4.s32 {d16, d17, d18, d19}, [r3, :256]!
-	vld4.u8 {d17, d19, d21, d23}, [r5, :256]!
+	vld4.s8 {d16, d17, d18, d19}, [r1:64]!
+	vld4.s16 {d16, d17, d18, d19}, [r2:128]!
+	vld4.s32 {d16, d17, d18, d19}, [r3:256]!
+	vld4.u8 {d17, d19, d21, d23}, [r5:256]!
 	vld4.u16 {d17, d19, d21, d23}, [r7]!
 	vld4.u32 {d16, d18, d20, d22}, [r8]!
 
-	vld4.p8 {d16, d17, d18, d19}, [r1, :64], r8
+	vld4.p8 {d16, d17, d18, d19}, [r1:64], r8
 	vld4.p16 {d16, d17, d18, d19}, [r2], r7
-	vld4.f32 {d16, d17, d18, d19}, [r3, :64], r5
-	vld4.i8 {d16, d18, d20, d22}, [r4, :256], r2
+	vld4.f32 {d16, d17, d18, d19}, [r3:64], r5
+	vld4.i8 {d16, d18, d20, d22}, [r4:256], r2
 	vld4.i16 {d16, d18, d20, d22}, [r6], r3
 	vld4.i32 {d17, d19, d21, d23}, [r9], r4
 
-@ CHECK: vld4.8 {d16, d17, d18, d19}, [r1, :64] @ encoding: [0x1f,0x00,0x61,0xf4]
-@ CHECK: vld4.16 {d16, d17, d18, d19}, [r2, :128] @ encoding: [0x6f,0x00,0x62,0xf4]
-@ CHECK: vld4.32 {d16, d17, d18, d19}, [r3, :256] @ encoding: [0xbf,0x00,0x63,0xf4]
-@ CHECK: vld4.8 {d17, d19, d21, d23}, [r5, :256] @ encoding: [0x3f,0x11,0x65,0xf4]
+@ CHECK: vld4.8 {d16, d17, d18, d19}, [r1:64] @ encoding: [0x1f,0x00,0x61,0xf4]
+@ CHECK: vld4.16 {d16, d17, d18, d19}, [r2:128] @ encoding: [0x6f,0x00,0x62,0xf4]
+@ CHECK: vld4.32 {d16, d17, d18, d19}, [r3:256] @ encoding: [0xbf,0x00,0x63,0xf4]
+@ CHECK: vld4.8 {d17, d19, d21, d23}, [r5:256] @ encoding: [0x3f,0x11,0x65,0xf4]
 @ CHECK: vld4.16 {d17, d19, d21, d23}, [r7] @ encoding: [0x4f,0x11,0x67,0xf4]
 @ CHECK: vld4.32 {d16, d18, d20, d22}, [r8] @ encoding: [0x8f,0x01,0x68,0xf4]
-@ CHECK: vld4.8 {d16, d17, d18, d19}, [r1, :64]! @ encoding: [0x1d,0x00,0x61,0xf4]
-@ CHECK: vld4.16 {d16, d17, d18, d19}, [r2, :128]! @ encoding: [0x6d,0x00,0x62,0xf4]
-@ CHECK: vld4.32 {d16, d17, d18, d19}, [r3, :256]! @ encoding: [0xbd,0x00,0x63,0xf4]
-@ CHECK: vld4.8 {d17, d19, d21, d23}, [r5, :256]! @ encoding: [0x3d,0x11,0x65,0xf4]
+@ CHECK: vld4.8 {d16, d17, d18, d19}, [r1:64]! @ encoding: [0x1d,0x00,0x61,0xf4]
+@ CHECK: vld4.16 {d16, d17, d18, d19}, [r2:128]! @ encoding: [0x6d,0x00,0x62,0xf4]
+@ CHECK: vld4.32 {d16, d17, d18, d19}, [r3:256]! @ encoding: [0xbd,0x00,0x63,0xf4]
+@ CHECK: vld4.8 {d17, d19, d21, d23}, [r5:256]! @ encoding: [0x3d,0x11,0x65,0xf4]
 @ CHECK: vld4.16 {d17, d19, d21, d23}, [r7]! @ encoding: [0x4d,0x11,0x67,0xf4]
 @ CHECK: vld4.32 {d16, d18, d20, d22}, [r8]! @ encoding: [0x8d,0x01,0x68,0xf4]
-@ CHECK: vld4.8 {d16, d17, d18, d19}, [r1, :64], r8 @ encoding: [0x18,0x00,0x61,0xf4]
+@ CHECK: vld4.8 {d16, d17, d18, d19}, [r1:64], r8 @ encoding: [0x18,0x00,0x61,0xf4]
 @ CHECK: vld4.16 {d16, d17, d18, d19}, [r2], r7 @ encoding: [0x47,0x00,0x62,0xf4]
-@ CHECK: vld4.32 {d16, d17, d18, d19}, [r3, :64], r5 @ encoding: [0x95,0x00,0x63,0xf4]
-@ CHECK: vld4.8 {d16, d18, d20, d22}, [r4, :256], r2 @ encoding: [0x32,0x01,0x64,0xf4]
+@ CHECK: vld4.32 {d16, d17, d18, d19}, [r3:64], r5 @ encoding: [0x95,0x00,0x63,0xf4]
+@ CHECK: vld4.8 {d16, d18, d20, d22}, [r4:256], r2 @ encoding: [0x32,0x01,0x64,0xf4]
 @ CHECK: vld4.16 {d16, d18, d20, d22}, [r6], r3 @ encoding: [0x43,0x01,0x66,0xf4]
 @ CHECK: vld4.32 {d17, d19, d21, d23}, [r9], r4 @ encoding: [0x84,0x11,0x69,0xf4]
 
@@ -252,28 +252,28 @@
 @ CHECK: vld1.8	{d4[], d5[]}, [r1], r3  @ encoding: [0x23,0x4c,0xa1,0xf4]
 
 	vld1.8	{d16[3]}, [r0]
-	vld1.16	{d16[2]}, [r0, :16]
-	vld1.32	{d16[1]}, [r0, :32]
+	vld1.16	{d16[2]}, [r0:16]
+	vld1.32	{d16[1]}, [r0:32]
         vld1.p8 d12[6], [r2]!
         vld1.i8 d12[6], [r2], r2
         vld1.u16 d12[3], [r2]!
         vld1.16 d12[2], [r2], r2
 
 @ CHECK: vld1.8	{d16[3]}, [r0]          @ encoding: [0x6f,0x00,0xe0,0xf4]
-@ CHECK: vld1.16 {d16[2]}, [r0, :16]    @ encoding: [0x9f,0x04,0xe0,0xf4]
-@ CHECK: vld1.32 {d16[1]}, [r0, :32]    @ encoding: [0xbf,0x08,0xe0,0xf4]
+@ CHECK: vld1.16 {d16[2]}, [r0:16]      @ encoding: [0x9f,0x04,0xe0,0xf4]
+@ CHECK: vld1.32 {d16[1]}, [r0:32]      @ encoding: [0xbf,0x08,0xe0,0xf4]
 @ CHECK: vld1.8	{d12[6]}, [r2]!         @ encoding: [0xcd,0xc0,0xa2,0xf4]
 @ CHECK: vld1.8	{d12[6]}, [r2], r2      @ encoding: [0xc2,0xc0,0xa2,0xf4]
 @ CHECK: vld1.16 {d12[3]}, [r2]!        @ encoding: [0xcd,0xc4,0xa2,0xf4]
 @ CHECK: vld1.16 {d12[2]}, [r2], r2     @ encoding: [0x82,0xc4,0xa2,0xf4]
 
 
-	vld2.8	{d16[1], d17[1]}, [r0, :16]
-	vld2.16	{d16[1], d17[1]}, [r0, :32]
+	vld2.8	{d16[1], d17[1]}, [r0:16]
+	vld2.16	{d16[1], d17[1]}, [r0:32]
 	vld2.32	{d16[1], d17[1]}, [r0]
 	vld2.16	{d17[1], d19[1]}, [r0]
-	vld2.32	{d17[0], d19[0]}, [r0, :64]
-	vld2.32	{d17[0], d19[0]}, [r0, :64]!
+	vld2.32	{d17[0], d19[0]}, [r0:64]
+	vld2.32	{d17[0], d19[0]}, [r0:64]!
         vld2.8 {d2[4], d3[4]}, [r2], r3
         vld2.8 {d2[4], d3[4]}, [r2]!
         vld2.8 {d2[4], d3[4]}, [r2]
@@ -284,12 +284,12 @@
         vld2.32 {d22[ ],d23[ ]}, [r5], r4
         vld2.32 {d22[ ],d24[ ]}, [r6], r4
 
-@ CHECK: vld2.8	{d16[1], d17[1]}, [r0, :16] @ encoding: [0x3f,0x01,0xe0,0xf4]
-@ CHECK: vld2.16 {d16[1], d17[1]}, [r0, :32] @ encoding: [0x5f,0x05,0xe0,0xf4]
+@ CHECK: vld2.8	{d16[1], d17[1]}, [r0:16] @ encoding: [0x3f,0x01,0xe0,0xf4]
+@ CHECK: vld2.16 {d16[1], d17[1]}, [r0:32] @ encoding: [0x5f,0x05,0xe0,0xf4]
 @ CHECK: vld2.32 {d16[1], d17[1]}, [r0]  @ encoding: [0x8f,0x09,0xe0,0xf4]
 @ CHECK: vld2.16 {d17[1], d19[1]}, [r0]  @ encoding: [0x6f,0x15,0xe0,0xf4]
-@ CHECK: vld2.32 {d17[0], d19[0]}, [r0, :64] @ encoding: [0x5f,0x19,0xe0,0xf4]
-@ CHECK: vld2.32 {d17[0], d19[0]}, [r0, :64]! @ encoding: [0x5d,0x19,0xe0,0xf4]
+@ CHECK: vld2.32 {d17[0], d19[0]}, [r0:64] @ encoding: [0x5f,0x19,0xe0,0xf4]
+@ CHECK: vld2.32 {d17[0], d19[0]}, [r0:64]! @ encoding: [0x5d,0x19,0xe0,0xf4]
 @ CHECK: vld2.8	{d2[4], d3[4]}, [r2], r3 @ encoding: [0x83,0x21,0xa2,0xf4]
 @ CHECK: vld2.8	{d2[4], d3[4]}, [r2]!   @ encoding: [0x8d,0x21,0xa2,0xf4]
 @ CHECK: vld2.8	{d2[4], d3[4]}, [r2]    @ encoding: [0x8f,0x21,0xa2,0xf4]
@@ -383,15 +383,15 @@
 	vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r7]
 	vld4.32 {d16[1], d18[1], d20[1], d22[1]}, [r8]
 
-	vld4.s8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
-	vld4.s16 {d16[1], d17[1], d18[1], d19[1]}, [r2, :64]!
-	vld4.s32 {d16[1], d17[1], d18[1], d19[1]}, [r3, :128]!
+	vld4.s8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32]!
+	vld4.s16 {d16[1], d17[1], d18[1], d19[1]}, [r2:64]!
+	vld4.s32 {d16[1], d17[1], d18[1], d19[1]}, [r3:128]!
 	vld4.u16 {d17[1], d19[1], d21[1], d23[1]}, [r7]!
 	vld4.u32 {d16[1], d18[1], d20[1], d22[1]}, [r8]!
 
-	vld4.p8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32], r8
+	vld4.p8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32], r8
 	vld4.p16 {d16[1], d17[1], d18[1], d19[1]}, [r2], r7
-	vld4.f32 {d16[1], d17[1], d18[1], d19[1]}, [r3, :64], r5
+	vld4.f32 {d16[1], d17[1], d18[1], d19[1]}, [r3:64], r5
 	vld4.i16 {d16[1], d18[1], d20[1], d22[1]}, [r6], r3
 	vld4.i32 {d17[1], d19[1], d21[1], d23[1]}, [r9], r4
 
@@ -400,14 +400,14 @@
 @ CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3] @ encoding: [0x8f,0x0b,0xe3,0xf4]
 @ CHECK: vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r7] @ encoding: [0x6f,0x17,0xe7,0xf4]
 @ CHECK: vld4.32 {d16[1], d18[1], d20[1], d22[1]}, [r8] @ encoding: [0xcf,0x0b,0xe8,0xf4]
-@ CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r1, :32]! @ encoding: [0x3d,0x03,0xe1,0xf4]
-@ CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r2, :64]! @ encoding: [0x5d,0x07,0xe2,0xf4]
-@ CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3, :128]! @ encoding: [0xad,0x0b,0xe3,0xf4]
+@ CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r1:32]! @ encoding: [0x3d,0x03,0xe1,0xf4]
+@ CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r2:64]! @ encoding: [0x5d,0x07,0xe2,0xf4]
+@ CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3:128]! @ encoding: [0xad,0x0b,0xe3,0xf4]
 @ CHECK: vld4.16 {d17[1], d18[1], d19[1], d20[1]}, [r7]! @ encoding: [0x6d,0x17,0xe7,0xf4]
 @ CHECK: vld4.32 {d16[1], d18[1], d20[1], d22[1]}, [r8]! @ encoding: [0xcd,0x0b,0xe8,0xf4]
-@ CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r1, :32], r8 @ encoding: [0x38,0x03,0xe1,0xf4]
+@ CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r1:32], r8 @ encoding: [0x38,0x03,0xe1,0xf4]
 @ CHECK: vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r2], r7 @ encoding: [0x47,0x07,0xe2,0xf4]
-@ CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3, :64], r5 @ encoding: [0x95,0x0b,0xe3,0xf4]
+@ CHECK: vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3:64], r5 @ encoding: [0x95,0x0b,0xe3,0xf4]
 @ CHECK: vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r6], r3 @ encoding: [0x63,0x07,0xe6,0xf4]
 @ CHECK: vld4.32 {d17[1], d19[1], d21[1], d23[1]}, [r9], r4 @ encoding: [0xc4,0x1b,0xe9,0xf4]
 
@@ -490,8 +490,17 @@
 
 
 @ Register lists can use the range syntax, just like VLDM
-	vld1.f64 {d2-d5}, [r2,:128]!
-	vld1.f64 {d2,d3,d4,d5}, [r2,:128]!
+	vld1.f64 {d2-d5}, [r2:128]!
+	vld1.f64 {d2,d3,d4,d5}, [r2:128]!
 
-@ CHECK: vld1.64 {d2, d3, d4, d5}, [r2, :128]! @ encoding: [0xed,0x22,0x22,0xf4]
-@ CHECK: vld1.64 {d2, d3, d4, d5}, [r2, :128]! @ encoding: [0xed,0x22,0x22,0xf4]
+@ CHECK: vld1.64 {d2, d3, d4, d5}, [r2:128]! @ encoding: [0xed,0x22,0x22,0xf4]
+@ CHECK: vld1.64 {d2, d3, d4, d5}, [r2:128]! @ encoding: [0xed,0x22,0x22,0xf4]
+
+
+@ verify that the old incorrect alignment specifier syntax (", :")
+@ still gets accepted.
+        vld2.8	{d16, d17}, [r0, :64]
+        vld2.16	{d16, d17}, [r0, :128]
+
+@ CHECK: vld2.8	{d16, d17}, [r0:64]   @ encoding: [0x1f,0x08,0x60,0xf4]
+@ CHECK: vld2.16 {d16, d17}, [r0:128] @ encoding: [0x6f,0x08,0x60,0xf4]
diff --git a/test/MC/ARM/neon-vst-encoding.s b/test/MC/ARM/neon-vst-encoding.s
index f5feca4..ef9f037 100644
--- a/test/MC/ARM/neon-vst-encoding.s
+++ b/test/MC/ARM/neon-vst-encoding.s
@@ -1,67 +1,67 @@
 @ RUN: llvm-mc -mcpu=cortex-a8 -triple armv7-apple-darwin -show-encoding < %s | FileCheck %s
 
-	vst1.8	{d16}, [r0, :64]
+	vst1.8	{d16}, [r0:64]
 	vst1.16	{d16}, [r0]
 	vst1.32	{d16}, [r0]
 	vst1.64	{d16}, [r0]
-	vst1.8	{d16, d17}, [r0, :64]
-	vst1.16	{d16, d17}, [r0, :128]
+	vst1.8	{d16, d17}, [r0:64]
+	vst1.16	{d16, d17}, [r0:128]
 	vst1.32	{d16, d17}, [r0]
 	vst1.64	{d16, d17}, [r0]
-        vst1.8  {d16, d17, d18}, [r0, :64]
-        vst1.8  {d16, d17, d18}, [r0, :64]!
+        vst1.8  {d16, d17, d18}, [r0:64]
+        vst1.8  {d16, d17, d18}, [r0:64]!
         vst1.8  {d16, d17, d18}, [r0], r3
-        vst1.8  {d16, d17, d18, d19}, [r0, :64]
-        vst1.16  {d16, d17, d18, d19}, [r1, :64]!
+        vst1.8  {d16, d17, d18, d19}, [r0:64]
+        vst1.16  {d16, d17, d18, d19}, [r1:64]!
         vst1.64  {d16, d17, d18, d19}, [r3], r2
 
-@ CHECK: vst1.8	{d16}, [r0, :64]        @ encoding: [0x1f,0x07,0x40,0xf4]
+@ CHECK: vst1.8	{d16}, [r0:64]        @ encoding: [0x1f,0x07,0x40,0xf4]
 @ CHECK: vst1.16 {d16}, [r0]            @ encoding: [0x4f,0x07,0x40,0xf4]
 @ CHECK: vst1.32 {d16}, [r0]            @ encoding: [0x8f,0x07,0x40,0xf4]
 @ CHECK: vst1.64 {d16}, [r0]            @ encoding: [0xcf,0x07,0x40,0xf4]
-@ CHECK: vst1.8	{d16, d17}, [r0, :64]   @ encoding: [0x1f,0x0a,0x40,0xf4]
-@ CHECK: vst1.16 {d16, d17}, [r0, :128] @ encoding: [0x6f,0x0a,0x40,0xf4]
+@ CHECK: vst1.8	{d16, d17}, [r0:64]   @ encoding: [0x1f,0x0a,0x40,0xf4]
+@ CHECK: vst1.16 {d16, d17}, [r0:128] @ encoding: [0x6f,0x0a,0x40,0xf4]
 @ CHECK: vst1.32 {d16, d17}, [r0]       @ encoding: [0x8f,0x0a,0x40,0xf4]
 @ CHECK: vst1.64 {d16, d17}, [r0]       @ encoding: [0xcf,0x0a,0x40,0xf4]
-@ CHECK: vst1.8	{d16, d17, d18}, [r0, :64] @ encoding: [0x1f,0x06,0x40,0xf4]
-@ CHECK: vst1.8	{d16, d17, d18}, [r0, :64]! @ encoding: [0x1d,0x06,0x40,0xf4]
+@ CHECK: vst1.8	{d16, d17, d18}, [r0:64] @ encoding: [0x1f,0x06,0x40,0xf4]
+@ CHECK: vst1.8	{d16, d17, d18}, [r0:64]! @ encoding: [0x1d,0x06,0x40,0xf4]
 @ CHECK: vst1.8	{d16, d17, d18}, [r0], r3 @ encoding: [0x03,0x06,0x40,0xf4]
-@ CHECK: vst1.8	{d16, d17, d18, d19}, [r0, :64] @ encoding: [0x1f,0x02,0x40,0xf4]
-@ CHECK: vst1.16 {d16, d17, d18, d19}, [r1, :64]! @ encoding: [0x5d,0x02,0x41,0xf4]
+@ CHECK: vst1.8	{d16, d17, d18, d19}, [r0:64] @ encoding: [0x1f,0x02,0x40,0xf4]
+@ CHECK: vst1.16 {d16, d17, d18, d19}, [r1:64]! @ encoding: [0x5d,0x02,0x41,0xf4]
 @ CHECK: vst1.64 {d16, d17, d18, d19}, [r3], r2 @ encoding: [0xc2,0x02,0x43,0xf4]
 
 
-	vst2.8	{d16, d17}, [r0, :64]
-	vst2.16	{d16, d17}, [r0, :128]
+	vst2.8	{d16, d17}, [r0:64]
+	vst2.16	{d16, d17}, [r0:128]
 	vst2.32	{d16, d17}, [r0]
-	vst2.8	{d16, d17, d18, d19}, [r0, :64]
-	vst2.16	{d16, d17, d18, d19}, [r0, :128]
-	vst2.32	{d16, d17, d18, d19}, [r0, :256]
-	vst2.8	{d16, d17}, [r0, :64]!
-	vst2.16	{q15}, [r0, :128]!
+	vst2.8	{d16, d17, d18, d19}, [r0:64]
+	vst2.16	{d16, d17, d18, d19}, [r0:128]
+	vst2.32	{d16, d17, d18, d19}, [r0:256]
+	vst2.8	{d16, d17}, [r0:64]!
+	vst2.16	{q15}, [r0:128]!
 	vst2.32	{d14, d15}, [r0]!
-	vst2.8	{d16, d17, d18, d19}, [r0, :64]!
-	vst2.16	{d18-d21}, [r0, :128]!
-	vst2.32	{q4, q5}, [r0, :256]!
+	vst2.8	{d16, d17, d18, d19}, [r0:64]!
+	vst2.16	{d18-d21}, [r0:128]!
+	vst2.32	{q4, q5}, [r0:256]!
 
-@ CHECK: vst2.8	{d16, d17}, [r0, :64]   @ encoding: [0x1f,0x08,0x40,0xf4]
-@ CHECK: vst2.16 {d16, d17}, [r0, :128] @ encoding: [0x6f,0x08,0x40,0xf4]
+@ CHECK: vst2.8	{d16, d17}, [r0:64]   @ encoding: [0x1f,0x08,0x40,0xf4]
+@ CHECK: vst2.16 {d16, d17}, [r0:128] @ encoding: [0x6f,0x08,0x40,0xf4]
 @ CHECK: vst2.32 {d16, d17}, [r0]       @ encoding: [0x8f,0x08,0x40,0xf4]
-@ CHECK: vst2.8	{d16, d17, d18, d19}, [r0, :64] @ encoding: [0x1f,0x03,0x40,0xf4]
-@ CHECK: vst2.16 {d16, d17, d18, d19}, [r0, :128] @ encoding: [0x6f,0x03,0x40,0xf4]
-@ CHECK: vst2.32 {d16, d17, d18, d19}, [r0, :256] @ encoding: [0xbf,0x03,0x40,0xf4]
-@ CHECK: vst2.8	{d16, d17}, [r0, :64]!  @ encoding: [0x1d,0x08,0x40,0xf4]
-@ CHECK: vst2.16	{d30, d31}, [r0, :128]! @ encoding: [0x6d,0xe8,0x40,0xf4]
+@ CHECK: vst2.8	{d16, d17, d18, d19}, [r0:64] @ encoding: [0x1f,0x03,0x40,0xf4]
+@ CHECK: vst2.16 {d16, d17, d18, d19}, [r0:128] @ encoding: [0x6f,0x03,0x40,0xf4]
+@ CHECK: vst2.32 {d16, d17, d18, d19}, [r0:256] @ encoding: [0xbf,0x03,0x40,0xf4]
+@ CHECK: vst2.8	{d16, d17}, [r0:64]!  @ encoding: [0x1d,0x08,0x40,0xf4]
+@ CHECK: vst2.16	{d30, d31}, [r0:128]! @ encoding: [0x6d,0xe8,0x40,0xf4]
 @ CHECK: vst2.32	{d14, d15}, [r0]!       @ encoding: [0x8d,0xe8,0x00,0xf4]
-@ CHECK: vst2.8	{d16, d17, d18, d19}, [r0, :64]! @ encoding: [0x1d,0x03,0x40,0xf4]
-@ CHECK: vst2.16	{d18, d19, d20, d21}, [r0, :128]! @ encoding: [0x6d,0x23,0x40,0xf4]
-@ CHECK: vst2.32	{d8, d9, d10, d11}, [r0, :256]! @ encoding: [0xbd,0x83,0x00,0xf4]
+@ CHECK: vst2.8	{d16, d17, d18, d19}, [r0:64]! @ encoding: [0x1d,0x03,0x40,0xf4]
+@ CHECK: vst2.16	{d18, d19, d20, d21}, [r0:128]! @ encoding: [0x6d,0x23,0x40,0xf4]
+@ CHECK: vst2.32	{d8, d9, d10, d11}, [r0:256]! @ encoding: [0xbd,0x83,0x00,0xf4]
 
 
 	vst3.8 {d16, d17, d18}, [r1]
 	vst3.16 {d6, d7, d8}, [r2]
 	vst3.32 {d1, d2, d3}, [r3]
-	vst3.8 {d16, d18, d20}, [r0, :64]
+	vst3.8 {d16, d18, d20}, [r0:64]
 	vst3.u16 {d27, d29, d31}, [r4]
 	vst3.i32 {d6, d8, d10}, [r5]
 
@@ -75,14 +75,14 @@
 	vst3.p8 {d6, d7, d8}, [r8]!
 	vst3.16 {d9, d10, d11}, [r7]!
 	vst3.f32 {d1, d2, d3}, [r6]!
-	vst3.8 {d16, d18, d20}, [r0, :64]!
+	vst3.8 {d16, d18, d20}, [r0:64]!
 	vst3.p16 {d20, d22, d24}, [r5]!
 	vst3.32 {d5, d7, d9}, [r4]!
 
 @ CHECK: vst3.8	{d16, d17, d18}, [r1]   @ encoding: [0x0f,0x04,0x41,0xf4]
 @ CHECK: vst3.16	{d6, d7, d8}, [r2]      @ encoding: [0x4f,0x64,0x02,0xf4]
 @ CHECK: vst3.32	{d1, d2, d3}, [r3]      @ encoding: [0x8f,0x14,0x03,0xf4]
-@ CHECK: vst3.8	{d16, d18, d20}, [r0, :64] @ encoding: [0x1f,0x05,0x40,0xf4]
+@ CHECK: vst3.8	{d16, d18, d20}, [r0:64] @ encoding: [0x1f,0x05,0x40,0xf4]
 @ CHECK: vst3.16	{d27, d29, d31}, [r4]   @ encoding: [0x4f,0xb5,0x44,0xf4]
 @ CHECK: vst3.32	{d6, d8, d10}, [r5]     @ encoding: [0x8f,0x65,0x05,0xf4]
 @ CHECK: vst3.8	{d12, d13, d14}, [r6], r1 @ encoding: [0x01,0xc4,0x06,0xf4]
@@ -94,85 +94,85 @@
 @ CHECK: vst3.8	{d6, d7, d8}, [r8]!     @ encoding: [0x0d,0x64,0x08,0xf4]
 @ CHECK: vst3.16	{d9, d10, d11}, [r7]!   @ encoding: [0x4d,0x94,0x07,0xf4]
 @ CHECK: vst3.32	{d1, d2, d3}, [r6]!     @ encoding: [0x8d,0x14,0x06,0xf4]
-@ CHECK: vst3.8	{d16, d18, d20}, [r0, :64]! @ encoding: [0x1d,0x05,0x40,0xf4]
+@ CHECK: vst3.8	{d16, d18, d20}, [r0:64]! @ encoding: [0x1d,0x05,0x40,0xf4]
 @ CHECK: vst3.16	{d20, d22, d24}, [r5]!  @ encoding: [0x4d,0x45,0x45,0xf4]
 @ CHECK: vst3.32	{d5, d7, d9}, [r4]!     @ encoding: [0x8d,0x55,0x04,0xf4]
 
 
-	vst4.8 {d16, d17, d18, d19}, [r1, :64]
-	vst4.16 {d16, d17, d18, d19}, [r2, :128]
-	vst4.32 {d16, d17, d18, d19}, [r3, :256]
-	vst4.8 {d17, d19, d21, d23}, [r5, :256]
+	vst4.8 {d16, d17, d18, d19}, [r1:64]
+	vst4.16 {d16, d17, d18, d19}, [r2:128]
+	vst4.32 {d16, d17, d18, d19}, [r3:256]
+	vst4.8 {d17, d19, d21, d23}, [r5:256]
 	vst4.16 {d17, d19, d21, d23}, [r7]
 	vst4.32 {d16, d18, d20, d22}, [r8]
 
-	vst4.s8 {d16, d17, d18, d19}, [r1, :64]!
-	vst4.s16 {d16, d17, d18, d19}, [r2, :128]!
-	vst4.s32 {d16, d17, d18, d19}, [r3, :256]!
-	vst4.u8 {d17, d19, d21, d23}, [r5, :256]!
+	vst4.s8 {d16, d17, d18, d19}, [r1:64]!
+	vst4.s16 {d16, d17, d18, d19}, [r2:128]!
+	vst4.s32 {d16, d17, d18, d19}, [r3:256]!
+	vst4.u8 {d17, d19, d21, d23}, [r5:256]!
 	vst4.u16 {d17, d19, d21, d23}, [r7]!
 	vst4.u32 {d16, d18, d20, d22}, [r8]!
 
-	vst4.p8 {d16, d17, d18, d19}, [r1, :64], r8
+	vst4.p8 {d16, d17, d18, d19}, [r1:64], r8
 	vst4.p16 {d16, d17, d18, d19}, [r2], r7
-	vst4.f32 {d16, d17, d18, d19}, [r3, :64], r5
-	vst4.i8 {d16, d18, d20, d22}, [r4, :256], r2
+	vst4.f32 {d16, d17, d18, d19}, [r3:64], r5
+	vst4.i8 {d16, d18, d20, d22}, [r4:256], r2
 	vst4.i16 {d16, d18, d20, d22}, [r6], r3
 	vst4.i32 {d17, d19, d21, d23}, [r9], r4
 
-@ CHECK: vst4.8 {d16, d17, d18, d19}, [r1, :64] @ encoding: [0x1f,0x00,0x41,0xf4]
-@ CHECK: vst4.16 {d16, d17, d18, d19}, [r2, :128] @ encoding: [0x6f,0x00,0x42,0xf4]
-@ CHECK: vst4.32 {d16, d17, d18, d19}, [r3, :256] @ encoding: [0xbf,0x00,0x43,0xf4]
-@ CHECK: vst4.8 {d17, d19, d21, d23}, [r5, :256] @ encoding: [0x3f,0x11,0x45,0xf4]
+@ CHECK: vst4.8 {d16, d17, d18, d19}, [r1:64] @ encoding: [0x1f,0x00,0x41,0xf4]
+@ CHECK: vst4.16 {d16, d17, d18, d19}, [r2:128] @ encoding: [0x6f,0x00,0x42,0xf4]
+@ CHECK: vst4.32 {d16, d17, d18, d19}, [r3:256] @ encoding: [0xbf,0x00,0x43,0xf4]
+@ CHECK: vst4.8 {d17, d19, d21, d23}, [r5:256] @ encoding: [0x3f,0x11,0x45,0xf4]
 @ CHECK: vst4.16 {d17, d19, d21, d23}, [r7] @ encoding: [0x4f,0x11,0x47,0xf4]
 @ CHECK: vst4.32 {d16, d18, d20, d22}, [r8] @ encoding: [0x8f,0x01,0x48,0xf4]
-@ CHECK: vst4.8 {d16, d17, d18, d19}, [r1, :64]! @ encoding: [0x1d,0x00,0x41,0xf4]
-@ CHECK: vst4.16 {d16, d17, d18, d19}, [r2, :128]! @ encoding: [0x6d,0x00,0x42,0xf4]
-@ CHECK: vst4.32 {d16, d17, d18, d19}, [r3, :256]! @ encoding: [0xbd,0x00,0x43,0xf4]
-@ CHECK: vst4.8 {d17, d19, d21, d23}, [r5, :256]! @ encoding: [0x3d,0x11,0x45,0xf4]
+@ CHECK: vst4.8 {d16, d17, d18, d19}, [r1:64]! @ encoding: [0x1d,0x00,0x41,0xf4]
+@ CHECK: vst4.16 {d16, d17, d18, d19}, [r2:128]! @ encoding: [0x6d,0x00,0x42,0xf4]
+@ CHECK: vst4.32 {d16, d17, d18, d19}, [r3:256]! @ encoding: [0xbd,0x00,0x43,0xf4]
+@ CHECK: vst4.8 {d17, d19, d21, d23}, [r5:256]! @ encoding: [0x3d,0x11,0x45,0xf4]
 @ CHECK: vst4.16 {d17, d19, d21, d23}, [r7]! @ encoding: [0x4d,0x11,0x47,0xf4]
 @ CHECK: vst4.32 {d16, d18, d20, d22}, [r8]! @ encoding: [0x8d,0x01,0x48,0xf4]
-@ CHECK: vst4.8 {d16, d17, d18, d19}, [r1, :64], r8 @ encoding: [0x18,0x00,0x41,0xf4]
+@ CHECK: vst4.8 {d16, d17, d18, d19}, [r1:64], r8 @ encoding: [0x18,0x00,0x41,0xf4]
 @ CHECK: vst4.16 {d16, d17, d18, d19}, [r2], r7 @ encoding: [0x47,0x00,0x42,0xf4]
-@ CHECK: vst4.32 {d16, d17, d18, d19}, [r3, :64], r5 @ encoding: [0x95,0x00,0x43,0xf4]
-@ CHECK: vst4.8 {d16, d18, d20, d22}, [r4, :256], r2 @ encoding: [0x32,0x01,0x44,0xf4]
+@ CHECK: vst4.32 {d16, d17, d18, d19}, [r3:64], r5 @ encoding: [0x95,0x00,0x43,0xf4]
+@ CHECK: vst4.8 {d16, d18, d20, d22}, [r4:256], r2 @ encoding: [0x32,0x01,0x44,0xf4]
 @ CHECK: vst4.16 {d16, d18, d20, d22}, [r6], r3 @ encoding: [0x43,0x01,0x46,0xf4]
 @ CHECK: vst4.32 {d17, d19, d21, d23}, [r9], r4 @ encoding: [0x84,0x11,0x49,0xf4]
 
 
-	vst2.8	{d16[1], d17[1]}, [r0, :16]
-	vst2.p16	{d16[1], d17[1]}, [r0, :32]
+	vst2.8	{d16[1], d17[1]}, [r0:16]
+	vst2.p16	{d16[1], d17[1]}, [r0:32]
 	vst2.i32	{d16[1], d17[1]}, [r0]
 	vst2.u16	{d17[1], d19[1]}, [r0]
-	vst2.f32	{d17[0], d19[0]}, [r0, :64]
+	vst2.f32	{d17[0], d19[0]}, [r0:64]
 
         vst2.8 {d2[4], d3[4]}, [r2], r3
         vst2.u8 {d2[4], d3[4]}, [r2]!
         vst2.p8 {d2[4], d3[4]}, [r2]
 
         vst2.16 {d17[1], d19[1]}, [r0]
-        vst2.32 {d17[0], d19[0]}, [r0, :64]
+        vst2.32 {d17[0], d19[0]}, [r0:64]
         vst2.i16 {d7[1], d9[1]}, [r1]!
-        vst2.32 {d6[0], d8[0]}, [r2, :64]!
+        vst2.32 {d6[0], d8[0]}, [r2:64]!
         vst2.16 {d2[1], d4[1]}, [r3], r5
-        vst2.u32 {d5[0], d7[0]}, [r4, :64], r7
+        vst2.u32 {d5[0], d7[0]}, [r4:64], r7
 
-@ CHECK: vst2.8	{d16[1], d17[1]}, [r0, :16] @ encoding: [0x3f,0x01,0xc0,0xf4]
-@ CHECK: vst2.16 {d16[1], d17[1]}, [r0, :32] @ encoding: [0x5f,0x05,0xc0,0xf4]
+@ CHECK: vst2.8	{d16[1], d17[1]}, [r0:16] @ encoding: [0x3f,0x01,0xc0,0xf4]
+@ CHECK: vst2.16 {d16[1], d17[1]}, [r0:32] @ encoding: [0x5f,0x05,0xc0,0xf4]
 @ CHECK: vst2.32 {d16[1], d17[1]}, [r0]  @ encoding: [0x8f,0x09,0xc0,0xf4]
 @ CHECK: vst2.16 {d17[1], d19[1]}, [r0]  @ encoding: [0x6f,0x15,0xc0,0xf4]
-@ CHECK: vst2.32 {d17[0], d19[0]}, [r0, :64] @ encoding: [0x5f,0x19,0xc0,0xf4]
+@ CHECK: vst2.32 {d17[0], d19[0]}, [r0:64] @ encoding: [0x5f,0x19,0xc0,0xf4]
 
 @ CHECK: vst2.8	{d2[4], d3[4]}, [r2], r3 @ encoding: [0x83,0x21,0x82,0xf4]
 @ CHECK: vst2.8	{d2[4], d3[4]}, [r2]!   @ encoding: [0x8d,0x21,0x82,0xf4]
 @ CHECK: vst2.8	{d2[4], d3[4]}, [r2]    @ encoding: [0x8f,0x21,0x82,0xf4]
 
 @ CHECK: vst2.16 {d17[1], d19[1]}, [r0]  @ encoding: [0x6f,0x15,0xc0,0xf4]
-@ CHECK: vst2.32 {d17[0], d19[0]}, [r0, :64] @ encoding: [0x5f,0x19,0xc0,0xf4]
+@ CHECK: vst2.32 {d17[0], d19[0]}, [r0:64] @ encoding: [0x5f,0x19,0xc0,0xf4]
 @ CHECK: vst2.16 {d7[1], d9[1]}, [r1]!   @ encoding: [0x6d,0x75,0x81,0xf4]
-@ CHECK: vst2.32 {d6[0], d8[0]}, [r2, :64]! @ encoding: [0x5d,0x69,0x82,0xf4]
+@ CHECK: vst2.32 {d6[0], d8[0]}, [r2:64]! @ encoding: [0x5d,0x69,0x82,0xf4]
 @ CHECK: vst2.16 {d2[1], d4[1]}, [r3], r5 @ encoding: [0x65,0x25,0x83,0xf4]
-@ CHECK: vst2.32 {d5[0], d7[0]}, [r4, :64], r7 @ encoding: [0x57,0x59,0x84,0xf4]
+@ CHECK: vst2.32 {d5[0], d7[0]}, [r4:64], r7 @ encoding: [0x57,0x59,0x84,0xf4]
 
 
 	vst3.8 {d16[1], d17[1], d18[1]}, [r1]
@@ -216,15 +216,15 @@
 	vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r7]
 	vst4.32 {d16[1], d18[1], d20[1], d22[1]}, [r8]
 
-	vst4.s8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
-	vst4.s16 {d16[1], d17[1], d18[1], d19[1]}, [r2, :64]!
-	vst4.s32 {d16[1], d17[1], d18[1], d19[1]}, [r3, :128]!
+	vst4.s8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32]!
+	vst4.s16 {d16[1], d17[1], d18[1], d19[1]}, [r2:64]!
+	vst4.s32 {d16[1], d17[1], d18[1], d19[1]}, [r3:128]!
 	vst4.u16 {d17[1], d19[1], d21[1], d23[1]}, [r7]!
 	vst4.u32 {d16[1], d18[1], d20[1], d22[1]}, [r8]!
 
-	vst4.p8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32], r8
+	vst4.p8 {d16[1], d17[1], d18[1], d19[1]}, [r1:32], r8
 	vst4.p16 {d16[1], d17[1], d18[1], d19[1]}, [r2], r7
-	vst4.f32 {d16[1], d17[1], d18[1], d19[1]}, [r3, :64], r5
+	vst4.f32 {d16[1], d17[1], d18[1], d19[1]}, [r3:64], r5
 	vst4.i16 {d16[1], d18[1], d20[1], d22[1]}, [r6], r3
 	vst4.i32 {d17[1], d19[1], d21[1], d23[1]}, [r9], r4
 
@@ -233,14 +233,14 @@
 @ CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3] @ encoding: [0x8f,0x0b,0xc3,0xf4]
 @ CHECK: vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r7] @ encoding: [0x6f,0x17,0xc7,0xf4]
 @ CHECK: vst4.32 {d16[1], d18[1], d20[1], d22[1]}, [r8] @ encoding: [0xcf,0x0b,0xc8,0xf4]
-@ CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r1, :32]! @ encoding: [0x3d,0x03,0xc1,0xf4]
-@ CHECK: vst4.16 {d16[1], d17[1], d18[1], d19[1]}, [r2, :64]! @ encoding: [0x5d,0x07,0xc2,0xf4]
-@ CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3, :128]! @ encoding: [0xad,0x0b,0xc3,0xf4]
+@ CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r1:32]! @ encoding: [0x3d,0x03,0xc1,0xf4]
+@ CHECK: vst4.16 {d16[1], d17[1], d18[1], d19[1]}, [r2:64]! @ encoding: [0x5d,0x07,0xc2,0xf4]
+@ CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3:128]! @ encoding: [0xad,0x0b,0xc3,0xf4]
 @ CHECK: vst4.16 {d17[1], d18[1], d19[1], d20[1]}, [r7]! @ encoding: [0x6d,0x17,0xc7,0xf4]
 @ CHECK: vst4.32 {d16[1], d18[1], d20[1], d22[1]}, [r8]! @ encoding: [0xcd,0x0b,0xc8,0xf4]
-@ CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r1, :32], r8 @ encoding: [0x38,0x03,0xc1,0xf4]
+@ CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r1:32], r8 @ encoding: [0x38,0x03,0xc1,0xf4]
 @ CHECK: vst4.16 {d16[1], d17[1], d18[1], d19[1]}, [r2], r7 @ encoding: [0x47,0x07,0xc2,0xf4]
-@ CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3, :64], r5 @ encoding: [0x95,0x0b,0xc3,0xf4]
+@ CHECK: vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3:64], r5 @ encoding: [0x95,0x0b,0xc3,0xf4]
 @ CHECK: vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r6], r3 @ encoding: [0x63,0x07,0xc6,0xf4]
 @ CHECK: vst4.32 {d17[1], d19[1], d21[1], d23[1]}, [r9], r4 @ encoding: [0xc4,0x1b,0xc9,0xf4]
 
@@ -269,10 +269,17 @@
         vst2.8 {d8, d10}, [r4]
 @ CHECK: vst2.8	{d8, d10}, [r4]         @ encoding: [0x0f,0x89,0x04,0xf4]
 
-        vst1.32 {d9[1]}, [r3, :32]
-        vst1.32 {d27[1]}, [r9, :32]!
-        vst1.32 {d27[1]}, [r3, :32], r5
-@ CHECK: vst1.32	{d9[1]}, [r3, :32]       @ encoding: [0xbf,0x98,0x83,0xf4]
-@ CHECK: vst1.32	{d27[1]}, [r9, :32]!     @ encoding: [0xbd,0xb8,0xc9,0xf4]
-@ CHECK: vst1.32	{d27[1]}, [r3, :32], r5  @ encoding: [0xb5,0xb8,0xc3,0xf4]
+        vst1.32 {d9[1]}, [r3:32]
+        vst1.32 {d27[1]}, [r9:32]!
+        vst1.32 {d27[1]}, [r3:32], r5
+@ CHECK: vst1.32	{d9[1]}, [r3:32]       @ encoding: [0xbf,0x98,0x83,0xf4]
+@ CHECK: vst1.32	{d27[1]}, [r9:32]!     @ encoding: [0xbd,0xb8,0xc9,0xf4]
+@ CHECK: vst1.32	{d27[1]}, [r3:32], r5  @ encoding: [0xb5,0xb8,0xc3,0xf4]
 
+@ verify that the old incorrect alignment specifier syntax (", :")
+@ still gets accepted.
+        vst2.8	{d16, d17}, [r0, :64]
+        vst2.16	{d16, d17}, [r0, :128]
+
+@ CHECK: vst2.8	{d16, d17}, [r0:64]   @ encoding: [0x1f,0x08,0x40,0xf4]
+@ CHECK: vst2.16 {d16, d17}, [r0:128] @ encoding: [0x6f,0x08,0x40,0xf4]
+\ No newline at end of file
diff --git a/test/MC/ARM/neont2-vld-encoding.s b/test/MC/ARM/neont2-vld-encoding.s
index 031205a..7db8552 100644
--- a/test/MC/ARM/neont2-vld-encoding.s
+++ b/test/MC/ARM/neont2-vld-encoding.s
@@ -3,46 +3,46 @@
 
 .code 16
 
-@ CHECK: vld1.8	{d16}, [r0, :64]        @ encoding: [0x1f,0x07,0x60,0xf9]
-	vld1.8	{d16}, [r0, :64]
+@ CHECK: vld1.8	{d16}, [r0:64]        @ encoding: [0x1f,0x07,0x60,0xf9]
+	vld1.8	{d16}, [r0:64]
 @ CHECK: vld1.16	{d16}, [r0]             @ encoding: [0x4f,0x07,0x60,0xf9]
   vld1.16	{d16}, [r0]
 @ CHECK: vld1.32	{d16}, [r0]             @ encoding: [0x8f,0x07,0x60,0xf9]
   vld1.32	{d16}, [r0]
 @ CHECK: vld1.64	{d16}, [r0]             @ encoding: [0xcf,0x07,0x60,0xf9]
   vld1.64	{d16}, [r0]
-@ CHECK: vld1.8	{d16, d17}, [r0, :64]   @ encoding: [0x1f,0x0a,0x60,0xf9]
-  vld1.8	{d16, d17}, [r0, :64]
-@ CHECK: vld1.16	{d16, d17}, [r0, :128]  @ encoding: [0x6f,0x0a,0x60,0xf9]
-  vld1.16	{d16, d17}, [r0, :128]
+@ CHECK: vld1.8	{d16, d17}, [r0:64]   @ encoding: [0x1f,0x0a,0x60,0xf9]
+  vld1.8	{d16, d17}, [r0:64]
+@ CHECK: vld1.16	{d16, d17}, [r0:128]  @ encoding: [0x6f,0x0a,0x60,0xf9]
+  vld1.16	{d16, d17}, [r0:128]
 @ CHECK: vld1.32	{d16, d17}, [r0]        @ encoding: [0x8f,0x0a,0x60,0xf9]
   vld1.32	{d16, d17}, [r0]
 @ CHECK: vld1.64	{d16, d17}, [r0]        @ encoding: [0xcf,0x0a,0x60,0xf9]
   vld1.64	{d16, d17}, [r0]
 
-@ CHECK: vld2.8	{d16, d17}, [r0, :64]   @ encoding: [0x1f,0x08,0x60,0xf9]
-  vld2.8	{d16, d17}, [r0, :64]
-@ CHECK: vld2.16	{d16, d17}, [r0, :128]  @ encoding: [0x6f,0x08,0x60,0xf9]
-  vld2.16	{d16, d17}, [r0, :128]
+@ CHECK: vld2.8	{d16, d17}, [r0:64]   @ encoding: [0x1f,0x08,0x60,0xf9]
+  vld2.8	{d16, d17}, [r0:64]
+@ CHECK: vld2.16	{d16, d17}, [r0:128]  @ encoding: [0x6f,0x08,0x60,0xf9]
+  vld2.16	{d16, d17}, [r0:128]
 @ CHECK: vld2.32	{d16, d17}, [r0]        @ encoding: [0x8f,0x08,0x60,0xf9]
   vld2.32	{d16, d17}, [r0]
-@ CHECK: vld2.8	{d16, d17, d18, d19}, [r0, :64] @ encoding: [0x1f,0x03,0x60,0xf9]
-  vld2.8	{d16, d17, d18, d19}, [r0, :64]
-@ CHECK: vld2.16	{d16, d17, d18, d19}, [r0, :128] @ encoding: [0x6f,0x03,0x60,0xf9]
-  vld2.16	{d16, d17, d18, d19}, [r0, :128]
-@ CHECK: vld2.32	{d16, d17, d18, d19}, [r0, :256] @ encoding: [0xbf,0x03,0x60,0xf9]
-  vld2.32	{d16, d17, d18, d19}, [r0, :256]
+@ CHECK: vld2.8	{d16, d17, d18, d19}, [r0:64] @ encoding: [0x1f,0x03,0x60,0xf9]
+  vld2.8	{d16, d17, d18, d19}, [r0:64]
+@ CHECK: vld2.16	{d16, d17, d18, d19}, [r0:128] @ encoding: [0x6f,0x03,0x60,0xf9]
+  vld2.16	{d16, d17, d18, d19}, [r0:128]
+@ CHECK: vld2.32	{d16, d17, d18, d19}, [r0:256] @ encoding: [0xbf,0x03,0x60,0xf9]
+  vld2.32	{d16, d17, d18, d19}, [r0:256]
 
-@ CHECK: vld3.8	{d16, d17, d18}, [r0, :64] @ encoding: [0x1f,0x04,0x60,0xf9]
-  vld3.8	{d16, d17, d18}, [r0, :64]
+@ CHECK: vld3.8	{d16, d17, d18}, [r0:64] @ encoding: [0x1f,0x04,0x60,0xf9]
+  vld3.8	{d16, d17, d18}, [r0:64]
 @ CHECK: vld3.16	{d16, d17, d18}, [r0]   @ encoding: [0x4f,0x04,0x60,0xf9]
   vld3.16	{d16, d17, d18}, [r0]
 @ CHECK: vld3.32	{d16, d17, d18}, [r0]   @ encoding: [0x8f,0x04,0x60,0xf9]
   vld3.32	{d16, d17, d18}, [r0]
-@ CHECK: vld3.8	{d16, d18, d20}, [r0, :64]! @ encoding: [0x1d,0x05,0x60,0xf9]
-  vld3.8	{d16, d18, d20}, [r0, :64]!
-@ CHECK: vld3.8	{d17, d19, d21}, [r0, :64]! @ encoding: [0x1d,0x15,0x60,0xf9]
-  vld3.8	{d17, d19, d21}, [r0, :64]!
+@ CHECK: vld3.8	{d16, d18, d20}, [r0:64]! @ encoding: [0x1d,0x05,0x60,0xf9]
+  vld3.8	{d16, d18, d20}, [r0:64]!
+@ CHECK: vld3.8	{d17, d19, d21}, [r0:64]! @ encoding: [0x1d,0x15,0x60,0xf9]
+  vld3.8	{d17, d19, d21}, [r0:64]!
 @ CHECK: vld3.16	{d16, d18, d20}, [r0]!  @ encoding: [0x4d,0x05,0x60,0xf9] 
   vld3.16	{d16, d18, d20}, [r0]!
 @ CHECK: vld3.16	{d17, d19, d21}, [r0]!  @ encoding: [0x4d,0x15,0x60,0xf9]
@@ -52,16 +52,16 @@
 @ CHECK: vld3.32	{d17, d19, d21}, [r0]!  @ encoding: [0x8d,0x15,0x60,0xf9]
   vld3.32	{d17, d19, d21}, [r0]!
 
-@ CHECK: vld4.8	{d16, d17, d18, d19}, [r0, :64] @ encoding: [0x1f,0x00,0x60,0xf9]
-  vld4.8	{d16, d17, d18, d19}, [r0, :64]
-@ CHECK: vld4.16	{d16, d17, d18, d19}, [r0, :128] @ encoding: [0x6f,0x00,0x60,0xf9]
-  vld4.16	{d16, d17, d18, d19}, [r0, :128]
-@ CHECK: vld4.32	{d16, d17, d18, d19}, [r0, :256] @ encoding: [0xbf,0x00,0x60,0xf9]
-  vld4.32	{d16, d17, d18, d19}, [r0, :256]
-@ CHECK: vld4.8	{d16, d18, d20, d22}, [r0, :256]! @ encoding: [0x3d,0x01,0x60,0xf9]
-  vld4.8	{d16, d18, d20, d22}, [r0, :256]!
-@ CHECK: vld4.8	{d17, d19, d21, d23}, [r0, :256]! @ encoding: [0x3d,0x11,0x60,0xf9]
-  vld4.8	{d17, d19, d21, d23}, [r0, :256]!
+@ CHECK: vld4.8	{d16, d17, d18, d19}, [r0:64] @ encoding: [0x1f,0x00,0x60,0xf9]
+  vld4.8	{d16, d17, d18, d19}, [r0:64]
+@ CHECK: vld4.16	{d16, d17, d18, d19}, [r0:128] @ encoding: [0x6f,0x00,0x60,0xf9]
+  vld4.16	{d16, d17, d18, d19}, [r0:128]
+@ CHECK: vld4.32	{d16, d17, d18, d19}, [r0:256] @ encoding: [0xbf,0x00,0x60,0xf9]
+  vld4.32	{d16, d17, d18, d19}, [r0:256]
+@ CHECK: vld4.8	{d16, d18, d20, d22}, [r0:256]! @ encoding: [0x3d,0x01,0x60,0xf9]
+  vld4.8	{d16, d18, d20, d22}, [r0:256]!
+@ CHECK: vld4.8	{d17, d19, d21, d23}, [r0:256]! @ encoding: [0x3d,0x11,0x60,0xf9]
+  vld4.8	{d17, d19, d21, d23}, [r0:256]!
 @ CHECK: vld4.16	{d16, d18, d20, d22}, [r0]! @ encoding: [0x4d,0x01,0x60,0xf9]
   vld4.16	{d16, d18, d20, d22}, [r0]!
 @ CHECK: vld4.16	{d17, d19, d21, d23}, [r0]! @ encoding: [0x4d,0x11,0x60,0xf9]
@@ -73,21 +73,21 @@
 
 @ CHECK: vld1.8	{d16[3]}, [r0]          @ encoding: [0x6f,0x00,0xe0,0xf9]
   vld1.8	{d16[3]}, [r0]
-@ CHECK: vld1.16	{d16[2]}, [r0, :16]     @ encoding: [0x9f,0x04,0xe0,0xf9]
-  vld1.16	{d16[2]}, [r0, :16]
-@ CHECK: vld1.32	{d16[1]}, [r0, :32]     @ encoding: [0xbf,0x08,0xe0,0xf9]
-  vld1.32	{d16[1]}, [r0, :32]
+@ CHECK: vld1.16	{d16[2]}, [r0:16]     @ encoding: [0x9f,0x04,0xe0,0xf9]
+  vld1.16	{d16[2]}, [r0:16]
+@ CHECK: vld1.32	{d16[1]}, [r0:32]     @ encoding: [0xbf,0x08,0xe0,0xf9]
+  vld1.32	{d16[1]}, [r0:32]
 
-@ CHECK: vld2.8	{d16[1], d17[1]}, [r0, :16] @ encoding: [0x3f,0x01,0xe0,0xf9]
-  vld2.8	{d16[1], d17[1]}, [r0, :16]
-@ CHECK: vld2.16	{d16[1], d17[1]}, [r0, :32] @ encoding: [0x5f,0x05,0xe0,0xf9]
-  vld2.16	{d16[1], d17[1]}, [r0, :32]
+@ CHECK: vld2.8	{d16[1], d17[1]}, [r0:16] @ encoding: [0x3f,0x01,0xe0,0xf9]
+  vld2.8	{d16[1], d17[1]}, [r0:16]
+@ CHECK: vld2.16	{d16[1], d17[1]}, [r0:32] @ encoding: [0x5f,0x05,0xe0,0xf9]
+  vld2.16	{d16[1], d17[1]}, [r0:32]
 @ CHECK: vld2.32	{d16[1], d17[1]}, [r0]  @ encoding: [0x8f,0x09,0xe0,0xf9]
   vld2.32	{d16[1], d17[1]}, [r0]
 @ CHECK: vld2.16	{d17[1], d19[1]}, [r0]  @ encoding: [0x6f,0x15,0xe0,0xf9]
   vld2.16	{d17[1], d19[1]}, [r0]
-@ CHECK: vld2.32	{d17[0], d19[0]}, [r0, :64] @ encoding: [0x5f,0x19,0xe0,0xf9]
-  vld2.32	{d17[0], d19[0]}, [r0, :64]
+@ CHECK: vld2.32	{d17[0], d19[0]}, [r0:64] @ encoding: [0x5f,0x19,0xe0,0xf9]
+  vld2.32	{d17[0], d19[0]}, [r0:64]
 
 @ CHECK: vld3.8	{d16[1], d17[1], d18[1]}, [r0] @ encoding: [0x2f,0x02,0xe0,0xf9]
   vld3.8	{d16[1], d17[1], d18[1]}, [r0]
@@ -100,13 +100,13 @@
 @ CHECK: vld3.32	{d17[1], d19[1], d21[1]}, [r0] @ encoding: [0xcf,0x1a,0xe0,0xf9]
   vld3.32	{d17[1], d19[1], d21[1]}, [r0]
 
-@ CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32] @ encoding: [0x3f,0x03,0xe0,0xf9]
-  vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
+@ CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0:32] @ encoding: [0x3f,0x03,0xe0,0xf9]
+  vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0:32]
 @ CHECK: vld4.16	{d16[1], d17[1], d18[1], d19[1]}, [r0] @ encoding: [0x4f,0x07,0xe0,0xf9]
   vld4.16	{d16[1], d17[1], d18[1], d19[1]}, [r0]
-@ CHECK: vld4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0, :128] @ encoding: [0xaf,0x0b,0xe0,0xf9]
-  vld4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0, :128]
-@ CHECK: vld4.16	{d16[1], d18[1], d20[1], d22[1]}, [r0, :64] @ encoding: [0x7f,0x07,0xe0,0xf9]
-  vld4.16	{d16[1], d18[1], d20[1], d22[1]}, [r0, :64]
+@ CHECK: vld4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0:128] @ encoding: [0xaf,0x0b,0xe0,0xf9]
+  vld4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0:128]
+@ CHECK: vld4.16	{d16[1], d18[1], d20[1], d22[1]}, [r0:64] @ encoding: [0x7f,0x07,0xe0,0xf9]
+  vld4.16	{d16[1], d18[1], d20[1], d22[1]}, [r0:64]
 @ CHECK: vld4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0] @ encoding: [0x4f,0x1b,0xe0,0xf9]
   vld4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0]
diff --git a/test/MC/ARM/neont2-vst-encoding.s b/test/MC/ARM/neont2-vst-encoding.s
index b50d8b6..9adf751 100644
--- a/test/MC/ARM/neont2-vst-encoding.s
+++ b/test/MC/ARM/neont2-vst-encoding.s
@@ -3,46 +3,46 @@
 
 .code 16
 
-@ CHECK: vst1.8	{d16}, [r0, :64]        @ encoding: [0x1f,0x07,0x40,0xf9]
-  vst1.8	{d16}, [r0, :64]
+@ CHECK: vst1.8	{d16}, [r0:64]        @ encoding: [0x1f,0x07,0x40,0xf9]
+  vst1.8	{d16}, [r0:64]
 @ CHECK: vst1.16	{d16}, [r0]             @ encoding: [0x4f,0x07,0x40,0xf9]
   vst1.16	{d16}, [r0]
 @ CHECK: vst1.32	{d16}, [r0]             @ encoding: [0x8f,0x07,0x40,0xf9]
   vst1.32	{d16}, [r0]
 @ CHECK: vst1.64	{d16}, [r0]             @ encoding: [0xcf,0x07,0x40,0xf9]
   vst1.64	{d16}, [r0]
-@ CHECK: vst1.8	{d16, d17}, [r0, :64]   @ encoding: [0x1f,0x0a,0x40,0xf9]
-  vst1.8	{d16, d17}, [r0, :64]
-@ CHECK: vst1.16	{d16, d17}, [r0, :128]  @ encoding: [0x6f,0x0a,0x40,0xf9]
-  vst1.16	{d16, d17}, [r0, :128]
+@ CHECK: vst1.8	{d16, d17}, [r0:64]   @ encoding: [0x1f,0x0a,0x40,0xf9]
+  vst1.8	{d16, d17}, [r0:64]
+@ CHECK: vst1.16	{d16, d17}, [r0:128]  @ encoding: [0x6f,0x0a,0x40,0xf9]
+  vst1.16	{d16, d17}, [r0:128]
 @ CHECK: vst1.32	{d16, d17}, [r0]        @ encoding: [0x8f,0x0a,0x40,0xf9]
   vst1.32	{d16, d17}, [r0]
 @ CHECK: vst1.64	{d16, d17}, [r0]        @ encoding: [0xcf,0x0a,0x40,0xf9]
   vst1.64	{d16, d17}, [r0]
 
-@ CHECK: vst2.8	{d16, d17}, [r0, :64]   @ encoding: [0x1f,0x08,0x40,0xf9]
-  vst2.8	{d16, d17}, [r0, :64]
-@ CHECK: vst2.16	{d16, d17}, [r0, :128]  @ encoding: [0x6f,0x08,0x40,0xf9]
-  vst2.16	{d16, d17}, [r0, :128]
+@ CHECK: vst2.8	{d16, d17}, [r0:64]   @ encoding: [0x1f,0x08,0x40,0xf9]
+  vst2.8	{d16, d17}, [r0:64]
+@ CHECK: vst2.16	{d16, d17}, [r0:128]  @ encoding: [0x6f,0x08,0x40,0xf9]
+  vst2.16	{d16, d17}, [r0:128]
 @ CHECK: vst2.32	{d16, d17}, [r0]        @ encoding: [0x8f,0x08,0x40,0xf9]
   vst2.32	{d16, d17}, [r0]
-@ CHECK: vst2.8	{d16, d17, d18, d19}, [r0, :64] @ encoding: [0x1f,0x03,0x40,0xf9]
-  vst2.8	{d16, d17, d18, d19}, [r0, :64]
-@ CHECK: vst2.16	{d16, d17, d18, d19}, [r0, :128] @ encoding: [0x6f,0x03,0x40,0xf9]
-  vst2.16	{d16, d17, d18, d19}, [r0, :128]
-@ CHECK: vst2.32	{d16, d17, d18, d19}, [r0, :256] @ encoding: [0xbf,0x03,0x40,0xf9]
-  vst2.32	{d16, d17, d18, d19}, [r0, :256]
+@ CHECK: vst2.8	{d16, d17, d18, d19}, [r0:64] @ encoding: [0x1f,0x03,0x40,0xf9]
+  vst2.8	{d16, d17, d18, d19}, [r0:64]
+@ CHECK: vst2.16	{d16, d17, d18, d19}, [r0:128] @ encoding: [0x6f,0x03,0x40,0xf9]
+  vst2.16	{d16, d17, d18, d19}, [r0:128]
+@ CHECK: vst2.32	{d16, d17, d18, d19}, [r0:256] @ encoding: [0xbf,0x03,0x40,0xf9]
+  vst2.32	{d16, d17, d18, d19}, [r0:256]
 
-@ CHECK: vst3.8	{d16, d17, d18}, [r0, :64] @ encoding: [0x1f,0x04,0x40,0xf9]
-  vst3.8	{d16, d17, d18}, [r0, :64]
+@ CHECK: vst3.8	{d16, d17, d18}, [r0:64] @ encoding: [0x1f,0x04,0x40,0xf9]
+  vst3.8	{d16, d17, d18}, [r0:64]
 @ CHECK: vst3.16	{d16, d17, d18}, [r0]   @ encoding: [0x4f,0x04,0x40,0xf9]
   vst3.16	{d16, d17, d18}, [r0]
 @ CHECK: vst3.32	{d16, d17, d18}, [r0]   @ encoding: [0x8f,0x04,0x40,0xf9]
   vst3.32	{d16, d17, d18}, [r0]
-@ CHECK: vst3.8	{d16, d18, d20}, [r0, :64]! @ encoding: [0x1d,0x05,0x40,0xf9]
-  vst3.8	{d16, d18, d20}, [r0, :64]!
-@ CHECK: vst3.8	{d17, d19, d21}, [r0, :64]! @ encoding: [0x1d,0x15,0x40,0xf9]
-  vst3.8	{d17, d19, d21}, [r0, :64]!
+@ CHECK: vst3.8	{d16, d18, d20}, [r0:64]! @ encoding: [0x1d,0x05,0x40,0xf9]
+  vst3.8	{d16, d18, d20}, [r0:64]!
+@ CHECK: vst3.8	{d17, d19, d21}, [r0:64]! @ encoding: [0x1d,0x15,0x40,0xf9]
+  vst3.8	{d17, d19, d21}, [r0:64]!
 @ CHECK: vst3.16	{d16, d18, d20}, [r0]!  @ encoding: [0x4d,0x05,0x40,0xf9]
   vst3.16	{d16, d18, d20}, [r0]!
 @ CHECK: vst3.16	{d17, d19, d21}, [r0]!  @ encoding: [0x4d,0x15,0x40,0xf9]
@@ -52,14 +52,14 @@
 @ CHECK: vst3.32	{d17, d19, d21}, [r0]!  @ encoding: [0x8d,0x15,0x40,0xf9]
   vst3.32	{d17, d19, d21}, [r0]!
 
-@ CHECK: vst4.8	{d16, d17, d18, d19}, [r0, :64] @ encoding: [0x1f,0x00,0x40,0xf9]
-  vst4.8	{d16, d17, d18, d19}, [r0, :64]
-@ CHECK: vst4.16	{d16, d17, d18, d19}, [r0, :128] @ encoding: [0x6f,0x00,0x40,0xf9]
-  vst4.16	{d16, d17, d18, d19}, [r0, :128]
-@ CHECK: vst4.8	{d16, d18, d20, d22}, [r0, :256]! @ encoding: [0x3d,0x01,0x40,0xf9]
-  vst4.8	{d16, d18, d20, d22}, [r0, :256]!
-@ CHECK: vst4.8	{d17, d19, d21, d23}, [r0, :256]! @ encoding: [0x3d,0x11,0x40,0xf9]
-  vst4.8	{d17, d19, d21, d23}, [r0, :256]!
+@ CHECK: vst4.8	{d16, d17, d18, d19}, [r0:64] @ encoding: [0x1f,0x00,0x40,0xf9]
+  vst4.8	{d16, d17, d18, d19}, [r0:64]
+@ CHECK: vst4.16	{d16, d17, d18, d19}, [r0:128] @ encoding: [0x6f,0x00,0x40,0xf9]
+  vst4.16	{d16, d17, d18, d19}, [r0:128]
+@ CHECK: vst4.8	{d16, d18, d20, d22}, [r0:256]! @ encoding: [0x3d,0x01,0x40,0xf9]
+  vst4.8	{d16, d18, d20, d22}, [r0:256]!
+@ CHECK: vst4.8	{d17, d19, d21, d23}, [r0:256]! @ encoding: [0x3d,0x11,0x40,0xf9]
+  vst4.8	{d17, d19, d21, d23}, [r0:256]!
 @ CHECK: vst4.16	{d16, d18, d20, d22}, [r0]! @ encoding: [0x4d,0x01,0x40,0xf9]
   vst4.16	{d16, d18, d20, d22}, [r0]!
 @ CHECK: vst4.16	{d17, d19, d21, d23}, [r0]! @ encoding: [0x4d,0x11,0x40,0xf9]
@@ -69,16 +69,16 @@
 @ CHECK: vst4.32	{d17, d19, d21, d23}, [r0]! @ encoding: [0x8d,0x11,0x40,0xf9]
   vst4.32	{d17, d19, d21, d23}, [r0]!
 
-@ CHECK: vst2.8	{d16[1], d17[1]}, [r0, :16] @ encoding: [0x3f,0x01,0xc0,0xf9]
-  vst2.8	{d16[1], d17[1]}, [r0, :16]
-@ CHECK: vst2.16	{d16[1], d17[1]}, [r0, :32] @ encoding: [0x5f,0x05,0xc0,0xf9]
-  vst2.16	{d16[1], d17[1]}, [r0, :32]
+@ CHECK: vst2.8	{d16[1], d17[1]}, [r0:16] @ encoding: [0x3f,0x01,0xc0,0xf9]
+  vst2.8	{d16[1], d17[1]}, [r0:16]
+@ CHECK: vst2.16	{d16[1], d17[1]}, [r0:32] @ encoding: [0x5f,0x05,0xc0,0xf9]
+  vst2.16	{d16[1], d17[1]}, [r0:32]
 @ CHECK: vst2.32	{d16[1], d17[1]}, [r0]  @ encoding: [0x8f,0x09,0xc0,0xf9]
   vst2.32	{d16[1], d17[1]}, [r0]
 @ CHECK: vst2.16	{d17[1], d19[1]}, [r0]  @ encoding: [0x6f,0x15,0xc0,0xf9]
   vst2.16	{d17[1], d19[1]}, [r0]
-@ CHECK: vst2.32	{d17[0], d19[0]}, [r0, :64] @ encoding: [0x5f,0x19,0xc0,0xf9]
-  vst2.32	{d17[0], d19[0]}, [r0, :64]
+@ CHECK: vst2.32	{d17[0], d19[0]}, [r0:64] @ encoding: [0x5f,0x19,0xc0,0xf9]
+  vst2.32	{d17[0], d19[0]}, [r0:64]
 
 @ CHECK: vst3.8	{d16[1], d17[1], d18[1]}, [r0] @ encoding: [0x2f,0x02,0xc0,0xf9]
   vst3.8	{d16[1], d17[1], d18[1]}, [r0]
@@ -91,14 +91,14 @@
 @ CHECK: vst3.32	{d16[0], d18[0], d20[0]}, [r0] @ encoding: [0x4f,0x0a,0xc0,0xf9]
   vst3.32	{d16[0], d18[0], d20[0]}, [r0]
 
-@ CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32] @ encoding: [0x3f,0x03,0xc0,0xf9]
-  vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
+@ CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0:32] @ encoding: [0x3f,0x03,0xc0,0xf9]
+  vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0:32]
 @ CHECK: vst4.16	{d16[1], d17[1], d18[1], d19[1]}, [r0] @ encoding: [0x4f,0x07,0xc0,0xf9]
   vst4.16	{d16[1], d17[1], d18[1], d19[1]}, [r0]
-@ CHECK: vst4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0, :128] @ encoding: [0xaf,0x0b,0xc0,0xf9]
-  vst4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0, :128]
-@ CHECK: vst4.16	{d17[3], d19[3], d21[3], d23[3]}, [r0, :64] @ encoding: [0xff,0x17,0xc0,0xf9]
-  vst4.16	{d17[3], d19[3], d21[3], d23[3]}, [r0, :64]
+@ CHECK: vst4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0:128] @ encoding: [0xaf,0x0b,0xc0,0xf9]
+  vst4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0:128]
+@ CHECK: vst4.16	{d17[3], d19[3], d21[3], d23[3]}, [r0:64] @ encoding: [0xff,0x17,0xc0,0xf9]
+  vst4.16	{d17[3], d19[3], d21[3], d23[3]}, [r0:64]
 @ CHECK: vst4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0] @ encoding: [0x4f,0x1b,0xc0,0xf9]
   vst4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0]
 
diff --git a/test/MC/AsmParser/align_invalid.s b/test/MC/AsmParser/align_invalid.s
new file mode 100644
index 0000000..0d06d94
--- /dev/null
+++ b/test/MC/AsmParser/align_invalid.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple i386-linux-gnu < %s 2>&1 | FileCheck %s -check-prefix=ELF
+# RUN: llvm-mc -triple i386-apple-darwin < %s 2>&1 | FileCheck %s -check-prefix=DARWIN
+
+.align 3
+# ELF: error: alignment must be a power of 2
+# DARWIN-NOT: error
+
+.align 32
+# ELF-NOT: error
+# DARWIN: error: invalid alignment value
diff --git a/test/MC/AsmParser/directive_values.s b/test/MC/AsmParser/directive_values.s
index 6c79c38..ed932b2 100644
--- a/test/MC/AsmParser/directive_values.s
+++ b/test/MC/AsmParser/directive_values.s
@@ -63,3 +63,9 @@ TEST7:
 # CHECK-NEXT:   .byte   2
 # CHECK-NEXT:   .byte   3
 # CHECK-NEXT:   .byte   4
+
+TEST8:
+        .long 0x200000UL+1
+        .long 0x200000L+1
+# CHECK: .long 2097153
+# CHECK: .long 2097153
diff --git a/test/MC/AsmParser/section_names.s b/test/MC/AsmParser/section_names.s
new file mode 100644
index 0000000..332cdbe
--- /dev/null
+++ b/test/MC/AsmParser/section_names.s
@@ -0,0 +1,62 @@
+# RUN: llvm-mc -triple i386-pc-linux-gnu -filetype=obj -o %t %s
+# RUN: elf-dump --dump-section-data < %t | FileCheck %s
+.section .nobits
+.byte 1
+.section .nobits2
+.byte 1
+.section .nobitsfoo
+.byte 1
+.section .init_array
+.byte 1
+.section .init_array2
+.byte 1
+.section .init_arrayfoo
+.byte 1
+.section .fini_array
+.byte 1
+.section .fini_array2
+.byte 1
+.section .fini_arrayfoo
+.byte 1
+.section .preinit_array
+.byte 1
+.section .preinit_array2
+.byte 1
+.section .preinit_arrayfoo
+.byte 1
+.section .note
+.byte 1
+.section .note2
+.byte 1
+.section .notefoo
+.byte 1
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.nobits'
+# CHECK-NEXT:  ('sh_type', 0x00000001)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.nobits2'
+# CHECK-NEXT:  ('sh_type', 0x00000001)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.nobitsfoo'
+# CHECK-NEXT:  ('sh_type', 0x00000001)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.init_array'
+# CHECK-NEXT:  ('sh_type', 0x0000000e)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.init_array2'
+# CHECK-NEXT:  ('sh_type', 0x00000001)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.init_arrayfoo'
+# CHECK-NEXT:  ('sh_type', 0x00000001)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.fini_array'
+# CHECK-NEXT:  ('sh_type', 0x0000000f)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.fini_array2'
+# CHECK-NEXT:  ('sh_type', 0x00000001)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.fini_arrayfoo'
+# CHECK-NEXT:  ('sh_type', 0x00000001)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.preinit_array'
+# CHECK-NEXT:  ('sh_type', 0x00000010)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.preinit_array2'
+# CHECK-NEXT:  ('sh_type', 0x00000001)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.preinit_arrayfoo'
+# CHECK-NEXT:  ('sh_type', 0x00000001)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.note'
+# CHECK-NEXT:  ('sh_type', 0x00000007)
+# CHECK:      (('sh_name', 0x00000{{...}}) # '.note2'
+# CHECK-NEXT:  ('sh_type', 0x00000007)
+#CHECK:       (('sh_name', 0x00000{{...}}) # '.notefoo'
+# CHECK-NEXT:  ('sh_type', 0x00000007)
diff --git a/test/MC/COFF/symbol-alias.s b/test/MC/COFF/symbol-alias.s
index 03f07b2..4b1772c 100644
--- a/test/MC/COFF/symbol-alias.s
+++ b/test/MC/COFF/symbol-alias.s
@@ -23,8 +23,11 @@ _bar:
 	.long	0                       # 0x0
 
 
+# Order is important here. Assign _bar_alias_alias before _bar_alias.
 	.globl	_foo_alias
 _foo_alias = _foo
+	.globl	_bar_alias_alias
+_bar_alias_alias = _bar_alias
 	.globl	_bar_alias
 _bar_alias = _bar
 
@@ -52,6 +55,14 @@ _bar_alias = _bar
 // CHECK-NEXT: StorageClass       = [[FOO_STORAGE_CLASS]]
 // CHECK-NEXT: NumberOfAuxSymbols = [[FOO_NUMBER_OF_AUX_SYMBOLS]]
 
+// CHECK:      Name               = {{_?}}bar_alias_alias
+// CHECK-NEXT: Value              = [[BAR_VALUE]]
+// CHECK-NEXT: SectionNumber      = [[BAR_SECTION_NUMBER]]
+// CHECK-NEXT: SimpleType         = [[BAR_SIMPLE_TYPE]]
+// CHECK-NEXT: ComplexType        = [[BAR_COMPLEX_TYPE]]
+// CHECK-NEXT: StorageClass       = [[BAR_STORAGE_CLASS]]
+// CHECK-NEXT: NumberOfAuxSymbols = [[BAR_NUMBER_OF_AUX_SYMBOLS]]
+
 // CHECK:      Name               = {{_?}}bar_alias
 // CHECK-NEXT: Value              = [[BAR_VALUE]]
 // CHECK-NEXT: SectionNumber      = [[BAR_SECTION_NUMBER]]
diff --git a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
new file mode 100644
index 0000000..4fa2d50
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
@@ -0,0 +1,4200 @@
+# RUN: llvm-mc -triple=aarch64 -disassemble < %s | FileCheck %s
+
+#------------------------------------------------------------------------------
+# Add/sub (immediate)
+#------------------------------------------------------------------------------
+# CHECK: add      w4, w5, #0
+# CHECK: add      w2, w3, #4095
+# CHECK: add      w30, w29, #1, lsl #12
+# CHECK: add      w13, w5, #4095, lsl #12
+# CHECK: add      x5, x7, #1638
+0xa4 0x0 0x0 0x11
+0x62 0xfc 0x3f 0x11
+0xbe 0x7 0x40 0x11
+0xad 0xfc 0x7f 0x11
+0xe5 0x98 0x19 0x91
+
+# CHECK: add      w20, wsp, #801
+# CHECK: add      wsp, wsp, #1104
+# CHECK: add      wsp, w30, #4084
+0xf4 0x87 0xc 0x11
+0xff 0x43 0x11 0x11
+0xdf 0xd3 0x3f 0x11
+
+# CHECK: add      x0, x24, #291
+# CHECK: add      x3, x24, #4095, lsl #12
+# CHECK: add      x8, sp, #1074
+# CHECK: add      sp, x29, #3816
+0x0 0x8f 0x4 0x91
+0x3 0xff 0x7f 0x91
+0xe8 0xcb 0x10 0x91
+0xbf 0xa3 0x3b 0x91
+
+# CHECK: sub      w0, wsp, #4077
+# CHECK: sub      w4, w20, #546, lsl #12
+# CHECK: sub      sp, sp, #288
+# CHECK: sub      wsp, w19, #16
+0xe0 0xb7 0x3f 0x51
+0x84 0x8a 0x48 0x51
+0xff 0x83 0x4 0xd1
+0x7f 0x42 0x0 0x51
+
+
+# CHECK: adds     w13, w23, #291, lsl #12
+# CHECK: cmn      w2, #4095
+# CHECK: adds     w20, wsp, #0
+# CHECK: cmn      x3, #1, lsl #12
+0xed 0x8e 0x44 0x31
+0x5f 0xfc 0x3f 0x31
+0xf4 0x3 0x0 0x31
+0x7f 0x4 0x40 0xb1
+
+# CHECK: cmp      sp, #20, lsl #12
+# CHECK: cmp      x30, #4095
+# CHECK: subs     x4, sp, #3822
+0xff 0x53 0x40 0xf1
+0xdf 0xff 0x3f 0xf1
+0xe4 0xbb 0x3b 0xf1
+
+# These should really be CMN
+# CHECK: cmn      w3, #291, lsl #12
+# CHECK: cmn      wsp, #1365
+# CHECK: cmn      sp, #1092, lsl #12
+0x7f 0x8c 0x44 0x31
+0xff 0x57 0x15 0x31
+0xff 0x13 0x51 0xb1
+
+# CHECK: mov      sp, x30
+# CHECK: mov      wsp, w20
+# CHECK: mov      x11, sp
+# CHECK: mov      w24, wsp
+0xdf 0x3 0x0 0x91
+0x9f 0x2 0x0 0x11
+0xeb 0x3 0x0 0x91
+0xf8 0x3 0x0 0x11
+
+#------------------------------------------------------------------------------
+# Add-subtract (shifted register)
+#------------------------------------------------------------------------------
+
+# CHECK: add      w3, w5, w7
+# CHECK: add      wzr, w3, w5
+# CHECK: add      w20, wzr, w4
+# CHECK: add      w4, w6, wzr
+# CHECK: add      w11, w13, w15
+# CHECK: add      w9, w3, wzr, lsl #10
+# CHECK: add      w17, w29, w20, lsl #31
+# CHECK: add      w21, w22, w23, lsr #0
+# CHECK: add      w24, w25, w26, lsr #18
+# CHECK: add      w27, w28, w29, lsr #31
+# CHECK: add      w2, w3, w4, asr #0
+# CHECK: add      w5, w6, w7, asr #21
+# CHECK: add      w8, w9, w10, asr #31
+0xa3 0x0 0x7 0xb
+0x7f 0x0 0x5 0xb
+0xf4 0x3 0x4 0xb
+0xc4 0x0 0x1f 0xb
+0xab 0x1 0xf 0xb
+0x69 0x28 0x1f 0xb
+0xb1 0x7f 0x14 0xb
+0xd5 0x2 0x57 0xb
+0x38 0x4b 0x5a 0xb
+0x9b 0x7f 0x5d 0xb
+0x62 0x0 0x84 0xb
+0xc5 0x54 0x87 0xb
+0x28 0x7d 0x8a 0xb
+
+# CHECK: add      x3, x5, x7
+# CHECK: add      xzr, x3, x5
+# CHECK: add      x20, xzr, x4
+# CHECK: add      x4, x6, xzr
+# CHECK: add      x11, x13, x15
+# CHECK: add      x9, x3, xzr, lsl #10
+# CHECK: add      x17, x29, x20, lsl #63
+# CHECK: add      x21, x22, x23, lsr #0
+# CHECK: add      x24, x25, x26, lsr #18
+# CHECK: add      x27, x28, x29, lsr #63
+# CHECK: add      x2, x3, x4, asr #0
+# CHECK: add      x5, x6, x7, asr #21
+# CHECK: add      x8, x9, x10, asr #63
+0xa3 0x0 0x7 0x8b
+0x7f 0x0 0x5 0x8b
+0xf4 0x3 0x4 0x8b
+0xc4 0x0 0x1f 0x8b
+0xab 0x1 0xf 0x8b
+0x69 0x28 0x1f 0x8b
+0xb1 0xff 0x14 0x8b
+0xd5 0x2 0x57 0x8b
+0x38 0x4b 0x5a 0x8b
+0x9b 0xff 0x5d 0x8b
+0x62 0x0 0x84 0x8b
+0xc5 0x54 0x87 0x8b
+0x28 0xfd 0x8a 0x8b
+
+# CHECK: adds     w3, w5, w7
+# CHECK: cmn      w3, w5
+# CHECK: adds     w20, wzr, w4
+# CHECK: adds     w4, w6, wzr
+# CHECK: adds     w11, w13, w15
+# CHECK: adds     w9, w3, wzr, lsl #10
+# CHECK: adds     w17, w29, w20, lsl #31
+# CHECK: adds     w21, w22, w23, lsr #0
+# CHECK: adds     w24, w25, w26, lsr #18
+# CHECK: adds     w27, w28, w29, lsr #31
+# CHECK: adds     w2, w3, w4, asr #0
+# CHECK: adds     w5, w6, w7, asr #21
+# CHECK: adds     w8, w9, w10, asr #31
+0xa3 0x0 0x7 0x2b
+0x7f 0x0 0x5 0x2b
+0xf4 0x3 0x4 0x2b
+0xc4 0x0 0x1f 0x2b
+0xab 0x1 0xf 0x2b
+0x69 0x28 0x1f 0x2b
+0xb1 0x7f 0x14 0x2b
+0xd5 0x2 0x57 0x2b
+0x38 0x4b 0x5a 0x2b
+0x9b 0x7f 0x5d 0x2b
+0x62 0x0 0x84 0x2b
+0xc5 0x54 0x87 0x2b
+0x28 0x7d 0x8a 0x2b
+
+# CHECK: adds     x3, x5, x7
+# CHECK: cmn      x3, x5
+# CHECK: adds     x20, xzr, x4
+# CHECK: adds     x4, x6, xzr
+# CHECK: adds     x11, x13, x15
+# CHECK: adds     x9, x3, xzr, lsl #10
+# CHECK: adds     x17, x29, x20, lsl #63
+# CHECK: adds     x21, x22, x23, lsr #0
+# CHECK: adds     x24, x25, x26, lsr #18
+# CHECK: adds     x27, x28, x29, lsr #63
+# CHECK: adds     x2, x3, x4, asr #0
+# CHECK: adds     x5, x6, x7, asr #21
+# CHECK: adds     x8, x9, x10, asr #63
+0xa3 0x0 0x7 0xab
+0x7f 0x0 0x5 0xab
+0xf4 0x3 0x4 0xab
+0xc4 0x0 0x1f 0xab
+0xab 0x1 0xf 0xab
+0x69 0x28 0x1f 0xab
+0xb1 0xff 0x14 0xab
+0xd5 0x2 0x57 0xab
+0x38 0x4b 0x5a 0xab
+0x9b 0xff 0x5d 0xab
+0x62 0x0 0x84 0xab
+0xc5 0x54 0x87 0xab
+0x28 0xfd 0x8a 0xab
+
+# CHECK: sub      w3, w5, w7
+# CHECK: sub      wzr, w3, w5
+# CHECK: sub      w20, wzr, w4
+# CHECK: sub      w4, w6, wzr
+# CHECK: sub      w11, w13, w15
+# CHECK: sub      w9, w3, wzr, lsl #10
+# CHECK: sub      w17, w29, w20, lsl #31
+# CHECK: sub      w21, w22, w23, lsr #0
+# CHECK: sub      w24, w25, w26, lsr #18
+# CHECK: sub      w27, w28, w29, lsr #31
+# CHECK: sub      w2, w3, w4, asr #0
+# CHECK: sub      w5, w6, w7, asr #21
+# CHECK: sub      w8, w9, w10, asr #31
+0xa3 0x0 0x7 0x4b
+0x7f 0x0 0x5 0x4b
+0xf4 0x3 0x4 0x4b
+0xc4 0x0 0x1f 0x4b
+0xab 0x1 0xf 0x4b
+0x69 0x28 0x1f 0x4b
+0xb1 0x7f 0x14 0x4b
+0xd5 0x2 0x57 0x4b
+0x38 0x4b 0x5a 0x4b
+0x9b 0x7f 0x5d 0x4b
+0x62 0x0 0x84 0x4b
+0xc5 0x54 0x87 0x4b
+0x28 0x7d 0x8a 0x4b
+
+# CHECK: sub      x3, x5, x7
+# CHECK: sub      xzr, x3, x5
+# CHECK: sub      x20, xzr, x4
+# CHECK: sub      x4, x6, xzr
+# CHECK: sub      x11, x13, x15
+# CHECK: sub      x9, x3, xzr, lsl #10
+# CHECK: sub      x17, x29, x20, lsl #63
+# CHECK: sub      x21, x22, x23, lsr #0
+# CHECK: sub      x24, x25, x26, lsr #18
+# CHECK: sub      x27, x28, x29, lsr #63
+# CHECK: sub      x2, x3, x4, asr #0
+# CHECK: sub      x5, x6, x7, asr #21
+# CHECK: sub      x8, x9, x10, asr #63
+0xa3 0x0 0x7 0xcb
+0x7f 0x0 0x5 0xcb
+0xf4 0x3 0x4 0xcb
+0xc4 0x0 0x1f 0xcb
+0xab 0x1 0xf 0xcb
+0x69 0x28 0x1f 0xcb
+0xb1 0xff 0x14 0xcb
+0xd5 0x2 0x57 0xcb
+0x38 0x4b 0x5a 0xcb
+0x9b 0xff 0x5d 0xcb
+0x62 0x0 0x84 0xcb
+0xc5 0x54 0x87 0xcb
+0x28 0xfd 0x8a 0xcb
+
+# CHECK: subs     w3, w5, w7
+# CHECK: cmp      w3, w5
+# CHECK: subs     w20, wzr, w4
+# CHECK: subs     w4, w6, wzr
+# CHECK: subs     w11, w13, w15
+# CHECK: subs     w9, w3, wzr, lsl #10
+# CHECK: subs     w17, w29, w20, lsl #31
+# CHECK: subs     w21, w22, w23, lsr #0
+# CHECK: subs     w24, w25, w26, lsr #18
+# CHECK: subs     w27, w28, w29, lsr #31
+# CHECK: subs     w2, w3, w4, asr #0
+# CHECK: subs     w5, w6, w7, asr #21
+# CHECK: subs     w8, w9, w10, asr #31
+0xa3 0x0 0x7 0x6b
+0x7f 0x0 0x5 0x6b
+0xf4 0x3 0x4 0x6b
+0xc4 0x0 0x1f 0x6b
+0xab 0x1 0xf 0x6b
+0x69 0x28 0x1f 0x6b
+0xb1 0x7f 0x14 0x6b
+0xd5 0x2 0x57 0x6b
+0x38 0x4b 0x5a 0x6b
+0x9b 0x7f 0x5d 0x6b
+0x62 0x0 0x84 0x6b
+0xc5 0x54 0x87 0x6b
+0x28 0x7d 0x8a 0x6b
+
+# CHECK: subs     x3, x5, x7
+# CHECK: cmp      x3, x5
+# CHECK: subs     x20, xzr, x4
+# CHECK: subs     x4, x6, xzr
+# CHECK: subs     x11, x13, x15
+# CHECK: subs     x9, x3, xzr, lsl #10
+# CHECK: subs     x17, x29, x20, lsl #63
+# CHECK: subs     x21, x22, x23, lsr #0
+# CHECK: subs     x24, x25, x26, lsr #18
+# CHECK: subs     x27, x28, x29, lsr #63
+# CHECK: subs     x2, x3, x4, asr #0
+# CHECK: subs     x5, x6, x7, asr #21
+# CHECK: subs     x8, x9, x10, asr #63
+0xa3 0x0 0x7 0xeb
+0x7f 0x0 0x5 0xeb
+0xf4 0x3 0x4 0xeb
+0xc4 0x0 0x1f 0xeb
+0xab 0x1 0xf 0xeb
+0x69 0x28 0x1f 0xeb
+0xb1 0xff 0x14 0xeb
+0xd5 0x2 0x57 0xeb
+0x38 0x4b 0x5a 0xeb
+0x9b 0xff 0x5d 0xeb
+0x62 0x0 0x84 0xeb
+0xc5 0x54 0x87 0xeb
+0x28 0xfd 0x8a 0xeb
+
+# CHECK: cmn      w0, w3
+# CHECK: cmn      wzr, w4
+# CHECK: cmn      w5, wzr
+# CHECK: cmn      w6, w7
+# CHECK: cmn      w8, w9, lsl #15
+# CHECK: cmn      w10, w11, lsl #31
+# CHECK: cmn      w12, w13, lsr #0
+# CHECK: cmn      w14, w15, lsr #21
+# CHECK: cmn      w16, w17, lsr #31
+# CHECK: cmn      w18, w19, asr #0
+# CHECK: cmn      w20, w21, asr #22
+# CHECK: cmn      w22, w23, asr #31
+0x1f 0x0 0x3 0x2b
+0xff 0x3 0x4 0x2b
+0xbf 0x0 0x1f 0x2b
+0xdf 0x0 0x7 0x2b
+0x1f 0x3d 0x9 0x2b
+0x5f 0x7d 0xb 0x2b
+0x9f 0x1 0x4d 0x2b
+0xdf 0x55 0x4f 0x2b
+0x1f 0x7e 0x51 0x2b
+0x5f 0x2 0x93 0x2b
+0x9f 0x5a 0x95 0x2b
+0xdf 0x7e 0x97 0x2b
+
+# CHECK: cmn      x0, x3
+# CHECK: cmn      xzr, x4
+# CHECK: cmn      x5, xzr
+# CHECK: cmn      x6, x7
+# CHECK: cmn      x8, x9, lsl #15
+# CHECK: cmn      x10, x11, lsl #63
+# CHECK: cmn      x12, x13, lsr #0
+# CHECK: cmn      x14, x15, lsr #41
+# CHECK: cmn      x16, x17, lsr #63
+# CHECK: cmn      x18, x19, asr #0
+# CHECK: cmn      x20, x21, asr #55
+# CHECK: cmn      x22, x23, asr #63
+0x1f 0x0 0x3 0xab
+0xff 0x3 0x4 0xab
+0xbf 0x0 0x1f 0xab
+0xdf 0x0 0x7 0xab
+0x1f 0x3d 0x9 0xab
+0x5f 0xfd 0xb 0xab
+0x9f 0x1 0x4d 0xab
+0xdf 0xa5 0x4f 0xab
+0x1f 0xfe 0x51 0xab
+0x5f 0x2 0x93 0xab
+0x9f 0xde 0x95 0xab
+0xdf 0xfe 0x97 0xab
+
+# CHECK: cmp      w0, w3
+# CHECK: cmp      wzr, w4
+# CHECK: cmp      w5, wzr
+# CHECK: cmp      w6, w7
+# CHECK: cmp      w8, w9, lsl #15
+# CHECK: cmp      w10, w11, lsl #31
+# CHECK: cmp      w12, w13, lsr #0
+# CHECK: cmp      w14, w15, lsr #21
+# CHECK: cmp      w16, w17, lsr #31
+# CHECK: cmp      w18, w19, asr #0
+# CHECK: cmp      w20, w21, asr #22
+# CHECK: cmp      w22, w23, asr #31
+0x1f 0x0 0x3 0x6b
+0xff 0x3 0x4 0x6b
+0xbf 0x0 0x1f 0x6b
+0xdf 0x0 0x7 0x6b
+0x1f 0x3d 0x9 0x6b
+0x5f 0x7d 0xb 0x6b
+0x9f 0x1 0x4d 0x6b
+0xdf 0x55 0x4f 0x6b
+0x1f 0x7e 0x51 0x6b
+0x5f 0x2 0x93 0x6b
+0x9f 0x5a 0x95 0x6b
+0xdf 0x7e 0x97 0x6b
+
+# CHECK: cmp      x0, x3
+# CHECK: cmp      xzr, x4
+# CHECK: cmp      x5, xzr
+# CHECK: cmp      x6, x7
+# CHECK: cmp      x8, x9, lsl #15
+# CHECK: cmp      x10, x11, lsl #63
+# CHECK: cmp      x12, x13, lsr #0
+# CHECK: cmp      x14, x15, lsr #41
+# CHECK: cmp      x16, x17, lsr #63
+# CHECK: cmp      x18, x19, asr #0
+# CHECK: cmp      x20, x21, asr #55
+# CHECK: cmp      x22, x23, asr #63
+0x1f 0x0 0x3 0xeb
+0xff 0x3 0x4 0xeb
+0xbf 0x0 0x1f 0xeb
+0xdf 0x0 0x7 0xeb
+0x1f 0x3d 0x9 0xeb
+0x5f 0xfd 0xb 0xeb
+0x9f 0x1 0x4d 0xeb
+0xdf 0xa5 0x4f 0xeb
+0x1f 0xfe 0x51 0xeb
+0x5f 0x2 0x93 0xeb
+0x9f 0xde 0x95 0xeb
+0xdf 0xfe 0x97 0xeb
+
+# CHECK: sub      w29, wzr, w30
+# CHECK: sub      w30, wzr, wzr
+# CHECK: sub      wzr, wzr, w0
+# CHECK: sub      w28, wzr, w27
+# CHECK: sub      w26, wzr, w25, lsl #29
+# CHECK: sub      w24, wzr, w23, lsl #31
+# CHECK: sub      w22, wzr, w21, lsr #0
+# CHECK: sub      w20, wzr, w19, lsr #1
+# CHECK: sub      w18, wzr, w17, lsr #31
+# CHECK: sub      w16, wzr, w15, asr #0
+# CHECK: sub      w14, wzr, w13, asr #12
+# CHECK: sub      w12, wzr, w11, asr #31
+0xfd 0x3 0x1e 0x4b
+0xfe 0x3 0x1f 0x4b
+0xff 0x3 0x0 0x4b
+0xfc 0x3 0x1b 0x4b
+0xfa 0x77 0x19 0x4b
+0xf8 0x7f 0x17 0x4b
+0xf6 0x3 0x55 0x4b
+0xf4 0x7 0x53 0x4b
+0xf2 0x7f 0x51 0x4b
+0xf0 0x3 0x8f 0x4b
+0xee 0x33 0x8d 0x4b
+0xec 0x7f 0x8b 0x4b
+
+# CHECK: sub      x29, xzr, x30
+# CHECK: sub      x30, xzr, xzr
+# CHECK: sub      xzr, xzr, x0
+# CHECK: sub      x28, xzr, x27
+# CHECK: sub      x26, xzr, x25, lsl #29
+# CHECK: sub      x24, xzr, x23, lsl #31
+# CHECK: sub      x22, xzr, x21, lsr #0
+# CHECK: sub      x20, xzr, x19, lsr #1
+# CHECK: sub      x18, xzr, x17, lsr #31
+# CHECK: sub      x16, xzr, x15, asr #0
+# CHECK: sub      x14, xzr, x13, asr #12
+# CHECK: sub      x12, xzr, x11, asr #31
+0xfd 0x3 0x1e 0xcb
+0xfe 0x3 0x1f 0xcb
+0xff 0x3 0x0 0xcb
+0xfc 0x3 0x1b 0xcb
+0xfa 0x77 0x19 0xcb
+0xf8 0x7f 0x17 0xcb
+0xf6 0x3 0x55 0xcb
+0xf4 0x7 0x53 0xcb
+0xf2 0x7f 0x51 0xcb
+0xf0 0x3 0x8f 0xcb
+0xee 0x33 0x8d 0xcb
+0xec 0x7f 0x8b 0xcb
+
+# CHECK: subs     w29, wzr, w30
+# CHECK: subs     w30, wzr, wzr
+# CHECK: cmp      wzr, w0
+# CHECK: subs     w28, wzr, w27
+# CHECK: subs     w26, wzr, w25, lsl #29
+# CHECK: subs     w24, wzr, w23, lsl #31
+# CHECK: subs     w22, wzr, w21, lsr #0
+# CHECK: subs     w20, wzr, w19, lsr #1
+# CHECK: subs     w18, wzr, w17, lsr #31
+# CHECK: subs     w16, wzr, w15, asr #0
+# CHECK: subs     w14, wzr, w13, asr #12
+# CHECK: subs     w12, wzr, w11, asr #31
+0xfd 0x3 0x1e 0x6b
+0xfe 0x3 0x1f 0x6b
+0xff 0x3 0x0 0x6b
+0xfc 0x3 0x1b 0x6b
+0xfa 0x77 0x19 0x6b
+0xf8 0x7f 0x17 0x6b
+0xf6 0x3 0x55 0x6b
+0xf4 0x7 0x53 0x6b
+0xf2 0x7f 0x51 0x6b
+0xf0 0x3 0x8f 0x6b
+0xee 0x33 0x8d 0x6b
+0xec 0x7f 0x8b 0x6b
+
+# CHECK: subs     x29, xzr, x30
+# CHECK: subs     x30, xzr, xzr
+# CHECK: cmp      xzr, x0
+# CHECK: subs     x28, xzr, x27
+# CHECK: subs     x26, xzr, x25, lsl #29
+# CHECK: subs     x24, xzr, x23, lsl #31
+# CHECK: subs     x22, xzr, x21, lsr #0
+# CHECK: subs     x20, xzr, x19, lsr #1
+# CHECK: subs     x18, xzr, x17, lsr #31
+# CHECK: subs     x16, xzr, x15, asr #0
+# CHECK: subs     x14, xzr, x13, asr #12
+# CHECK: subs     x12, xzr, x11, asr #31
+0xfd 0x3 0x1e 0xeb
+0xfe 0x3 0x1f 0xeb
+0xff 0x3 0x0 0xeb
+0xfc 0x3 0x1b 0xeb
+0xfa 0x77 0x19 0xeb
+0xf8 0x7f 0x17 0xeb
+0xf6 0x3 0x55 0xeb
+0xf4 0x7 0x53 0xeb
+0xf2 0x7f 0x51 0xeb
+0xf0 0x3 0x8f 0xeb
+0xee 0x33 0x8d 0xeb
+0xec 0x7f 0x8b 0xeb
+
+#------------------------------------------------------------------------------
+# Add-subtract (shifted register)
+#------------------------------------------------------------------------------
+
+# CHECK: adc      w29, w27, w25
+# CHECK: adc      wzr, w3, w4
+# CHECK: adc      w9, wzr, w10
+# CHECK: adc      w20, w0, wzr
+0x7d 0x3 0x19 0x1a
+0x7f 0x0 0x4 0x1a
+0xe9 0x3 0xa 0x1a
+0x14 0x0 0x1f 0x1a
+
+# CHECK: adc      x29, x27, x25
+# CHECK: adc      xzr, x3, x4
+# CHECK: adc      x9, xzr, x10
+# CHECK: adc      x20, x0, xzr
+0x7d 0x3 0x19 0x9a
+0x7f 0x0 0x4 0x9a
+0xe9 0x3 0xa 0x9a
+0x14 0x0 0x1f 0x9a
+
+# CHECK: adcs     w29, w27, w25
+# CHECK: adcs     wzr, w3, w4
+# CHECK: adcs     w9, wzr, w10
+# CHECK: adcs     w20, w0, wzr
+0x7d 0x3 0x19 0x3a
+0x7f 0x0 0x4 0x3a
+0xe9 0x3 0xa 0x3a
+0x14 0x0 0x1f 0x3a
+
+# CHECK: adcs     x29, x27, x25
+# CHECK: adcs     xzr, x3, x4
+# CHECK: adcs     x9, xzr, x10
+# CHECK: adcs     x20, x0, xzr
+0x7d 0x3 0x19 0xba
+0x7f 0x0 0x4 0xba
+0xe9 0x3 0xa 0xba
+0x14 0x0 0x1f 0xba
+
+# CHECK: sbc      w29, w27, w25
+# CHECK: sbc      wzr, w3, w4
+# CHECK: ngc      w9, w10
+# CHECK: sbc      w20, w0, wzr
+0x7d 0x3 0x19 0x5a
+0x7f 0x0 0x4 0x5a
+0xe9 0x3 0xa 0x5a
+0x14 0x0 0x1f 0x5a
+
+# CHECK: sbc      x29, x27, x25
+# CHECK: sbc      xzr, x3, x4
+# CHECK: ngc      x9, x10
+# CHECK: sbc      x20, x0, xzr
+0x7d 0x3 0x19 0xda
+0x7f 0x0 0x4 0xda
+0xe9 0x3 0xa 0xda
+0x14 0x0 0x1f 0xda
+
+# CHECK: sbcs     w29, w27, w25
+# CHECK: sbcs     wzr, w3, w4
+# CHECK: ngcs     w9, w10
+# CHECK: sbcs     w20, w0, wzr
+0x7d 0x3 0x19 0x7a
+0x7f 0x0 0x4 0x7a
+0xe9 0x3 0xa 0x7a
+0x14 0x0 0x1f 0x7a
+
+# CHECK: sbcs     x29, x27, x25
+# CHECK: sbcs     xzr, x3, x4
+# CHECK: ngcs     x9, x10
+# CHECK: sbcs     x20, x0, xzr
+0x7d 0x3 0x19 0xfa
+0x7f 0x0 0x4 0xfa
+0xe9 0x3 0xa 0xfa
+0x14 0x0 0x1f 0xfa
+
+# CHECK: ngc      w3, w12
+# CHECK: ngc      wzr, w9
+# CHECK: ngc      w23, wzr
+0xe3 0x3 0xc 0x5a
+0xff 0x3 0x9 0x5a
+0xf7 0x3 0x1f 0x5a
+
+# CHECK: ngc      x29, x30
+# CHECK: ngc      xzr, x0
+# CHECK: ngc      x0, xzr
+0xfd 0x3 0x1e 0xda
+0xff 0x3 0x0 0xda
+0xe0 0x3 0x1f 0xda
+
+# CHECK: ngcs     w3, w12
+# CHECK: ngcs     wzr, w9
+# CHECK: ngcs     w23, wzr
+0xe3 0x3 0xc 0x7a
+0xff 0x3 0x9 0x7a
+0xf7 0x3 0x1f 0x7a
+
+# CHECK: ngcs     x29, x30
+# CHECK: ngcs     xzr, x0
+# CHECK: ngcs     x0, xzr
+0xfd 0x3 0x1e 0xfa
+0xff 0x3 0x0 0xfa
+0xe0 0x3 0x1f 0xfa
+
+#------------------------------------------------------------------------------
+# Compare and branch (immediate)
+#------------------------------------------------------------------------------
+
+# CHECK: sbfx     x1, x2, #3, #2
+# CHECK: asr      x3, x4, #63
+# CHECK: asr      wzr, wzr, #31
+# CHECK: sbfx     w12, w9, #0, #1
+0x41 0x10 0x43 0x93
+0x83 0xfc 0x7f 0x93
+0xff 0x7f 0x1f 0x13
+0x2c 0x1 0x0 0x13
+
+# CHECK: ubfiz    x4, x5, #52, #11
+# CHECK: ubfx     xzr, x4, #0, #1
+# CHECK: ubfiz    x4, xzr, #1, #6
+# CHECK: lsr      x5, x6, #12
+0xa4 0x28 0x4c 0xd3
+0x9f 0x0 0x40 0xd3
+0xe4 0x17 0x7f 0xd3
+0xc5 0xfc 0x4c 0xd3
+
+# CHECK: bfi      x4, x5, #52, #11
+# CHECK: bfxil    xzr, x4, #0, #1
+# CHECK: bfi      x4, xzr, #1, #6
+# CHECK: bfxil    x5, x6, #12, #52
+0xa4 0x28 0x4c 0xb3
+0x9f 0x0 0x40 0xb3
+0xe4 0x17 0x7f 0xb3
+0xc5 0xfc 0x4c 0xb3
+
+# CHECK: sxtb     w1, w2
+# CHECK: sxtb     xzr, w3
+# CHECK: sxth     w9, w10
+# CHECK: sxth     x0, w1
+# CHECK: sxtw     x3, w30
+0x41 0x1c 0x0 0x13
+0x7f 0x1c 0x40 0x93
+0x49 0x3d 0x0 0x13
+0x20 0x3c 0x40 0x93
+0xc3 0x7f 0x40 0x93
+
+# CHECK: uxtb     w1, w2
+# CHECK: uxth     w9, w10
+# CHECK: ubfx     x3, x30, #0, #32
+0x41 0x1c 0x0 0x53
+0x49 0x3d 0x0 0x53
+0xc3 0x7f 0x40 0xd3
+
+# CHECK: asr      w3, w2, #0
+# CHECK: asr      w9, w10, #31
+# CHECK: asr      x20, x21, #63
+# CHECK: asr      w1, wzr, #3
+0x43 0x7c 0x0 0x13
+0x49 0x7d 0x1f 0x13
+0xb4 0xfe 0x7f 0x93
+0xe1 0x7f 0x3 0x13
+
+# CHECK: lsr      w3, w2, #0
+# CHECK: lsr      w9, w10, #31
+# CHECK: lsr      x20, x21, #63
+# CHECK: lsr      wzr, wzr, #3
+0x43 0x7c 0x0 0x53
+0x49 0x7d 0x1f 0x53
+0xb4 0xfe 0x7f 0xd3
+0xff 0x7f 0x3 0x53
+
+# CHECK: lsr      w3, w2, #0
+# CHECK: lsl      w9, w10, #31
+# CHECK: lsl      x20, x21, #63
+# CHECK: lsl      w1, wzr, #3
+0x43 0x7c 0x0 0x53
+0x49 0x1 0x1 0x53
+0xb4 0x2 0x41 0xd3
+0xe1 0x73 0x1d 0x53
+
+# CHECK: sbfx     w9, w10, #0, #1
+# CHECK: sbfiz    x2, x3, #63, #1
+# CHECK: asr      x19, x20, #0
+# CHECK: sbfiz    x9, x10, #5, #59
+# CHECK: asr      w9, w10, #0
+# CHECK: sbfiz    w11, w12, #31, #1
+# CHECK: sbfiz    w13, w14, #29, #3
+# CHECK: sbfiz    xzr, xzr, #10, #11
+0x49 0x1 0x0 0x13
+0x62 0x0 0x41 0x93
+0x93 0xfe 0x40 0x93
+0x49 0xe9 0x7b 0x93
+0x49 0x7d 0x0 0x13
+0x8b 0x1 0x1 0x13
+0xcd 0x9 0x3 0x13
+0xff 0x2b 0x76 0x93
+
+# CHECK: sbfx     w9, w10, #0, #1
+# CHECK: asr      x2, x3, #63
+# CHECK: asr      x19, x20, #0
+# CHECK: asr      x9, x10, #5
+# CHECK: asr      w9, w10, #0
+# CHECK: asr      w11, w12, #31
+# CHECK: asr      w13, w14, #29
+# CHECK: sbfx     xzr, xzr, #10, #11
+0x49 0x1 0x0 0x13
+0x62 0xfc 0x7f 0x93
+0x93 0xfe 0x40 0x93
+0x49 0xfd 0x45 0x93
+0x49 0x7d 0x0 0x13
+0x8b 0x7d 0x1f 0x13
+0xcd 0x7d 0x1d 0x13
+0xff 0x53 0x4a 0x93
+
+# CHECK: bfxil    w9, w10, #0, #1
+# CHECK: bfi      x2, x3, #63, #1
+# CHECK: bfxil    x19, x20, #0, #64
+# CHECK: bfi      x9, x10, #5, #59
+# CHECK: bfxil    w9, w10, #0, #32
+# CHECK: bfi      w11, w12, #31, #1
+# CHECK: bfi      w13, w14, #29, #3
+# CHECK: bfi      xzr, xzr, #10, #11
+0x49 0x1 0x0 0x33
+0x62 0x0 0x41 0xb3
+0x93 0xfe 0x40 0xb3
+0x49 0xe9 0x7b 0xb3
+0x49 0x7d 0x0 0x33
+0x8b 0x1 0x1 0x33
+0xcd 0x9 0x3 0x33
+0xff 0x2b 0x76 0xb3
+
+# CHECK: bfxil    w9, w10, #0, #1
+# CHECK: bfxil    x2, x3, #63, #1
+# CHECK: bfxil    x19, x20, #0, #64
+# CHECK: bfxil    x9, x10, #5, #59
+# CHECK: bfxil    w9, w10, #0, #32
+# CHECK: bfxil    w11, w12, #31, #1
+# CHECK: bfxil    w13, w14, #29, #3
+# CHECK: bfxil    xzr, xzr, #10, #11
+0x49 0x1 0x0 0x33
+0x62 0xfc 0x7f 0xb3
+0x93 0xfe 0x40 0xb3
+0x49 0xfd 0x45 0xb3
+0x49 0x7d 0x0 0x33
+0x8b 0x7d 0x1f 0x33
+0xcd 0x7d 0x1d 0x33
+0xff 0x53 0x4a 0xb3
+
+# CHECK: ubfx     w9, w10, #0, #1
+# CHECK: lsl      x2, x3, #63
+# CHECK: lsr      x19, x20, #0
+# CHECK: lsl      x9, x10, #5
+# CHECK: lsr      w9, w10, #0
+# CHECK: lsl      w11, w12, #31
+# CHECK: lsl      w13, w14, #29
+# CHECK: ubfiz    xzr, xzr, #10, #11
+0x49 0x1 0x0 0x53
+0x62 0x0 0x41 0xd3
+0x93 0xfe 0x40 0xd3
+0x49 0xe9 0x7b 0xd3
+0x49 0x7d 0x0 0x53
+0x8b 0x1 0x1 0x53
+0xcd 0x9 0x3 0x53
+0xff 0x2b 0x76 0xd3
+
+# CHECK: ubfx     w9, w10, #0, #1
+# CHECK: lsr      x2, x3, #63
+# CHECK: lsr      x19, x20, #0
+# CHECK: lsr      x9, x10, #5
+# CHECK: lsr      w9, w10, #0
+# CHECK: lsr      w11, w12, #31
+# CHECK: lsr      w13, w14, #29
+# CHECK: ubfx     xzr, xzr, #10, #11
+0x49 0x1 0x0 0x53
+0x62 0xfc 0x7f 0xd3
+0x93 0xfe 0x40 0xd3
+0x49 0xfd 0x45 0xd3
+0x49 0x7d 0x0 0x53
+0x8b 0x7d 0x1f 0x53
+0xcd 0x7d 0x1d 0x53
+0xff 0x53 0x4a 0xd3
+
+
+#------------------------------------------------------------------------------
+# Compare and branch (immediate)
+#------------------------------------------------------------------------------
+
+# CHECK: cbz      w5, #4
+# CHECK: cbz      x5, #0
+# CHECK: cbnz     x2, #-4
+# CHECK: cbnz     x26, #1048572
+0x25 0x0 0x0 0x34
+0x05 0x0 0x0 0xb4
+0xe2 0xff 0xff 0xb5
+0xfa 0xff 0x7f 0xb5
+
+# CHECK: cbz      wzr, #0
+# CHECK: cbnz     xzr, #0
+0x1f 0x0 0x0 0x34
+0x1f 0x0 0x0 0xb5
+
+#------------------------------------------------------------------------------
+# Conditional branch (immediate)
+#------------------------------------------------------------------------------
+
+# CHECK: b.ne #4
+# CHECK: b.ge #1048572
+# CHECK: b.ge #-4
+0x21 0x00 0x00 0x54
+0xea 0xff 0x7f 0x54
+0xea 0xff 0xff 0x54
+
+#------------------------------------------------------------------------------
+# Conditional compare (immediate)
+#------------------------------------------------------------------------------
+
+# CHECK: ccmp w1, #31, #0, eq
+# CHECK: ccmp w3, #0, #15, hs
+# CHECK: ccmp wzr, #15, #13, hs
+0x20 0x08 0x5f 0x7a
+0x6f 0x28 0x40 0x7a
+0xed 0x2b 0x4f 0x7a
+
+# CHECK: ccmp x9, #31, #0, le
+# CHECK: ccmp x3, #0, #15, gt
+# CHECK: ccmp xzr, #5, #7, ne
+0x20 0xd9 0x5f 0xfa
+0x6f 0xc8 0x40 0xfa
+0xe7 0x1b 0x45 0xfa
+
+# CHECK: ccmn w1, #31, #0, eq
+# CHECK: ccmn w3, #0, #15, hs
+# CHECK: ccmn wzr, #15, #13, hs
+0x20 0x08 0x5f 0x3a
+0x6f 0x28 0x40 0x3a
+0xed 0x2b 0x4f 0x3a
+
+# CHECK: ccmn x9, #31, #0, le
+# CHECK: ccmn x3, #0, #15, gt
+# CHECK: ccmn xzr, #5, #7, ne
+0x20 0xd9 0x5f 0xba
+0x6f 0xc8 0x40 0xba
+0xe7 0x1b 0x45 0xba
+
+#------------------------------------------------------------------------------
+# Conditional compare (register)
+#------------------------------------------------------------------------------
+
+# CHECK: ccmp w1, wzr, #0, eq
+# CHECK: ccmp w3, w0, #15, hs
+# CHECK: ccmp wzr, w15, #13, hs
+0x20 0x00 0x5f 0x7a
+0x6f 0x20 0x40 0x7a
+0xed 0x23 0x4f 0x7a
+
+# CHECK: ccmp x9, xzr, #0, le
+# CHECK: ccmp x3, x0, #15, gt
+# CHECK: ccmp xzr, x5, #7, ne
+0x20 0xd1 0x5f 0xfa
+0x6f 0xc0 0x40 0xfa
+0xe7 0x13 0x45 0xfa
+
+# CHECK: ccmn w1, wzr, #0, eq
+# CHECK: ccmn w3, w0, #15, hs
+# CHECK: ccmn wzr, w15, #13, hs
+0x20 0x00 0x5f 0x3a
+0x6f 0x20 0x40 0x3a
+0xed 0x23 0x4f 0x3a
+
+# CHECK: ccmn x9, xzr, #0, le
+# CHECK: ccmn x3, x0, #15, gt
+# CHECK: ccmn xzr, x5, #7, ne
+0x20 0xd1 0x5f 0xba
+0x6f 0xc0 0x40 0xba
+0xe7 0x13 0x45 0xba
+
+#------------------------------------------------------------------------------
+# Conditional branch (immediate)
+#------------------------------------------------------------------------------
+# CHECK: csel     w1, w0, w19, ne
+# CHECK: csel     wzr, w5, w9, eq
+# CHECK: csel     w9, wzr, w30, gt
+# CHECK: csel     w1, w28, wzr, mi
+# CHECK: csel     x19, x23, x29, lt
+# CHECK: csel     xzr, x3, x4, ge
+# CHECK: csel     x5, xzr, x6, hs
+# CHECK: csel     x7, x8, xzr, lo
+0x1 0x10 0x93 0x1a
+0xbf 0x0 0x89 0x1a
+0xe9 0xc3 0x9e 0x1a
+0x81 0x43 0x9f 0x1a
+0xf3 0xb2 0x9d 0x9a
+0x7f 0xa0 0x84 0x9a
+0xe5 0x23 0x86 0x9a
+0x7 0x31 0x9f 0x9a
+
+# CHECK: csinc    w1, w0, w19, ne
+# CHECK: csinc    wzr, w5, w9, eq
+# CHECK: csinc    w9, wzr, w30, gt
+# CHECK: csinc    w1, w28, wzr, mi
+# CHECK: csinc    x19, x23, x29, lt
+# CHECK: csinc    xzr, x3, x4, ge
+# CHECK: csinc    x5, xzr, x6, hs
+# CHECK: csinc    x7, x8, xzr, lo
+0x1 0x14 0x93 0x1a
+0xbf 0x4 0x89 0x1a
+0xe9 0xc7 0x9e 0x1a
+0x81 0x47 0x9f 0x1a
+0xf3 0xb6 0x9d 0x9a
+0x7f 0xa4 0x84 0x9a
+0xe5 0x27 0x86 0x9a
+0x7 0x35 0x9f 0x9a
+
+# CHECK: csinv    w1, w0, w19, ne
+# CHECK: csinv    wzr, w5, w9, eq
+# CHECK: csinv    w9, wzr, w30, gt
+# CHECK: csinv    w1, w28, wzr, mi
+# CHECK: csinv    x19, x23, x29, lt
+# CHECK: csinv    xzr, x3, x4, ge
+# CHECK: csinv    x5, xzr, x6, hs
+# CHECK: csinv    x7, x8, xzr, lo
+0x1 0x10 0x93 0x5a
+0xbf 0x0 0x89 0x5a
+0xe9 0xc3 0x9e 0x5a
+0x81 0x43 0x9f 0x5a
+0xf3 0xb2 0x9d 0xda
+0x7f 0xa0 0x84 0xda
+0xe5 0x23 0x86 0xda
+0x7 0x31 0x9f 0xda
+
+# CHECK: csneg    w1, w0, w19, ne
+# CHECK: csneg    wzr, w5, w9, eq
+# CHECK: csneg    w9, wzr, w30, gt
+# CHECK: csneg    w1, w28, wzr, mi
+# CHECK: csneg    x19, x23, x29, lt
+# CHECK: csneg    xzr, x3, x4, ge
+# CHECK: csneg    x5, xzr, x6, hs
+# CHECK: csneg    x7, x8, xzr, lo
+0x1 0x14 0x93 0x5a
+0xbf 0x4 0x89 0x5a
+0xe9 0xc7 0x9e 0x5a
+0x81 0x47 0x9f 0x5a
+0xf3 0xb6 0x9d 0xda
+0x7f 0xa4 0x84 0xda
+0xe5 0x27 0x86 0xda
+0x7 0x35 0x9f 0xda
+
+# CHECK: csinc    w3, wzr, wzr, ne
+# CHECK: csinc    x9, xzr, xzr, mi
+# CHECK: csinv    w20, wzr, wzr, eq
+# CHECK: csinv    x30, xzr, xzr, lt
+0xe3 0x17 0x9f 0x1a
+0xe9 0x47 0x9f 0x9a
+0xf4 0x3 0x9f 0x5a
+0xfe 0xb3 0x9f 0xda
+
+# CHECK: csinc    w3, w5, w5, le
+# CHECK: csinc    wzr, w4, w4, gt
+# CHECK: csinc    w9, wzr, wzr, ge
+# CHECK: csinc    x3, x5, x5, le
+# CHECK: csinc    xzr, x4, x4, gt
+# CHECK: csinc    x9, xzr, xzr, ge
+0xa3 0xd4 0x85 0x1a
+0x9f 0xc4 0x84 0x1a
+0xe9 0xa7 0x9f 0x1a
+0xa3 0xd4 0x85 0x9a
+0x9f 0xc4 0x84 0x9a
+0xe9 0xa7 0x9f 0x9a
+
+# CHECK: csinv    w3, w5, w5, le
+# CHECK: csinv    wzr, w4, w4, gt
+# CHECK: csinv    w9, wzr, wzr, ge
+# CHECK: csinv    x3, x5, x5, le
+# CHECK: csinv    xzr, x4, x4, gt
+# CHECK: csinv    x9, xzr, xzr, ge
+0xa3 0xd0 0x85 0x5a
+0x9f 0xc0 0x84 0x5a
+0xe9 0xa3 0x9f 0x5a
+0xa3 0xd0 0x85 0xda
+0x9f 0xc0 0x84 0xda
+0xe9 0xa3 0x9f 0xda
+
+# CHECK: csneg     w3, w5, w5, le
+# CHECK: csneg     wzr, w4, w4, gt
+# CHECK: csneg     w9, wzr, wzr, ge
+# CHECK: csneg     x3, x5, x5, le
+# CHECK: csneg     xzr, x4, x4, gt
+# CHECK: csneg     x9, xzr, xzr, ge
+0xa3 0xd4 0x85 0x5a
+0x9f 0xc4 0x84 0x5a
+0xe9 0xa7 0x9f 0x5a
+0xa3 0xd4 0x85 0xda
+0x9f 0xc4 0x84 0xda
+0xe9 0xa7 0x9f 0xda
+
+#------------------------------------------------------------------------------
+# Data-processing (1 source)
+#------------------------------------------------------------------------------
+
+# CHECK: rbit	w0, w7
+# CHECK: rbit   x18, x3
+# CHECK: rev16	w17, w1
+# CHECK: rev16	x5, x2
+# CHECK: rev	w18, w0
+# CHECK: rev32	x20, x1
+0xe0 0x00 0xc0 0x5a
+0x72 0x00 0xc0 0xda
+0x31 0x04 0xc0 0x5a
+0x45 0x04 0xc0 0xda
+0x12 0x08 0xc0 0x5a
+0x34 0x08 0xc0 0xda
+
+# CHECK: rev	x22, x2
+# CHECK: clz	w24, w3
+# CHECK: clz	x26, x4
+# CHECK: cls	w3, w5
+# CHECK: cls	x20, x5
+0x56 0x0c 0xc0 0xda
+0x78 0x10 0xc0 0x5a
+0x9a 0x10 0xc0 0xda
+0xa3 0x14 0xc0 0x5a
+0xb4 0x14 0xc0 0xda
+
+#------------------------------------------------------------------------------
+# Data-processing (2 source)
+#------------------------------------------------------------------------------
+
+# CHECK: crc32b  w5, w7, w20
+# CHECK: crc32h  w28, wzr, w30
+# CHECK: crc32w  w0, w1, w2
+# CHECK: crc32x  w7, w9, x20
+# CHECK: crc32cb w9, w5, w4
+# CHECK: crc32ch w13, w17, w25
+# CHECK: crc32cw wzr, w3, w5
+# CHECK: crc32cx w18, w16, xzr
+0xe5 0x40 0xd4 0x1a
+0xfc 0x47 0xde 0x1a
+0x20 0x48 0xc2 0x1a
+0x27 0x4d 0xd4 0x9a
+0xa9 0x50 0xc4 0x1a
+0x2d 0x56 0xd9 0x1a
+0x7f 0x58 0xc5 0x1a
+0x12 0x5e 0xdf 0x9a
+
+# CHECK: udiv	w0, w7, w10
+# CHECK: udiv	x9, x22, x4
+# CHECK: sdiv	w12, w21, w0
+# CHECK: sdiv	x13, x2, x1
+# CHECK: lsl	w11, w12, w13
+# CHECK: lsl	x14, x15, x16
+# CHECK: lsr	w17, w18, w19
+# CHECK: lsr	x20, x21, x22
+# CHECK: asr	w23, w24, w25
+# CHECK: asr	x26, x27, x28
+# CHECK: ror	w0, w1, w2
+# CHECK: ror    x3, x4, x5
+0xe0 0x08 0xca 0x1a
+0xc9 0x0a 0xc4 0x9a
+0xac 0x0e 0xc0 0x1a
+0x4d 0x0c 0xc1 0x9a
+0x8b 0x21 0xcd 0x1a
+0xee 0x21 0xd0 0x9a
+0x51 0x26 0xd3 0x1a
+0xb4 0x26 0xd6 0x9a
+0x17 0x2b 0xd9 0x1a
+0x7a 0x2b 0xdc 0x9a
+0x20 0x2c 0xc2 0x1a
+0x83 0x2c 0xc5 0x9a
+
+# CHECK: lsl	w6, w7, w8
+# CHECK: lsl	x9, x10, x11
+# CHECK: lsr	w12, w13, w14
+# CHECK: lsr	x15, x16, x17
+# CHECK: asr	w18, w19, w20
+# CHECK: asr	x21, x22, x23
+# CHECK: ror	w24, w25, w26
+# CHECK: ror	x27, x28, x29
+0xe6 0x20 0xc8 0x1a
+0x49 0x21 0xcb 0x9a
+0xac 0x25 0xce 0x1a
+0x0f 0x26 0xd1 0x9a
+0x72 0x2a 0xd4 0x1a
+0xd5 0x2a 0xd7 0x9a
+0x38 0x2f 0xda 0x1a
+0x9b 0x2f 0xdd 0x9a
+
+#------------------------------------------------------------------------------
+# Data-processing (3 sources)
+#------------------------------------------------------------------------------
+
+# First check some non-canonical encodings where Ra is not 0b11111 (only umulh
+# and smulh have them).
+
+# CHECK: smulh    x30, x29, x28
+# CHECK: smulh    xzr, x27, x26
+# CHECK: umulh    x30, x29, x28
+# CHECK: umulh    x23, x30, xzr
+0xbe 0x73 0x5c 0x9b
+0x7f 0x2f 0x5a 0x9b
+0xbe 0x3f 0xdc 0x9b
+0xd7 0x77 0xdf 0x9b
+
+# Now onto the boilerplate stuff
+
+# CHECK: madd     w1, w3, w7, w4
+# CHECK: madd     wzr, w0, w9, w11
+# CHECK: madd     w13, wzr, w4, w4
+# CHECK: madd     w19, w30, wzr, w29
+# CHECK: mul      w4, w5, w6
+0x61 0x10 0x7 0x1b
+0x1f 0x2c 0x9 0x1b
+0xed 0x13 0x4 0x1b
+0xd3 0x77 0x1f 0x1b
+0xa4 0x7c 0x6 0x1b
+
+# CHECK: madd     x1, x3, x7, x4
+# CHECK: madd     xzr, x0, x9, x11
+# CHECK: madd     x13, xzr, x4, x4
+# CHECK: madd     x19, x30, xzr, x29
+# CHECK: mul      x4, x5, x6
+0x61 0x10 0x7 0x9b
+0x1f 0x2c 0x9 0x9b
+0xed 0x13 0x4 0x9b
+0xd3 0x77 0x1f 0x9b
+0xa4 0x7c 0x6 0x9b
+
+# CHECK: msub     w1, w3, w7, w4
+# CHECK: msub     wzr, w0, w9, w11
+# CHECK: msub     w13, wzr, w4, w4
+# CHECK: msub     w19, w30, wzr, w29
+# CHECK: mneg     w4, w5, w6
+0x61 0x90 0x7 0x1b
+0x1f 0xac 0x9 0x1b
+0xed 0x93 0x4 0x1b
+0xd3 0xf7 0x1f 0x1b
+0xa4 0xfc 0x6 0x1b
+
+# CHECK: msub     x1, x3, x7, x4
+# CHECK: msub     xzr, x0, x9, x11
+# CHECK: msub     x13, xzr, x4, x4
+# CHECK: msub     x19, x30, xzr, x29
+# CHECK: mneg     x4, x5, x6
+0x61 0x90 0x7 0x9b
+0x1f 0xac 0x9 0x9b
+0xed 0x93 0x4 0x9b
+0xd3 0xf7 0x1f 0x9b
+0xa4 0xfc 0x6 0x9b
+
+# CHECK: smaddl   x3, w5, w2, x9
+# CHECK: smaddl   xzr, w10, w11, x12
+# CHECK: smaddl   x13, wzr, w14, x15
+# CHECK: smaddl   x16, w17, wzr, x18
+# CHECK: smull    x19, w20, w21
+0xa3 0x24 0x22 0x9b
+0x5f 0x31 0x2b 0x9b
+0xed 0x3f 0x2e 0x9b
+0x30 0x4a 0x3f 0x9b
+0x93 0x7e 0x35 0x9b
+
+# CHECK: smsubl   x3, w5, w2, x9
+# CHECK: smsubl   xzr, w10, w11, x12
+# CHECK: smsubl   x13, wzr, w14, x15
+# CHECK: smsubl   x16, w17, wzr, x18
+# CHECK: smnegl   x19, w20, w21
+0xa3 0xa4 0x22 0x9b
+0x5f 0xb1 0x2b 0x9b
+0xed 0xbf 0x2e 0x9b
+0x30 0xca 0x3f 0x9b
+0x93 0xfe 0x35 0x9b
+
+# CHECK: umaddl   x3, w5, w2, x9
+# CHECK: umaddl   xzr, w10, w11, x12
+# CHECK: umaddl   x13, wzr, w14, x15
+# CHECK: umaddl   x16, w17, wzr, x18
+# CHECK: umull    x19, w20, w21
+0xa3 0x24 0xa2 0x9b
+0x5f 0x31 0xab 0x9b
+0xed 0x3f 0xae 0x9b
+0x30 0x4a 0xbf 0x9b
+0x93 0x7e 0xb5 0x9b
+
+# CHECK: umsubl   x3, w5, w2, x9
+# CHECK: umsubl   xzr, w10, w11, x12
+# CHECK: umsubl   x13, wzr, w14, x15
+# CHECK: umsubl   x16, w17, wzr, x18
+# CHECK: umnegl   x19, w20, w21
+0xa3 0xa4 0xa2 0x9b
+0x5f 0xb1 0xab 0x9b
+0xed 0xbf 0xae 0x9b
+0x30 0xca 0xbf 0x9b
+0x93 0xfe 0xb5 0x9b
+
+# CHECK: smulh    x30, x29, x28
+# CHECK: smulh    xzr, x27, x26
+# CHECK: smulh    x25, xzr, x24
+# CHECK: smulh    x23, x22, xzr
+0xbe 0x7f 0x5c 0x9b
+0x7f 0x7f 0x5a 0x9b
+0xf9 0x7f 0x58 0x9b
+0xd7 0x7e 0x5f 0x9b
+
+# CHECK: umulh    x30, x29, x28
+# CHECK: umulh    xzr, x27, x26
+# CHECK: umulh    x25, xzr, x24
+# CHECK: umulh    x23, x22, xzr
+0xbe 0x7f 0xdc 0x9b
+0x7f 0x7f 0xda 0x9b
+0xf9 0x7f 0xd8 0x9b
+0xd7 0x7e 0xdf 0x9b
+
+# CHECK: mul      w3, w4, w5
+# CHECK: mul      wzr, w6, w7
+# CHECK: mul      w8, wzr, w9
+# CHECK: mul      w10, w11, wzr
+# CHECK: mul      x12, x13, x14
+# CHECK: mul      xzr, x15, x16
+# CHECK: mul      x17, xzr, x18
+# CHECK: mul      x19, x20, xzr
+0x83 0x7c 0x5 0x1b
+0xdf 0x7c 0x7 0x1b
+0xe8 0x7f 0x9 0x1b
+0x6a 0x7d 0x1f 0x1b
+0xac 0x7d 0xe 0x9b
+0xff 0x7d 0x10 0x9b
+0xf1 0x7f 0x12 0x9b
+0x93 0x7e 0x1f 0x9b
+
+# CHECK: mneg     w21, w22, w23
+# CHECK: mneg     wzr, w24, w25
+# CHECK: mneg     w26, wzr, w27
+# CHECK: mneg     w28, w29, wzr
+0xd5 0xfe 0x17 0x1b
+0x1f 0xff 0x19 0x1b
+0xfa 0xff 0x1b 0x1b
+0xbc 0xff 0x1f 0x1b
+
+# CHECK: smull    x11, w13, w17
+# CHECK: umull    x11, w13, w17
+# CHECK: smnegl   x11, w13, w17
+# CHECK: umnegl   x11, w13, w17
+0xab 0x7d 0x31 0x9b
+0xab 0x7d 0xb1 0x9b
+0xab 0xfd 0x31 0x9b
+0xab 0xfd 0xb1 0x9b
+
+#------------------------------------------------------------------------------
+# Exception generation
+#------------------------------------------------------------------------------
+
+# CHECK: svc      #0
+# CHECK: svc      #65535
+0x1 0x0 0x0 0xd4
+0xe1 0xff 0x1f 0xd4
+
+# CHECK: hvc      #1
+# CHECK: smc      #12000
+# CHECK: brk      #12
+# CHECK: hlt      #123
+0x22 0x0 0x0 0xd4
+0x3 0xdc 0x5 0xd4
+0x80 0x1 0x20 0xd4
+0x60 0xf 0x40 0xd4
+
+# CHECK: dcps1    #42
+# CHECK: dcps2    #9
+# CHECK: dcps3    #1000
+0x41 0x5 0xa0 0xd4
+0x22 0x1 0xa0 0xd4
+0x3 0x7d 0xa0 0xd4
+
+# CHECK: dcps1
+# CHECK: dcps2
+# CHECK: dcps3
+0x1 0x0 0xa0 0xd4
+0x2 0x0 0xa0 0xd4
+0x3 0x0 0xa0 0xd4
+
+#------------------------------------------------------------------------------
+# Extract (immediate)
+#------------------------------------------------------------------------------
+
+# CHECK: extr     w3, w5, w7, #0
+# CHECK: extr     w11, w13, w17, #31
+0xa3 0x0 0x87 0x13
+0xab 0x7d 0x91 0x13
+
+# CHECK: extr     x3, x5, x7, #15
+# CHECK: extr     x11, x13, x17, #63
+0xa3 0x3c 0xc7 0x93
+0xab 0xfd 0xd1 0x93
+
+# CHECK: extr     x19, x23, x23, #24
+# CHECK: extr     x29, xzr, xzr, #63
+# CHECK: extr     w9, w13, w13, #31
+0xf3 0x62 0xd7 0x93
+0xfd 0xff 0xdf 0x93
+0xa9 0x7d 0x8d 0x13
+
+#------------------------------------------------------------------------------
+# Floating-point compare
+#------------------------------------------------------------------------------
+
+# CHECK: fcmp    s3, s5
+# CHECK: fcmp    s31, #0.0
+# CHECK: fcmp    s31, #0.0
+0x60 0x20 0x25 0x1e
+0xe8 0x23 0x20 0x1e
+0xe8 0x23 0x3f 0x1e
+
+# CHECK: fcmpe   s29, s30
+# CHECK: fcmpe   s15, #0.0
+# CHECK: fcmpe   s15, #0.0
+0xb0 0x23 0x3e 0x1e
+0xf8 0x21 0x20 0x1e
+0xf8 0x21 0x2f 0x1e
+
+# CHECK: fcmp    d4, d12
+# CHECK: fcmp    d23, #0.0
+# CHECK: fcmp    d23, #0.0
+0x80 0x20 0x6c 0x1e
+0xe8 0x22 0x60 0x1e
+0xe8 0x22 0x77 0x1e
+
+# CHECK: fcmpe   d26, d22
+# CHECK: fcmpe   d29, #0.0
+# CHECK: fcmpe   d29, #0.0
+0x50 0x23 0x76 0x1e
+0xb8 0x23 0x60 0x1e
+0xb8 0x23 0x6d 0x1e
+
+#------------------------------------------------------------------------------
+# Floating-point conditional compare
+#------------------------------------------------------------------------------
+
+# CHECK: fccmp s1, s31, #0, eq
+# CHECK: fccmp s3, s0, #15, hs
+# CHECK: fccmp s31, s15, #13, hs
+0x20 0x04 0x3f 0x1e
+0x6f 0x24 0x20 0x1e
+0xed 0x27 0x2f 0x1e
+
+# CHECK: fccmp d9, d31, #0, le
+# CHECK: fccmp d3, d0, #15, gt
+# CHECK: fccmp d31, d5, #7, ne
+0x20 0xd5 0x7f 0x1e
+0x6f 0xc4 0x60 0x1e
+0xe7 0x17 0x65 0x1e
+
+# CHECK: fccmpe s1, s31, #0, eq
+# CHECK: fccmpe s3, s0, #15, hs
+# CHECK: fccmpe s31, s15, #13, hs
+0x30 0x04 0x3f 0x1e
+0x7f 0x24 0x20 0x1e
+0xfd 0x27 0x2f 0x1e
+
+# CHECK: fccmpe d9, d31, #0, le
+# CHECK: fccmpe d3, d0, #15, gt
+# CHECK: fccmpe d31, d5, #7, ne
+0x30 0xd5 0x7f 0x1e
+0x7f 0xc4 0x60 0x1e
+0xf7 0x17 0x65 0x1e
+
+#-------------------------------------------------------------------------------
+# Floating-point conditional compare
+#-------------------------------------------------------------------------------
+
+# CHECK: fcsel s3, s20, s9, pl
+# CHECK: fcsel d9, d10, d11, mi
+0x83 0x5e 0x29 0x1e
+0x49 0x4d 0x6b 0x1e
+
+#------------------------------------------------------------------------------
+# Floating-point data-processing (1 source)
+#------------------------------------------------------------------------------
+
+# CHECK: fmov     s0, s1
+# CHECK: fabs     s2, s3
+# CHECK: fneg     s4, s5
+# CHECK: fsqrt    s6, s7
+# CHECK: fcvt     d8, s9
+# CHECK: fcvt     h10, s11
+# CHECK: frintn   s12, s13
+# CHECK: frintp   s14, s15
+# CHECK: frintm   s16, s17
+# CHECK: frintz   s18, s19
+# CHECK: frinta   s20, s21
+# CHECK: frintx   s22, s23
+# CHECK: frinti   s24, s25
+0x20 0x40 0x20 0x1e
+0x62 0xc0 0x20 0x1e
+0xa4 0x40 0x21 0x1e
+0xe6 0xc0 0x21 0x1e
+0x28 0xc1 0x22 0x1e
+0x6a 0xc1 0x23 0x1e
+0xac 0x41 0x24 0x1e
+0xee 0xc1 0x24 0x1e
+0x30 0x42 0x25 0x1e
+0x72 0xc2 0x25 0x1e
+0xb4 0x42 0x26 0x1e
+0xf6 0x42 0x27 0x1e
+0x38 0xc3 0x27 0x1e
+
+# CHECK: fmov     d0, d1
+# CHECK: fabs     d2, d3
+# CHECK: fneg     d4, d5
+# CHECK: fsqrt    d6, d7
+# CHECK: fcvt     s8, d9
+# CHECK: fcvt     h10, d11
+# CHECK: frintn   d12, d13
+# CHECK: frintp   d14, d15
+# CHECK: frintm   d16, d17
+# CHECK: frintz   d18, d19
+# CHECK: frinta   d20, d21
+# CHECK: frintx   d22, d23
+# CHECK: frinti   d24, d25
+0x20 0x40 0x60 0x1e
+0x62 0xc0 0x60 0x1e
+0xa4 0x40 0x61 0x1e
+0xe6 0xc0 0x61 0x1e
+0x28 0x41 0x62 0x1e
+0x6a 0xc1 0x63 0x1e
+0xac 0x41 0x64 0x1e
+0xee 0xc1 0x64 0x1e
+0x30 0x42 0x65 0x1e
+0x72 0xc2 0x65 0x1e
+0xb4 0x42 0x66 0x1e
+0xf6 0x42 0x67 0x1e
+0x38 0xc3 0x67 0x1e
+
+# CHECK: fcvt     s26, h27
+# CHECK: fcvt     d28, h29
+0x7a 0x43 0xe2 0x1e
+0xbc 0xc3 0xe2 0x1e
+
+#------------------------------------------------------------------------------
+# Floating-point data-processing (2 sources)
+#------------------------------------------------------------------------------
+
+# CHECK: fmul     s20, s19, s17
+# CHECK: fdiv     s1, s2, s3
+# CHECK: fadd     s4, s5, s6
+# CHECK: fsub     s7, s8, s9
+# CHECK: fmax     s10, s11, s12
+# CHECK: fmin     s13, s14, s15
+# CHECK: fmaxnm   s16, s17, s18
+# CHECK: fminnm   s19, s20, s21
+# CHECK: fnmul    s22, s23, s2
+0x74 0xa 0x31 0x1e
+0x41 0x18 0x23 0x1e
+0xa4 0x28 0x26 0x1e
+0x7 0x39 0x29 0x1e
+0x6a 0x49 0x2c 0x1e
+0xcd 0x59 0x2f 0x1e
+0x30 0x6a 0x32 0x1e
+0x93 0x7a 0x35 0x1e
+0xf6 0x8a 0x38 0x1e
+
+
+# CHECK: fmul     d20, d19, d17
+# CHECK: fdiv     d1, d2, d3
+# CHECK: fadd     d4, d5, d6
+# CHECK: fsub     d7, d8, d9
+# CHECK: fmax     d10, d11, d12
+# CHECK: fmin     d13, d14, d15
+# CHECK: fmaxnm   d16, d17, d18
+# CHECK: fminnm   d19, d20, d21
+# CHECK: fnmul    d22, d23, d24
+0x74 0xa 0x71 0x1e
+0x41 0x18 0x63 0x1e
+0xa4 0x28 0x66 0x1e
+0x7 0x39 0x69 0x1e
+0x6a 0x49 0x6c 0x1e
+0xcd 0x59 0x6f 0x1e
+0x30 0x6a 0x72 0x1e
+0x93 0x7a 0x75 0x1e
+0xf6 0x8a 0x78 0x1e
+
+#------------------------------------------------------------------------------
+# Floating-point data-processing (1 source)
+#------------------------------------------------------------------------------
+
+# CHECK: fmadd s3, s5, s6, s31
+# CHECK: fmadd d3, d13, d0, d23
+# CHECK: fmsub s3, s5, s6, s31
+# CHECK: fmsub d3, d13, d0, d23
+# CHECK: fnmadd s3, s5, s6, s31
+# CHECK: fnmadd d3, d13, d0, d23
+# CHECK: fnmsub s3, s5, s6, s31
+# CHECK: fnmsub d3, d13, d0, d23
+0xa3 0x7c 0x06 0x1f
+0xa3 0x5d 0x40 0x1f
+0xa3 0xfc 0x06 0x1f
+0xa3 0xdd 0x40 0x1f
+0xa3 0x7c 0x26 0x1f
+0xa3 0x5d 0x60 0x1f
+0xa3 0xfc 0x26 0x1f
+0xa3 0xdd 0x60 0x1f
+
+#------------------------------------------------------------------------------
+# Floating-point <-> fixed-point conversion
+#------------------------------------------------------------------------------
+
+# CHECK: fcvtzs  w3, s5, #1
+# CHECK: fcvtzs  wzr, s20, #13
+# CHECK: fcvtzs  w19, s0, #32
+0xa3 0xfc 0x18 0x1e
+0x9f 0xce 0x18 0x1e
+0x13 0x80 0x18 0x1e
+
+# CHECK: fcvtzs  x3, s5, #1
+# CHECK: fcvtzs  x12, s30, #45
+# CHECK: fcvtzs  x19, s0, #64
+0xa3 0xfc 0x18 0x9e
+0xcc 0x4f 0x18 0x9e
+0x13 0x00 0x18 0x9e
+
+# CHECK: fcvtzs  w3, d5, #1
+# CHECK: fcvtzs  wzr, d20, #13
+# CHECK: fcvtzs  w19, d0, #32
+0xa3 0xfc 0x58 0x1e
+0x9f 0xce 0x58 0x1e
+0x13 0x80 0x58 0x1e
+
+# CHECK: fcvtzs  x3, d5, #1
+# CHECK: fcvtzs  x12, d30, #45
+# CHECK: fcvtzs  x19, d0, #64
+0xa3 0xfc 0x58 0x9e
+0xcc 0x4f 0x58 0x9e
+0x13 0x00 0x58 0x9e
+
+# CHECK: fcvtzu  w3, s5, #1
+# CHECK: fcvtzu  wzr, s20, #13
+# CHECK: fcvtzu  w19, s0, #32
+0xa3 0xfc 0x19 0x1e
+0x9f 0xce 0x19 0x1e
+0x13 0x80 0x19 0x1e
+
+# CHECK: fcvtzu  x3, s5, #1
+# CHECK: fcvtzu  x12, s30, #45
+# CHECK: fcvtzu  x19, s0, #64
+0xa3 0xfc 0x19 0x9e
+0xcc 0x4f 0x19 0x9e
+0x13 0x00 0x19 0x9e
+
+# CHECK: fcvtzu  w3, d5, #1
+# CHECK: fcvtzu  wzr, d20, #13
+# CHECK: fcvtzu  w19, d0, #32
+0xa3 0xfc 0x59 0x1e
+0x9f 0xce 0x59 0x1e
+0x13 0x80 0x59 0x1e
+
+# CHECK: fcvtzu  x3, d5, #1
+# CHECK: fcvtzu  x12, d30, #45
+# CHECK: fcvtzu  x19, d0, #64
+0xa3 0xfc 0x59 0x9e
+0xcc 0x4f 0x59 0x9e
+0x13 0x00 0x59 0x9e
+
+# CHECK: scvtf   s23, w19, #1
+# CHECK: scvtf   s31, wzr, #20
+# CHECK: scvtf   s14, w0, #32
+0x77 0xfe 0x02 0x1e
+0xff 0xb3 0x02 0x1e
+0x0e 0x80 0x02 0x1e
+
+# CHECK: scvtf   s23, x19, #1
+# CHECK: scvtf   s31, xzr, #20
+# CHECK: scvtf   s14, x0, #64
+0x77 0xfe 0x02 0x9e
+0xff 0xb3 0x02 0x9e
+0x0e 0x00 0x02 0x9e
+
+# CHECK: scvtf   d23, w19, #1
+# CHECK: scvtf   d31, wzr, #20
+# CHECK: scvtf   d14, w0, #32
+0x77 0xfe 0x42 0x1e
+0xff 0xb3 0x42 0x1e
+0x0e 0x80 0x42 0x1e
+
+# CHECK: scvtf   d23, x19, #1
+# CHECK: scvtf   d31, xzr, #20
+# CHECK: scvtf   d14, x0, #64
+0x77 0xfe 0x42 0x9e
+0xff 0xb3 0x42 0x9e
+0x0e 0x00 0x42 0x9e
+
+# CHECK: ucvtf   s23, w19, #1
+# CHECK: ucvtf   s31, wzr, #20
+# CHECK: ucvtf   s14, w0, #32
+0x77 0xfe 0x03 0x1e
+0xff 0xb3 0x03 0x1e
+0x0e 0x80 0x03 0x1e
+
+# CHECK: ucvtf   s23, x19, #1
+# CHECK: ucvtf   s31, xzr, #20
+# CHECK: ucvtf   s14, x0, #64
+0x77 0xfe 0x03 0x9e
+0xff 0xb3 0x03 0x9e
+0x0e 0x00 0x03 0x9e
+
+# CHECK: ucvtf   d23, w19, #1
+# CHECK: ucvtf   d31, wzr, #20
+# CHECK: ucvtf   d14, w0, #32
+0x77 0xfe 0x43 0x1e
+0xff 0xb3 0x43 0x1e
+0x0e 0x80 0x43 0x1e
+
+# CHECK: ucvtf   d23, x19, #1
+# CHECK: ucvtf   d31, xzr, #20
+# CHECK: ucvtf   d14, x0, #64
+0x77 0xfe 0x43 0x9e
+0xff 0xb3 0x43 0x9e
+0x0e 0x00 0x43 0x9e
+
+#------------------------------------------------------------------------------
+# Floating-point <-> integer conversion
+#------------------------------------------------------------------------------
+# CHECK: fcvtns   w3, s31
+# CHECK: fcvtns   xzr, s12
+# CHECK: fcvtnu   wzr, s12
+# CHECK: fcvtnu   x0, s0
+0xe3 0x3 0x20 0x1e
+0x9f 0x1 0x20 0x9e
+0x9f 0x1 0x21 0x1e
+0x0 0x0 0x21 0x9e
+
+# CHECK: fcvtps   wzr, s9
+# CHECK: fcvtps   x12, s20
+# CHECK: fcvtpu   w30, s23
+# CHECK: fcvtpu   x29, s3
+0x3f 0x1 0x28 0x1e
+0x8c 0x2 0x28 0x9e
+0xfe 0x2 0x29 0x1e
+0x7d 0x0 0x29 0x9e
+
+# CHECK: fcvtms   w2, s3
+# CHECK: fcvtms   x4, s5
+# CHECK: fcvtmu   w6, s7
+# CHECK: fcvtmu   x8, s9
+0x62 0x0 0x30 0x1e
+0xa4 0x0 0x30 0x9e
+0xe6 0x0 0x31 0x1e
+0x28 0x1 0x31 0x9e
+
+# CHECK: fcvtzs   w10, s11
+# CHECK: fcvtzs   x12, s13
+# CHECK: fcvtzu   w14, s15
+# CHECK: fcvtzu   x15, s16
+0x6a 0x1 0x38 0x1e
+0xac 0x1 0x38 0x9e
+0xee 0x1 0x39 0x1e
+0xf 0x2 0x39 0x9e
+
+# CHECK: scvtf    s17, w18
+# CHECK: scvtf    s19, x20
+# CHECK: ucvtf    s21, w22
+# CHECK: scvtf    s23, x24
+0x51 0x2 0x22 0x1e
+0x93 0x2 0x22 0x9e
+0xd5 0x2 0x23 0x1e
+0x17 0x3 0x22 0x9e
+
+# CHECK: fcvtas   w25, s26
+# CHECK: fcvtas   x27, s28
+# CHECK: fcvtau   w29, s30
+# CHECK: fcvtau   xzr, s0
+0x59 0x3 0x24 0x1e
+0x9b 0x3 0x24 0x9e
+0xdd 0x3 0x25 0x1e
+0x1f 0x0 0x25 0x9e
+
+# CHECK: fcvtns   w3, d31
+# CHECK: fcvtns   xzr, d12
+# CHECK: fcvtnu   wzr, d12
+# CHECK: fcvtnu   x0, d0
+0xe3 0x3 0x60 0x1e
+0x9f 0x1 0x60 0x9e
+0x9f 0x1 0x61 0x1e
+0x0 0x0 0x61 0x9e
+
+# CHECK: fcvtps   wzr, d9
+# CHECK: fcvtps   x12, d20
+# CHECK: fcvtpu   w30, d23
+# CHECK: fcvtpu   x29, d3
+0x3f 0x1 0x68 0x1e
+0x8c 0x2 0x68 0x9e
+0xfe 0x2 0x69 0x1e
+0x7d 0x0 0x69 0x9e
+
+# CHECK: fcvtms   w2, d3
+# CHECK: fcvtms   x4, d5
+# CHECK: fcvtmu   w6, d7
+# CHECK: fcvtmu   x8, d9
+0x62 0x0 0x70 0x1e
+0xa4 0x0 0x70 0x9e
+0xe6 0x0 0x71 0x1e
+0x28 0x1 0x71 0x9e
+
+# CHECK: fcvtzs   w10, d11
+# CHECK: fcvtzs   x12, d13
+# CHECK: fcvtzu   w14, d15
+# CHECK: fcvtzu   x15, d16
+0x6a 0x1 0x78 0x1e
+0xac 0x1 0x78 0x9e
+0xee 0x1 0x79 0x1e
+0xf 0x2 0x79 0x9e
+
+# CHECK: scvtf    d17, w18
+# CHECK: scvtf    d19, x20
+# CHECK: ucvtf    d21, w22
+# CHECK: ucvtf    d23, x24
+0x51 0x2 0x62 0x1e
+0x93 0x2 0x62 0x9e
+0xd5 0x2 0x63 0x1e
+0x17 0x3 0x63 0x9e
+
+# CHECK: fcvtas   w25, d26
+# CHECK: fcvtas   x27, d28
+# CHECK: fcvtau   w29, d30
+# CHECK: fcvtau   xzr, d0
+0x59 0x3 0x64 0x1e
+0x9b 0x3 0x64 0x9e
+0xdd 0x3 0x65 0x1e
+0x1f 0x0 0x65 0x9e
+
+# CHECK: fmov     w3, s9
+# CHECK: fmov     s9, w3
+0x23 0x1 0x26 0x1e
+0x69 0x0 0x27 0x1e
+
+# CHECK: fmov     x20, d31
+# CHECK: fmov     d1, x15
+0xf4 0x3 0x66 0x9e
+0xe1 0x1 0x67 0x9e
+
+# CHECK: fmov     x3, v12.d[1]
+# CHECK: fmov     v1.d[1], x19
+0x83 0x1 0xae 0x9e
+0x61 0x2 0xaf 0x9e
+
+#------------------------------------------------------------------------------
+# Floating-point immediate
+#------------------------------------------------------------------------------
+
+# CHECK: fmov     s2, #0.12500000
+# CHECK: fmov     s3, #1.00000000
+# CHECK: fmov     d30, #16.00000000
+0x2 0x10 0x28 0x1e
+0x3 0x10 0x2e 0x1e
+0x1e 0x10 0x66 0x1e
+
+# CHECK: fmov     s4, #1.06250000
+# CHECK: fmov     d10, #1.93750000
+0x4 0x30 0x2e 0x1e
+0xa 0xf0 0x6f 0x1e
+
+# CHECK: fmov     s12, #-1.00000000
+0xc 0x10 0x3e 0x1e
+
+# CHECK: fmov     d16, #8.50000000
+0x10 0x30 0x64 0x1e
+
+#------------------------------------------------------------------------------
+# Load-register (literal)
+#------------------------------------------------------------------------------
+
+# CHECK: ldr       w3, #0
+# CHECK: ldr       x29, #4
+# CHECK: ldrsw     xzr, #-4
+0x03 0x00 0x00 0x18
+0x3d 0x00 0x00 0x58
+0xff 0xff 0xff 0x98
+
+# CHECK: ldr       s0, #8
+# CHECK: ldr       d0, #1048572
+# CHECK: ldr       q0, #-1048576
+0x40 0x00 0x00 0x1c
+0xe0 0xff 0x7f 0x5c
+0x00 0x00 0x80 0x9c
+
+# CHECK: prfm      pldl1strm, #0
+# CHECK: prfm      #22, #0
+0x01 0x00 0x00 0xd8
+0x16 0x00 0x00 0xd8
+
+#------------------------------------------------------------------------------
+# Load/store exclusive
+#------------------------------------------------------------------------------
+
+#CHECK: stxrb      w18, w8, [sp]
+#CHECK: stxrh      w24, w15, [x16]
+#CHECK: stxr       w5, w6, [x17]
+#CHECK: stxr       w1, x10, [x21]
+#CHECK: stxr       w1, x10, [x21]
+0xe8 0x7f 0x12 0x08
+0x0f 0x7e 0x18 0x48
+0x26 0x7e 0x05 0x88
+0xaa 0x7e 0x01 0xc8
+0xaa 0x7a 0x01 0xc8
+
+#CHECK: ldxrb      w30, [x0]
+#CHECK: ldxrh      w17, [x4]
+#CHECK: ldxr       w22, [sp]
+#CHECK: ldxr       x11, [x29]
+#CHECK: ldxr       x11, [x29]
+#CHECK: ldxr       x11, [x29]
+0x1e 0x7c 0x5f 0x08
+0x91 0x7c 0x5f 0x48
+0xf6 0x7f 0x5f 0x88
+0xab 0x7f 0x5f 0xc8
+0xab 0x6f 0x5f 0xc8
+0xab 0x7f 0x5e 0xc8
+
+#CHECK: stxp       w12, w11, w10, [sp]
+#CHECK: stxp       wzr, x27, x9, [x12]
+0xeb 0x2b 0x2c 0x88
+0x9b 0x25 0x3f 0xc8
+
+#CHECK: ldxp       w0, wzr, [sp]
+#CHECK: ldxp       x17, x0, [x18]
+#CHECK: ldxp       x17, x0, [x18]
+0xe0 0x7f 0x7f 0x88
+0x51 0x02 0x7f 0xc8
+0x51 0x02 0x7e 0xc8
+
+#CHECK: stlxrb     w12, w22, [x0]
+#CHECK: stlxrh     w10, w1, [x1]
+#CHECK: stlxr      w9, w2, [x2]
+#CHECK: stlxr      w9, x3, [sp]
+
+0x16 0xfc 0x0c 0x08
+0x21 0xfc 0x0a 0x48
+0x42 0xfc 0x09 0x88
+0xe3 0xff 0x09 0xc8
+
+#CHECK: ldaxrb     w8, [x4]
+#CHECK: ldaxrh     w7, [x5]
+#CHECK: ldaxr      w6, [sp]
+#CHECK: ldaxr      x5, [x6]
+#CHECK: ldaxr      x5, [x6]
+#CHECK: ldaxr      x5, [x6]
+0x88 0xfc 0x5f 0x08
+0xa7 0xfc 0x5f 0x48
+0xe6 0xff 0x5f 0x88
+0xc5 0xfc 0x5f 0xc8
+0xc5 0xec 0x5f 0xc8
+0xc5 0xfc 0x5e 0xc8
+
+#CHECK: stlxp      w4, w5, w6, [sp]
+#CHECK: stlxp      wzr, x6, x7, [x1]
+0xe5 0x9b 0x24 0x88
+0x26 0x9c 0x3f 0xc8
+
+#CHECK: ldaxp      w5, w18, [sp]
+#CHECK: ldaxp      x6, x19, [x22]
+#CHECK: ldaxp      x6, x19, [x22]
+0xe5 0xcb 0x7f 0x88
+0xc6 0xce 0x7f 0xc8
+0xc6 0xce 0x7e 0xc8
+
+#CHECK: stlrb      w24, [sp]
+#CHECK: stlrh      w25, [x30]
+#CHECK: stlr       w26, [x29]
+#CHECK: stlr       x27, [x28]
+#CHECK: stlr       x27, [x28]
+#CHECK: stlr       x27, [x28]
+0xf8 0xff 0x9f 0x08
+0xd9 0xff 0x9f 0x48
+0xba 0xff 0x9f 0x88
+0x9b 0xff 0x9f 0xc8
+0x9b 0xef 0x9f 0xc8
+0x9b 0xff 0x9e 0xc8
+
+#CHECK: ldarb      w23, [sp]
+#CHECK: ldarh      w22, [x30]
+#CHECK: ldar       wzr, [x29]
+#CHECK: ldar       x21, [x28]
+#CHECK: ldar       x21, [x28]
+#CHECK: ldar       x21, [x28]
+0xf7 0xff 0xdf 0x08
+0xd6 0xff 0xdf 0x48
+0xbf 0xff 0xdf 0x88
+0x95 0xff 0xdf 0xc8
+0x95 0xef 0xdf 0xc8
+0x95 0xff 0xde 0xc8
+
+#------------------------------------------------------------------------------
+# Load/store (unscaled  immediate)
+#------------------------------------------------------------------------------
+
+# CHECK: sturb    w9, [sp]
+# CHECK: sturh    wzr, [x12, #255]
+# CHECK: stur     w16, [x0, #-256]
+# CHECK: stur     x28, [x14, #1]
+0xe9 0x3 0x0 0x38
+0x9f 0xf1 0xf 0x78
+0x10 0x0 0x10 0xb8
+0xdc 0x11 0x0 0xf8
+
+# CHECK: ldurb    w1, [x20, #255]
+# CHECK: ldurh    w20, [x1, #255]
+# CHECK: ldur     w12, [sp, #255]
+# CHECK: ldur     xzr, [x12, #255]
+0x81 0xf2 0x4f 0x38
+0x34 0xf0 0x4f 0x78
+0xec 0xf3 0x4f 0xb8
+0x9f 0xf1 0x4f 0xf8
+
+# CHECK: ldursb   x9, [x7, #-256]
+# CHECK: ldursh   x17, [x19, #-256]
+# CHECK: ldursw   x20, [x15, #-256]
+# CHECK: prfum    pldl2keep, [sp, #-256]
+# CHECK: ldursb   w19, [x1, #-256]
+# CHECK: ldursh   w15, [x21, #-256]
+0xe9 0x0 0x90 0x38
+0x71 0x2 0x90 0x78
+0xf4 0x1 0x90 0xb8
+0xe2 0x3 0x90 0xf8
+0x33 0x0 0xd0 0x38
+0xaf 0x2 0xd0 0x78
+
+# CHECK: stur     b0, [sp, #1]
+# CHECK: stur     h12, [x12, #-1]
+# CHECK: stur     s15, [x0, #255]
+# CHECK: stur     d31, [x5, #25]
+# CHECK: stur     q9, [x5]
+0xe0 0x13 0x0 0x3c
+0x8c 0xf1 0x1f 0x7c
+0xf 0xf0 0xf 0xbc
+0xbf 0x90 0x1 0xfc
+0xa9 0x0 0x80 0x3c
+
+# CHECK: ldur     b3, [sp]
+# CHECK: ldur     h5, [x4, #-256]
+# CHECK: ldur     s7, [x12, #-1]
+# CHECK: ldur     d11, [x19, #4]
+# CHECK: ldur     q13, [x1, #2]
+0xe3 0x3 0x40 0x3c
+0x85 0x0 0x50 0x7c
+0x87 0xf1 0x5f 0xbc
+0x6b 0x42 0x40 0xfc
+0x2d 0x20 0xc0 0x3c
+
+#------------------------------------------------------------------------------
+# Load/store (immediate post-indexed)
+#------------------------------------------------------------------------------
+
+# E.g. "str xzr, [sp], #4" is *not* unpredictable
+# CHECK-NOT: warning: potentially undefined instruction encoding
+0xff 0x47 0x40 0xb8
+
+# CHECK: strb     w9, [x2], #255
+# CHECK: strb     w10, [x3], #1
+# CHECK: strb     w10, [x3], #-256
+# CHECK: strh     w9, [x2], #255
+# CHECK: strh     w9, [x2], #1
+# CHECK: strh     w10, [x3], #-256
+0x49 0xf4 0xf 0x38
+0x6a 0x14 0x0 0x38
+0x6a 0x4 0x10 0x38
+0x49 0xf4 0xf 0x78
+0x49 0x14 0x0 0x78
+0x6a 0x4 0x10 0x78
+
+# CHECK: str      w19, [sp], #255
+# CHECK: str      w20, [x30], #1
+# CHECK: str      w21, [x12], #-256
+# CHECK: str      xzr, [x9], #255
+# CHECK: str      x2, [x3], #1
+# CHECK: str      x19, [x12], #-256
+0xf3 0xf7 0xf 0xb8
+0xd4 0x17 0x0 0xb8
+0x95 0x5 0x10 0xb8
+0x3f 0xf5 0xf 0xf8
+0x62 0x14 0x0 0xf8
+0x93 0x5 0x10 0xf8
+
+# CHECK: ldrb     w9, [x2], #255
+# CHECK: ldrb     w10, [x3], #1
+# CHECK: ldrb     w10, [x3], #-256
+# CHECK: ldrh     w9, [x2], #255
+# CHECK: ldrh     w9, [x2], #1
+# CHECK: ldrh     w10, [x3], #-256
+0x49 0xf4 0x4f 0x38
+0x6a 0x14 0x40 0x38
+0x6a 0x4 0x50 0x38
+0x49 0xf4 0x4f 0x78
+0x49 0x14 0x40 0x78
+0x6a 0x4 0x50 0x78
+
+# CHECK: ldr      w19, [sp], #255
+# CHECK: ldr      w20, [x30], #1
+# CHECK: ldr      w21, [x12], #-256
+# CHECK: ldr      xzr, [x9], #255
+# CHECK: ldr      x2, [x3], #1
+# CHECK: ldr      x19, [x12], #-256
+0xf3 0xf7 0x4f 0xb8
+0xd4 0x17 0x40 0xb8
+0x95 0x5 0x50 0xb8
+0x3f 0xf5 0x4f 0xf8
+0x62 0x14 0x40 0xf8
+0x93 0x5 0x50 0xf8
+
+# CHECK: ldrsb    xzr, [x9], #255
+# CHECK: ldrsb    x2, [x3], #1
+# CHECK: ldrsb    x19, [x12], #-256
+# CHECK: ldrsh    xzr, [x9], #255
+# CHECK: ldrsh    x2, [x3], #1
+# CHECK: ldrsh    x19, [x12], #-256
+# CHECK: ldrsw    xzr, [x9], #255
+# CHECK: ldrsw    x2, [x3], #1
+# CHECK: ldrsw    x19, [x12], #-256
+0x3f 0xf5 0x8f 0x38
+0x62 0x14 0x80 0x38
+0x93 0x5 0x90 0x38
+0x3f 0xf5 0x8f 0x78
+0x62 0x14 0x80 0x78
+0x93 0x5 0x90 0x78
+0x3f 0xf5 0x8f 0xb8
+0x62 0x14 0x80 0xb8
+0x93 0x5 0x90 0xb8
+
+# CHECK: ldrsb    wzr, [x9], #255
+# CHECK: ldrsb    w2, [x3], #1
+# CHECK: ldrsb    w19, [x12], #-256
+# CHECK: ldrsh    wzr, [x9], #255
+# CHECK: ldrsh    w2, [x3], #1
+# CHECK: ldrsh    w19, [x12], #-256
+0x3f 0xf5 0xcf 0x38
+0x62 0x14 0xc0 0x38
+0x93 0x5 0xd0 0x38
+0x3f 0xf5 0xcf 0x78
+0x62 0x14 0xc0 0x78
+0x93 0x5 0xd0 0x78
+
+# CHECK: str      b0, [x0], #255
+# CHECK: str      b3, [x3], #1
+# CHECK: str      b5, [sp], #-256
+# CHECK: str      h10, [x10], #255
+# CHECK: str      h13, [x23], #1
+# CHECK: str      h15, [sp], #-256
+# CHECK: str      s20, [x20], #255
+# CHECK: str      s23, [x23], #1
+# CHECK: str      s25, [x0], #-256
+# CHECK: str      d20, [x20], #255
+# CHECK: str      d23, [x23], #1
+# CHECK: str      d25, [x0], #-256
+0x0 0xf4 0xf 0x3c
+0x63 0x14 0x0 0x3c
+0xe5 0x7 0x10 0x3c
+0x4a 0xf5 0xf 0x7c
+0xed 0x16 0x0 0x7c
+0xef 0x7 0x10 0x7c
+0x94 0xf6 0xf 0xbc
+0xf7 0x16 0x0 0xbc
+0x19 0x4 0x10 0xbc
+0x94 0xf6 0xf 0xfc
+0xf7 0x16 0x0 0xfc
+0x19 0x4 0x10 0xfc
+
+# CHECK: ldr      b0, [x0], #255
+# CHECK: ldr      b3, [x3], #1
+# CHECK: ldr      b5, [sp], #-256
+# CHECK: ldr      h10, [x10], #255
+# CHECK: ldr      h13, [x23], #1
+# CHECK: ldr      h15, [sp], #-256
+# CHECK: ldr      s20, [x20], #255
+# CHECK: ldr      s23, [x23], #1
+# CHECK: ldr      s25, [x0], #-256
+# CHECK: ldr      d20, [x20], #255
+# CHECK: ldr      d23, [x23], #1
+# CHECK: ldr      d25, [x0], #-256
+0x0 0xf4 0x4f 0x3c
+0x63 0x14 0x40 0x3c
+0xe5 0x7 0x50 0x3c
+0x4a 0xf5 0x4f 0x7c
+0xed 0x16 0x40 0x7c
+0xef 0x7 0x50 0x7c
+0x94 0xf6 0x4f 0xbc
+0xf7 0x16 0x40 0xbc
+0x19 0x4 0x50 0xbc
+0x94 0xf6 0x4f 0xfc
+0xf7 0x16 0x40 0xfc
+0x19 0x4 0x50 0xfc
+0x34 0xf4 0xcf 0x3c
+
+# CHECK: ldr      q20, [x1], #255
+# CHECK: ldr      q23, [x9], #1
+# CHECK: ldr      q25, [x20], #-256
+# CHECK: str      q10, [x1], #255
+# CHECK: str      q22, [sp], #1
+# CHECK: str      q21, [x20], #-256
+0x37 0x15 0xc0 0x3c
+0x99 0x6 0xd0 0x3c
+0x2a 0xf4 0x8f 0x3c
+0xf6 0x17 0x80 0x3c
+0x95 0x6 0x90 0x3c
+
+#-------------------------------------------------------------------------------
+# Load-store register (immediate pre-indexed)
+#-------------------------------------------------------------------------------
+
+# E.g. "str xzr, [sp, #4]!" is *not* unpredictable
+# CHECK-NOT: warning: potentially undefined instruction encoding
+0xff 0xf 0x40 0xf8
+
+# CHECK: ldr      x3, [x4, #0]!
+0x83 0xc 0x40 0xf8
+
+# CHECK: strb     w9, [x2, #255]!
+# CHECK: strb     w10, [x3, #1]!
+# CHECK: strb     w10, [x3, #-256]!
+# CHECK: strh     w9, [x2, #255]!
+# CHECK: strh     w9, [x2, #1]!
+# CHECK: strh     w10, [x3, #-256]!
+0x49 0xfc 0xf 0x38
+0x6a 0x1c 0x0 0x38
+0x6a 0xc 0x10 0x38
+0x49 0xfc 0xf 0x78
+0x49 0x1c 0x0 0x78
+0x6a 0xc 0x10 0x78
+
+# CHECK: str      w19, [sp, #255]!
+# CHECK: str      w20, [x30, #1]!
+# CHECK: str      w21, [x12, #-256]!
+# CHECK: str      xzr, [x9, #255]!
+# CHECK: str      x2, [x3, #1]!
+# CHECK: str      x19, [x12, #-256]!
+0xf3 0xff 0xf 0xb8
+0xd4 0x1f 0x0 0xb8
+0x95 0xd 0x10 0xb8
+0x3f 0xfd 0xf 0xf8
+0x62 0x1c 0x0 0xf8
+0x93 0xd 0x10 0xf8
+
+# CHECK: ldrb     w9, [x2, #255]!
+# CHECK: ldrb     w10, [x3, #1]!
+# CHECK: ldrb     w10, [x3, #-256]!
+# CHECK: ldrh     w9, [x2, #255]!
+# CHECK: ldrh     w9, [x2, #1]!
+# CHECK: ldrh     w10, [x3, #-256]!
+0x49 0xfc 0x4f 0x38
+0x6a 0x1c 0x40 0x38
+0x6a 0xc 0x50 0x38
+0x49 0xfc 0x4f 0x78
+0x49 0x1c 0x40 0x78
+0x6a 0xc 0x50 0x78
+
+# CHECK: ldr      w19, [sp, #255]!
+# CHECK: ldr      w20, [x30, #1]!
+# CHECK: ldr      w21, [x12, #-256]!
+# CHECK: ldr      xzr, [x9, #255]!
+# CHECK: ldr      x2, [x3, #1]!
+# CHECK: ldr      x19, [x12, #-256]!
+0xf3 0xff 0x4f 0xb8
+0xd4 0x1f 0x40 0xb8
+0x95 0xd 0x50 0xb8
+0x3f 0xfd 0x4f 0xf8
+0x62 0x1c 0x40 0xf8
+0x93 0xd 0x50 0xf8
+
+# CHECK: ldrsb    xzr, [x9, #255]!
+# CHECK: ldrsb    x2, [x3, #1]!
+# CHECK: ldrsb    x19, [x12, #-256]!
+# CHECK: ldrsh    xzr, [x9, #255]!
+# CHECK: ldrsh    x2, [x3, #1]!
+# CHECK: ldrsh    x19, [x12, #-256]!
+# CHECK: ldrsw    xzr, [x9, #255]!
+# CHECK: ldrsw    x2, [x3, #1]!
+# CHECK: ldrsw    x19, [x12, #-256]!
+0x3f 0xfd 0x8f 0x38
+0x62 0x1c 0x80 0x38
+0x93 0xd 0x90 0x38
+0x3f 0xfd 0x8f 0x78
+0x62 0x1c 0x80 0x78
+0x93 0xd 0x90 0x78
+0x3f 0xfd 0x8f 0xb8
+0x62 0x1c 0x80 0xb8
+0x93 0xd 0x90 0xb8
+
+# CHECK: ldrsb    wzr, [x9, #255]!
+# CHECK: ldrsb    w2, [x3, #1]!
+# CHECK: ldrsb    w19, [x12, #-256]!
+# CHECK: ldrsh    wzr, [x9, #255]!
+# CHECK: ldrsh    w2, [x3, #1]!
+# CHECK: ldrsh    w19, [x12, #-256]!
+0x3f 0xfd 0xcf 0x38
+0x62 0x1c 0xc0 0x38
+0x93 0xd 0xd0 0x38
+0x3f 0xfd 0xcf 0x78
+0x62 0x1c 0xc0 0x78
+0x93 0xd 0xd0 0x78
+
+# CHECK: str      b0, [x0, #255]!
+# CHECK: str      b3, [x3, #1]!
+# CHECK: str      b5, [sp, #-256]!
+# CHECK: str      h10, [x10, #255]!
+# CHECK: str      h13, [x23, #1]!
+# CHECK: str      h15, [sp, #-256]!
+# CHECK: str      s20, [x20, #255]!
+# CHECK: str      s23, [x23, #1]!
+# CHECK: str      s25, [x0, #-256]!
+# CHECK: str      d20, [x20, #255]!
+# CHECK: str      d23, [x23, #1]!
+# CHECK: str      d25, [x0, #-256]!
+0x0 0xfc 0xf 0x3c
+0x63 0x1c 0x0 0x3c
+0xe5 0xf 0x10 0x3c
+0x4a 0xfd 0xf 0x7c
+0xed 0x1e 0x0 0x7c
+0xef 0xf 0x10 0x7c
+0x94 0xfe 0xf 0xbc
+0xf7 0x1e 0x0 0xbc
+0x19 0xc 0x10 0xbc
+0x94 0xfe 0xf 0xfc
+0xf7 0x1e 0x0 0xfc
+0x19 0xc 0x10 0xfc
+
+# CHECK: ldr      b0, [x0, #255]!
+# CHECK: ldr      b3, [x3, #1]!
+# CHECK: ldr      b5, [sp, #-256]!
+# CHECK: ldr      h10, [x10, #255]!
+# CHECK: ldr      h13, [x23, #1]!
+# CHECK: ldr      h15, [sp, #-256]!
+# CHECK: ldr      s20, [x20, #255]!
+# CHECK: ldr      s23, [x23, #1]!
+# CHECK: ldr      s25, [x0, #-256]!
+# CHECK: ldr      d20, [x20, #255]!
+# CHECK: ldr      d23, [x23, #1]!
+# CHECK: ldr      d25, [x0, #-256]!
+0x0 0xfc 0x4f 0x3c
+0x63 0x1c 0x40 0x3c
+0xe5 0xf 0x50 0x3c
+0x4a 0xfd 0x4f 0x7c
+0xed 0x1e 0x40 0x7c
+0xef 0xf 0x50 0x7c
+0x94 0xfe 0x4f 0xbc
+0xf7 0x1e 0x40 0xbc
+0x19 0xc 0x50 0xbc
+0x94 0xfe 0x4f 0xfc
+0xf7 0x1e 0x40 0xfc
+0x19 0xc 0x50 0xfc
+
+# CHECK: ldr      q20, [x1, #255]!
+# CHECK: ldr      q23, [x9, #1]!
+# CHECK: ldr      q25, [x20, #-256]!
+# CHECK: str      q10, [x1, #255]!
+# CHECK: str      q22, [sp, #1]!
+# CHECK: str      q21, [x20, #-256]!
+0x34 0xfc 0xcf 0x3c
+0x37 0x1d 0xc0 0x3c
+0x99 0xe 0xd0 0x3c
+0x2a 0xfc 0x8f 0x3c
+0xf6 0x1f 0x80 0x3c
+0x95 0xe 0x90 0x3c
+
+#------------------------------------------------------------------------------
+# Load/store (unprivileged)
+#------------------------------------------------------------------------------
+
+# CHECK: sttrb    w9, [sp]
+# CHECK: sttrh    wzr, [x12, #255]
+# CHECK: sttr     w16, [x0, #-256]
+# CHECK: sttr     x28, [x14, #1]
+0xe9 0x0b 0x0 0x38
+0x9f 0xf9 0xf 0x78
+0x10 0x08 0x10 0xb8
+0xdc 0x19 0x0 0xf8
+
+# CHECK: ldtrb    w1, [x20, #255]
+# CHECK: ldtrh    w20, [x1, #255]
+# CHECK: ldtr     w12, [sp, #255]
+# CHECK: ldtr     xzr, [x12, #255]
+0x81 0xfa 0x4f 0x38
+0x34 0xf8 0x4f 0x78
+0xec 0xfb 0x4f 0xb8
+0x9f 0xf9 0x4f 0xf8
+
+# CHECK: ldtrsb   x9, [x7, #-256]
+# CHECK: ldtrsh   x17, [x19, #-256]
+# CHECK: ldtrsw   x20, [x15, #-256]
+# CHECK: ldtrsb   w19, [x1, #-256]
+# CHECK: ldtrsh   w15, [x21, #-256]
+0xe9 0x08 0x90 0x38
+0x71 0x0a 0x90 0x78
+0xf4 0x09 0x90 0xb8
+0x33 0x08 0xd0 0x38
+0xaf 0x0a 0xd0 0x78
+
+#------------------------------------------------------------------------------
+# Load/store (unsigned  immediate)
+#------------------------------------------------------------------------------
+
+# CHECK: ldr      x0, [x0]
+# CHECK: ldr      x4, [x29]
+# CHECK: ldr      x30, [x12, #32760]
+# CHECK: ldr      x20, [sp, #8]
+0x0 0x0 0x40 0xf9
+0xa4 0x3 0x40 0xf9
+0x9e 0xfd 0x7f 0xf9
+0xf4 0x7 0x40 0xf9
+
+# CHECK: ldr      xzr, [sp]
+0xff 0x3 0x40 0xf9
+
+# CHECK: ldr      w2, [sp]
+# CHECK: ldr      w17, [sp, #16380]
+# CHECK: ldr      w13, [x2, #4]
+0xe2 0x3 0x40 0xb9
+0xf1 0xff 0x7f 0xb9
+0x4d 0x4 0x40 0xb9
+
+# CHECK: ldrsw    x2, [x5, #4]
+# CHECK: ldrsw    x23, [sp, #16380]
+0xa2 0x4 0x80 0xb9
+0xf7 0xff 0xbf 0xb9
+
+# CHECK: ldrh     w2, [x4]
+# CHECK: ldrsh    w23, [x6, #8190]
+# CHECK: ldrsh    wzr, [sp, #2]
+# CHECK: ldrsh    x29, [x2, #2]
+0x82 0x0 0x40 0x79
+0xd7 0xfc 0xff 0x79
+0xff 0x7 0xc0 0x79
+0x5d 0x4 0x80 0x79
+
+# CHECK: ldrb     w26, [x3, #121]
+# CHECK: ldrb     w12, [x2]
+# CHECK: ldrsb    w27, [sp, #4095]
+# CHECK: ldrsb    xzr, [x15]
+0x7a 0xe4 0x41 0x39
+0x4c 0x0 0x40 0x39
+0xfb 0xff 0xff 0x39
+0xff 0x1 0x80 0x39
+
+# CHECK: str      x30, [sp]
+# CHECK: str      w20, [x4, #16380]
+# CHECK: strh     w20, [x10, #14]
+# CHECK: strh     w17, [sp, #8190]
+# CHECK: strb     w23, [x3, #4095]
+# CHECK: strb     wzr, [x2]
+0xfe 0x3 0x0 0xf9
+0x94 0xfc 0x3f 0xb9
+0x54 0x1d 0x0 0x79
+0xf1 0xff 0x3f 0x79
+0x77 0xfc 0x3f 0x39
+0x5f 0x0 0x0 0x39
+
+# CHECK: ldr      b31, [sp, #4095]
+# CHECK: ldr      h20, [x2, #8190]
+# CHECK: ldr      s10, [x19, #16380]
+# CHECK: ldr      d3, [x10, #32760]
+# CHECK: str      q12, [sp, #65520]
+0xff 0xff 0x7f 0x3d
+0x54 0xfc 0x7f 0x7d
+0x6a 0xfe 0x7f 0xbd
+0x43 0xfd 0x7f 0xfd
+0xec 0xff 0xbf 0x3d
+
+# CHECK: prfm    pldl1keep, [sp, #8]
+# CHECK: prfm    pldl1strm, [x3, #0]
+# CHECK: prfm    pldl2keep, [x5, #16]
+# CHECK: prfm    pldl2strm, [x2, #0]
+# CHECK: prfm    pldl3keep, [x5, #0]
+# CHECK: prfm    pldl3strm, [x6, #0]
+# CHECK: prfm    plil1keep, [sp, #8]
+# CHECK: prfm    plil1strm, [x3, #0]
+# CHECK: prfm    plil2keep, [x5, #16]
+# CHECK: prfm    plil2strm, [x2, #0]
+# CHECK: prfm    plil3keep, [x5, #0]
+# CHECK: prfm    plil3strm, [x6, #0]
+# CHECK: prfm    pstl1keep, [sp, #8]
+# CHECK: prfm    pstl1strm, [x3, #0]
+# CHECK: prfm    pstl2keep, [x5, #16]
+# CHECK: prfm    pstl2strm, [x2, #0]
+# CHECK: prfm    pstl3keep, [x5, #0]
+# CHECK: prfm    pstl3strm, [x6, #0]
+0xe0 0x07 0x80 0xf9
+0x61 0x00 0x80 0xf9
+0xa2 0x08 0x80 0xf9
+0x43 0x00 0x80 0xf9
+0xa4 0x00 0x80 0xf9
+0xc5 0x00 0x80 0xf9
+0xe8 0x07 0x80 0xf9
+0x69 0x00 0x80 0xf9
+0xaa 0x08 0x80 0xf9
+0x4b 0x00 0x80 0xf9
+0xac 0x00 0x80 0xf9
+0xcd 0x00 0x80 0xf9
+0xf0 0x07 0x80 0xf9
+0x71 0x00 0x80 0xf9
+0xb2 0x08 0x80 0xf9
+0x53 0x00 0x80 0xf9
+0xb4 0x00 0x80 0xf9
+0xd5 0x00 0x80 0xf9
+
+
+#------------------------------------------------------------------------------
+# Load/store (register offset)
+#------------------------------------------------------------------------------
+
+# CHECK: ldrb     w3, [sp, x5]
+# CHECK: ldrb     w9, [x27, x6]
+# CHECK: ldrsb    w10, [x30, x7]
+# CHECK: ldrb     w11, [x29, x3, sxtx]
+# CHECK: strb     w12, [x28, xzr, sxtx]
+# CHECK: ldrb     w14, [x26, w6, uxtw]
+# CHECK: ldrsb    w15, [x25, w7, uxtw]
+# CHECK: ldrb     w17, [x23, w9, sxtw]
+# CHECK: ldrsb    x18, [x22, w10, sxtw]
+0xe3 0x6b 0x65 0x38
+0x69 0x6b 0x66 0x38
+0xca 0x6b 0xe7 0x38
+0xab 0xeb 0x63 0x38
+0x8c 0xeb 0x3f 0x38
+0x4e 0x4b 0x66 0x38
+0x2f 0x4b 0xe7 0x38
+0xf1 0xca 0x69 0x38
+0xd2 0xca 0xaa 0x38
+
+# CHECK: ldrsh    w3, [sp, x5]
+# CHECK: ldrsh    w9, [x27, x6]
+# CHECK: ldrh     w10, [x30, x7, lsl #1]
+# CHECK: strh     w11, [x29, x3, sxtx]
+# CHECK: ldrh     w12, [x28, xzr, sxtx]
+# CHECK: ldrsh    x13, [x27, x5, sxtx #1]
+# CHECK: ldrh     w14, [x26, w6, uxtw]
+# CHECK: ldrh     w15, [x25, w7, uxtw]
+# CHECK: ldrsh    w16, [x24, w8, uxtw #1]
+# CHECK: ldrh     w17, [x23, w9, sxtw]
+# CHECK: ldrh     w18, [x22, w10, sxtw]
+# CHECK: strh     w19, [x21, wzr, sxtw #1]
+0xe3 0x6b 0xe5 0x78
+0x69 0x6b 0xe6 0x78
+0xca 0x7b 0x67 0x78
+0xab 0xeb 0x23 0x78
+0x8c 0xeb 0x7f 0x78
+0x6d 0xfb 0xa5 0x78
+0x4e 0x4b 0x66 0x78
+0x2f 0x4b 0x67 0x78
+0x10 0x5b 0xe8 0x78
+0xf1 0xca 0x69 0x78
+0xd2 0xca 0x6a 0x78
+0xb3 0xda 0x3f 0x78
+
+# CHECK: ldr      w3, [sp, x5]
+# CHECK: ldr      s9, [x27, x6]
+# CHECK: ldr      w10, [x30, x7, lsl #2]
+# CHECK: ldr      w11, [x29, x3, sxtx]
+# CHECK: str      s12, [x28, xzr, sxtx]
+# CHECK: str      w13, [x27, x5, sxtx #2]
+# CHECK: str      w14, [x26, w6, uxtw]
+# CHECK: ldr      w15, [x25, w7, uxtw]
+# CHECK: ldr      w16, [x24, w8, uxtw #2]
+# CHECK: ldrsw    x17, [x23, w9, sxtw]
+# CHECK: ldr      w18, [x22, w10, sxtw]
+# CHECK: ldrsw    x19, [x21, wzr, sxtw #2]
+0xe3 0x6b 0x65 0xb8
+0x69 0x6b 0x66 0xbc
+0xca 0x7b 0x67 0xb8
+0xab 0xeb 0x63 0xb8
+0x8c 0xeb 0x3f 0xbc
+0x6d 0xfb 0x25 0xb8
+0x4e 0x4b 0x26 0xb8
+0x2f 0x4b 0x67 0xb8
+0x10 0x5b 0x68 0xb8
+0xf1 0xca 0xa9 0xb8
+0xd2 0xca 0x6a 0xb8
+0xb3 0xda 0xbf 0xb8
+
+# CHECK: ldr      x3, [sp, x5]
+# CHECK: str      x9, [x27, x6]
+# CHECK: ldr      d10, [x30, x7, lsl #3]
+# CHECK: str      x11, [x29, x3, sxtx]
+# CHECK: ldr      x12, [x28, xzr, sxtx]
+# CHECK: ldr      x13, [x27, x5, sxtx #3]
+# CHECK: prfm     pldl1keep, [x26, w6, uxtw]
+# CHECK: ldr      x15, [x25, w7, uxtw]
+# CHECK: ldr      x16, [x24, w8, uxtw #3]
+# CHECK: ldr      x17, [x23, w9, sxtw]
+# CHECK: ldr      x18, [x22, w10, sxtw]
+# CHECK: str      d19, [x21, wzr, sxtw #3]
+0xe3 0x6b 0x65 0xf8
+0x69 0x6b 0x26 0xf8
+0xca 0x7b 0x67 0xfc
+0xab 0xeb 0x23 0xf8
+0x8c 0xeb 0x7f 0xf8
+0x6d 0xfb 0x65 0xf8
+0x40 0x4b 0xa6 0xf8
+0x2f 0x4b 0x67 0xf8
+0x10 0x5b 0x68 0xf8
+0xf1 0xca 0x69 0xf8
+0xd2 0xca 0x6a 0xf8
+0xb3 0xda 0x3f 0xfc
+
+# CHECK: ldr      q3, [sp, x5]
+# CHECK: ldr      q9, [x27, x6]
+# CHECK: ldr      q10, [x30, x7, lsl #4]
+# CHECK: str      q11, [x29, x3, sxtx]
+# CHECK: str      q12, [x28, xzr, sxtx]
+# CHECK: str      q13, [x27, x5, sxtx #4]
+# CHECK: ldr      q14, [x26, w6, uxtw]
+# CHECK: ldr      q15, [x25, w7, uxtw]
+# CHECK: ldr      q16, [x24, w8, uxtw #4]
+# CHECK: ldr      q17, [x23, w9, sxtw]
+# CHECK: str      q18, [x22, w10, sxtw]
+# CHECK: ldr      q19, [x21, wzr, sxtw #4]
+0xe3 0x6b 0xe5 0x3c
+0x69 0x6b 0xe6 0x3c
+0xca 0x7b 0xe7 0x3c
+0xab 0xeb 0xa3 0x3c
+0x8c 0xeb 0xbf 0x3c
+0x6d 0xfb 0xa5 0x3c
+0x4e 0x4b 0xe6 0x3c
+0x2f 0x4b 0xe7 0x3c
+0x10 0x5b 0xe8 0x3c
+0xf1 0xca 0xe9 0x3c
+0xd2 0xca 0xaa 0x3c
+0xb3 0xda 0xff 0x3c
+
+#------------------------------------------------------------------------------
+# Load/store register pair (offset)
+#------------------------------------------------------------------------------
+
+# CHECK: ldp      w3, w5, [sp]
+# CHECK: stp      wzr, w9, [sp, #252]
+# CHECK: ldp      w2, wzr, [sp, #-256]
+# CHECK: ldp      w9, w10, [sp, #4]
+0xe3 0x17 0x40 0x29
+0xff 0xa7 0x1f 0x29
+0xe2 0x7f 0x60 0x29
+0xe9 0xab 0x40 0x29
+
+# CHECK: ldpsw    x9, x10, [sp, #4]
+# CHECK: ldpsw    x9, x10, [x2, #-256]
+# CHECK: ldpsw    x20, x30, [sp, #252]
+0xe9 0xab 0x40 0x69
+0x49 0x28 0x60 0x69
+0xf4 0xfb 0x5f 0x69
+
+# CHECK: ldp      x21, x29, [x2, #504]
+# CHECK: ldp      x22, x23, [x3, #-512]
+# CHECK: ldp      x24, x25, [x4, #8]
+0x55 0xf4 0x5f 0xa9
+0x76 0x5c 0x60 0xa9
+0x98 0xe4 0x40 0xa9
+
+# CHECK: ldp      s29, s28, [sp, #252]
+# CHECK: stp      s27, s26, [sp, #-256]
+# CHECK: ldp      s1, s2, [x3, #44]
+0xfd 0xf3 0x5f 0x2d
+0xfb 0x6b 0x20 0x2d
+0x61 0x88 0x45 0x2d
+
+# CHECK: stp      d3, d5, [x9, #504]
+# CHECK: stp      d7, d11, [x10, #-512]
+# CHECK: ldp      d2, d3, [x30, #-8]
+0x23 0x95 0x1f 0x6d
+0x47 0x2d 0x20 0x6d
+0xc2 0x8f 0x7f 0x6d
+
+# CHECK: stp      q3, q5, [sp]
+# CHECK: stp      q17, q19, [sp, #1008]
+# CHECK: ldp      q23, q29, [x1, #-1024]
+0xe3 0x17 0x0 0xad
+0xf1 0xcf 0x1f 0xad
+0x37 0x74 0x60 0xad
+
+#------------------------------------------------------------------------------
+# Load/store register pair (post-indexed)
+#------------------------------------------------------------------------------
+
+# CHECK: ldp      w3, w5, [sp], #0
+# CHECK: stp      wzr, w9, [sp], #252
+# CHECK: ldp      w2, wzr, [sp], #-256
+# CHECK: ldp      w9, w10, [sp], #4
+0xe3 0x17 0xc0 0x28
+0xff 0xa7 0x9f 0x28
+0xe2 0x7f 0xe0 0x28
+0xe9 0xab 0xc0 0x28
+
+# CHECK: ldpsw    x9, x10, [sp], #4
+# CHECK: ldpsw    x9, x10, [x2], #-256
+# CHECK: ldpsw    x20, x30, [sp], #252
+0xe9 0xab 0xc0 0x68
+0x49 0x28 0xe0 0x68
+0xf4 0xfb 0xdf 0x68
+
+# CHECK: ldp      x21, x29, [x2], #504
+# CHECK: ldp      x22, x23, [x3], #-512
+# CHECK: ldp      x24, x25, [x4], #8
+0x55 0xf4 0xdf 0xa8
+0x76 0x5c 0xe0 0xa8
+0x98 0xe4 0xc0 0xa8
+
+# CHECK: ldp      s29, s28, [sp], #252
+# CHECK: stp      s27, s26, [sp], #-256
+# CHECK: ldp      s1, s2, [x3], #44
+0xfd 0xf3 0xdf 0x2c
+0xfb 0x6b 0xa0 0x2c
+0x61 0x88 0xc5 0x2c
+
+# CHECK: stp      d3, d5, [x9], #504
+# CHECK: stp      d7, d11, [x10], #-512
+# CHECK: ldp      d2, d3, [x30], #-8
+0x23 0x95 0x9f 0x6c
+0x47 0x2d 0xa0 0x6c
+0xc2 0x8f 0xff 0x6c
+
+# CHECK: stp      q3, q5, [sp], #0
+# CHECK: stp      q17, q19, [sp], #1008
+# CHECK: ldp      q23, q29, [x1], #-1024
+0xe3 0x17 0x80 0xac
+0xf1 0xcf 0x9f 0xac
+0x37 0x74 0xe0 0xac
+
+#------------------------------------------------------------------------------
+# Load/store register pair (pre-indexed)
+#------------------------------------------------------------------------------
+
+# CHECK: ldp      w3, w5, [sp, #0]!
+# CHECK: stp      wzr, w9, [sp, #252]!
+# CHECK: ldp      w2, wzr, [sp, #-256]!
+# CHECK: ldp      w9, w10, [sp, #4]!
+0xe3 0x17 0xc0 0x29
+0xff 0xa7 0x9f 0x29
+0xe2 0x7f 0xe0 0x29
+0xe9 0xab 0xc0 0x29
+
+# CHECK: ldpsw    x9, x10, [sp, #4]!
+# CHECK: ldpsw    x9, x10, [x2, #-256]!
+# CHECK: ldpsw    x20, x30, [sp, #252]!
+0xe9 0xab 0xc0 0x69
+0x49 0x28 0xe0 0x69
+0xf4 0xfb 0xdf 0x69
+
+# CHECK: ldp      x21, x29, [x2, #504]!
+# CHECK: ldp      x22, x23, [x3, #-512]!
+# CHECK: ldp      x24, x25, [x4, #8]!
+0x55 0xf4 0xdf 0xa9
+0x76 0x5c 0xe0 0xa9
+0x98 0xe4 0xc0 0xa9
+
+# CHECK: ldp      s29, s28, [sp, #252]!
+# CHECK: stp      s27, s26, [sp, #-256]!
+# CHECK: ldp      s1, s2, [x3, #44]!
+0xfd 0xf3 0xdf 0x2d
+0xfb 0x6b 0xa0 0x2d
+0x61 0x88 0xc5 0x2d
+
+# CHECK: stp      d3, d5, [x9, #504]!
+# CHECK: stp      d7, d11, [x10, #-512]!
+# CHECK: ldp      d2, d3, [x30, #-8]!
+0x23 0x95 0x9f 0x6d
+0x47 0x2d 0xa0 0x6d
+0xc2 0x8f 0xff 0x6d
+
+# CHECK: stp      q3, q5, [sp, #0]!
+# CHECK: stp      q17, q19, [sp, #1008]!
+# CHECK: ldp      q23, q29, [x1, #-1024]!
+0xe3 0x17 0x80 0xad
+0xf1 0xcf 0x9f 0xad
+0x37 0x74 0xe0 0xad
+
+#------------------------------------------------------------------------------
+# Load/store register pair (offset)
+#------------------------------------------------------------------------------
+
+# CHECK: ldnp      w3, w5, [sp]
+# CHECK: stnp      wzr, w9, [sp, #252]
+# CHECK: ldnp      w2, wzr, [sp, #-256]
+# CHECK: ldnp      w9, w10, [sp, #4]
+0xe3 0x17 0x40 0x28
+0xff 0xa7 0x1f 0x28
+0xe2 0x7f 0x60 0x28
+0xe9 0xab 0x40 0x28
+
+# CHECK: ldnp      x21, x29, [x2, #504]
+# CHECK: ldnp      x22, x23, [x3, #-512]
+# CHECK: ldnp      x24, x25, [x4, #8]
+0x55 0xf4 0x5f 0xa8
+0x76 0x5c 0x60 0xa8
+0x98 0xe4 0x40 0xa8
+
+# CHECK: ldnp      s29, s28, [sp, #252]
+# CHECK: stnp      s27, s26, [sp, #-256]
+# CHECK: ldnp      s1, s2, [x3, #44]
+0xfd 0xf3 0x5f 0x2c
+0xfb 0x6b 0x20 0x2c
+0x61 0x88 0x45 0x2c
+
+# CHECK: stnp      d3, d5, [x9, #504]
+# CHECK: stnp      d7, d11, [x10, #-512]
+# CHECK: ldnp      d2, d3, [x30, #-8]
+0x23 0x95 0x1f 0x6c
+0x47 0x2d 0x20 0x6c
+0xc2 0x8f 0x7f 0x6c
+
+# CHECK: stnp      q3, q5, [sp]
+# CHECK: stnp      q17, q19, [sp, #1008]
+# CHECK: ldnp      q23, q29, [x1, #-1024]
+0xe3 0x17 0x0 0xac
+0xf1 0xcf 0x1f 0xac
+0x37 0x74 0x60 0xac
+
+#------------------------------------------------------------------------------
+# Logical (immediate)
+#------------------------------------------------------------------------------
+# CHECK: orr      w3, w9, #0xffff0000
+# CHECK: orr      wsp, w10, #0xe00000ff
+# CHECK: orr      w9, w10, #0x3ff
+0x23 0x3d 0x10 0x32
+0x5f 0x29 0x3 0x32
+0x49 0x25 0x0 0x32
+
+# CHECK: and      w14, w15, #0x80008000
+# CHECK: and      w12, w13, #0xffc3ffc3
+# CHECK: and      w11, wzr, #0x30003
+0xee 0x81 0x1 0x12
+0xac 0xad 0xa 0x12
+0xeb 0x87 0x0 0x12
+
+# CHECK: eor      w3, w6, #0xe0e0e0e0
+# CHECK: eor      wsp, wzr, #0x3030303
+# CHECK: eor      w16, w17, #0x81818181
+0xc3 0xc8 0x3 0x52
+0xff 0xc7 0x0 0x52
+0x30 0xc6 0x1 0x52
+
+# CHECK: ands     wzr, w18, #0xcccccccc
+# CHECK: ands     w19, w20, #0x33333333
+# CHECK: ands     w21, w22, #0x99999999
+0x5f 0xe6 0x2 0x72
+0x93 0xe6 0x0 0x72
+0xd5 0xe6 0x1 0x72
+
+# CHECK: ands     wzr, w3, #0xaaaaaaaa
+# CHECK: ands     wzr, wzr, #0x55555555
+0x7f 0xf0 0x1 0x72
+0xff 0xf3 0x0 0x72
+
+# CHECK: eor      x3, x5, #0xffffffffc000000
+# CHECK: and      x9, x10, #0x7fffffffffff
+# CHECK: orr      x11, x12, #0x8000000000000fff
+0xa3 0x84 0x66 0xd2
+0x49 0xb9 0x40 0x92
+0x8b 0x31 0x41 0xb2
+
+# CHECK: orr      x3, x9, #0xffff0000ffff0000
+# CHECK: orr      sp, x10, #0xe00000ffe00000ff
+# CHECK: orr      x9, x10, #0x3ff000003ff
+0x23 0x3d 0x10 0xb2
+0x5f 0x29 0x3 0xb2
+0x49 0x25 0x0 0xb2
+
+# CHECK: and      x14, x15, #0x8000800080008000
+# CHECK: and      x12, x13, #0xffc3ffc3ffc3ffc3
+# CHECK: and      x11, xzr, #0x3000300030003
+0xee 0x81 0x1 0x92
+0xac 0xad 0xa 0x92
+0xeb 0x87 0x0 0x92
+
+# CHECK: eor      x3, x6, #0xe0e0e0e0e0e0e0e0
+# CHECK: eor      sp, xzr, #0x303030303030303
+# CHECK: eor      x16, x17, #0x8181818181818181
+0xc3 0xc8 0x3 0xd2
+0xff 0xc7 0x0 0xd2
+0x30 0xc6 0x1 0xd2
+
+# CHECK: ands     xzr, x18, #0xcccccccccccccccc
+# CHECK: ands     x19, x20, #0x3333333333333333
+# CHECK: ands     x21, x22, #0x9999999999999999
+0x5f 0xe6 0x2 0xf2
+0x93 0xe6 0x0 0xf2
+0xd5 0xe6 0x1 0xf2
+
+# CHECK: ands     xzr, x3, #0xaaaaaaaaaaaaaaaa
+# CHECK: ands     xzr, xzr, #0x5555555555555555
+0x7f 0xf0 0x1 0xf2
+0xff 0xf3 0x0 0xf2
+
+# CHECK: orr      w3, wzr, #0xf000f
+# CHECK: orr      x10, xzr, #0xaaaaaaaaaaaaaaaa
+0xe3 0x8f 0x0 0x32
+0xea 0xf3 0x1 0xb2
+
+# CHECK: orr      w3, wzr, #0xffff
+# CHECK: orr      x9, xzr, #0xffff00000000
+0xe3 0x3f 0x0 0x32
+0xe9 0x3f 0x60 0xb2
+
+#------------------------------------------------------------------------------
+# Logical (shifted register)
+#------------------------------------------------------------------------------
+
+# CHECK: and      w12, w23, w21
+# CHECK: and      w16, w15, w1, lsl #1
+# CHECK: and      w9, w4, w10, lsl #31
+# CHECK: and      w3, w30, w11
+# CHECK: and      x3, x5, x7, lsl #63
+0xec 0x2 0x15 0xa
+0xf0 0x5 0x1 0xa
+0x89 0x7c 0xa 0xa
+0xc3 0x3 0xb 0xa
+0xa3 0xfc 0x7 0x8a
+
+# CHECK: and      x5, x14, x19, asr #4
+# CHECK: and      w3, w17, w19, ror #31
+# CHECK: and      w0, w2, wzr, lsr #17
+# CHECK: and      w3, w30, w11, asr
+0xc5 0x11 0x93 0x8a
+0x23 0x7e 0xd3 0xa
+0x40 0x44 0x5f 0xa
+0xc3 0x3 0x8b 0xa
+
+# CHECK: and      xzr, x4, x26
+# CHECK: and      w3, wzr, w20, ror
+# CHECK: and      x7, x20, xzr, asr #63
+0x9f 0x0 0x1a 0x8a
+0xe3 0x3 0xd4 0xa
+0x87 0xfe 0x9f 0x8a
+
+# CHECK: bic      x13, x20, x14, lsl #47
+# CHECK: bic      w2, w7, w9
+# CHECK: orr      w2, w7, w0, asr #31
+# CHECK: orr      x8, x9, x10, lsl #12
+# CHECK: orn      x3, x5, x7, asr
+# CHECK: orn      w2, w5, w29
+0x8d 0xbe 0x2e 0x8a
+0xe2 0x0 0x29 0xa
+0xe2 0x7c 0x80 0x2a
+0x28 0x31 0xa 0xaa
+0xa3 0x0 0xa7 0xaa
+0xa2 0x0 0x3d 0x2a
+
+# CHECK: ands     w7, wzr, w9, lsl #1
+# CHECK: ands     x3, x5, x20, ror #63
+# CHECK: bics     w3, w5, w7
+# CHECK: bics     x3, xzr, x3, lsl #1
+# CHECK: tst      w3, w7, lsl #31
+# CHECK: tst      x2, x20, asr
+0xe7 0x7 0x9 0x6a
+0xa3 0xfc 0xd4 0xea
+0xa3 0x0 0x27 0x6a
+0xe3 0x7 0x23 0xea
+0x7f 0x7c 0x7 0x6a
+0x5f 0x0 0x94 0xea
+
+# CHECK: mov      x3, x6
+# CHECK: mov      x3, xzr
+# CHECK: mov      wzr, w2
+# CHECK: mov      w3, w5
+0xe3 0x3 0x6 0xaa
+0xe3 0x3 0x1f 0xaa
+0xff 0x3 0x2 0x2a
+0xe3 0x3 0x5 0x2a
+
+#------------------------------------------------------------------------------
+# Move wide (immediate)
+#------------------------------------------------------------------------------
+
+# N.b. (FIXME) canonical aliases aren't produced here because of
+# limitation in InstAlias. Lots of the "mov[nz]" instructions should
+# be "mov".
+
+# CHECK: movz     w1, #65535
+# CHECK: movz     w2, #0, lsl #16
+# CHECK: movn     w2, #1234
+0xe1 0xff 0x9f 0x52
+0x2 0x0 0xa0 0x52
+0x42 0x9a 0x80 0x12
+
+# CHECK: movz     x2, #1234, lsl #32
+# CHECK: movk     xzr, #4321, lsl #48
+0x42 0x9a 0xc0 0xd2
+0x3f 0x1c 0xe2 0xf2
+
+# CHECK: movz     x2, #0
+# CHECK: movk     w3, #0
+# CHECK: movz     x4, #0, lsl #16
+# CHECK: movk     w5, #0, lsl #16
+# CHECK: movz     x6, #0, lsl #32
+# CHECK: movk     x7, #0, lsl #32
+# CHECK: movz     x8, #0, lsl #48
+# CHECK: movk     x9, #0, lsl #48
+0x2 0x0 0x80 0xd2
+0x3 0x0 0x80 0x72
+0x4 0x0 0xa0 0xd2
+0x5 0x0 0xa0 0x72
+0x6 0x0 0xc0 0xd2
+0x7 0x0 0xc0 0xf2
+0x8 0x0 0xe0 0xd2
+0x9 0x0 0xe0 0xf2
+
+#------------------------------------------------------------------------------
+# PC-relative addressing
+#------------------------------------------------------------------------------
+
+# It's slightly dodgy using immediates here, but harmless enough when
+# it's all that's available.
+
+# CHECK: adr      x2, #1600
+# CHECK: adrp     x21, #6553600
+# CHECK: adr      x0, #262144
+0x02 0x32 0x00 0x10
+0x15 0x32 0x00 0x90
+0x00 0x00 0x20 0x10
+
+#------------------------------------------------------------------------------
+# System
+#------------------------------------------------------------------------------
+
+# CHECK: nop
+# CHECK: hint     #127
+# CHECK: nop
+# CHECK: yield
+# CHECK: wfe
+# CHECK: wfi
+# CHECK: sev
+# CHECK: sevl
+0x1f 0x20 0x3 0xd5
+0xff 0x2f 0x3 0xd5
+0x1f 0x20 0x3 0xd5
+0x3f 0x20 0x3 0xd5
+0x5f 0x20 0x3 0xd5
+0x7f 0x20 0x3 0xd5
+0x9f 0x20 0x3 0xd5
+0xbf 0x20 0x3 0xd5
+
+# CHECK: clrex
+# CHECK: clrex    #0
+# CHECK: clrex    #7
+# CHECK: clrex
+0x5f 0x3f 0x3 0xd5
+0x5f 0x30 0x3 0xd5
+0x5f 0x37 0x3 0xd5
+0x5f 0x3f 0x3 0xd5
+
+# CHECK: dsb      #0
+# CHECK: dsb      #12
+# CHECK: dsb      sy
+# CHECK: dsb      oshld
+# CHECK: dsb      oshst
+# CHECK: dsb      osh
+# CHECK: dsb      nshld
+# CHECK: dsb      nshst
+# CHECK: dsb      nsh
+# CHECK: dsb      ishld
+# CHECK: dsb      ishst
+# CHECK: dsb      ish
+# CHECK: dsb      ld
+# CHECK: dsb      st
+# CHECK: dsb      sy
+0x9f 0x30 0x3 0xd5
+0x9f 0x3c 0x3 0xd5
+0x9f 0x3f 0x3 0xd5
+0x9f 0x31 0x3 0xd5
+0x9f 0x32 0x3 0xd5
+0x9f 0x33 0x3 0xd5
+0x9f 0x35 0x3 0xd5
+0x9f 0x36 0x3 0xd5
+0x9f 0x37 0x3 0xd5
+0x9f 0x39 0x3 0xd5
+0x9f 0x3a 0x3 0xd5
+0x9f 0x3b 0x3 0xd5
+0x9f 0x3d 0x3 0xd5
+0x9f 0x3e 0x3 0xd5
+0x9f 0x3f 0x3 0xd5
+
+# CHECK: dmb      #0
+# CHECK: dmb      #12
+# CHECK: dmb      sy
+# CHECK: dmb      oshld
+# CHECK: dmb      oshst
+# CHECK: dmb      osh
+# CHECK: dmb      nshld
+# CHECK: dmb      nshst
+# CHECK: dmb      nsh
+# CHECK: dmb      ishld
+# CHECK: dmb      ishst
+# CHECK: dmb      ish
+# CHECK: dmb      ld
+# CHECK: dmb      st
+# CHECK: dmb      sy
+0xbf 0x30 0x3 0xd5
+0xbf 0x3c 0x3 0xd5
+0xbf 0x3f 0x3 0xd5
+0xbf 0x31 0x3 0xd5
+0xbf 0x32 0x3 0xd5
+0xbf 0x33 0x3 0xd5
+0xbf 0x35 0x3 0xd5
+0xbf 0x36 0x3 0xd5
+0xbf 0x37 0x3 0xd5
+0xbf 0x39 0x3 0xd5
+0xbf 0x3a 0x3 0xd5
+0xbf 0x3b 0x3 0xd5
+0xbf 0x3d 0x3 0xd5
+0xbf 0x3e 0x3 0xd5
+0xbf 0x3f 0x3 0xd5
+
+# CHECK: isb
+# CHECK: isb      #12
+0xdf 0x3f 0x3 0xd5
+0xdf 0x3c 0x3 0xd5
+
+# CHECK: msr      spsel, #0
+# CHECK: msr      daifset, #15
+# CHECK: msr      daifclr, #12
+0xbf 0x40 0x0 0xd5
+0xdf 0x4f 0x3 0xd5
+0xff 0x4c 0x3 0xd5
+
+# CHECK: sys      #7, c5, c9, #7, x5
+# CHECK: sys      #0, c15, c15, #2
+# CHECK: sysl     x9, #7, c5, c9, #7
+# CHECK: sysl     x1, #0, c15, c15, #2
+0xe5 0x59 0xf 0xd5
+0x5f 0xff 0x8 0xd5
+0xe9 0x59 0x2f 0xd5
+0x41 0xff 0x28 0xd5
+
+# CHECK: sys     #0, c7, c1, #0, xzr
+# CHECK: sys     #0, c7, c5, #0, xzr
+# CHECK: sys     #3, c7, c5, #1, x9
+0x1f 0x71 0x8 0xd5
+0x1f 0x75 0x8 0xd5
+0x29 0x75 0xb 0xd5
+
+# CHECK: sys     #3, c7, c4, #1, x12
+# CHECK: sys     #0, c7, c6, #1, xzr
+# CHECK: sys     #0, c7, c6, #2, x2
+# CHECK: sys     #3, c7, c10, #1, x9
+# CHECK: sys     #0, c7, c10, #2, x10
+# CHECK: sys     #3, c7, c11, #1, x0
+# CHECK: sys     #3, c7, c14, #1, x3
+# CHECK: sys     #0, c7, c14, #2, x30
+0x2c 0x74 0xb 0xd5
+0x3f 0x76 0x8 0xd5
+0x42 0x76 0x8 0xd5
+0x29 0x7a 0xb 0xd5
+0x4a 0x7a 0x8 0xd5
+0x20 0x7b 0xb 0xd5
+0x23 0x7e 0xb 0xd5
+0x5e 0x7e 0x8 0xd5
+
+
+# CHECK: msr      teecr32_el1, x12
+# CHECK: msr      osdtrrx_el1, x12
+# CHECK: msr      mdccint_el1, x12
+# CHECK: msr      mdscr_el1, x12
+# CHECK: msr      osdtrtx_el1, x12
+# CHECK: msr      dbgdtr_el0, x12
+# CHECK: msr      dbgdtrtx_el0, x12
+# CHECK: msr      oseccr_el1, x12
+# CHECK: msr      dbgvcr32_el2, x12
+# CHECK: msr      dbgbvr0_el1, x12
+# CHECK: msr      dbgbvr1_el1, x12
+# CHECK: msr      dbgbvr2_el1, x12
+# CHECK: msr      dbgbvr3_el1, x12
+# CHECK: msr      dbgbvr4_el1, x12
+# CHECK: msr      dbgbvr5_el1, x12
+# CHECK: msr      dbgbvr6_el1, x12
+# CHECK: msr      dbgbvr7_el1, x12
+# CHECK: msr      dbgbvr8_el1, x12
+# CHECK: msr      dbgbvr9_el1, x12
+# CHECK: msr      dbgbvr10_el1, x12
+# CHECK: msr      dbgbvr11_el1, x12
+# CHECK: msr      dbgbvr12_el1, x12
+# CHECK: msr      dbgbvr13_el1, x12
+# CHECK: msr      dbgbvr14_el1, x12
+# CHECK: msr      dbgbvr15_el1, x12
+# CHECK: msr      dbgbcr0_el1, x12
+# CHECK: msr      dbgbcr1_el1, x12
+# CHECK: msr      dbgbcr2_el1, x12
+# CHECK: msr      dbgbcr3_el1, x12
+# CHECK: msr      dbgbcr4_el1, x12
+# CHECK: msr      dbgbcr5_el1, x12
+# CHECK: msr      dbgbcr6_el1, x12
+# CHECK: msr      dbgbcr7_el1, x12
+# CHECK: msr      dbgbcr8_el1, x12
+# CHECK: msr      dbgbcr9_el1, x12
+# CHECK: msr      dbgbcr10_el1, x12
+# CHECK: msr      dbgbcr11_el1, x12
+# CHECK: msr      dbgbcr12_el1, x12
+# CHECK: msr      dbgbcr13_el1, x12
+# CHECK: msr      dbgbcr14_el1, x12
+# CHECK: msr      dbgbcr15_el1, x12
+# CHECK: msr      dbgwvr0_el1, x12
+# CHECK: msr      dbgwvr1_el1, x12
+# CHECK: msr      dbgwvr2_el1, x12
+# CHECK: msr      dbgwvr3_el1, x12
+# CHECK: msr      dbgwvr4_el1, x12
+# CHECK: msr      dbgwvr5_el1, x12
+# CHECK: msr      dbgwvr6_el1, x12
+# CHECK: msr      dbgwvr7_el1, x12
+# CHECK: msr      dbgwvr8_el1, x12
+# CHECK: msr      dbgwvr9_el1, x12
+# CHECK: msr      dbgwvr10_el1, x12
+# CHECK: msr      dbgwvr11_el1, x12
+# CHECK: msr      dbgwvr12_el1, x12
+# CHECK: msr      dbgwvr13_el1, x12
+# CHECK: msr      dbgwvr14_el1, x12
+# CHECK: msr      dbgwvr15_el1, x12
+# CHECK: msr      dbgwcr0_el1, x12
+# CHECK: msr      dbgwcr1_el1, x12
+# CHECK: msr      dbgwcr2_el1, x12
+# CHECK: msr      dbgwcr3_el1, x12
+# CHECK: msr      dbgwcr4_el1, x12
+# CHECK: msr      dbgwcr5_el1, x12
+# CHECK: msr      dbgwcr6_el1, x12
+# CHECK: msr      dbgwcr7_el1, x12
+# CHECK: msr      dbgwcr8_el1, x12
+# CHECK: msr      dbgwcr9_el1, x12
+# CHECK: msr      dbgwcr10_el1, x12
+# CHECK: msr      dbgwcr11_el1, x12
+# CHECK: msr      dbgwcr12_el1, x12
+# CHECK: msr      dbgwcr13_el1, x12
+# CHECK: msr      dbgwcr14_el1, x12
+# CHECK: msr      dbgwcr15_el1, x12
+# CHECK: msr      teehbr32_el1, x12
+# CHECK: msr      oslar_el1, x12
+# CHECK: msr      osdlr_el1, x12
+# CHECK: msr      dbgprcr_el1, x12
+# CHECK: msr      dbgclaimset_el1, x12
+# CHECK: msr      dbgclaimclr_el1, x12
+# CHECK: msr      csselr_el1, x12
+# CHECK: msr      vpidr_el2, x12
+# CHECK: msr      vmpidr_el2, x12
+# CHECK: msr      sctlr_el1, x12
+# CHECK: msr      sctlr_el2, x12
+# CHECK: msr      sctlr_el3, x12
+# CHECK: msr      actlr_el1, x12
+# CHECK: msr      actlr_el2, x12
+# CHECK: msr      actlr_el3, x12
+# CHECK: msr      cpacr_el1, x12
+# CHECK: msr      hcr_el2, x12
+# CHECK: msr      scr_el3, x12
+# CHECK: msr      mdcr_el2, x12
+# CHECK: msr      sder32_el3, x12
+# CHECK: msr      cptr_el2, x12
+# CHECK: msr      cptr_el3, x12
+# CHECK: msr      hstr_el2, x12
+# CHECK: msr      hacr_el2, x12
+# CHECK: msr      mdcr_el3, x12
+# CHECK: msr      ttbr0_el1, x12
+# CHECK: msr      ttbr0_el2, x12
+# CHECK: msr      ttbr0_el3, x12
+# CHECK: msr      ttbr1_el1, x12
+# CHECK: msr      tcr_el1, x12
+# CHECK: msr      tcr_el2, x12
+# CHECK: msr      tcr_el3, x12
+# CHECK: msr      vttbr_el2, x12
+# CHECK: msr      vtcr_el2, x12
+# CHECK: msr      dacr32_el2, x12
+# CHECK: msr      spsr_el1, x12
+# CHECK: msr      spsr_el2, x12
+# CHECK: msr      spsr_el3, x12
+# CHECK: msr      elr_el1, x12
+# CHECK: msr      elr_el2, x12
+# CHECK: msr      elr_el3, x12
+# CHECK: msr      sp_el0, x12
+# CHECK: msr      sp_el1, x12
+# CHECK: msr      sp_el2, x12
+# CHECK: msr      spsel, x12
+# CHECK: msr      nzcv, x12
+# CHECK: msr      daif, x12
+# CHECK: msr      currentel, x12
+# CHECK: msr      spsr_irq, x12
+# CHECK: msr      spsr_abt, x12
+# CHECK: msr      spsr_und, x12
+# CHECK: msr      spsr_fiq, x12
+# CHECK: msr      fpcr, x12
+# CHECK: msr      fpsr, x12
+# CHECK: msr      dspsr_el0, x12
+# CHECK: msr      dlr_el0, x12
+# CHECK: msr      ifsr32_el2, x12
+# CHECK: msr      afsr0_el1, x12
+# CHECK: msr      afsr0_el2, x12
+# CHECK: msr      afsr0_el3, x12
+# CHECK: msr      afsr1_el1, x12
+# CHECK: msr      afsr1_el2, x12
+# CHECK: msr      afsr1_el3, x12
+# CHECK: msr      esr_el1, x12
+# CHECK: msr      esr_el2, x12
+# CHECK: msr      esr_el3, x12
+# CHECK: msr      fpexc32_el2, x12
+# CHECK: msr      far_el1, x12
+# CHECK: msr      far_el2, x12
+# CHECK: msr      far_el3, x12
+# CHECK: msr      hpfar_el2, x12
+# CHECK: msr      par_el1, x12
+# CHECK: msr      pmcr_el0, x12
+# CHECK: msr      pmcntenset_el0, x12
+# CHECK: msr      pmcntenclr_el0, x12
+# CHECK: msr      pmovsclr_el0, x12
+# CHECK: msr      pmselr_el0, x12
+# CHECK: msr      pmccntr_el0, x12
+# CHECK: msr      pmxevtyper_el0, x12
+# CHECK: msr      pmxevcntr_el0, x12
+# CHECK: msr      pmuserenr_el0, x12
+# CHECK: msr      pmintenset_el1, x12
+# CHECK: msr      pmintenclr_el1, x12
+# CHECK: msr      pmovsset_el0, x12
+# CHECK: msr      mair_el1, x12
+# CHECK: msr      mair_el2, x12
+# CHECK: msr      mair_el3, x12
+# CHECK: msr      amair_el1, x12
+# CHECK: msr      amair_el2, x12
+# CHECK: msr      amair_el3, x12
+# CHECK: msr      vbar_el1, x12
+# CHECK: msr      vbar_el2, x12
+# CHECK: msr      vbar_el3, x12
+# CHECK: msr      rmr_el1, x12
+# CHECK: msr      rmr_el2, x12
+# CHECK: msr      rmr_el3, x12
+# CHECK: msr      tpidr_el0, x12
+# CHECK: msr      tpidr_el2, x12
+# CHECK: msr      tpidr_el3, x12
+# CHECK: msr      tpidrro_el0, x12
+# CHECK: msr      tpidr_el1, x12
+# CHECK: msr      cntfrq_el0, x12
+# CHECK: msr      cntvoff_el2, x12
+# CHECK: msr      cntkctl_el1, x12
+# CHECK: msr      cnthctl_el2, x12
+# CHECK: msr      cntp_tval_el0, x12
+# CHECK: msr      cnthp_tval_el2, x12
+# CHECK: msr      cntps_tval_el1, x12
+# CHECK: msr      cntp_ctl_el0, x12
+# CHECK: msr      cnthp_ctl_el2, x12
+# CHECK: msr      cntps_ctl_el1, x12
+# CHECK: msr      cntp_cval_el0, x12
+# CHECK: msr      cnthp_cval_el2, x12
+# CHECK: msr      cntps_cval_el1, x12
+# CHECK: msr      cntv_tval_el0, x12
+# CHECK: msr      cntv_ctl_el0, x12
+# CHECK: msr      cntv_cval_el0, x12
+# CHECK: msr      pmevcntr0_el0, x12
+# CHECK: msr      pmevcntr1_el0, x12
+# CHECK: msr      pmevcntr2_el0, x12
+# CHECK: msr      pmevcntr3_el0, x12
+# CHECK: msr      pmevcntr4_el0, x12
+# CHECK: msr      pmevcntr5_el0, x12
+# CHECK: msr      pmevcntr6_el0, x12
+# CHECK: msr      pmevcntr7_el0, x12
+# CHECK: msr      pmevcntr8_el0, x12
+# CHECK: msr      pmevcntr9_el0, x12
+# CHECK: msr      pmevcntr10_el0, x12
+# CHECK: msr      pmevcntr11_el0, x12
+# CHECK: msr      pmevcntr12_el0, x12
+# CHECK: msr      pmevcntr13_el0, x12
+# CHECK: msr      pmevcntr14_el0, x12
+# CHECK: msr      pmevcntr15_el0, x12
+# CHECK: msr      pmevcntr16_el0, x12
+# CHECK: msr      pmevcntr17_el0, x12
+# CHECK: msr      pmevcntr18_el0, x12
+# CHECK: msr      pmevcntr19_el0, x12
+# CHECK: msr      pmevcntr20_el0, x12
+# CHECK: msr      pmevcntr21_el0, x12
+# CHECK: msr      pmevcntr22_el0, x12
+# CHECK: msr      pmevcntr23_el0, x12
+# CHECK: msr      pmevcntr24_el0, x12
+# CHECK: msr      pmevcntr25_el0, x12
+# CHECK: msr      pmevcntr26_el0, x12
+# CHECK: msr      pmevcntr27_el0, x12
+# CHECK: msr      pmevcntr28_el0, x12
+# CHECK: msr      pmevcntr29_el0, x12
+# CHECK: msr      pmevcntr30_el0, x12
+# CHECK: msr      pmccfiltr_el0, x12
+# CHECK: msr      pmevtyper0_el0, x12
+# CHECK: msr      pmevtyper1_el0, x12
+# CHECK: msr      pmevtyper2_el0, x12
+# CHECK: msr      pmevtyper3_el0, x12
+# CHECK: msr      pmevtyper4_el0, x12
+# CHECK: msr      pmevtyper5_el0, x12
+# CHECK: msr      pmevtyper6_el0, x12
+# CHECK: msr      pmevtyper7_el0, x12
+# CHECK: msr      pmevtyper8_el0, x12
+# CHECK: msr      pmevtyper9_el0, x12
+# CHECK: msr      pmevtyper10_el0, x12
+# CHECK: msr      pmevtyper11_el0, x12
+# CHECK: msr      pmevtyper12_el0, x12
+# CHECK: msr      pmevtyper13_el0, x12
+# CHECK: msr      pmevtyper14_el0, x12
+# CHECK: msr      pmevtyper15_el0, x12
+# CHECK: msr      pmevtyper16_el0, x12
+# CHECK: msr      pmevtyper17_el0, x12
+# CHECK: msr      pmevtyper18_el0, x12
+# CHECK: msr      pmevtyper19_el0, x12
+# CHECK: msr      pmevtyper20_el0, x12
+# CHECK: msr      pmevtyper21_el0, x12
+# CHECK: msr      pmevtyper22_el0, x12
+# CHECK: msr      pmevtyper23_el0, x12
+# CHECK: msr      pmevtyper24_el0, x12
+# CHECK: msr      pmevtyper25_el0, x12
+# CHECK: msr      pmevtyper26_el0, x12
+# CHECK: msr      pmevtyper27_el0, x12
+# CHECK: msr      pmevtyper28_el0, x12
+# CHECK: msr      pmevtyper29_el0, x12
+# CHECK: msr      pmevtyper30_el0, x12
+# CHECK: mrs      x9, teecr32_el1
+# CHECK: mrs      x9, osdtrrx_el1
+# CHECK: mrs      x9, mdccsr_el0
+# CHECK: mrs      x9, mdccint_el1
+# CHECK: mrs      x9, mdscr_el1
+# CHECK: mrs      x9, osdtrtx_el1
+# CHECK: mrs      x9, dbgdtr_el0
+# CHECK: mrs      x9, dbgdtrrx_el0
+# CHECK: mrs      x9, oseccr_el1
+# CHECK: mrs      x9, dbgvcr32_el2
+# CHECK: mrs      x9, dbgbvr0_el1
+# CHECK: mrs      x9, dbgbvr1_el1
+# CHECK: mrs      x9, dbgbvr2_el1
+# CHECK: mrs      x9, dbgbvr3_el1
+# CHECK: mrs      x9, dbgbvr4_el1
+# CHECK: mrs      x9, dbgbvr5_el1
+# CHECK: mrs      x9, dbgbvr6_el1
+# CHECK: mrs      x9, dbgbvr7_el1
+# CHECK: mrs      x9, dbgbvr8_el1
+# CHECK: mrs      x9, dbgbvr9_el1
+# CHECK: mrs      x9, dbgbvr10_el1
+# CHECK: mrs      x9, dbgbvr11_el1
+# CHECK: mrs      x9, dbgbvr12_el1
+# CHECK: mrs      x9, dbgbvr13_el1
+# CHECK: mrs      x9, dbgbvr14_el1
+# CHECK: mrs      x9, dbgbvr15_el1
+# CHECK: mrs      x9, dbgbcr0_el1
+# CHECK: mrs      x9, dbgbcr1_el1
+# CHECK: mrs      x9, dbgbcr2_el1
+# CHECK: mrs      x9, dbgbcr3_el1
+# CHECK: mrs      x9, dbgbcr4_el1
+# CHECK: mrs      x9, dbgbcr5_el1
+# CHECK: mrs      x9, dbgbcr6_el1
+# CHECK: mrs      x9, dbgbcr7_el1
+# CHECK: mrs      x9, dbgbcr8_el1
+# CHECK: mrs      x9, dbgbcr9_el1
+# CHECK: mrs      x9, dbgbcr10_el1
+# CHECK: mrs      x9, dbgbcr11_el1
+# CHECK: mrs      x9, dbgbcr12_el1
+# CHECK: mrs      x9, dbgbcr13_el1
+# CHECK: mrs      x9, dbgbcr14_el1
+# CHECK: mrs      x9, dbgbcr15_el1
+# CHECK: mrs      x9, dbgwvr0_el1
+# CHECK: mrs      x9, dbgwvr1_el1
+# CHECK: mrs      x9, dbgwvr2_el1
+# CHECK: mrs      x9, dbgwvr3_el1
+# CHECK: mrs      x9, dbgwvr4_el1
+# CHECK: mrs      x9, dbgwvr5_el1
+# CHECK: mrs      x9, dbgwvr6_el1
+# CHECK: mrs      x9, dbgwvr7_el1
+# CHECK: mrs      x9, dbgwvr8_el1
+# CHECK: mrs      x9, dbgwvr9_el1
+# CHECK: mrs      x9, dbgwvr10_el1
+# CHECK: mrs      x9, dbgwvr11_el1
+# CHECK: mrs      x9, dbgwvr12_el1
+# CHECK: mrs      x9, dbgwvr13_el1
+# CHECK: mrs      x9, dbgwvr14_el1
+# CHECK: mrs      x9, dbgwvr15_el1
+# CHECK: mrs      x9, dbgwcr0_el1
+# CHECK: mrs      x9, dbgwcr1_el1
+# CHECK: mrs      x9, dbgwcr2_el1
+# CHECK: mrs      x9, dbgwcr3_el1
+# CHECK: mrs      x9, dbgwcr4_el1
+# CHECK: mrs      x9, dbgwcr5_el1
+# CHECK: mrs      x9, dbgwcr6_el1
+# CHECK: mrs      x9, dbgwcr7_el1
+# CHECK: mrs      x9, dbgwcr8_el1
+# CHECK: mrs      x9, dbgwcr9_el1
+# CHECK: mrs      x9, dbgwcr10_el1
+# CHECK: mrs      x9, dbgwcr11_el1
+# CHECK: mrs      x9, dbgwcr12_el1
+# CHECK: mrs      x9, dbgwcr13_el1
+# CHECK: mrs      x9, dbgwcr14_el1
+# CHECK: mrs      x9, dbgwcr15_el1
+# CHECK: mrs      x9, mdrar_el1
+# CHECK: mrs      x9, teehbr32_el1
+# CHECK: mrs      x9, oslsr_el1
+# CHECK: mrs      x9, osdlr_el1
+# CHECK: mrs      x9, dbgprcr_el1
+# CHECK: mrs      x9, dbgclaimset_el1
+# CHECK: mrs      x9, dbgclaimclr_el1
+# CHECK: mrs      x9, dbgauthstatus_el1
+# CHECK: mrs      x9, midr_el1
+# CHECK: mrs      x9, ccsidr_el1
+# CHECK: mrs      x9, csselr_el1
+# CHECK: mrs      x9, vpidr_el2
+# CHECK: mrs      x9, clidr_el1
+# CHECK: mrs      x9, ctr_el0
+# CHECK: mrs      x9, mpidr_el1
+# CHECK: mrs      x9, vmpidr_el2
+# CHECK: mrs      x9, revidr_el1
+# CHECK: mrs      x9, aidr_el1
+# CHECK: mrs      x9, dczid_el0
+# CHECK: mrs      x9, id_pfr0_el1
+# CHECK: mrs      x9, id_pfr1_el1
+# CHECK: mrs      x9, id_dfr0_el1
+# CHECK: mrs      x9, id_afr0_el1
+# CHECK: mrs      x9, id_mmfr0_el1
+# CHECK: mrs      x9, id_mmfr1_el1
+# CHECK: mrs      x9, id_mmfr2_el1
+# CHECK: mrs      x9, id_mmfr3_el1
+# CHECK: mrs      x9, id_isar0_el1
+# CHECK: mrs      x9, id_isar1_el1
+# CHECK: mrs      x9, id_isar2_el1
+# CHECK: mrs      x9, id_isar3_el1
+# CHECK: mrs      x9, id_isar4_el1
+# CHECK: mrs      x9, id_isar5_el1
+# CHECK: mrs      x9, mvfr0_el1
+# CHECK: mrs      x9, mvfr1_el1
+# CHECK: mrs      x9, mvfr2_el1
+# CHECK: mrs      x9, id_aa64pfr0_el1
+# CHECK: mrs      x9, id_aa64pfr1_el1
+# CHECK: mrs      x9, id_aa64dfr0_el1
+# CHECK: mrs      x9, id_aa64dfr1_el1
+# CHECK: mrs      x9, id_aa64afr0_el1
+# CHECK: mrs      x9, id_aa64afr1_el1
+# CHECK: mrs      x9, id_aa64isar0_el1
+# CHECK: mrs      x9, id_aa64isar1_el1
+# CHECK: mrs      x9, id_aa64mmfr0_el1
+# CHECK: mrs      x9, id_aa64mmfr1_el1
+# CHECK: mrs      x9, sctlr_el1
+# CHECK: mrs      x9, sctlr_el2
+# CHECK: mrs      x9, sctlr_el3
+# CHECK: mrs      x9, actlr_el1
+# CHECK: mrs      x9, actlr_el2
+# CHECK: mrs      x9, actlr_el3
+# CHECK: mrs      x9, cpacr_el1
+# CHECK: mrs      x9, hcr_el2
+# CHECK: mrs      x9, scr_el3
+# CHECK: mrs      x9, mdcr_el2
+# CHECK: mrs      x9, sder32_el3
+# CHECK: mrs      x9, cptr_el2
+# CHECK: mrs      x9, cptr_el3
+# CHECK: mrs      x9, hstr_el2
+# CHECK: mrs      x9, hacr_el2
+# CHECK: mrs      x9, mdcr_el3
+# CHECK: mrs      x9, ttbr0_el1
+# CHECK: mrs      x9, ttbr0_el2
+# CHECK: mrs      x9, ttbr0_el3
+# CHECK: mrs      x9, ttbr1_el1
+# CHECK: mrs      x9, tcr_el1
+# CHECK: mrs      x9, tcr_el2
+# CHECK: mrs      x9, tcr_el3
+# CHECK: mrs      x9, vttbr_el2
+# CHECK: mrs      x9, vtcr_el2
+# CHECK: mrs      x9, dacr32_el2
+# CHECK: mrs      x9, spsr_el1
+# CHECK: mrs      x9, spsr_el2
+# CHECK: mrs      x9, spsr_el3
+# CHECK: mrs      x9, elr_el1
+# CHECK: mrs      x9, elr_el2
+# CHECK: mrs      x9, elr_el3
+# CHECK: mrs      x9, sp_el0
+# CHECK: mrs      x9, sp_el1
+# CHECK: mrs      x9, sp_el2
+# CHECK: mrs      x9, spsel
+# CHECK: mrs      x9, nzcv
+# CHECK: mrs      x9, daif
+# CHECK: mrs      x9, currentel
+# CHECK: mrs      x9, spsr_irq
+# CHECK: mrs      x9, spsr_abt
+# CHECK: mrs      x9, spsr_und
+# CHECK: mrs      x9, spsr_fiq
+# CHECK: mrs      x9, fpcr
+# CHECK: mrs      x9, fpsr
+# CHECK: mrs      x9, dspsr_el0
+# CHECK: mrs      x9, dlr_el0
+# CHECK: mrs      x9, ifsr32_el2
+# CHECK: mrs      x9, afsr0_el1
+# CHECK: mrs      x9, afsr0_el2
+# CHECK: mrs      x9, afsr0_el3
+# CHECK: mrs      x9, afsr1_el1
+# CHECK: mrs      x9, afsr1_el2
+# CHECK: mrs      x9, afsr1_el3
+# CHECK: mrs      x9, esr_el1
+# CHECK: mrs      x9, esr_el2
+# CHECK: mrs      x9, esr_el3
+# CHECK: mrs      x9, fpexc32_el2
+# CHECK: mrs      x9, far_el1
+# CHECK: mrs      x9, far_el2
+# CHECK: mrs      x9, far_el3
+# CHECK: mrs      x9, hpfar_el2
+# CHECK: mrs      x9, par_el1
+# CHECK: mrs      x9, pmcr_el0
+# CHECK: mrs      x9, pmcntenset_el0
+# CHECK: mrs      x9, pmcntenclr_el0
+# CHECK: mrs      x9, pmovsclr_el0
+# CHECK: mrs      x9, pmselr_el0
+# CHECK: mrs      x9, pmceid0_el0
+# CHECK: mrs      x9, pmceid1_el0
+# CHECK: mrs      x9, pmccntr_el0
+# CHECK: mrs      x9, pmxevtyper_el0
+# CHECK: mrs      x9, pmxevcntr_el0
+# CHECK: mrs      x9, pmuserenr_el0
+# CHECK: mrs      x9, pmintenset_el1
+# CHECK: mrs      x9, pmintenclr_el1
+# CHECK: mrs      x9, pmovsset_el0
+# CHECK: mrs      x9, mair_el1
+# CHECK: mrs      x9, mair_el2
+# CHECK: mrs      x9, mair_el3
+# CHECK: mrs      x9, amair_el1
+# CHECK: mrs      x9, amair_el2
+# CHECK: mrs      x9, amair_el3
+# CHECK: mrs      x9, vbar_el1
+# CHECK: mrs      x9, vbar_el2
+# CHECK: mrs      x9, vbar_el3
+# CHECK: mrs      x9, rvbar_el1
+# CHECK: mrs      x9, rvbar_el2
+# CHECK: mrs      x9, rvbar_el3
+# CHECK: mrs      x9, rmr_el1
+# CHECK: mrs      x9, rmr_el2
+# CHECK: mrs      x9, rmr_el3
+# CHECK: mrs      x9, isr_el1
+# CHECK: mrs      x9, contextidr_el1
+# CHECK: mrs      x9, tpidr_el0
+# CHECK: mrs      x9, tpidr_el2
+# CHECK: mrs      x9, tpidr_el3
+# CHECK: mrs      x9, tpidrro_el0
+# CHECK: mrs      x9, tpidr_el1
+# CHECK: mrs      x9, cntfrq_el0
+# CHECK: mrs      x9, cntpct_el0
+# CHECK: mrs      x9, cntvct_el0
+# CHECK: mrs      x9, cntvoff_el2
+# CHECK: mrs      x9, cntkctl_el1
+# CHECK: mrs      x9, cnthctl_el2
+# CHECK: mrs      x9, cntp_tval_el0
+# CHECK: mrs      x9, cnthp_tval_el2
+# CHECK: mrs      x9, cntps_tval_el1
+# CHECK: mrs      x9, cntp_ctl_el0
+# CHECK: mrs      x9, cnthp_ctl_el2
+# CHECK: mrs      x9, cntps_ctl_el1
+# CHECK: mrs      x9, cntp_cval_el0
+# CHECK: mrs      x9, cnthp_cval_el2
+# CHECK: mrs      x9, cntps_cval_el1
+# CHECK: mrs      x9, cntv_tval_el0
+# CHECK: mrs      x9, cntv_ctl_el0
+# CHECK: mrs      x9, cntv_cval_el0
+# CHECK: mrs      x9, pmevcntr0_el0
+# CHECK: mrs      x9, pmevcntr1_el0
+# CHECK: mrs      x9, pmevcntr2_el0
+# CHECK: mrs      x9, pmevcntr3_el0
+# CHECK: mrs      x9, pmevcntr4_el0
+# CHECK: mrs      x9, pmevcntr5_el0
+# CHECK: mrs      x9, pmevcntr6_el0
+# CHECK: mrs      x9, pmevcntr7_el0
+# CHECK: mrs      x9, pmevcntr8_el0
+# CHECK: mrs      x9, pmevcntr9_el0
+# CHECK: mrs      x9, pmevcntr10_el0
+# CHECK: mrs      x9, pmevcntr11_el0
+# CHECK: mrs      x9, pmevcntr12_el0
+# CHECK: mrs      x9, pmevcntr13_el0
+# CHECK: mrs      x9, pmevcntr14_el0
+# CHECK: mrs      x9, pmevcntr15_el0
+# CHECK: mrs      x9, pmevcntr16_el0
+# CHECK: mrs      x9, pmevcntr17_el0
+# CHECK: mrs      x9, pmevcntr18_el0
+# CHECK: mrs      x9, pmevcntr19_el0
+# CHECK: mrs      x9, pmevcntr20_el0
+# CHECK: mrs      x9, pmevcntr21_el0
+# CHECK: mrs      x9, pmevcntr22_el0
+# CHECK: mrs      x9, pmevcntr23_el0
+# CHECK: mrs      x9, pmevcntr24_el0
+# CHECK: mrs      x9, pmevcntr25_el0
+# CHECK: mrs      x9, pmevcntr26_el0
+# CHECK: mrs      x9, pmevcntr27_el0
+# CHECK: mrs      x9, pmevcntr28_el0
+# CHECK: mrs      x9, pmevcntr29_el0
+# CHECK: mrs      x9, pmevcntr30_el0
+# CHECK: mrs      x9, pmccfiltr_el0
+# CHECK: mrs      x9, pmevtyper0_el0
+# CHECK: mrs      x9, pmevtyper1_el0
+# CHECK: mrs      x9, pmevtyper2_el0
+# CHECK: mrs      x9, pmevtyper3_el0
+# CHECK: mrs      x9, pmevtyper4_el0
+# CHECK: mrs      x9, pmevtyper5_el0
+# CHECK: mrs      x9, pmevtyper6_el0
+# CHECK: mrs      x9, pmevtyper7_el0
+# CHECK: mrs      x9, pmevtyper8_el0
+# CHECK: mrs      x9, pmevtyper9_el0
+# CHECK: mrs      x9, pmevtyper10_el0
+# CHECK: mrs      x9, pmevtyper11_el0
+# CHECK: mrs      x9, pmevtyper12_el0
+# CHECK: mrs      x9, pmevtyper13_el0
+# CHECK: mrs      x9, pmevtyper14_el0
+# CHECK: mrs      x9, pmevtyper15_el0
+# CHECK: mrs      x9, pmevtyper16_el0
+# CHECK: mrs      x9, pmevtyper17_el0
+# CHECK: mrs      x9, pmevtyper18_el0
+# CHECK: mrs      x9, pmevtyper19_el0
+# CHECK: mrs      x9, pmevtyper20_el0
+# CHECK: mrs      x9, pmevtyper21_el0
+# CHECK: mrs      x9, pmevtyper22_el0
+# CHECK: mrs      x9, pmevtyper23_el0
+# CHECK: mrs      x9, pmevtyper24_el0
+# CHECK: mrs      x9, pmevtyper25_el0
+# CHECK: mrs      x9, pmevtyper26_el0
+# CHECK: mrs      x9, pmevtyper27_el0
+# CHECK: mrs      x9, pmevtyper28_el0
+# CHECK: mrs      x9, pmevtyper29_el0
+# CHECK: mrs      x9, pmevtyper30_el0
+
+0xc 0x0 0x12 0xd5
+0x4c 0x0 0x10 0xd5
+0xc 0x2 0x10 0xd5
+0x4c 0x2 0x10 0xd5
+0x4c 0x3 0x10 0xd5
+0xc 0x4 0x13 0xd5
+0xc 0x5 0x13 0xd5
+0x4c 0x6 0x10 0xd5
+0xc 0x7 0x14 0xd5
+0x8c 0x0 0x10 0xd5
+0x8c 0x1 0x10 0xd5
+0x8c 0x2 0x10 0xd5
+0x8c 0x3 0x10 0xd5
+0x8c 0x4 0x10 0xd5
+0x8c 0x5 0x10 0xd5
+0x8c 0x6 0x10 0xd5
+0x8c 0x7 0x10 0xd5
+0x8c 0x8 0x10 0xd5
+0x8c 0x9 0x10 0xd5
+0x8c 0xa 0x10 0xd5
+0x8c 0xb 0x10 0xd5
+0x8c 0xc 0x10 0xd5
+0x8c 0xd 0x10 0xd5
+0x8c 0xe 0x10 0xd5
+0x8c 0xf 0x10 0xd5
+0xac 0x0 0x10 0xd5
+0xac 0x1 0x10 0xd5
+0xac 0x2 0x10 0xd5
+0xac 0x3 0x10 0xd5
+0xac 0x4 0x10 0xd5
+0xac 0x5 0x10 0xd5
+0xac 0x6 0x10 0xd5
+0xac 0x7 0x10 0xd5
+0xac 0x8 0x10 0xd5
+0xac 0x9 0x10 0xd5
+0xac 0xa 0x10 0xd5
+0xac 0xb 0x10 0xd5
+0xac 0xc 0x10 0xd5
+0xac 0xd 0x10 0xd5
+0xac 0xe 0x10 0xd5
+0xac 0xf 0x10 0xd5
+0xcc 0x0 0x10 0xd5
+0xcc 0x1 0x10 0xd5
+0xcc 0x2 0x10 0xd5
+0xcc 0x3 0x10 0xd5
+0xcc 0x4 0x10 0xd5
+0xcc 0x5 0x10 0xd5
+0xcc 0x6 0x10 0xd5
+0xcc 0x7 0x10 0xd5
+0xcc 0x8 0x10 0xd5
+0xcc 0x9 0x10 0xd5
+0xcc 0xa 0x10 0xd5
+0xcc 0xb 0x10 0xd5
+0xcc 0xc 0x10 0xd5
+0xcc 0xd 0x10 0xd5
+0xcc 0xe 0x10 0xd5
+0xcc 0xf 0x10 0xd5
+0xec 0x0 0x10 0xd5
+0xec 0x1 0x10 0xd5
+0xec 0x2 0x10 0xd5
+0xec 0x3 0x10 0xd5
+0xec 0x4 0x10 0xd5
+0xec 0x5 0x10 0xd5
+0xec 0x6 0x10 0xd5
+0xec 0x7 0x10 0xd5
+0xec 0x8 0x10 0xd5
+0xec 0x9 0x10 0xd5
+0xec 0xa 0x10 0xd5
+0xec 0xb 0x10 0xd5
+0xec 0xc 0x10 0xd5
+0xec 0xd 0x10 0xd5
+0xec 0xe 0x10 0xd5
+0xec 0xf 0x10 0xd5
+0xc 0x10 0x12 0xd5
+0x8c 0x10 0x10 0xd5
+0x8c 0x13 0x10 0xd5
+0x8c 0x14 0x10 0xd5
+0xcc 0x78 0x10 0xd5
+0xcc 0x79 0x10 0xd5
+0xc 0x0 0x1a 0xd5
+0xc 0x0 0x1c 0xd5
+0xac 0x0 0x1c 0xd5
+0xc 0x10 0x18 0xd5
+0xc 0x10 0x1c 0xd5
+0xc 0x10 0x1e 0xd5
+0x2c 0x10 0x18 0xd5
+0x2c 0x10 0x1c 0xd5
+0x2c 0x10 0x1e 0xd5
+0x4c 0x10 0x18 0xd5
+0xc 0x11 0x1c 0xd5
+0xc 0x11 0x1e 0xd5
+0x2c 0x11 0x1c 0xd5
+0x2c 0x11 0x1e 0xd5
+0x4c 0x11 0x1c 0xd5
+0x4c 0x11 0x1e 0xd5
+0x6c 0x11 0x1c 0xd5
+0xec 0x11 0x1c 0xd5
+0x2c 0x13 0x1e 0xd5
+0xc 0x20 0x18 0xd5
+0xc 0x20 0x1c 0xd5
+0xc 0x20 0x1e 0xd5
+0x2c 0x20 0x18 0xd5
+0x4c 0x20 0x18 0xd5
+0x4c 0x20 0x1c 0xd5
+0x4c 0x20 0x1e 0xd5
+0xc 0x21 0x1c 0xd5
+0x4c 0x21 0x1c 0xd5
+0xc 0x30 0x1c 0xd5
+0xc 0x40 0x18 0xd5
+0xc 0x40 0x1c 0xd5
+0xc 0x40 0x1e 0xd5
+0x2c 0x40 0x18 0xd5
+0x2c 0x40 0x1c 0xd5
+0x2c 0x40 0x1e 0xd5
+0xc 0x41 0x18 0xd5
+0xc 0x41 0x1c 0xd5
+0xc 0x41 0x1e 0xd5
+0xc 0x42 0x18 0xd5
+0xc 0x42 0x1b 0xd5
+0x2c 0x42 0x1b 0xd5
+0x4c 0x42 0x18 0xd5
+0xc 0x43 0x1c 0xd5
+0x2c 0x43 0x1c 0xd5
+0x4c 0x43 0x1c 0xd5
+0x6c 0x43 0x1c 0xd5
+0xc 0x44 0x1b 0xd5
+0x2c 0x44 0x1b 0xd5
+0xc 0x45 0x1b 0xd5
+0x2c 0x45 0x1b 0xd5
+0x2c 0x50 0x1c 0xd5
+0xc 0x51 0x18 0xd5
+0xc 0x51 0x1c 0xd5
+0xc 0x51 0x1e 0xd5
+0x2c 0x51 0x18 0xd5
+0x2c 0x51 0x1c 0xd5
+0x2c 0x51 0x1e 0xd5
+0xc 0x52 0x18 0xd5
+0xc 0x52 0x1c 0xd5
+0xc 0x52 0x1e 0xd5
+0xc 0x53 0x1c 0xd5
+0xc 0x60 0x18 0xd5
+0xc 0x60 0x1c 0xd5
+0xc 0x60 0x1e 0xd5
+0x8c 0x60 0x1c 0xd5
+0xc 0x74 0x18 0xd5
+0xc 0x9c 0x1b 0xd5
+0x2c 0x9c 0x1b 0xd5
+0x4c 0x9c 0x1b 0xd5
+0x6c 0x9c 0x1b 0xd5
+0xac 0x9c 0x1b 0xd5
+0xc 0x9d 0x1b 0xd5
+0x2c 0x9d 0x1b 0xd5
+0x4c 0x9d 0x1b 0xd5
+0xc 0x9e 0x1b 0xd5
+0x2c 0x9e 0x18 0xd5
+0x4c 0x9e 0x18 0xd5
+0x6c 0x9e 0x1b 0xd5
+0xc 0xa2 0x18 0xd5
+0xc 0xa2 0x1c 0xd5
+0xc 0xa2 0x1e 0xd5
+0xc 0xa3 0x18 0xd5
+0xc 0xa3 0x1c 0xd5
+0xc 0xa3 0x1e 0xd5
+0xc 0xc0 0x18 0xd5
+0xc 0xc0 0x1c 0xd5
+0xc 0xc0 0x1e 0xd5
+0x4c 0xc0 0x18 0xd5
+0x4c 0xc0 0x1c 0xd5
+0x4c 0xc0 0x1e 0xd5
+0x4c 0xd0 0x1b 0xd5
+0x4c 0xd0 0x1c 0xd5
+0x4c 0xd0 0x1e 0xd5
+0x6c 0xd0 0x1b 0xd5
+0x8c 0xd0 0x18 0xd5
+0xc 0xe0 0x1b 0xd5
+0x6c 0xe0 0x1c 0xd5
+0xc 0xe1 0x18 0xd5
+0xc 0xe1 0x1c 0xd5
+0xc 0xe2 0x1b 0xd5
+0xc 0xe2 0x1c 0xd5
+0xc 0xe2 0x1f 0xd5
+0x2c 0xe2 0x1b 0xd5
+0x2c 0xe2 0x1c 0xd5
+0x2c 0xe2 0x1f 0xd5
+0x4c 0xe2 0x1b 0xd5
+0x4c 0xe2 0x1c 0xd5
+0x4c 0xe2 0x1f 0xd5
+0xc 0xe3 0x1b 0xd5
+0x2c 0xe3 0x1b 0xd5
+0x4c 0xe3 0x1b 0xd5
+0xc 0xe8 0x1b 0xd5
+0x2c 0xe8 0x1b 0xd5
+0x4c 0xe8 0x1b 0xd5
+0x6c 0xe8 0x1b 0xd5
+0x8c 0xe8 0x1b 0xd5
+0xac 0xe8 0x1b 0xd5
+0xcc 0xe8 0x1b 0xd5
+0xec 0xe8 0x1b 0xd5
+0xc 0xe9 0x1b 0xd5
+0x2c 0xe9 0x1b 0xd5
+0x4c 0xe9 0x1b 0xd5
+0x6c 0xe9 0x1b 0xd5
+0x8c 0xe9 0x1b 0xd5
+0xac 0xe9 0x1b 0xd5
+0xcc 0xe9 0x1b 0xd5
+0xec 0xe9 0x1b 0xd5
+0xc 0xea 0x1b 0xd5
+0x2c 0xea 0x1b 0xd5
+0x4c 0xea 0x1b 0xd5
+0x6c 0xea 0x1b 0xd5
+0x8c 0xea 0x1b 0xd5
+0xac 0xea 0x1b 0xd5
+0xcc 0xea 0x1b 0xd5
+0xec 0xea 0x1b 0xd5
+0xc 0xeb 0x1b 0xd5
+0x2c 0xeb 0x1b 0xd5
+0x4c 0xeb 0x1b 0xd5
+0x6c 0xeb 0x1b 0xd5
+0x8c 0xeb 0x1b 0xd5
+0xac 0xeb 0x1b 0xd5
+0xcc 0xeb 0x1b 0xd5
+0xec 0xef 0x1b 0xd5
+0xc 0xec 0x1b 0xd5
+0x2c 0xec 0x1b 0xd5
+0x4c 0xec 0x1b 0xd5
+0x6c 0xec 0x1b 0xd5
+0x8c 0xec 0x1b 0xd5
+0xac 0xec 0x1b 0xd5
+0xcc 0xec 0x1b 0xd5
+0xec 0xec 0x1b 0xd5
+0xc 0xed 0x1b 0xd5
+0x2c 0xed 0x1b 0xd5
+0x4c 0xed 0x1b 0xd5
+0x6c 0xed 0x1b 0xd5
+0x8c 0xed 0x1b 0xd5
+0xac 0xed 0x1b 0xd5
+0xcc 0xed 0x1b 0xd5
+0xec 0xed 0x1b 0xd5
+0xc 0xee 0x1b 0xd5
+0x2c 0xee 0x1b 0xd5
+0x4c 0xee 0x1b 0xd5
+0x6c 0xee 0x1b 0xd5
+0x8c 0xee 0x1b 0xd5
+0xac 0xee 0x1b 0xd5
+0xcc 0xee 0x1b 0xd5
+0xec 0xee 0x1b 0xd5
+0xc 0xef 0x1b 0xd5
+0x2c 0xef 0x1b 0xd5
+0x4c 0xef 0x1b 0xd5
+0x6c 0xef 0x1b 0xd5
+0x8c 0xef 0x1b 0xd5
+0xac 0xef 0x1b 0xd5
+0xcc 0xef 0x1b 0xd5
+0x9 0x0 0x32 0xd5
+0x49 0x0 0x30 0xd5
+0x9 0x1 0x33 0xd5
+0x9 0x2 0x30 0xd5
+0x49 0x2 0x30 0xd5
+0x49 0x3 0x30 0xd5
+0x9 0x4 0x33 0xd5
+0x9 0x5 0x33 0xd5
+0x49 0x6 0x30 0xd5
+0x9 0x7 0x34 0xd5
+0x89 0x0 0x30 0xd5
+0x89 0x1 0x30 0xd5
+0x89 0x2 0x30 0xd5
+0x89 0x3 0x30 0xd5
+0x89 0x4 0x30 0xd5
+0x89 0x5 0x30 0xd5
+0x89 0x6 0x30 0xd5
+0x89 0x7 0x30 0xd5
+0x89 0x8 0x30 0xd5
+0x89 0x9 0x30 0xd5
+0x89 0xa 0x30 0xd5
+0x89 0xb 0x30 0xd5
+0x89 0xc 0x30 0xd5
+0x89 0xd 0x30 0xd5
+0x89 0xe 0x30 0xd5
+0x89 0xf 0x30 0xd5
+0xa9 0x0 0x30 0xd5
+0xa9 0x1 0x30 0xd5
+0xa9 0x2 0x30 0xd5
+0xa9 0x3 0x30 0xd5
+0xa9 0x4 0x30 0xd5
+0xa9 0x5 0x30 0xd5
+0xa9 0x6 0x30 0xd5
+0xa9 0x7 0x30 0xd5
+0xa9 0x8 0x30 0xd5
+0xa9 0x9 0x30 0xd5
+0xa9 0xa 0x30 0xd5
+0xa9 0xb 0x30 0xd5
+0xa9 0xc 0x30 0xd5
+0xa9 0xd 0x30 0xd5
+0xa9 0xe 0x30 0xd5
+0xa9 0xf 0x30 0xd5
+0xc9 0x0 0x30 0xd5
+0xc9 0x1 0x30 0xd5
+0xc9 0x2 0x30 0xd5
+0xc9 0x3 0x30 0xd5
+0xc9 0x4 0x30 0xd5
+0xc9 0x5 0x30 0xd5
+0xc9 0x6 0x30 0xd5
+0xc9 0x7 0x30 0xd5
+0xc9 0x8 0x30 0xd5
+0xc9 0x9 0x30 0xd5
+0xc9 0xa 0x30 0xd5
+0xc9 0xb 0x30 0xd5
+0xc9 0xc 0x30 0xd5
+0xc9 0xd 0x30 0xd5
+0xc9 0xe 0x30 0xd5
+0xc9 0xf 0x30 0xd5
+0xe9 0x0 0x30 0xd5
+0xe9 0x1 0x30 0xd5
+0xe9 0x2 0x30 0xd5
+0xe9 0x3 0x30 0xd5
+0xe9 0x4 0x30 0xd5
+0xe9 0x5 0x30 0xd5
+0xe9 0x6 0x30 0xd5
+0xe9 0x7 0x30 0xd5
+0xe9 0x8 0x30 0xd5
+0xe9 0x9 0x30 0xd5
+0xe9 0xa 0x30 0xd5
+0xe9 0xb 0x30 0xd5
+0xe9 0xc 0x30 0xd5
+0xe9 0xd 0x30 0xd5
+0xe9 0xe 0x30 0xd5
+0xe9 0xf 0x30 0xd5
+0x9 0x10 0x30 0xd5
+0x9 0x10 0x32 0xd5
+0x89 0x11 0x30 0xd5
+0x89 0x13 0x30 0xd5
+0x89 0x14 0x30 0xd5
+0xc9 0x78 0x30 0xd5
+0xc9 0x79 0x30 0xd5
+0xc9 0x7e 0x30 0xd5
+0x9 0x0 0x38 0xd5
+0x9 0x0 0x39 0xd5
+0x9 0x0 0x3a 0xd5
+0x9 0x0 0x3c 0xd5
+0x29 0x0 0x39 0xd5
+0x29 0x0 0x3b 0xd5
+0xa9 0x0 0x38 0xd5
+0xa9 0x0 0x3c 0xd5
+0xc9 0x0 0x38 0xd5
+0xe9 0x0 0x39 0xd5
+0xe9 0x0 0x3b 0xd5
+0x9 0x1 0x38 0xd5
+0x29 0x1 0x38 0xd5
+0x49 0x1 0x38 0xd5
+0x69 0x1 0x38 0xd5
+0x89 0x1 0x38 0xd5
+0xa9 0x1 0x38 0xd5
+0xc9 0x1 0x38 0xd5
+0xe9 0x1 0x38 0xd5
+0x9 0x2 0x38 0xd5
+0x29 0x2 0x38 0xd5
+0x49 0x2 0x38 0xd5
+0x69 0x2 0x38 0xd5
+0x89 0x2 0x38 0xd5
+0xa9 0x2 0x38 0xd5
+0x9 0x3 0x38 0xd5
+0x29 0x3 0x38 0xd5
+0x49 0x3 0x38 0xd5
+0x9 0x4 0x38 0xd5
+0x29 0x4 0x38 0xd5
+0x9 0x5 0x38 0xd5
+0x29 0x5 0x38 0xd5
+0x89 0x5 0x38 0xd5
+0xa9 0x5 0x38 0xd5
+0x9 0x6 0x38 0xd5
+0x29 0x6 0x38 0xd5
+0x9 0x7 0x38 0xd5
+0x29 0x7 0x38 0xd5
+0x9 0x10 0x38 0xd5
+0x9 0x10 0x3c 0xd5
+0x9 0x10 0x3e 0xd5
+0x29 0x10 0x38 0xd5
+0x29 0x10 0x3c 0xd5
+0x29 0x10 0x3e 0xd5
+0x49 0x10 0x38 0xd5
+0x9 0x11 0x3c 0xd5
+0x9 0x11 0x3e 0xd5
+0x29 0x11 0x3c 0xd5
+0x29 0x11 0x3e 0xd5
+0x49 0x11 0x3c 0xd5
+0x49 0x11 0x3e 0xd5
+0x69 0x11 0x3c 0xd5
+0xe9 0x11 0x3c 0xd5
+0x29 0x13 0x3e 0xd5
+0x9 0x20 0x38 0xd5
+0x9 0x20 0x3c 0xd5
+0x9 0x20 0x3e 0xd5
+0x29 0x20 0x38 0xd5
+0x49 0x20 0x38 0xd5
+0x49 0x20 0x3c 0xd5
+0x49 0x20 0x3e 0xd5
+0x9 0x21 0x3c 0xd5
+0x49 0x21 0x3c 0xd5
+0x9 0x30 0x3c 0xd5
+0x9 0x40 0x38 0xd5
+0x9 0x40 0x3c 0xd5
+0x9 0x40 0x3e 0xd5
+0x29 0x40 0x38 0xd5
+0x29 0x40 0x3c 0xd5
+0x29 0x40 0x3e 0xd5
+0x9 0x41 0x38 0xd5
+0x9 0x41 0x3c 0xd5
+0x9 0x41 0x3e 0xd5
+0x9 0x42 0x38 0xd5
+0x9 0x42 0x3b 0xd5
+0x29 0x42 0x3b 0xd5
+0x49 0x42 0x38 0xd5
+0x9 0x43 0x3c 0xd5
+0x29 0x43 0x3c 0xd5
+0x49 0x43 0x3c 0xd5
+0x69 0x43 0x3c 0xd5
+0x9 0x44 0x3b 0xd5
+0x29 0x44 0x3b 0xd5
+0x9 0x45 0x3b 0xd5
+0x29 0x45 0x3b 0xd5
+0x29 0x50 0x3c 0xd5
+0x9 0x51 0x38 0xd5
+0x9 0x51 0x3c 0xd5
+0x9 0x51 0x3e 0xd5
+0x29 0x51 0x38 0xd5
+0x29 0x51 0x3c 0xd5
+0x29 0x51 0x3e 0xd5
+0x9 0x52 0x38 0xd5
+0x9 0x52 0x3c 0xd5
+0x9 0x52 0x3e 0xd5
+0x9 0x53 0x3c 0xd5
+0x9 0x60 0x38 0xd5
+0x9 0x60 0x3c 0xd5
+0x9 0x60 0x3e 0xd5
+0x89 0x60 0x3c 0xd5
+0x9 0x74 0x38 0xd5
+0x9 0x9c 0x3b 0xd5
+0x29 0x9c 0x3b 0xd5
+0x49 0x9c 0x3b 0xd5
+0x69 0x9c 0x3b 0xd5
+0xa9 0x9c 0x3b 0xd5
+0xc9 0x9c 0x3b 0xd5
+0xe9 0x9c 0x3b 0xd5
+0x9 0x9d 0x3b 0xd5
+0x29 0x9d 0x3b 0xd5
+0x49 0x9d 0x3b 0xd5
+0x9 0x9e 0x3b 0xd5
+0x29 0x9e 0x38 0xd5
+0x49 0x9e 0x38 0xd5
+0x69 0x9e 0x3b 0xd5
+0x9 0xa2 0x38 0xd5
+0x9 0xa2 0x3c 0xd5
+0x9 0xa2 0x3e 0xd5
+0x9 0xa3 0x38 0xd5
+0x9 0xa3 0x3c 0xd5
+0x9 0xa3 0x3e 0xd5
+0x9 0xc0 0x38 0xd5
+0x9 0xc0 0x3c 0xd5
+0x9 0xc0 0x3e 0xd5
+0x29 0xc0 0x38 0xd5
+0x29 0xc0 0x3c 0xd5
+0x29 0xc0 0x3e 0xd5
+0x49 0xc0 0x38 0xd5
+0x49 0xc0 0x3c 0xd5
+0x49 0xc0 0x3e 0xd5
+0x9 0xc1 0x38 0xd5
+0x29 0xd0 0x38 0xd5
+0x49 0xd0 0x3b 0xd5
+0x49 0xd0 0x3c 0xd5
+0x49 0xd0 0x3e 0xd5
+0x69 0xd0 0x3b 0xd5
+0x89 0xd0 0x38 0xd5
+0x9 0xe0 0x3b 0xd5
+0x29 0xe0 0x3b 0xd5
+0x49 0xe0 0x3b 0xd5
+0x69 0xe0 0x3c 0xd5
+0x9 0xe1 0x38 0xd5
+0x9 0xe1 0x3c 0xd5
+0x9 0xe2 0x3b 0xd5
+0x9 0xe2 0x3c 0xd5
+0x9 0xe2 0x3f 0xd5
+0x29 0xe2 0x3b 0xd5
+0x29 0xe2 0x3c 0xd5
+0x29 0xe2 0x3f 0xd5
+0x49 0xe2 0x3b 0xd5
+0x49 0xe2 0x3c 0xd5
+0x49 0xe2 0x3f 0xd5
+0x9 0xe3 0x3b 0xd5
+0x29 0xe3 0x3b 0xd5
+0x49 0xe3 0x3b 0xd5
+0x9 0xe8 0x3b 0xd5
+0x29 0xe8 0x3b 0xd5
+0x49 0xe8 0x3b 0xd5
+0x69 0xe8 0x3b 0xd5
+0x89 0xe8 0x3b 0xd5
+0xa9 0xe8 0x3b 0xd5
+0xc9 0xe8 0x3b 0xd5
+0xe9 0xe8 0x3b 0xd5
+0x9 0xe9 0x3b 0xd5
+0x29 0xe9 0x3b 0xd5
+0x49 0xe9 0x3b 0xd5
+0x69 0xe9 0x3b 0xd5
+0x89 0xe9 0x3b 0xd5
+0xa9 0xe9 0x3b 0xd5
+0xc9 0xe9 0x3b 0xd5
+0xe9 0xe9 0x3b 0xd5
+0x9 0xea 0x3b 0xd5
+0x29 0xea 0x3b 0xd5
+0x49 0xea 0x3b 0xd5
+0x69 0xea 0x3b 0xd5
+0x89 0xea 0x3b 0xd5
+0xa9 0xea 0x3b 0xd5
+0xc9 0xea 0x3b 0xd5
+0xe9 0xea 0x3b 0xd5
+0x9 0xeb 0x3b 0xd5
+0x29 0xeb 0x3b 0xd5
+0x49 0xeb 0x3b 0xd5
+0x69 0xeb 0x3b 0xd5
+0x89 0xeb 0x3b 0xd5
+0xa9 0xeb 0x3b 0xd5
+0xc9 0xeb 0x3b 0xd5
+0xe9 0xef 0x3b 0xd5
+0x9 0xec 0x3b 0xd5
+0x29 0xec 0x3b 0xd5
+0x49 0xec 0x3b 0xd5
+0x69 0xec 0x3b 0xd5
+0x89 0xec 0x3b 0xd5
+0xa9 0xec 0x3b 0xd5
+0xc9 0xec 0x3b 0xd5
+0xe9 0xec 0x3b 0xd5
+0x9 0xed 0x3b 0xd5
+0x29 0xed 0x3b 0xd5
+0x49 0xed 0x3b 0xd5
+0x69 0xed 0x3b 0xd5
+0x89 0xed 0x3b 0xd5
+0xa9 0xed 0x3b 0xd5
+0xc9 0xed 0x3b 0xd5
+0xe9 0xed 0x3b 0xd5
+0x9 0xee 0x3b 0xd5
+0x29 0xee 0x3b 0xd5
+0x49 0xee 0x3b 0xd5
+0x69 0xee 0x3b 0xd5
+0x89 0xee 0x3b 0xd5
+0xa9 0xee 0x3b 0xd5
+0xc9 0xee 0x3b 0xd5
+0xe9 0xee 0x3b 0xd5
+0x9 0xef 0x3b 0xd5
+0x29 0xef 0x3b 0xd5
+0x49 0xef 0x3b 0xd5
+0x69 0xef 0x3b 0xd5
+0x89 0xef 0x3b 0xd5
+0xa9 0xef 0x3b 0xd5
+0xc9 0xef 0x3b 0xd5
+
+# CHECK: mrs     x12, s3_7_c15_c1_5
+# CHECK: mrs     x13, s3_2_c11_c15_7
+# CHECK: msr     s3_0_c15_c0_0, x12
+# CHECK: msr     s3_7_c11_c13_7, x5
+0xac 0xf1 0x3f 0xd5
+0xed 0xbf 0x3a 0xd5
+0x0c 0xf0 0x18 0xd5
+0xe5 0xbd 0x1f 0xd5
+
+#------------------------------------------------------------------------------
+# Test and branch (immediate)
+#------------------------------------------------------------------------------
+
+# CHECK: tbz     x12, #62, #0
+# CHECK: tbz     x12, #62, #4
+# CHECK: tbz     x12, #62, #-32768
+# CHECK: tbnz    x12, #60, #32764
+0x0c 0x00 0xf0 0xb6
+0x2c 0x00 0xf0 0xb6
+0x0c 0x00 0xf4 0xb6
+0xec 0xff 0xe3 0xb7
+
+#------------------------------------------------------------------------------
+# Unconditional branch (immediate)
+#------------------------------------------------------------------------------
+
+# CHECK: b        #4
+# CHECK: b        #-4
+# CHECK: b        #134217724
+0x01 0x00 0x00 0x14
+0xff 0xff 0xff 0x17
+0xff 0xff 0xff 0x15
+
+#------------------------------------------------------------------------------
+# Unconditional branch (register)
+#------------------------------------------------------------------------------
+
+# CHECK: br       x20
+# CHECK: blr      xzr
+# CHECK: ret      x10
+0x80 0x2 0x1f 0xd6
+0xe0 0x3 0x3f 0xd6
+0x40 0x1 0x5f 0xd6
+
+# CHECK: ret
+# CHECK: eret
+# CHECK: drps
+0xc0 0x3 0x5f 0xd6
+0xe0 0x3 0x9f 0xd6
+0xe0 0x3 0xbf 0xd6
+
diff --git a/test/MC/Disassembler/AArch64/basic-a64-undefined.txt b/test/MC/Disassembler/AArch64/basic-a64-undefined.txt
new file mode 100644
index 0000000..a17579c
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/basic-a64-undefined.txt
@@ -0,0 +1,43 @@
+# These spawn another process so they're rather expensive. Not many.
+
+# Instructions notionally in the add/sub (extended register) sheet, but with
+# invalid shift amount or "opt" field.
+# RUN: echo "0x00 0x10 0xa0 0x0b" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x10 0x60 0x0b" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x14 0x20 0x0b" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+
+# Instructions notionally in the add/sub (immediate) sheet, but with
+# invalid "shift" field.
+# RUN: echo "0xdf 0x3 0x80 0x91" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0xed 0x8e 0xc4 0x31" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x62 0xfc 0xbf 0x11" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x3 0xff 0xff 0x91" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+
+# Instructions notionally in the load/store (unsigned immediate) sheet.
+# Only unallocated (int-register) variants are: opc=0b11, size=0b10, 0b11
+# RUN: echo "0xd7 0xfc 0xff 0xb9" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0xd7 0xfc 0xcf 0xf9" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+
+# Instructions notionally in the floating-point <-> fixed-point conversion
+# Scale field is 64-<imm> and <imm> should be 1-32 for a 32-bit int register.
+# RUN: echo "0x23 0x01 0x18 0x1e" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x23 0x25 0x42 0x1e" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+
+# Instructions notionally in the logical (shifted register) sheet, but with out
+# of range shift: w-registers can only have 0-31.
+# RUN: echo "0x00 0x80 0x00 0x0a" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+
+# Instructions notionally in the move wide (immediate) sheet, but with out
+# of range shift: w-registers can only have 0 or 16.
+# RUN: echo "0x00 0x00 0xc0 0x12" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x12 0x34 0xe0 0x52" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+
+# Data-processing instructions are undefined when S=1 and for the 0b0000111 value in opcode:sf
+# RUN: echo "0x00 0x00 0xc0 0x5f" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x56 0x0c 0xc0 0x5a" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+
+# Data-processing instructions (2 source) are undefined for a value of 0001xx:0:x or 0011xx:0:x for opcode:S:sf
+# RUN: echo "0x00 0x30 0xc1 0x1a" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+# RUN: echo "0x00 0x10 0xc1 0x1a" | llvm-mc -triple=aarch64 -disassemble 2>&1 | FileCheck %s
+
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt b/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt
new file mode 100644
index 0000000..adb8f75
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt
@@ -0,0 +1,96 @@
+# RUN: llvm-mc -triple=aarch64 -disassemble < %s 2>&1 | FileCheck %s
+
+#------------------------------------------------------------------------------
+# Load-store exclusive
+#------------------------------------------------------------------------------
+
+#ldxp x14, x14, [sp]
+0xee 0x3b 0x7f 0xc8
+#CHECK: warning: potentially undefined instruction encoding
+#CHECK-NEXT: 0xee 0x3b 0x7f 0xc8
+
+#ldaxp w19, w19, [x1]
+0x33 0xcc 0x7f 0x88
+#CHECK: warning: potentially undefined instruction encoding
+#CHECK-NEXT: 0x33 0xcc 0x7f 0x88
+
+#------------------------------------------------------------------------------
+# Load-store register (immediate post-indexed)
+#------------------------------------------------------------------------------
+
+0x63 0x44 0x40 0xf8
+#CHECK: warning: potentially undefined instruction encoding
+#CHECK-NEXT: 0x63 0x44 0x40 0xf8
+
+0x42 0x14 0xc0 0x38
+#CHECK: warning: potentially undefined instruction encoding
+#CHECK-NEXT: 0x42 0x14 0xc0 0x38
+
+#------------------------------------------------------------------------------
+# Load-store register (immediate pre-indexed)
+#------------------------------------------------------------------------------
+
+0x63 0x4c 0x40 0xf8
+#CHECK: warning: potentially undefined instruction encoding
+#CHECK-NEXT: 0x63 0x4c 0x40 0xf8
+
+0x42 0x1c 0xc0 0x38
+#CHECK: warning: potentially undefined instruction encoding
+#CHECK-NEXT: 0x42 0x1c 0xc0 0x38
+
+#------------------------------------------------------------------------------
+# Load-store register pair (offset)
+#------------------------------------------------------------------------------
+
+# Unpredictable if Rt == Rt2 on a load.
+
+0xe3 0x0f 0x40 0xa9
+# CHECK:  warning: potentially undefined instruction encoding
+# CHECK-NEXT: 0xe3 0x0f 0x40 0xa9
+# CHECK-NEXT: ^
+
+0xe2 0x8b 0x41 0x69
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: 0xe2 0x8b 0x41 0x69
+# CHECK-NEXT: ^
+
+0x82 0x88 0x40 0x2d
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: 0x82 0x88 0x40 0x2d
+# CHECK-NEXT: ^
+
+#------------------------------------------------------------------------------
+# Load-store register pair (post-indexed)
+#------------------------------------------------------------------------------
+
+# Unpredictable if Rt == Rt2 on a load.
+
+0xe3 0x0f 0xc0 0xa8
+# CHECK:  warning: potentially undefined instruction encoding
+# CHECK-NEXT: 0xe3 0x0f 0xc0 0xa8
+# CHECK-NEXT: ^
+
+0xe2 0x8b 0xc1 0x68
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: 0xe2 0x8b 0xc1 0x68
+# CHECK-NEXT: ^
+
+0x82 0x88 0xc0 0x2c
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: 0x82 0x88 0xc0 0x2c
+# CHECK-NEXT: ^
+
+# Also unpredictable if writeback clashes with either transfer register
+
+0x63 0x94 0xc0 0xa8
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: 0x63 0x94 0xc0 0xa8
+
+0x69 0x2d 0x81 0xa8
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: 0x69 0x2d 0x81 0xa8
+
+0x29 0xad 0xc0 0x28
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: 0x29 0xad 0xc0 0x28
+
diff --git a/test/MC/Disassembler/AArch64/ldp-offset-predictable.txt b/test/MC/Disassembler/AArch64/ldp-offset-predictable.txt
new file mode 100644
index 0000000..7ff495f
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/ldp-offset-predictable.txt
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -triple=aarch64 -disassemble < %s 2>&1 | FileCheck %s
+
+# Stores are OK.
+0xe0 0x83 0x00 0xa9
+# CHECK-NOT: potentially undefined instruction encoding
+# CHECK: stp x0, x0, [sp, #8]
+
diff --git a/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt b/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt
new file mode 100644
index 0000000..775660b
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple=aarch64 -disassemble < %s 2>&1 | FileCheck %s
+
+# None of these instructions should be classified as unpredictable:
+
+# CHECK-NOT: potentially undefined instruction encoding
+
+# Stores from duplicated registers should be fine.
+0xe3 0x0f 0x80 0xa8
+# CHECK: stp x3, x3, [sp], #0
+
+# d5 != x5 so "ldp d5, d6, [x5], #24" is fine.
+0xa5 0x98 0xc1 0x6c
+# CHECK: ldp d5, d6, [x5], #24
+
+# xzr != sp so "stp xzr, xzr, [sp], #8" is fine.
+0xff 0xff 0x80 0xa8
+# CHECK: stp xzr, xzr, [sp], #8
diff --git a/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt b/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt
new file mode 100644
index 0000000..48ea817
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt
@@ -0,0 +1,17 @@
+# RUN: llvm-mc -triple=aarch64 -disassemble < %s 2>&1 | FileCheck %s
+
+# None of these instructions should be classified as unpredictable:
+
+# CHECK-NOT: potentially undefined instruction encoding
+
+# Stores from duplicated registers should be fine.
+0xe3 0x0f 0x80 0xa9
+# CHECK: stp x3, x3, [sp, #0]!
+
+# d5 != x5 so "ldp d5, d6, [x5, #24]!" is fine.
+0xa5 0x98 0xc1 0x6d
+# CHECK: ldp d5, d6, [x5, #24]!
+
+# xzr != sp so "stp xzr, xzr, [sp, #8]!" is fine.
+0xff 0xff 0x80 0xa9
+# CHECK: stp xzr, xzr, [sp, #8]!
diff --git a/test/MC/Disassembler/AArch64/lit.local.cfg b/test/MC/Disassembler/AArch64/lit.local.cfg
new file mode 100644
index 0000000..f9df30e
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.txt']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True
+
diff --git a/test/MC/Disassembler/ARM/invalid-VST1d8Twb_register-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST1d8Twb_register-thumb.txt
index 2d2a628..99da8ce 100644
--- a/test/MC/Disassembler/ARM/invalid-VST1d8Twb_register-thumb.txt
+++ b/test/MC/Disassembler/ARM/invalid-VST1d8Twb_register-thumb.txt
@@ -7,7 +7,7 @@
 # -------------------------------------------------------------------------------------------------
 #
 # A8.6.391 VST1 (multiple single elements)
-# This encoding looks like: vst1.8 {d0,d1,d2}, [r0, :128]
+# This encoding looks like: vst1.8 {d0,d1,d2}, [r0:128]
 # But bits 5-4 for the alignment of 128 encoded as align = 0b10, is available only if <list>
 # contains two or four registers.  rdar://11220250
 0x00 0xf9 0x2f 0x06
diff --git a/test/MC/Disassembler/ARM/neon-tests.txt b/test/MC/Disassembler/ARM/neon-tests.txt
index a7b6b1c..65e9954 100644
--- a/test/MC/Disassembler/ARM/neon-tests.txt
+++ b/test/MC/Disassembler/ARM/neon-tests.txt
@@ -21,10 +21,10 @@
 # CHECK:	vld4.8	{d4, d6, d8, d10}, [r2]
 0x0f 0x41 0x22 0xf4
 
-# CHECK:	vld1.32	{d3[], d4[]}, [r0, :32]!
+# CHECK:	vld1.32	{d3[], d4[]}, [r0:32]!
 0xbd 0x3c 0xa0 0xf4
 
-# CHECK:	vld4.16	{d3[], d5[], d7[], d9[]}, [r0, :64]!
+# CHECK:	vld4.16	{d3[], d5[], d7[], d9[]}, [r0:64]!
 0x7d 0x3f 0xa0 0xf4
 
 # CHECK:	vorr	d0, d15, d15
@@ -75,7 +75,7 @@
 # CHECK:	vbic.i32	q2, #0xa900
 0x79 0x43 0x82 0xf3
 
-# CHECK:	vst2.32	{d16, d18}, [r2, :64], r2
+# CHECK:	vst2.32	{d16, d18}, [r2:64], r2
 0x92 0x9 0x42 0xf4
 
 # CHECK:	vmov.s8	r0, d8[1]
diff --git a/test/MC/Disassembler/ARM/neon.txt b/test/MC/Disassembler/ARM/neon.txt
index 649424a..cd5f418 100644
--- a/test/MC/Disassembler/ARM/neon.txt
+++ b/test/MC/Disassembler/ARM/neon.txt
@@ -1638,7 +1638,7 @@
 
 
 0x1f 0x07 0x60 0xf4
-# CHECK: vld1.8	{d16}, [r0, :64]
+# CHECK: vld1.8	{d16}, [r0:64]
 0x4f 0x07 0x60 0xf4
 # CHECK: vld1.16	{d16}, [r0]
 0x8f 0x07 0x60 0xf4
@@ -1646,37 +1646,37 @@
 0xcf 0x07 0x60 0xf4
 # CHECK: vld1.64	{d16}, [r0]
 0x1f 0x0a 0x60 0xf4
-# CHECK: vld1.8	{d16, d17}, [r0, :64]
+# CHECK: vld1.8	{d16, d17}, [r0:64]
 0x6f 0x0a 0x60 0xf4
-# CHECK: vld1.16	{d16, d17}, [r0, :128]
+# CHECK: vld1.16	{d16, d17}, [r0:128]
 0x8f 0x0a 0x60 0xf4
 # CHECK: vld1.32	{d16, d17}, [r0]
 0xcf 0x0a 0x60 0xf4
 # CHECK: vld1.64	{d16, d17}, [r0]
 
 0x1f 0x08 0x60 0xf4
-# CHECK: vld2.8	{d16, d17}, [r0, :64]
+# CHECK: vld2.8	{d16, d17}, [r0:64]
 0x6f 0x08 0x60 0xf4
-# CHECK: vld2.16	{d16, d17}, [r0, :128]
+# CHECK: vld2.16	{d16, d17}, [r0:128]
 0x8f 0x08 0x60 0xf4
 # CHECK: vld2.32	{d16, d17}, [r0]
 0x1f 0x03 0x60 0xf4
-# CHECK: vld2.8	{d16, d17, d18, d19}, [r0, :64]
+# CHECK: vld2.8	{d16, d17, d18, d19}, [r0:64]
 0x6f 0x03 0x60 0xf4
-# CHECK: vld2.16	{d16, d17, d18, d19}, [r0, :128]
+# CHECK: vld2.16	{d16, d17, d18, d19}, [r0:128]
 0xbf 0x03 0x60 0xf4
-# CHECK: vld2.32	{d16, d17, d18, d19}, [r0, :256]
+# CHECK: vld2.32	{d16, d17, d18, d19}, [r0:256]
 
 0x1f 0x04 0x60 0xf4
-# CHECK: vld3.8	{d16, d17, d18}, [r0, :64]
+# CHECK: vld3.8	{d16, d17, d18}, [r0:64]
 0x4f 0x04 0x60 0xf4
 # CHECK: vld3.16	{d16, d17, d18}, [r0]
 0x8f 0x04 0x60 0xf4
 # CHECK: vld3.32	{d16, d17, d18}, [r0]
 0x1d 0x05 0x60 0xf4
-# CHECK: vld3.8	{d16, d18, d20}, [r0, :64]!
+# CHECK: vld3.8	{d16, d18, d20}, [r0:64]!
 0x1d 0x15 0x60 0xf4
-# CHECK: vld3.8	{d17, d19, d21}, [r0, :64]!
+# CHECK: vld3.8	{d17, d19, d21}, [r0:64]!
 0x4d 0x05 0x60 0xf4
 # CHECK: vld3.16	{d16, d18, d20}, [r0]!
 0x4d 0x15 0x60 0xf4
@@ -1687,15 +1687,15 @@
 # CHECK: vld3.32	{d17, d19, d21}, [r0]!
 
 0x1f 0x00 0x60 0xf4
-# CHECK: vld4.8	{d16, d17, d18, d19}, [r0, :64]
+# CHECK: vld4.8	{d16, d17, d18, d19}, [r0:64]
 0x6f 0x00 0x60 0xf4
-# CHECK: vld4.16	{d16, d17, d18, d19}, [r0, :128]
+# CHECK: vld4.16	{d16, d17, d18, d19}, [r0:128]
 0xbf 0x00 0x60 0xf4
-# CHECK: vld4.32	{d16, d17, d18, d19}, [r0, :256]
+# CHECK: vld4.32	{d16, d17, d18, d19}, [r0:256]
 0x3d 0x01 0x60 0xf4
-# CHECK: vld4.8	{d16, d18, d20, d22}, [r0, :256]!
+# CHECK: vld4.8	{d16, d18, d20, d22}, [r0:256]!
 0x3d 0x11 0x60 0xf4
-# CHECK: vld4.8	{d17, d19, d21, d23}, [r0, :256]!
+# CHECK: vld4.8	{d17, d19, d21, d23}, [r0:256]!
 0x4d 0x01 0x60 0xf4
 # CHECK: vld4.16	{d16, d18, d20, d22}, [r0]!
 0x4d 0x11 0x60 0xf4
@@ -1708,20 +1708,20 @@
 0x6f 0x00 0xe0 0xf4
 # CHECK: vld1.8	{d16[3]}, [r0]
 0x9f 0x04 0xe0 0xf4
-# CHECK: vld1.16	{d16[2]}, [r0, :16]
+# CHECK: vld1.16	{d16[2]}, [r0:16]
 0xbf 0x08 0xe0 0xf4
-# CHECK: vld1.32	{d16[1]}, [r0, :32]
+# CHECK: vld1.32	{d16[1]}, [r0:32]
 
 0x3f 0x01 0xe0 0xf4
-# CHECK: vld2.8	{d16[1], d17[1]}, [r0, :16]
+# CHECK: vld2.8	{d16[1], d17[1]}, [r0:16]
 0x5f 0x05 0xe0 0xf4
-# CHECK: vld2.16	{d16[1], d17[1]}, [r0, :32]
+# CHECK: vld2.16	{d16[1], d17[1]}, [r0:32]
 0x8f 0x09 0xe0 0xf4
 # CHECK: vld2.32	{d16[1], d17[1]}, [r0]
 0x6f 0x15 0xe0 0xf4
 # CHECK: vld2.16	{d17[1], d19[1]}, [r0]
 0x5f 0x19 0xe0 0xf4
-# CHECK: vld2.32	{d17[0], d19[0]}, [r0, :64]
+# CHECK: vld2.32	{d17[0], d19[0]}, [r0:64]
 
 0x2f 0x02 0xe0 0xf4
 # CHECK: vld3.8	{d16[1], d17[1], d18[1]}, [r0]
@@ -1754,44 +1754,44 @@
 0xa5 0x0e 0xa4 0xf4
 
 0x3f 0x03 0xe0 0xf4
-# CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
+# CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0:32]
 0x4f 0x07 0xe0 0xf4
 # CHECK: vld4.16	{d16[1], d17[1], d18[1], d19[1]}, [r0]
 0xaf 0x0b 0xe0 0xf4
-# CHECK: vld4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0, :128]
+# CHECK: vld4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0:128]
 0x7f 0x07 0xe0 0xf4
-# CHECK: vld4.16	{d16[1], d18[1], d20[1], d22[1]}, [r0, :64]
+# CHECK: vld4.16	{d16[1], d18[1], d20[1], d22[1]}, [r0:64]
 0x4f 0x1b 0xe0 0xf4
 # CHECK: vld4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0]
 
 0x0f 0x0f 0xa4 0xf4
 # CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4]
 0x3f 0x0f 0xa4 0xf4
-# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4, :32]
+# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4:32]
 0x1d 0x0f 0xa4 0xf4
-# CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4, :32]!
+# CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4:32]!
 0x35 0x0f 0xa4 0xf4
-# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4, :32], r5
+# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4:32], r5
 0x4f 0x0f 0xa4 0xf4
 # CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4]
 0x7f 0x0f 0xa4 0xf4
-# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4, :64]
+# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4:64]
 0x5d 0x0f 0xa4 0xf4
-# CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4, :64]!
+# CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4:64]!
 0x75 0x0f 0xa4 0xf4
-# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4, :64], r5
+# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4:64], r5
 0x8f 0x0f 0xa4 0xf4
 # CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4]
 0xbf 0x0f 0xa4 0xf4
-# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4, :64]
+# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4:64]
 0xdd 0x0f 0xa4 0xf4
-# CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4, :128]!
+# CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4:128]!
 0xf5 0x0f 0xa4 0xf4
-# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4, :128], r5
+# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4:128], r5
 
 
 0x1f 0x07 0x40 0xf4
-# CHECK: vst1.8	{d16}, [r0, :64]
+# CHECK: vst1.8	{d16}, [r0:64]
 0x4f 0x07 0x40 0xf4
 # CHECK: vst1.16	{d16}, [r0]
 0x8f 0x07 0x40 0xf4
@@ -1799,37 +1799,37 @@
 0xcf 0x07 0x40 0xf4
 # CHECK: vst1.64	{d16}, [r0]
 0x1f 0x0a 0x40 0xf4
-# CHECK: vst1.8	{d16, d17}, [r0, :64]
+# CHECK: vst1.8	{d16, d17}, [r0:64]
 0x6f 0x0a 0x40 0xf4
-# CHECK: vst1.16	{d16, d17}, [r0, :128]
+# CHECK: vst1.16	{d16, d17}, [r0:128]
 0x8f 0x0a 0x40 0xf4
 # CHECK: vst1.32	{d16, d17}, [r0]
 0xcf 0x0a 0x40 0xf4
 # CHECK: vst1.64	{d16, d17}, [r0]
 
 0x1f 0x08 0x40 0xf4
-# CHECK: vst2.8	{d16, d17}, [r0, :64]
+# CHECK: vst2.8	{d16, d17}, [r0:64]
 0x6f 0x08 0x40 0xf4
-# CHECK: vst2.16	{d16, d17}, [r0, :128]
+# CHECK: vst2.16	{d16, d17}, [r0:128]
 0x8f 0x08 0x40 0xf4
 # CHECK: vst2.32	{d16, d17}, [r0]
 0x1f 0x03 0x40 0xf4
-# CHECK: vst2.8	{d16, d17, d18, d19}, [r0, :64]
+# CHECK: vst2.8	{d16, d17, d18, d19}, [r0:64]
 0x6f 0x03 0x40 0xf4
-# CHECK: vst2.16	{d16, d17, d18, d19}, [r0, :128]
+# CHECK: vst2.16	{d16, d17, d18, d19}, [r0:128]
 0xbf 0x03 0x40 0xf4
-# CHECK: vst2.32	{d16, d17, d18, d19}, [r0, :256]
+# CHECK: vst2.32	{d16, d17, d18, d19}, [r0:256]
 
 0x1f 0x04 0x40 0xf4
-# CHECK: vst3.8	{d16, d17, d18}, [r0, :64]
+# CHECK: vst3.8	{d16, d17, d18}, [r0:64]
 0x4f 0x04 0x40 0xf4
 # CHECK: vst3.16	{d16, d17, d18}, [r0]
 0x8f 0x04 0x40 0xf4
 # CHECK: vst3.32	{d16, d17, d18}, [r0]
 0x1d 0x05 0x40 0xf4
-# CHECK: vst3.8	{d16, d18, d20}, [r0, :64]!
+# CHECK: vst3.8	{d16, d18, d20}, [r0:64]!
 0x1d 0x15 0x40 0xf4
-# CHECK: vst3.8	{d17, d19, d21}, [r0, :64]!
+# CHECK: vst3.8	{d17, d19, d21}, [r0:64]!
 0x4d 0x05 0x40 0xf4
 # CHECK: vst3.16	{d16, d18, d20}, [r0]!
 0x4d 0x15 0x40 0xf4
@@ -1840,13 +1840,13 @@
 # CHECK: vst3.32	{d17, d19, d21}, [r0]!
 
 0x1f 0x00 0x40 0xf4
-# CHECK: vst4.8	{d16, d17, d18, d19}, [r0, :64]
+# CHECK: vst4.8	{d16, d17, d18, d19}, [r0:64]
 0x6f 0x00 0x40 0xf4
-# CHECK: vst4.16	{d16, d17, d18, d19}, [r0, :128]
+# CHECK: vst4.16	{d16, d17, d18, d19}, [r0:128]
 0x3d 0x01 0x40 0xf4
-# CHECK: vst4.8	{d16, d18, d20, d22}, [r0, :256]!
+# CHECK: vst4.8	{d16, d18, d20, d22}, [r0:256]!
 0x3d 0x11 0x40 0xf4
-# CHECK: vst4.8	{d17, d19, d21, d23}, [r0, :256]!
+# CHECK: vst4.8	{d17, d19, d21, d23}, [r0:256]!
 0x4d 0x01 0x40 0xf4
 # CHECK: vst4.16	{d16, d18, d20, d22}, [r0]!
 0x4d 0x11 0x40 0xf4
@@ -1857,15 +1857,15 @@
 # CHECK: vst4.32	{d17, d19, d21, d23}, [r0]!
 
 0x3f 0x01 0xc0 0xf4
-# CHECK: vst2.8	{d16[1], d17[1]}, [r0, :16]
+# CHECK: vst2.8	{d16[1], d17[1]}, [r0:16]
 0x5f 0x05 0xc0 0xf4
-# CHECK: vst2.16	{d16[1], d17[1]}, [r0, :32]
+# CHECK: vst2.16	{d16[1], d17[1]}, [r0:32]
 0x8f 0x09 0xc0 0xf4
 # CHECK: vst2.32	{d16[1], d17[1]}, [r0]
 0x6f 0x15 0xc0 0xf4
 # CHECK: vst2.16	{d17[1], d19[1]}, [r0]
 0x5f 0x19 0xc0 0xf4
-# CHECK: vst2.32	{d17[0], d19[0]}, [r0, :64]
+# CHECK: vst2.32	{d17[0], d19[0]}, [r0:64]
 
 0x2f 0x02 0xc0 0xf4
 # CHECK: vst3.8	{d16[1], d17[1], d18[1]}, [r0]
@@ -1879,13 +1879,13 @@
 # CHECK: vst3.32	{d16[0], d18[0], d20[0]}, [r0]
 
 0x3f 0x03 0xc0 0xf4
-# CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
+# CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0:32]
 0x4f 0x07 0xc0 0xf4
 # CHECK: vst4.16	{d16[1], d17[1], d18[1], d19[1]}, [r0]
 0xaf 0x0b 0xc0 0xf4
-# CHECK: vst4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0, :128]
+# CHECK: vst4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0:128]
 0xff 0x17 0xc0 0xf4
-# CHECK: vst4.16	{d17[3], d19[3], d21[3], d23[3]}, [r0, :64]
+# CHECK: vst4.16	{d17[3], d19[3], d21[3], d23[3]}, [r0:64]
 0x4f 0x1b 0xc0 0xf4
 # CHECK: vst4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0]
 
@@ -1920,11 +1920,11 @@
 # CHECK: vcvttmi.f32.f16	s2, s19
 
 0x1d 0x76 0x66 0xf4
-# CHECK: vld1.8	{d23, d24, d25}, [r6, :64]!
+# CHECK: vld1.8	{d23, d24, d25}, [r6:64]!
 0x9d 0x62 0x6f 0xf4
-# CHECK: vld1.32	{d22, d23, d24, d25}, [pc, :64]!
+# CHECK: vld1.32	{d22, d23, d24, d25}, [pc:64]!
 0x9d 0xaa 0x41 0xf4
-# CHECK: vst1.32	{d26, d27}, [r1, :64]!
+# CHECK: vst1.32	{d26, d27}, [r1:64]!
 
 0x10 0x0f 0x83 0xf2
 0x50 0x0f 0x83 0xf2
diff --git a/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
index e53739e..6506143 100644
--- a/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
+++ b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
@@ -28,13 +28,13 @@
 0xa0 0xf9 0xd0 0x04
 
 # CHECK: vld1.16 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x04]
-# CHECK: vld1.16 {d0[0]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[0]}, [r0:16], r0 @ encoding: [0xa0,0xf9,0x10,0x04]
 # CHECK: vld1.16 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x40,0x04]
-# CHECK: vld1.16 {d0[1]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x50,0x04]
+# CHECK: vld1.16 {d0[1]}, [r0:16], r0 @ encoding: [0xa0,0xf9,0x50,0x04]
 # CHECK: vld1.16 {d0[2]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x04]
-# CHECK: vld1.16 {d0[2]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x90,0x04]
+# CHECK: vld1.16 {d0[2]}, [r0:16], r0 @ encoding: [0xa0,0xf9,0x90,0x04]
 # CHECK: vld1.16 {d0[3]}, [r0], r0      @ encoding: [0xa0,0xf9,0xc0,0x04]
-# CHECK: vld1.16 {d0[3]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0xd0,0x04]
+# CHECK: vld1.16 {d0[3]}, [r0:16], r0 @ encoding: [0xa0,0xf9,0xd0,0x04]
 
 0xa0 0xf9 0x00 0x08
 0xa0 0xf9 0x30 0x08
@@ -42,20 +42,20 @@
 0xa0 0xf9 0xb0 0x08
 
 # CHECK: vld1.32 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x08]
-# CHECK: vld1.32 {d0[0]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0x30,0x08]
+# CHECK: vld1.32 {d0[0]}, [r0:32], r0 @ encoding: [0xa0,0xf9,0x30,0x08]
 # CHECK: vld1.32 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x08]
-# CHECK: vld1.32 {d0[1]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0xb0,0x08]
+# CHECK: vld1.32 {d0[1]}, [r0:32], r0 @ encoding: [0xa0,0xf9,0xb0,0x08]
 
 0xa0 0xf9 0x1f 0x04
 0xa0 0xf9 0x8f 0x00
 
-# CHECK: vld1.16 {d0[0]}, [r0, :16] @ encoding: [0xa0,0xf9,0x1f,0x04]
+# CHECK: vld1.16 {d0[0]}, [r0:16] @ encoding: [0xa0,0xf9,0x1f,0x04]
 # CHECK: vld1.8  {d0[4]}, [r0]      @ encoding: [0xa0,0xf9,0x8f,0x00]
 
 0xa0 0xf9 0x1d 0x04
 0xa0 0xf9 0x8d 0x00
 
-# CHECK: vld1.16 {d0[0]}, [r0, :16]! @ encoding: [0xa0,0xf9,0x1d,0x04]
+# CHECK: vld1.16 {d0[0]}, [r0:16]! @ encoding: [0xa0,0xf9,0x1d,0x04]
 # CHECK: vld1.8  {d0[4]}, [r0]!      @ encoding: [0xa0,0xf9,0x8d,0x00]
 
 0xa5 0xf9 0x10 0x04
@@ -63,15 +63,15 @@
 0xae 0xf9 0x1a 0x04
 0xa5 0xf9 0x1a 0x94
 
-# CHECK: vld1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0xa5,0xf9,0x10,0x04]
-# CHECK: vld1.16 {d0[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x04]
-# CHECK: vld1.16 {d0[0]}, [lr, :16], r10 @ encoding: [0xae,0xf9,0x1a,0x04]
-# CHECK: vld1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x94]
+# CHECK: vld1.16 {d0[0]}, [r5:16], r0  @ encoding: [0xa5,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[0]}, [r5:16], r10 @ encoding: [0xa5,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d0[0]}, [lr:16], r10 @ encoding: [0xae,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d9[0]}, [r5:16], r10 @ encoding: [0xa5,0xf9,0x1a,0x94]
 
 0xa0 0xf9 0x20 0x0b
 0xa0 0xf9 0x20 0x07
 0xa0 0xf9 0x20 0x03
 
-# CHECK: vld4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0, :128], r0 @ encoding: [0xa0,0xf9,0x20,0x0b]
+# CHECK: vld4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0:128], r0 @ encoding: [0xa0,0xf9,0x20,0x0b]
 # CHECK: vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x07]
 # CHECK: vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x03]
diff --git a/test/MC/Disassembler/ARM/neont-VST-reencoding.txt b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
index eb3722c..5119d92 100644
--- a/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
+++ b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
@@ -28,13 +28,13 @@
 0xc9 0xf9 0xd9 0x94
 
 # CHECK: vst1.16 {d0[0]},  [r0], r0      @ encoding: [0x80,0xf9,0x00,0x04]
-# CHECK: vst1.16 {d16[0]}, [r3, :16], r3 @ encoding: [0xc3,0xf9,0x13,0x04]
+# CHECK: vst1.16 {d16[0]}, [r3:16], r3 @ encoding: [0xc3,0xf9,0x13,0x04]
 # CHECK: vst1.16 {d16[1]}, [r4], r3      @ encoding: [0xc4,0xf9,0x43,0x04]
-# CHECK: vst1.16 {d16[1]}, [r5, :16], r5 @ encoding: [0xc5,0xf9,0x55,0x04]
+# CHECK: vst1.16 {d16[1]}, [r5:16], r5 @ encoding: [0xc5,0xf9,0x55,0x04]
 # CHECK: vst1.16 {d16[2]}, [r6], r5      @ encoding: [0xc6,0xf9,0x85,0x04]
-# CHECK: vst1.16 {d23[2]}, [r7, :16], r5 @ encoding: [0xc7,0xf9,0x95,0x74]
+# CHECK: vst1.16 {d23[2]}, [r7:16], r5 @ encoding: [0xc7,0xf9,0x95,0x74]
 # CHECK: vst1.16 {d24[3]}, [r8], r7      @ encoding: [0xc8,0xf9,0xc7,0x84]
-# CHECK: vst1.16 {d25[3]}, [r9, :16], r9 @ encoding: [0xc9,0xf9,0xd9,0x94]
+# CHECK: vst1.16 {d25[3]}, [r9:16], r9 @ encoding: [0xc9,0xf9,0xd9,0x94]
 
 0x8a 0xf9 0x01 0xa8
 0xcb 0xf9 0x32 0x18
@@ -42,20 +42,20 @@
 0xcd 0xf9 0xb4 0x28
 
 # CHECK: vst1.32 {d10[0]}, [r10], r1      @ encoding: [0x8a,0xf9,0x01,0xa8]
-# CHECK: vst1.32 {d17[0]}, [r11, :32], r2 @ encoding: [0xcb,0xf9,0x32,0x18]
+# CHECK: vst1.32 {d17[0]}, [r11:32], r2 @ encoding: [0xcb,0xf9,0x32,0x18]
 # CHECK: vst1.32 {d11[1]}, [r12], r3      @ encoding: [0x8c,0xf9,0x83,0xb8]
-# CHECK: vst1.32 {d18[1]}, [sp, :32], r4  @ encoding: [0xcd,0xf9,0xb4,0x28]
+# CHECK: vst1.32 {d18[1]}, [sp:32], r4  @ encoding: [0xcd,0xf9,0xb4,0x28]
 
 0x81 0xf9 0x1f 0x44
 0x82 0xf9 0x8f 0x30
 
-# CHECK: vst1.16 {d4[0]}, [r1, :16] @ encoding: [0x81,0xf9,0x1f,0x44]
+# CHECK: vst1.16 {d4[0]}, [r1:16] @ encoding: [0x81,0xf9,0x1f,0x44]
 # CHECK: vst1.8  {d3[4]}, [r2]      @ encoding: [0x82,0xf9,0x8f,0x30]
 
 0x83 0xf9 0x1d 0x24
 0x84 0xf9 0x8d 0x10
 
-# CHECK: vst1.16 {d2[0]}, [r3, :16]! @ encoding: [0x83,0xf9,0x1d,0x24]
+# CHECK: vst1.16 {d2[0]}, [r3:16]! @ encoding: [0x83,0xf9,0x1d,0x24]
 # CHECK: vst1.8  {d1[4]}, [r4]!      @ encoding: [0x84,0xf9,0x8d,0x10]
 
 0x85 0xf9 0x10 0x04
@@ -63,15 +63,15 @@
 0x8e 0xf9 0x1a 0x84
 0x85 0xf9 0x1a 0x94
 
-# CHECK: vst1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0x85,0xf9,0x10,0x04]
-# CHECK: vst1.16 {d7[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x74]
-# CHECK: vst1.16 {d8[0]}, [lr, :16], r10 @ encoding: [0x8e,0xf9,0x1a,0x84]
-# CHECK: vst1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x94]
+# CHECK: vst1.16 {d0[0]}, [r5:16], r0  @ encoding: [0x85,0xf9,0x10,0x04]
+# CHECK: vst1.16 {d7[0]}, [r5:16], r10 @ encoding: [0x85,0xf9,0x1a,0x74]
+# CHECK: vst1.16 {d8[0]}, [lr:16], r10 @ encoding: [0x8e,0xf9,0x1a,0x84]
+# CHECK: vst1.16 {d9[0]}, [r5:16], r10 @ encoding: [0x85,0xf9,0x1a,0x94]
 
 0x81 0xf9 0x24 0x0b
 0x82 0xf9 0x25 0x07
 0x83 0xf9 0x26 0x03
 
-# CHECK: vst4.32 {d0[0], d1[0], d2[0], d3[0]}, [r1, :128], r4 @ encoding: [0x81,0xf9,0x24,0x0b]
+# CHECK: vst4.32 {d0[0], d1[0], d2[0], d3[0]}, [r1:128], r4 @ encoding: [0x81,0xf9,0x24,0x0b]
 # CHECK: vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r2], r5       @ encoding: [0x82,0xf9,0x25,0x07]
 # CHECK: vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r3], r6       @ encoding: [0x83,0xf9,0x26,0x03]
diff --git a/test/MC/Disassembler/ARM/neont2.txt b/test/MC/Disassembler/ARM/neont2.txt
index 7d7010f..3374578 100644
--- a/test/MC/Disassembler/ARM/neont2.txt
+++ b/test/MC/Disassembler/ARM/neont2.txt
@@ -1379,7 +1379,7 @@
 # CHECK: vtbx.8	d20, {d16, d17, d18, d19}, d21
 
 0x60 0xf9 0x1f 0x07
-# CHECK: vld1.8	{d16}, [r0, :64]
+# CHECK: vld1.8	{d16}, [r0:64]
 0x60 0xf9 0x4f 0x07
 # CHECK: vld1.16	{d16}, [r0]
 0x60 0xf9 0x8f 0x07
@@ -1387,37 +1387,37 @@
 0x60 0xf9 0xcf 0x07
 # CHECK: vld1.64	{d16}, [r0]
 0x60 0xf9 0x1f 0x0a
-# CHECK: vld1.8	{d16, d17}, [r0, :64]
+# CHECK: vld1.8	{d16, d17}, [r0:64]
 0x60 0xf9 0x6f 0x0a
-# CHECK: vld1.16	{d16, d17}, [r0, :128]
+# CHECK: vld1.16	{d16, d17}, [r0:128]
 0x60 0xf9 0x8f 0x0a
 # CHECK: vld1.32	{d16, d17}, [r0]
 0x60 0xf9 0xcf 0x0a
 # CHECK: vld1.64	{d16, d17}, [r0]
 
 0x60 0xf9 0x1f 0x08
-# CHECK: vld2.8	{d16, d17}, [r0, :64]
+# CHECK: vld2.8	{d16, d17}, [r0:64]
 0x60 0xf9 0x6f 0x08
-# CHECK: vld2.16	{d16, d17}, [r0, :128]
+# CHECK: vld2.16	{d16, d17}, [r0:128]
 0x60 0xf9 0x8f 0x08
 # CHECK: vld2.32	{d16, d17}, [r0]
 0x60 0xf9 0x1f 0x03
-# CHECK: vld2.8	{d16, d17, d18, d19}, [r0, :64]
+# CHECK: vld2.8	{d16, d17, d18, d19}, [r0:64]
 0x60 0xf9 0x6f 0x03
-# CHECK: vld2.16	{d16, d17, d18, d19}, [r0, :128]
+# CHECK: vld2.16	{d16, d17, d18, d19}, [r0:128]
 0x60 0xf9 0xbf 0x03
-# CHECK: vld2.32	{d16, d17, d18, d19}, [r0, :256]
+# CHECK: vld2.32	{d16, d17, d18, d19}, [r0:256]
 
 0x60 0xf9 0x1f 0x04
-# CHECK: vld3.8	{d16, d17, d18}, [r0, :64]
+# CHECK: vld3.8	{d16, d17, d18}, [r0:64]
 0x60 0xf9 0x4f 0x04
 # CHECK: vld3.16	{d16, d17, d18}, [r0]
 0x60 0xf9 0x8f 0x04
 # CHECK: vld3.32	{d16, d17, d18}, [r0]
 0x60 0xf9 0x1d 0x05
-# CHECK: vld3.8	{d16, d18, d20}, [r0, :64]!
+# CHECK: vld3.8	{d16, d18, d20}, [r0:64]!
 0x60 0xf9 0x1d 0x15
-# CHECK: vld3.8	{d17, d19, d21}, [r0, :64]!
+# CHECK: vld3.8	{d17, d19, d21}, [r0:64]!
 0x60 0xf9 0x4d 0x05
 # CHECK: vld3.16	{d16, d18, d20}, [r0]!
 0x60 0xf9 0x4d 0x15
@@ -1428,15 +1428,15 @@
 # CHECK: vld3.32	{d17, d19, d21}, [r0]!
 
 0x60 0xf9 0x1f 0x00
-# CHECK: vld4.8	{d16, d17, d18, d19}, [r0, :64]
+# CHECK: vld4.8	{d16, d17, d18, d19}, [r0:64]
 0x60 0xf9 0x6f 0x00
-# CHECK: vld4.16	{d16, d17, d18, d19}, [r0, :128]
+# CHECK: vld4.16	{d16, d17, d18, d19}, [r0:128]
 0x60 0xf9 0xbf 0x00
-# CHECK: vld4.32	{d16, d17, d18, d19}, [r0, :256]
+# CHECK: vld4.32	{d16, d17, d18, d19}, [r0:256]
 0x60 0xf9 0x3d 0x01
-# CHECK: vld4.8	{d16, d18, d20, d22}, [r0, :256]!
+# CHECK: vld4.8	{d16, d18, d20, d22}, [r0:256]!
 0x60 0xf9 0x3d 0x11
-# CHECK: vld4.8	{d17, d19, d21, d23}, [r0, :256]!
+# CHECK: vld4.8	{d17, d19, d21, d23}, [r0:256]!
 0x60 0xf9 0x4d 0x01
 # CHECK: vld4.16	{d16, d18, d20, d22}, [r0]!
 0x60 0xf9 0x4d 0x11
@@ -1449,20 +1449,20 @@
 0xe0 0xf9 0x6f 0x00
 # CHECK: vld1.8	{d16[3]}, [r0]
 0xe0 0xf9 0x9f 0x04
-# CHECK: vld1.16	{d16[2]}, [r0, :16]
+# CHECK: vld1.16	{d16[2]}, [r0:16]
 0xe0 0xf9 0xbf 0x08
-# CHECK: vld1.32	{d16[1]}, [r0, :32]
+# CHECK: vld1.32	{d16[1]}, [r0:32]
 
 0xe0 0xf9 0x3f 0x01
-# CHECK: vld2.8	{d16[1], d17[1]}, [r0, :16]
+# CHECK: vld2.8	{d16[1], d17[1]}, [r0:16]
 0xe0 0xf9 0x5f 0x05
-# CHECK: vld2.16	{d16[1], d17[1]}, [r0, :32]
+# CHECK: vld2.16	{d16[1], d17[1]}, [r0:32]
 0xe0 0xf9 0x8f 0x09
 # CHECK: vld2.32	{d16[1], d17[1]}, [r0]
 0xe0 0xf9 0x6f 0x15
 # CHECK: vld2.16	{d17[1], d19[1]}, [r0]
 0xe0 0xf9 0x5f 0x19
-# CHECK: vld2.32	{d17[0], d19[0]}, [r0, :64]
+# CHECK: vld2.32	{d17[0], d19[0]}, [r0:64]
 
 0xe0 0xf9 0x2f 0x02
 # CHECK: vld3.8	{d16[1], d17[1], d18[1]}, [r0]
@@ -1495,43 +1495,43 @@
 # CHECK: vld3.32	{d0[], d2[], d4[]}, [r4], r5
 
 0xe0 0xf9 0x3f 0x03
-# CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
+# CHECK: vld4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0:32]
 0xe0 0xf9 0x4f 0x07
 # CHECK: vld4.16	{d16[1], d17[1], d18[1], d19[1]}, [r0]
 0xe0 0xf9 0xaf 0x0b
-# CHECK: vld4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0, :128]
+# CHECK: vld4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0:128]
 0xe0 0xf9 0x7f 0x07
-# CHECK: vld4.16	{d16[1], d18[1], d20[1], d22[1]}, [r0, :64]
+# CHECK: vld4.16	{d16[1], d18[1], d20[1], d22[1]}, [r0:64]
 0xe0 0xf9 0x4f 0x1b
 # CHECK: vld4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0]
 
 0xa4 0xf9 0x0f 0x0f
 # CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4] 
 0xa4 0xf9 0x3f 0x0f
-# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4, :32] 
+# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4:32] 
 0xa4 0xf9 0x1d 0x0f
-# CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4, :32]! 
+# CHECK: vld4.8	{d0[], d1[], d2[], d3[]}, [r4:32]! 
 0xa4 0xf9 0x35 0x0f
-# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4, :32], r5 
+# CHECK: vld4.8	{d0[], d2[], d4[], d6[]}, [r4:32], r5 
 0xa4 0xf9 0x4f 0x0f
 # CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4] 
 0xa4 0xf9 0x7f 0x0f
-# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4, :64] 
+# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4:64] 
 0xa4 0xf9 0x5d 0x0f
-# CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4, :64]! 
+# CHECK: vld4.16	{d0[], d1[], d2[], d3[]}, [r4:64]! 
 0xa4 0xf9 0x75 0x0f
-# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4, :64], r5 
+# CHECK: vld4.16	{d0[], d2[], d4[], d6[]}, [r4:64], r5 
 0xa4 0xf9 0x8f 0x0f
 # CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4] 
 0xa4 0xf9 0xbf 0x0f
-# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4, :64] 
+# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4:64] 
 0xa4 0xf9 0xdd 0x0f
-# CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4, :128]! 
+# CHECK: vld4.32	{d0[], d1[], d2[], d3[]}, [r4:128]! 
 0xa4 0xf9 0xf5 0x0f
-# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4, :128], r5 
+# CHECK: vld4.32	{d0[], d2[], d4[], d6[]}, [r4:128], r5 
 
 0x40 0xf9 0x1f 0x07
-# CHECK: vst1.8	{d16}, [r0, :64]
+# CHECK: vst1.8	{d16}, [r0:64]
 0x40 0xf9 0x4f 0x07
 # CHECK: vst1.16	{d16}, [r0]
 0x40 0xf9 0x8f 0x07
@@ -1539,37 +1539,37 @@
 0x40 0xf9 0xcf 0x07
 # CHECK: vst1.64	{d16}, [r0]
 0x40 0xf9 0x1f 0x0a
-# CHECK: vst1.8	{d16, d17}, [r0, :64]
+# CHECK: vst1.8	{d16, d17}, [r0:64]
 0x40 0xf9 0x6f 0x0a
-# CHECK: vst1.16	{d16, d17}, [r0, :128]
+# CHECK: vst1.16	{d16, d17}, [r0:128]
 0x40 0xf9 0x8f 0x0a
 # CHECK: vst1.32	{d16, d17}, [r0]
 0x40 0xf9 0xcf 0x0a
 # CHECK: vst1.64	{d16, d17}, [r0]
 
 0x40 0xf9 0x1f 0x08
-# CHECK: vst2.8	{d16, d17}, [r0, :64]
+# CHECK: vst2.8	{d16, d17}, [r0:64]
 0x40 0xf9 0x6f 0x08
-# CHECK: vst2.16	{d16, d17}, [r0, :128]
+# CHECK: vst2.16	{d16, d17}, [r0:128]
 0x40 0xf9 0x8f 0x08
 # CHECK: vst2.32	{d16, d17}, [r0]
 0x40 0xf9 0x1f 0x03
-# CHECK: vst2.8	{d16, d17, d18, d19}, [r0, :64]
+# CHECK: vst2.8	{d16, d17, d18, d19}, [r0:64]
 0x40 0xf9 0x6f 0x03
-# CHECK: vst2.16	{d16, d17, d18, d19}, [r0, :128]
+# CHECK: vst2.16	{d16, d17, d18, d19}, [r0:128]
 0x40 0xf9 0xbf 0x03
-# CHECK: vst2.32	{d16, d17, d18, d19}, [r0, :256]
+# CHECK: vst2.32	{d16, d17, d18, d19}, [r0:256]
 
 0x40 0xf9 0x1f 0x04
-# CHECK: vst3.8	{d16, d17, d18}, [r0, :64]
+# CHECK: vst3.8	{d16, d17, d18}, [r0:64]
 0x40 0xf9 0x4f 0x04
 # CHECK: vst3.16	{d16, d17, d18}, [r0]
 0x40 0xf9 0x8f 0x04
 # CHECK: vst3.32	{d16, d17, d18}, [r0]
 0x40 0xf9 0x1d 0x05
-# CHECK: vst3.8	{d16, d18, d20}, [r0, :64]!
+# CHECK: vst3.8	{d16, d18, d20}, [r0:64]!
 0x40 0xf9 0x1d 0x15
-# CHECK: vst3.8	{d17, d19, d21}, [r0, :64]!
+# CHECK: vst3.8	{d17, d19, d21}, [r0:64]!
 0x40 0xf9 0x4d 0x05
 # CHECK: vst3.16	{d16, d18, d20}, [r0]!
 0x40 0xf9 0x4d 0x15
@@ -1580,13 +1580,13 @@
 # CHECK: vst3.32	{d17, d19, d21}, [r0]!
 
 0x40 0xf9 0x1f 0x00
-# CHECK: vst4.8	{d16, d17, d18, d19}, [r0, :64]
+# CHECK: vst4.8	{d16, d17, d18, d19}, [r0:64]
 0x40 0xf9 0x6f 0x00
-# CHECK: vst4.16	{d16, d17, d18, d19}, [r0, :128]
+# CHECK: vst4.16	{d16, d17, d18, d19}, [r0:128]
 0x40 0xf9 0x3d 0x01
-# CHECK: vst4.8	{d16, d18, d20, d22}, [r0, :256]!
+# CHECK: vst4.8	{d16, d18, d20, d22}, [r0:256]!
 0x40 0xf9 0x3d 0x11
-# CHECK: vst4.8	{d17, d19, d21, d23}, [r0, :256]!
+# CHECK: vst4.8	{d17, d19, d21, d23}, [r0:256]!
 0x40 0xf9 0x4d 0x01
 # CHECK: vst4.16	{d16, d18, d20, d22}, [r0]!
 0x40 0xf9 0x4d 0x11
@@ -1597,15 +1597,15 @@
 # CHECK: vst4.32	{d17, d19, d21, d23}, [r0]!
 
 0xc0 0xf9 0x3f 0x01
-# CHECK: vst2.8	{d16[1], d17[1]}, [r0, :16]
+# CHECK: vst2.8	{d16[1], d17[1]}, [r0:16]
 0xc0 0xf9 0x5f 0x05
-# CHECK: vst2.16	{d16[1], d17[1]}, [r0, :32]
+# CHECK: vst2.16	{d16[1], d17[1]}, [r0:32]
 0xc0 0xf9 0x8f 0x09
 # CHECK: vst2.32	{d16[1], d17[1]}, [r0]
 0xc0 0xf9 0x6f 0x15
 # CHECK: vst2.16	{d17[1], d19[1]}, [r0]
 0xc0 0xf9 0x5f 0x19
-# CHECK: vst2.32	{d17[0], d19[0]}, [r0, :64]
+# CHECK: vst2.32	{d17[0], d19[0]}, [r0:64]
 
 0xc0 0xf9 0x2f 0x02
 # CHECK: vst3.8	{d16[1], d17[1], d18[1]}, [r0]
@@ -1619,26 +1619,26 @@
 # CHECK: vst3.32	{d16[0], d18[0], d20[0]}, [r0]
 
 0xc0 0xf9 0x3f 0x03
-# CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0, :32]
+# CHECK: vst4.8	{d16[1], d17[1], d18[1], d19[1]}, [r0:32]
 0xc0 0xf9 0x4f 0x07
 # CHECK: vst4.16	{d16[1], d17[1], d18[1], d19[1]}, [r0]
 0xc0 0xf9 0xaf 0x0b
-# CHECK: vst4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0, :128]
+# CHECK: vst4.32	{d16[1], d17[1], d18[1], d19[1]}, [r0:128]
 0xc0 0xf9 0xff 0x17
-# CHECK: vst4.16	{d17[3], d19[3], d21[3], d23[3]}, [r0, :64]
+# CHECK: vst4.16	{d17[3], d19[3], d21[3], d23[3]}, [r0:64]
 0xc0 0xf9 0x4f 0x1b
 # CHECK: vst4.32	{d17[0], d19[0], d21[0], d23[0]}, [r0]
 
 0x63 0xf9 0x37 0xc9
-# CHECK: vld2.8	{d28, d30}, [r3, :256], r7
+# CHECK: vld2.8	{d28, d30}, [r3:256], r7
 
 # rdar://10798451
 0xe7 0xf9 0x32 0x1d
-# CHECK vld2.8	{d17[], d19[]}, [r7, :16], r2
+# CHECK vld2.8	{d17[], d19[]}, [r7:16], r2
 0xe7 0xf9 0x3d 0x1d
-# CHECK vld2.8	{d17[], d19[]}, [r7, :16]!
+# CHECK vld2.8	{d17[], d19[]}, [r7:16]!
 0xe7 0xf9 0x3f 0x1d
-# CHECK vld2.8	{d17[], d19[]}, [r7, :16]
+# CHECK vld2.8	{d17[], d19[]}, [r7:16]
 
 # rdar://11034702
 0x04 0xf9 0x0d 0x87
@@ -2046,9 +2046,9 @@
 
 # rdar://10798451
 0xe7 0xf9 0x32 0x1d
-# CHECK: vld2.8	{d17[], d19[]}, [r7, :16], r2
+# CHECK: vld2.8	{d17[], d19[]}, [r7:16], r2
 0xe7 0xf9 0x3d 0x1d
-# CHECK: vld2.8	{d17[], d19[]}, [r7, :16]!
+# CHECK: vld2.8	{d17[], d19[]}, [r7:16]!
 0xe7 0xf9 0x3f 0x1d
-# CHECK: vld2.8	{d17[], d19[]}, [r7, :16]
+# CHECK: vld2.8	{d17[], d19[]}, [r7:16]
 
diff --git a/test/MC/Disassembler/Mips/mips32.txt b/test/MC/Disassembler/Mips/mips32.txt
index a193319..7022486 100644
--- a/test/MC/Disassembler/Mips/mips32.txt
+++ b/test/MC/Disassembler/Mips/mips32.txt
@@ -404,3 +404,9 @@
 
 # CHECK: xori  $9,  $6, 17767
 0x38 0xc9 0x45 0x67
+
+# CHECK: .set    push
+# CHECK: .set    mips32r2
+# CHECK: rdhwr   $5, $29
+# CHECK: .set    pop
+0x7c 0x05 0xe8 0x3b
diff --git a/test/MC/Disassembler/Mips/mips32_le.txt b/test/MC/Disassembler/Mips/mips32_le.txt
index 08b3672..48fa8e2 100644
--- a/test/MC/Disassembler/Mips/mips32_le.txt
+++ b/test/MC/Disassembler/Mips/mips32_le.txt
@@ -404,3 +404,9 @@
 
 # CHECK: xori  $9,  $6, 17767
 0x67 0x45 0xc9 0x38
+
+# CHECK: .set    push
+# CHECK: .set    mips32r2
+# CHECK: rdhwr   $5, $29
+# CHECK: .set    pop
+0x3b 0xe8 0x05 0x7c
diff --git a/test/MC/Disassembler/X86/intel-syntax-32.txt b/test/MC/Disassembler/X86/intel-syntax-32.txt
new file mode 100644
index 0000000..08bae6e
--- /dev/null
+++ b/test/MC/Disassembler/X86/intel-syntax-32.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc --disassemble %s -triple=i386 --output-asm-variant=1 | FileCheck %s
+
+# CHECK: sgdt
+0x0f 0x01 0x00
+
+# CHECK: sidt
+0x0f 0x01 0x08
+
+# CHECK: lgdt
+0x0f 0x01 0x10
+
+# CHECK: lidt
+0x0f 0x01 0x18
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index 99d49943..76d67d3 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -630,3 +630,21 @@
 
 # CHECK: movntss %xmm0, (%edi)
 0xf3 0x0f 0x2b 0x07
+
+# CHECK: prefetch (%eax)
+0x0f 0x0d 0x00
+
+# CHECK: prefetchw (%eax)
+0x0f 0x0d 0x08
+
+# CHECK: adcxl %eax, %eax
+0x66 0x0f 0x38 0xf6 0xc0
+
+# CHECK: adcxl (%eax), %eax
+0x66 0x0f 0x38 0xf6 0x00
+
+# CHECK: adoxl %eax, %eax
+0xf3 0x0f 0x38 0xf6 0xc0
+
+# CHECK: adoxl (%eax), %eax
+0xf3 0x0f 0x38 0xf6 0x00
diff --git a/test/MC/Disassembler/X86/x86-64.txt b/test/MC/Disassembler/X86/x86-64.txt
index df449a4..1345741 100644
--- a/test/MC/Disassembler/X86/x86-64.txt
+++ b/test/MC/Disassembler/X86/x86-64.txt
@@ -2,64 +2,64 @@
 
 # Coverage
 
-# CHECK: vcmptrue_usps 
+# CHECK: vcmptrue_usps
 0xc5 0x04 0xc2 0xc7 0x1f
 
-# CHECK: vcmptrue_uspd 
+# CHECK: vcmptrue_uspd
 0xc5 0x05 0xc2 0xc7 0x1f
 
-# CHECK: vcmptrue_usss 
+# CHECK: vcmptrue_usss
 0xc5 0x06 0xc2 0xc7 0x1f
 
-# CHECK: vcmptrue_ussd 
+# CHECK: vcmptrue_ussd
 0xc5 0x07 0xc2 0xc7 0x1f
 
-# CHECK: vcmpeq_uqps 
+# CHECK: vcmpeq_uqps
 0xc5 0x04 0xc2 0xc7 0x08
 
-# CHECK: vcmpeq_uqpd 
+# CHECK: vcmpeq_uqpd
 0xc5 0x05 0xc2 0xc7 0x08
 
-# CHECK: vcmpeq_uqss 
+# CHECK: vcmpeq_uqss
 0xc5 0x06 0xc2 0xc7 0x08
 
-# CHECK: vcmpeq_uqsd 
+# CHECK: vcmpeq_uqsd
 0xc5 0x07 0xc2 0xc7 0x08
 
-# CHECK: vcmpeqps 
+# CHECK: vcmpeqps
 0xc5 0x04 0xc2 0xc7 0x00
 
-# CHECK: vcmpeqpd 
+# CHECK: vcmpeqpd
 0xc5 0x05 0xc2 0xc7 0x00
 
-# CHECK: vcmpeqss 
+# CHECK: vcmpeqss
 0xc5 0x06 0xc2 0xc7 0x00
 
-# CHECK: vcmpeqsd 
+# CHECK: vcmpeqsd
 0xc5 0x07 0xc2 0xc7 0x00
 
-# CHECK: cmpeqps 
+# CHECK: cmpeqps
 0x0f 0xc2 0xc7 0x00
 
-# CHECK: cmpeqpd 
+# CHECK: cmpeqpd
 0x66 0x0f 0xc2 0xc7 0x00
 
-# CHECK: cmpeqss 
+# CHECK: cmpeqss
 0xf3 0x0f 0xc2 0xc7 0x00
 
-# CHECK: cmpeqsd 
+# CHECK: cmpeqsd
 0xf2 0x0f 0xc2 0xc7 0x00
 
-# CHECK: cmpordps 
+# CHECK: cmpordps
 0x0f 0xc2 0xc7 0x07
 
-# CHECK: cmpordpd 
+# CHECK: cmpordpd
 0x66 0x0f 0xc2 0xc7 0x07
 
-# CHECK: cmpordss 
+# CHECK: cmpordss
 0xf3 0x0f 0xc2 0xc7 0x07
 
-# CHECK: cmpordsd 
+# CHECK: cmpordsd
 0xf2 0x0f 0xc2 0xc7 0x07
 
 # CHECK: extrq  $2, $3, %xmm0
@@ -79,3 +79,27 @@
 
 # CHECK: movntss %xmm0, (%rdi)
 0xf3 0x0f 0x2b 0x07
+
+# CHECK: adcxl %eax, %eax
+0x66 0x0f 0x38 0xf6 0xc0
+
+# CHECK: adcxl (%rax), %eax
+0x66 0x0f 0x38 0xf6 0x00
+
+# CHECK: adcxq %rax, %rax
+0x66 0x48 0x0f 0x38 0xf6 0xc0
+
+# CHECK: adcxq (%rax), %rax
+0x66 0x48 0x0f 0x38 0xf6 0x00
+
+# CHECK: adoxl %eax, %eax
+0xf3 0x0f 0x38 0xf6 0xc0
+
+# CHECK: adoxl (%rax), %eax
+0xf3 0x0f 0x38 0xf6 0x00
+
+# CHECK: adoxq %rax, %rax
+0xf3 0x48 0x0f 0x38 0xf6 0xc0
+
+# CHECK: adoxq (%rax), %rax
+0xf3 0x48 0x0f 0x38 0xf6 0x00
diff --git a/test/MC/Disassembler/XCore/xcore.txt b/test/MC/Disassembler/XCore/xcore.txt
index f6b9c90..8ad7588 100644
--- a/test/MC/Disassembler/XCore/xcore.txt
+++ b/test/MC/Disassembler/XCore/xcore.txt
@@ -21,6 +21,57 @@
 # CHECK: waiteu
 0xec 0x07
 
+# CHECK: dcall
+0xfc 0x07
+
+# CHECK: dentsp
+0xec 0x17
+
+# CHECK: drestsp
+0xed 0x17
+
+# CHECK: dret
+0xfe 0x07
+
+# CHECK: freet
+0xef 0x07
+
+# CHECK: get r11, kep
+0xef 0x17
+
+# CHECK: get r11, ksp
+0xfc 0x17
+
+# CHECK: kret
+0xfd 0x07
+
+# CHECK: ldw et, sp[4]
+0xfe 0x17
+
+# CHECK: ldw sed, sp[3]
+0xfd 0x17
+
+# CHECK: ldw spc, sp[1]
+0xec 0x0f
+
+# CHECK: ldw ssr, sp[2]
+0xee 0x0f
+
+# CHECK: set kep, r11
+0xff 0x07
+
+# CHECK: stw et, sp[4]
+0xfd 0x0f
+
+# CHECK: stw sed, sp[3]
+0xfc 0x0f
+
+# CHECK: stw spc, sp[1]
+0xed 0x0f
+
+# CHECK: stw ssr, sp[2]
+0xef 0x0f
+
 # 1r instructions
 
 # CHECK: msync res[r0]
@@ -59,6 +110,33 @@
 # CHECK: eeu res[r11]
 0xfb 0x07
 
+# CHECK: set dp, r5
+0xe5 0x37
+
+# CHECK: set cp, r0
+0xf0 0x37
+
+# CHECK: dgetreg r11
+0xeb 0x3f
+
+# CHECK: edu res[r8]
+0xe8 0x07
+
+# CHECK: kcall r2
+0xe2 0x47
+
+# CHECK: waitef r10
+0xfa 0x0f
+
+# CHECK: waitet r7
+0xe7 0x0f
+
+# CHECK: start t[r4]
+0xe4 0x1f
+
+# CHECK: clrpt res[r9]
+0xe9 0x87
+
 # 2r instructions
 
 # CHECK: not r1, r8
@@ -139,6 +217,15 @@
 # CHECK: sext r9, r1
 0x45 0x37
 
+# CHECK: tsetmr r7, r3
+0x1f 0x1f
+
+# CHECK: eef r1, res[r6]
+0x96 0x2f
+
+# CHECK: eet r11, res[r0]
+0x5c 0x27
+
 # rus instructions
 
 # CHECK: chkct res[r1], 8
@@ -196,3 +283,359 @@
 
 # CHECK: settw res[r7], r2
 0x9b 0xff 0xec 0x27
+
+# CHECK: getd r8, res[r3]
+0x53 0xff 0xec 0x1f
+
+# CHECK: getn r10, res[r11]
+0xbb 0xff 0xec 0x37
+
+# CHECK: testlcl r2, res[r0]
+0xc8 0xfe 0xec 0x27
+
+# CHECK: setn res[r9], r7
+0x6d 0xff 0xec 0x37
+
+# 3r instructions
+
+# CHECK: add r1, r2, r3
+0x1b 0x10
+
+# CHECK: and r11, r10, r9
+0xb9 0x3e
+
+# CHECK: eq r6, r1, r2
+0x66 0x30
+
+# CHECK: ld16s r8, r3[r4]
+0xcc 0x82
+
+# CHECK: ld8u r9, r1[r10]
+0x16 0x8d
+
+# CHECK: ldw r9, r4[r5]
+0x91 0x4b
+
+# CHECK: lss r7, r3, r0
+0x7c 0xc0
+
+# CHECK: lsu r5, r8, r6
+0x12 0xcc
+
+# CHECK: or r1, r3, r2
+0x1e 0x40
+
+# CHECK: shl r8, r2, r4
+0xc8 0x22
+
+# CHECK: shr r9, r7, r1
+0x5d 0x29
+
+# CHECK: sub r4, r2, r5
+0x89 0x1a
+
+# CHECK: set t[r0]:r1, r2
+0x18 0xb8
+
+# 2rus instructions
+
+# CHECK: add r10, r2, 5
+0xe9 0x92
+
+# CHECK: eq r2, r1, 0
+0x24 0xb0
+
+# CHECK: ldw r5, r6[1]
+0x19 0x09
+
+# CHECK: shl r6, r5, 24
+0xa6 0xa5
+
+# CHECK: shr r3, r8, 5
+0xf1 0xab
+
+# CHECK: stw r3, r2[0]
+0x38 0x00
+
+# CHECK: sub r2, r4, 11
+0x63 0x9d
+
+# l3r instructions
+
+# CHECK: ashr r5, r1, r11
+0xd7 0xfc 0xec 0x17
+
+# CHECK: crc32 r5, r6, r1
+0x19 0xf9 0xec 0xaf
+
+# CHECK: divu r9, r1, r3
+0x97 0xf8 0xec 0x4f
+
+# CHECK: divs r6, r7, r2
+0x2e 0xf9 0xec 0x47
+
+# CHECK: lda16 r11, r2[r1]
+0xb9 0xf8 0xec 0x2f
+
+# CHECK: lda16 r9, r3[-r11]
+0x1f 0xfd 0xec 0x37
+
+# CHECK: ldaw r9, r1[r2]
+0x96 0xf8 0xec 0x1f
+
+# CHECK: ldaw r8, r7[r11]
+0xcf 0xfd 0xec 0x1f
+
+# CHECK: mul r0, r4, r2
+0xc2 0xf8 0xec 0x3f
+
+# CHECK: remu r1, r2, r3
+0x1b 0xf8 0xec 0xcf
+
+# CHECK: rems r11, r10, r9
+0xb9 0xfe 0xec 0xc7
+
+# CHECK: st16 r5, r3[r8]
+0xdc 0xfc 0xec 0x87
+
+# CHECK: stw r7, r10[r1]
+0xf9 0xf9 0xec 0x07
+
+# CHECK: xor r4, r3, r9
+0xcd 0xfc 0xec 0x0f
+
+# l2rus instructions
+
+# CHECK: ashr r5, r1, 3
+0x57 0xf8 0xec 0x97
+
+# CHECK: ldaw r11, r10[6]
+0x7a 0xfc 0xec 0x9f
+
+# CHECK: ldaw r8, r2[-9]
+0x09 0xfd 0xec 0xa7
+
+# CHECK: inpw r6, res[r1], 8
+0xe4 0xfc 0xee 0x97
+
+# CHECK: outpw res[r3], r0, 2
+0x0e 0xf8 0xed 0x97
+
+# ru6 / lru6 instructions
+
+# CHECK: bt r6, -5
+0x85 0x75
+
+# CHECK: bt r10, -451
+0x07 0xf0 0x83 0x76
+
+# CHECK: bt r8, 10
+0x0a 0x72
+
+# CHECK: bt r1, 6451
+0x64 0xf0 0x73 0x70
+
+# CHECK: bf r5, 8
+0x48 0x79
+
+# CHECK: bf r6, 65
+0x01 0xf0 0x81 0x79
+
+# CHECK: bf r1, 53
+0x75 0x78
+
+# CHECK: bf r10, 101
+0x01 0xf0 0xa5 0x7a
+
+# CHECK: ldaw r11, dp[63]
+0xff 0x62
+
+# CHECK: ldaw r1, dp[456]
+0x07 0xf0 0x48 0x60
+
+# CHECK: ldaw r3, sp[2]
+0xc2 0x64
+
+# CHECK: ldaw r8, sp[65535]
+0xff 0xf3 0x3f 0x66
+
+# CHECK: ldc r3, 30
+0xde 0x68
+
+# CHECK: ldc r11, 1000
+0x0f 0xf0 0xe8 0x6a
+
+# CHECK: ldw r0, cp[4]
+0x04 0x6c
+
+# CHECK: ldw r1, cp[32345]
+0xf9 0xf1 0x59 0x6c
+
+# CHECK: ldw r10, dp[16]
+0x90 0x5a
+
+# CHECK: ldw r10, dp[76]
+0x01 0xf0 0x8c 0x5a
+
+# CHECK: ldw r8, sp[51]
+0x33 0x5e
+
+# CHECK: ldw r8, sp[1225]
+0x13 0xf0 0x09 0x5e
+
+# CHECK: setc res[r5], 36
+0x64 0xe9
+
+# CHECK: setc res[r2], 40312
+0x75 0xf2 0xb8 0xe8
+
+# CHECK: stw r8, dp[14]
+0x0e 0x52
+
+# CHECK: stw r9, dp[654]
+0x0a 0xf0 0x4e 0x52
+
+# CHECK: stw r1, sp[32]
+0x60 0x54
+
+# CHECK: stw r0, sp[8761]
+0x88 0xf0 0x39 0x54
+
+# u6 / lu6 instructions
+
+# CHECK: bu -20
+0x14 0x77
+
+# CHECK: bu -1000
+0x0f 0xf0 0x28 0x77
+
+# CHECK: bu 24
+0x18 0x73
+
+# CHECK: bu 2231
+0x22 0xf0 0x37 0x73
+
+# CHECK: extsp 9
+0x89 0x77
+
+# CHECK: extsp 5721
+0x59 0xf0 0x99 0x77
+
+# CHECK: clrsr 60
+0x3c 0x7b
+
+# CHECK: clrsr 64391
+0xee 0xf3 0x07 0x7b
+
+# CHECK: entsp 1
+0x41 0x77
+
+# CHECK: entsp 70
+0x01 0xf0 0x46 0x77
+
+# CHECK: ldaw r11, cp[5]
+0x45 0x7f
+
+# CHECK: ldaw r11, cp[33000]
+0x03 0xf2 0x68 0x7f
+
+# CHECK: retsp 40
+0xe8 0x77
+
+# CHECK: retsp 52010
+0x2c 0xf3 0xea 0x77
+
+# CHECK: setsr 42
+0x6a 0x7b
+
+# CHECK: setsr 21863
+0x55 0xf1 0x67 0x7b
+
+# CHECK: extdp 4
+0x84 0x73
+
+# CHECK: extdp 554
+0x08 0xf0 0xaa 0x73
+
+# CHECK: blat 9
+0x49 0x73
+
+# CHECK: blat 61212
+0xbc 0xf3 0x5c 0x73
+
+# CHECK: getsr r11, 54
+0x36 0x7f
+
+# CHECK: getsr r11, 442
+0x06 0xf0 0x3a 0x7f
+
+# CHECK: kcall 11
+0xcb 0x73
+
+# CHECK: kcall 4001
+0x3e 0xf0 0xe1 0x73
+
+# CHECK: kentsp 22
+0x96 0x7b
+
+# CHECK: kentsp 8793
+0x89 0xf0 0x99 0x7b
+
+# CHECK: krestsp 0
+0xc0 0x7b
+
+# CHECK: krestsp 55312
+0x60 0xf3 0xd0 0x7b
+
+# u10 / lu10 instructions
+
+# CHECK: ldap r11, 40
+0x28 0xd8
+
+# CHECK: ldap r11, 53112
+0x33 0xf0 0x78 0xdb
+
+# CHECK: bl 8
+0x08 0xd0
+
+# CHECK: bl 38631
+0x25 0xf0 0xe7 0xd2
+
+# CHECK: bla cp[500]
+0xf4 0xe1
+
+# CHECK: bla cp[413742]
+0x94 0xf1 0x2e 0xe0
+
+# CHECK: ldw r11, cp[132]
+0x84 0xe4
+
+# CHECK: ldw r11, cp[3444]
+0x35 0xf0 0xf4 0x6e
+
+# l6r instructions
+
+# CHECK: lmul r11, r0, r2, r5, r8, r10
+0xf9 0xfa 0x02 0x06
+
+# l5r instructions
+
+# CHECK: ladd r10, r2, r5, r1, r7
+0xe5 0xf8 0xfb 0x06
+
+# CHECK: ldivu r5, r6, r3, r9, r8
+0x54 0xfe 0x0b 0x07
+
+# CHECK: lsub r1, r8, r7, r11, r5
+0xcf 0xfd 0x85 0x0f
+
+# l4r instructions
+
+# CHECK: crc8 r6, r3, r4, r11
+0x73 0xfd 0xe6 0x07
+
+# CHECK: maccs r11, r8, r2, r4
+0xf8 0xfa 0xe8 0x0f
+
+# CHECK: maccu r0, r2, r5, r8
+0x44 0xfd 0xf2 0x07
diff --git a/test/MC/ELF/comp-dir.s b/test/MC/ELF/comp-dir.s
index 50d10eb..59e3d7d 100644
--- a/test/MC/ELF/comp-dir.s
+++ b/test/MC/ELF/comp-dir.s
@@ -1,5 +1,5 @@
 // RUN: llvm-mc -triple=x86_64-linux-unknown -g -fdebug-compilation-dir=/test/comp/dir %s -filetype=obj -o %t.o
-// RUN: llvm-dwarfdump %t.o | FileCheck %s
+// RUN: llvm-dwarfdump -debug-dump=info %t.o | FileCheck %s
 
 // CHECK: DW_AT_comp_dir [DW_FORM_string] ("{{([A-Za-z]:.*)?}}/test/comp/dir")
 
diff --git a/test/MC/MachO/bad-dollar.s b/test/MC/MachO/bad-dollar.s
new file mode 100644
index 0000000..fd72ed0
--- /dev/null
+++ b/test/MC/MachO/bad-dollar.s
@@ -0,0 +1,5 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err > %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+
+.long $1
+// CHECK-ERROR: 4:7: error: invalid token in expression
diff --git a/test/MC/MachO/bad-macro.s b/test/MC/MachO/bad-macro.s
new file mode 100644
index 0000000..0aaba09
--- /dev/null
+++ b/test/MC/MachO/bad-macro.s
@@ -0,0 +1,14 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err > %t
+// RUN: FileCheck --check-prefix=CHECK-OUTPUT < %t %s
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+
+.macro test_macro reg1, reg2
+mov $1, %eax
+mov $2, %eax
+.endmacro
+test_macro %ebx, %ecx
+
+// CHECK-ERROR: 5:1: warning: macro defined with named parameters which are not used in macro body, possible positional parameter found in body which will have no effect
+
+// CHECK-OUTPUT: movl	$1, %eax
+// CHECK-OUTPUT: movl	$2, %eax
diff --git a/test/MC/MachO/gen-dwarf-cpp.s b/test/MC/MachO/gen-dwarf-cpp.s
index cb749f4..e42a63a 100644
--- a/test/MC/MachO/gen-dwarf-cpp.s
+++ b/test/MC/MachO/gen-dwarf-cpp.s
@@ -1,5 +1,5 @@
 // RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
-// RUN: llvm-dwarfdump %t | FileCheck %s
+// RUN: llvm-dwarfdump -debug-dump=line %t | FileCheck %s
 
 # 100 "t.s" 1
 .globl _bar
diff --git a/test/MC/MachO/gen-dwarf-macro-cpp.s b/test/MC/MachO/gen-dwarf-macro-cpp.s
index 05a449b..6177814 100644
--- a/test/MC/MachO/gen-dwarf-macro-cpp.s
+++ b/test/MC/MachO/gen-dwarf-macro-cpp.s
@@ -1,5 +1,5 @@
 // RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
-// RUN: llvm-dwarfdump %t | FileCheck %s
+// RUN: llvm-dwarfdump -debug-dump=line %t | FileCheck %s
 
 # 1 "foo.S" 2
 .macro switcher
diff --git a/test/MC/MachO/gen-dwarf-producer.s b/test/MC/MachO/gen-dwarf-producer.s
new file mode 100644
index 0000000..f7388db
--- /dev/null
+++ b/test/MC/MachO/gen-dwarf-producer.s
@@ -0,0 +1,8 @@
+// RUN: env DEBUG_PRODUCER="my producer" llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
+// RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+
+.globl _bar
+_bar:
+	ret
+
+// CHECK:    DW_AT_producer [DW_FORM_string]	("my producer")
diff --git a/test/MC/MachO/gen-dwarf.s b/test/MC/MachO/gen-dwarf.s
index cf2d1db..d763dd1 100644
--- a/test/MC/MachO/gen-dwarf.s
+++ b/test/MC/MachO/gen-dwarf.s
@@ -1,5 +1,5 @@
 // RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
-// RUN: llvm-dwarfdump %t | FileCheck %s
+// RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck %s
 
 .globl _bar
 _bar:
diff --git a/test/MC/MachO/linker-option-1.s b/test/MC/MachO/linker-option-1.s
new file mode 100644
index 0000000..a01cab7
--- /dev/null
+++ b/test/MC/MachO/linker-option-1.s
@@ -0,0 +1,21 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err > %t
+// RUN: FileCheck --check-prefix=CHECK-OUTPUT < %t %s
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+        
+// CHECK-OUTPUT: .linker_option "a"
+.linker_option "a"
+// CHECK-OUTPUT: .linker_option "a", "b"
+.linker_option "a", "b"
+// CHECK-OUTPUT-NOT: .linker_option
+// CHECK-ERROR: expected string in '.linker_option' directive
+// CHECK-ERROR: .linker_option 10
+// CHECK-ERROR:                ^
+.linker_option 10
+// CHECK-ERROR: expected string in '.linker_option' directive
+// CHECK-ERROR: .linker_option "a",
+// CHECK-ERROR:                    ^
+.linker_option "a",
+// CHECK-ERROR: unexpected token in '.linker_option' directive
+// CHECK-ERROR: .linker_option "a" "b"
+// CHECK-ERROR:                    ^
+.linker_option "a" "b"
diff --git a/test/MC/MachO/linker-option-2.s b/test/MC/MachO/linker-option-2.s
new file mode 100644
index 0000000..bb5966b
--- /dev/null
+++ b/test/MC/MachO/linker-option-2.s
@@ -0,0 +1,25 @@
+// RUN: llvm-mc -n -triple x86_64-apple-darwin10 %s -filetype=obj | macho-dump | FileCheck %s
+
+// CHECK: ('load_commands_size', 104)
+// CHECK: ('load_commands', [
+// CHECK:   # Load Command 1
+// CHECK:  (('command', 45)
+// CHECK:   ('size', 16)
+// CHECK:   ('count', 1)
+// CHECK:   ('_strings', [
+// CHECK: 	"a",
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 2
+// CHECK:  (('command', 45)
+// CHECK:   ('size', 16)
+// CHECK:   ('count', 2)
+// CHECK:   ('_strings', [
+// CHECK: 	"a",
+// CHECK: 	"b",
+// CHECK:   ])
+// CHECK:  ),
+// CHECK: ])
+
+.linker_option "a"
+.linker_option "a", "b"
diff --git a/test/MC/MachO/linker-options.ll b/test/MC/MachO/linker-options.ll
new file mode 100644
index 0000000..827adfd
--- /dev/null
+++ b/test/MC/MachO/linker-options.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin -o - %s > %t
+; RUN: FileCheck --check-prefix=CHECK-ASM < %t %s
+
+; CHECK-ASM: .linker_option "-lz"
+; CHECK-ASM-NEXT: .linker_option "-framework", "Cocoa"
+
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | macho-dump > %t
+; RUN: FileCheck --check-prefix=CHECK-OBJ < %t %s
+
+; CHECK-OBJ: ('load_commands', [
+; CHECK-OBJ:   # Load Command 1
+; CHECK-OBJ:  (('command', 45)
+; CHECK-OBJ:   ('size', 16)
+; CHECK-OBJ:   ('count', 1)
+; CHECK-OBJ:   ('_strings', [
+; CHECK-OBJ: 	"-lz",
+; CHECK-OBJ:   ])
+; CHECK-OBJ:  ),
+; CHECK-OBJ:   # Load Command 2
+; CHECK-OBJ:  (('command', 45)
+; CHECK-OBJ:   ('size', 32)
+; CHECK-OBJ:   ('count', 2)
+; CHECK-OBJ:   ('_strings', [
+; CHECK-OBJ: 	"-framework",
+; CHECK-OBJ: 	"Cocoa",
+; CHECK-OBJ:   ])
+; CHECK-OBJ:   # Load Command 3
+; CHECK-OBJ:  (('command', 45)
+; CHECK-OBJ:   ('size', 24)
+; CHECK-OBJ:   ('count', 1)
+; CHECK-OBJ:   ('_strings', [
+; CHECK-OBJ: 	"-lmath",
+; CHECK-OBJ:   ])
+; CHECK-OBJ:  ),
+; CHECK-OBJ: ])
+
+!0 = metadata !{ i32 6, metadata !"Linker Options", 
+   metadata !{
+      metadata !{ metadata !"-lz" },
+      metadata !{ metadata !"-framework", metadata !"Cocoa" },
+      metadata !{ metadata !"-lmath" } } }
+
+!llvm.module.flags = !{ !0 }
diff --git a/test/MC/Mips/elf-gprel-32-64.ll b/test/MC/Mips/elf-gprel-32-64.ll
new file mode 100644
index 0000000..b946822
--- /dev/null
+++ b/test/MC/Mips/elf-gprel-32-64.ll
@@ -0,0 +1,37 @@
+; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 %s -o - \
+; RUN: | elf-dump --dump-section-data \
+; RUN: | FileCheck %s
+
+define i32 @test(i32 %c) nounwind {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb2
+    i32 2, label %sw.bb5
+    i32 3, label %sw.bb8
+  ]
+
+sw.bb:
+  br label %return
+sw.bb2:
+  br label %return
+sw.bb5:
+  br label %return
+sw.bb8:
+  br label %return
+sw.default:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ -1, %sw.default ], [ 7, %sw.bb8 ], [ 2, %sw.bb5 ], [ 3, %sw.bb2 ], [ 1, %sw.bb ]
+  ret i32 %retval.0
+}
+
+; Check that the appropriate relocations were created.
+
+; R_MIPS_GPREL32/R_MIPS_64/R_MIPS_NONE
+; CHECK: (('sh_name', 0x{{[a-z0-9]+}}) # '.rela.rodata'
+; CHECK:      ('r_type3', 0x00)
+; CHECK-NEXT: ('r_type2', 0x12)
+; CHECK-NEXT: ('r_type', 0x0c)
+
diff --git a/test/MC/Mips/elf-reginfo.ll b/test/MC/Mips/elf-reginfo.ll
new file mode 100644
index 0000000..1d7a188
--- /dev/null
+++ b/test/MC/Mips/elf-reginfo.ll
@@ -0,0 +1,31 @@
+ ; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 %s -o - \
+ ; RUN: | elf-dump --dump-section-data  | FileCheck --check-prefix=CHECK_64 %s
+ ; RUN: llc -filetype=obj -march=mipsel -mcpu=mips32 %s -o - \
+ ; RUN: | elf-dump --dump-section-data  | FileCheck --check-prefix=CHECK_32 %s
+
+; Check for register information sections.
+;
+
+@str = private unnamed_addr constant [12 x i8] c"hello world\00"
+
+define i32 @main() nounwind {
+entry:
+; Check that the appropriate relocations were created.
+
+; check for .MIPS.options
+; CHECK_64:      (('sh_name', 0x{{[0-9|a-f]+}}) # '.MIPS.options'
+; CHECK_64-NEXT: ('sh_type', 0x7000000d)
+; CHECK_64-NEXT: ('sh_flags', 0x0000000008000002)
+
+; check for .reginfo
+; CHECK_32:      (('sh_name', 0x{{[0-9|a-f]+}}) # '.reginfo'
+; CHECK_32-NEXT: ('sh_type', 0x70000006)
+; CHECK_32-NEXT: ('sh_flags', 0x00000002)
+
+
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @str, i64 0, i64 0))
+  ret i32 0
+
+}
+declare i32 @puts(i8* nocapture) nounwind
+  
diff --git a/test/MC/Mips/elf_eflags.ll b/test/MC/Mips/elf_eflags.ll
new file mode 100644
index 0000000..315cb81
--- /dev/null
+++ b/test/MC/Mips/elf_eflags.ll
@@ -0,0 +1,66 @@
+; This tests ELF EFLAGS setting with direct object.
+; When the assembler is ready a .s file for it will
+; be created.
+
+; Non-shared (static) is the absence of pic and or cpic.
+
+; EF_MIPS_NOREORDER (0x00000001) is always on by default currently
+; EF_MIPS_PIC (0x00000002)
+; EF_MIPS_CPIC (0x00000004) - not tested yet
+; EF_MIPS_ABI2 (0x00000020) - n32 not tested yet
+; EF_MIPS_ARCH_32 (0x50000000)
+; EF_MIPS_ARCH_64 (0x60000000)
+; EF_MIPS_ARCH_32R2 (0x70000000)
+; EF_MIPS_ARCH_64R2 (0x80000000)
+
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE32 %s
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE32_PIC %s
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -relocation-model=static %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE32R2 %s
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE32R2_PIC %s
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips -relocation-model=static %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE32R2-MICROMIPS %s
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE32R2-MICROMIPS_PIC %s
+
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE64 %s
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips64 %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE64_PIC %s
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips64r2 -relocation-model=static %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE64R2 %s
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips64r2 %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-BE64R2_PIC %s
+
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+mips16 -relocation-model=pic %s -o - | elf-dump --dump-section-data  | FileCheck -check-prefix=CHECK-LE32R2-MIPS16 %s
+ 
+; 32(R1) bit with NO_REORDER and static
+; CHECK-BE32: ('e_flags', 0x50001001)
+;
+; 32(R1) bit with NO_REORDER and PIC
+; CHECK-BE32_PIC: ('e_flags', 0x50001003)
+;
+; 32R2 bit with NO_REORDER and static
+; CHECK-BE32R2: ('e_flags', 0x70001001)
+;
+; 32R2 bit with NO_REORDER and PIC
+; CHECK-BE32R2_PIC: ('e_flags', 0x70001003)
+;
+; 32R2 bit MICROMIPS with NO_REORDER and static
+; CHECK-BE32R2-MICROMIPS: ('e_flags', 0x72001001)
+;
+; 32R2 bit MICROMIPS with NO_REORDER and PIC
+;CHECK-BE32R2-MICROMIPS_PIC:  ('e_flags', 0x72001003)
+;
+; 64(R1) bit with NO_REORDER and static
+; CHECK-BE64: ('e_flags', 0x60000001)
+;
+; 64(R1) bit with NO_REORDER and PIC
+; CHECK-BE64_PIC: ('e_flags', 0x60000003)
+;
+; 64R2 bit with NO_REORDER and static
+; CHECK-BE64R2: ('e_flags', 0x80000001)
+;
+; 64R2 bit with NO_REORDER and PIC
+; CHECK-BE64R2_PIC: ('e_flags', 0x80000003)
+;
+; 32R2 bit MIPS16 with PIC
+; CHECK-LE32R2-MIPS16: ('e_flags', 0x74001002)
+ 
+define i32 @main() nounwind {
+entry:
+  ret i32 0
+}
diff --git a/test/MC/Mips/elf_st_other.ll b/test/MC/Mips/elf_st_other.ll
new file mode 100644
index 0000000..f188ce7
--- /dev/null
+++ b/test/MC/Mips/elf_st_other.ll
@@ -0,0 +1,13 @@
+; This tests value of ELF st_other field for function symbol table entries.
+; For microMIPS value should be equal to STO_MIPS_MICROMIPS.
+
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | elf-dump --dump-section-data  | FileCheck %s
+
+define i32 @main() nounwind {
+entry:
+  ret i32 0
+}
+
+; CHECK:  'main'
+; CHECK:  ('st_other', 0x80)
+
diff --git a/test/MC/Mips/hilo-addressing.s b/test/MC/Mips/hilo-addressing.s
new file mode 100644
index 0000000..28459c2
--- /dev/null
+++ b/test/MC/Mips/hilo-addressing.s
@@ -0,0 +1,11 @@
+# RUN: llvm-mc -show-encoding -triple mips-unknown-unknown %s | FileCheck %s
+
+  .ent hilo_test
+     .equ    addr, 0xdeadbeef
+# CHECK: # encoding: [0x3c,0x04,0xde,0xae]
+    lui $4,%hi(addr)
+# CHECK: # encoding: [0x03,0xe0,0x00,0x08]
+    jr  $31
+# CHECK: # encoding: [0x80,0x82,0xbe,0xef]
+    lb  $2,%lo(addr)($4)
+    .end hilo_test
diff --git a/test/MC/Mips/mips-alu-instructions.s b/test/MC/Mips/mips-alu-instructions.s
index 2997782..816138e 100644
--- a/test/MC/Mips/mips-alu-instructions.s
+++ b/test/MC/Mips/mips-alu-instructions.s
@@ -31,7 +31,7 @@
 # CHECK:  xori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
 # CHECK:  xori   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
 # CHECK:  wsbh   $6, $7          # encoding: [0xa0,0x30,0x07,0x7c]
-# CHECK:  nor    $7, $8, $zero   # encoding: [0x27,0x38,0x00,0x01]
+# CHECK:  not    $7, $8          # encoding: [0x27,0x38,0x00,0x01]
      and    $9,  $6, $7
      and    $9,  $6, 17767
      andi   $9,  $6, 17767
@@ -78,9 +78,13 @@
 # CHECK:  multu  $3, $5          # encoding: [0x19,0x00,0x65,0x00]
 # CHECK:  sub    $9, $6, $7      # encoding: [0x22,0x48,0xc7,0x00]
 # CHECK:  subu   $4, $3, $5      # encoding: [0x23,0x20,0x65,0x00]
-# CHECK:  sub     $6, $zero, $7  # encoding: [0x22,0x30,0x07,0x00]
-# CHECK:  subu    $6, $zero, $7  # encoding: [0x23,0x30,0x07,0x00]
-# CHECK:  add     $7, $8, $zero  # encoding: [0x20,0x38,0x00,0x01]
+# CHECK:  neg     $6, $7         # encoding: [0x22,0x30,0x07,0x00]
+# CHECK:  negu    $6, $7         # encoding: [0x23,0x30,0x07,0x00]
+# CHECK:  move    $7, $8         # encoding: [0x21,0x38,0x00,0x01]
+# CHECK:  .set    push
+# CHECK:  .set    mips32r2
+# CHECK:  rdhwr   $5, $29
+# CHECK:  .set    pop            # encoding: [0x3b,0xe8,0x05,0x7c]
     add    $9,$6,$7
     add    $9,$6,17767
     addu   $9,$6,-15001
@@ -98,3 +102,4 @@
     neg    $6,$7
     negu   $6,$7
     move   $7,$8
+    rdhwr   $5, $29
diff --git a/test/MC/Mips/mips-coprocessor-encodings.s b/test/MC/Mips/mips-coprocessor-encodings.s
index bad9163..3d638c3 100644
--- a/test/MC/Mips/mips-coprocessor-encodings.s
+++ b/test/MC/Mips/mips-coprocessor-encodings.s
@@ -1,4 +1,5 @@
-# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding | FileCheck --check-prefix=MIPS64 %s
+# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding \
+# RUN:| FileCheck --check-prefix=MIPS64 %s
 
 # MIPS64:	dmtc0	$12, $16, 2             # encoding: [0x40,0xac,0x80,0x02]
 # MIPS64:	dmtc0	$12, $16, 0             # encoding: [0x40,0xac,0x80,0x00]
diff --git a/test/MC/Mips/mips-jump-instructions.s b/test/MC/Mips/mips-jump-instructions.s
index 58250f3..bc2d720 100644
--- a/test/MC/Mips/mips-jump-instructions.s
+++ b/test/MC/Mips/mips-jump-instructions.s
@@ -56,6 +56,10 @@ end_of_code:
 # CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
 # CHECK:   jalr $6              # encoding: [0x09,0xf8,0xc0,0x00]
 # CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jalr $25             # encoding: [0x09,0xf8,0x20,0x03]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jalr $10, $11        # encoding: [0x09,0x50,0x60,0x01]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
 # CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
 # CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
 # CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
@@ -67,6 +71,10 @@ end_of_code:
    nop
    jalr $6
    nop
+   jalr $31, $25
+   nop
+   jalr $10, $11
+   nop
    jr $7
    nop
    j $7
diff --git a/test/MC/Mips/mips64-alu-instructions.s b/test/MC/Mips/mips64-alu-instructions.s
new file mode 100644
index 0000000..1b4ebdf
--- /dev/null
+++ b/test/MC/Mips/mips64-alu-instructions.s
@@ -0,0 +1,100 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for arithmetic and logical instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Logical instructions
+#------------------------------------------------------------------------------
+# CHECK:  and    $9, $6, $7      # encoding: [0x24,0x48,0xc7,0x00]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  clo    $6, $7          # encoding: [0x21,0x30,0xe6,0x70]
+# CHECK:  clz    $6, $7          # encoding: [0x20,0x30,0xe6,0x70]
+# CHECK:  ins    $19, $9, 6, 7   # encoding: [0x84,0x61,0x33,0x7d]
+# CHECK:  nor    $9, $6, $7      # encoding: [0x27,0x48,0xc7,0x00]
+# CHECK:  or     $3, $3, $5      # encoding: [0x25,0x18,0x65,0x00]
+# CHECK:  ori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x34]
+# CHECK:  rotr   $9, $6, 7       # encoding: [0xc2,0x49,0x26,0x00]
+# CHECK:  rotrv  $9, $6, $7      # encoding: [0x46,0x48,0xe6,0x00]
+# CHECK:  sll    $4, $3, 7       # encoding: [0xc0,0x21,0x03,0x00]
+# CHECK:  sllv   $2, $3, $5      # encoding: [0x04,0x10,0xa3,0x00]
+# CHECK:  slt    $3, $3, $5      # encoding: [0x2a,0x18,0x65,0x00]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  sltiu  $3, $3, 103     # encoding: [0x67,0x00,0x63,0x2c]
+# CHECK:  sltu   $3, $3, $5      # encoding: [0x2b,0x18,0x65,0x00]
+# CHECK:  sra    $4, $3, 7       # encoding: [0xc3,0x21,0x03,0x00]
+# CHECK:  srav   $2, $3, $5      # encoding: [0x07,0x10,0xa3,0x00]
+# CHECK:  srl    $4, $3, 7       # encoding: [0xc2,0x21,0x03,0x00]
+# CHECK:  srlv   $2, $3, $5      # encoding: [0x06,0x10,0xa3,0x00]
+# CHECK:  xor    $3, $3, $5      # encoding: [0x26,0x18,0x65,0x00]
+# CHECK:  xori    $9, $6, 17767  # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  xori   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  wsbh   $6, $7          # encoding: [0xa0,0x30,0x07,0x7c]
+# CHECK:  not    $7, $8          # encoding: [0x27,0x38,0x00,0x01]
+     and    $9,  $6, $7
+     and    $9,  $6, 17767
+     andi   $9,  $6, 17767
+     clo    $6,  $7
+     clz    $6,  $7
+     ins    $19, $9, 6,7
+     nor    $9,  $6, $7
+     or     $3,  $3, $5
+     ori    $9,  $6, 17767
+     rotr   $9,  $6, 7
+     rotrv  $9,  $6, $7
+     sll    $4,  $3, 7
+     sllv   $2,  $3, $5
+     slt    $3,  $3, $5
+     slt    $3,  $3, 103
+     slti   $3,  $3, 103
+     sltiu  $3,  $3, 103
+     sltu   $3,  $3, $5
+     sra    $4,  $3, 7
+     srav   $2,  $3, $5
+     srl    $4,  $3, 7
+     srlv   $2,  $3, $5
+     xor    $3,  $3, $5
+     xor    $9,  $6, 17767
+     xori   $9,  $6, 17767
+     wsbh   $6,  $7
+     not    $7  ,$8
+
+#------------------------------------------------------------------------------
+# Arithmetic instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  dadd    $9, $6, $7      # encoding: [0x2c,0x48,0xc7,0x00]
+# CHECK:  daddi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x60]
+# CHECK:  daddiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x64]
+# CHECK:  daddi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x60]
+# CHECK:  daddiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x64]
+# CHECK:  daddu   $9, $6, $7      # encoding: [0x2d,0x48,0xc7,0x00]
+# CHECK:  madd   $6, $7          # encoding: [0x00,0x00,0xc7,0x70]
+# CHECK:  maddu  $6, $7          # encoding: [0x01,0x00,0xc7,0x70]
+# CHECK:  msub   $6, $7          # encoding: [0x04,0x00,0xc7,0x70]
+# CHECK:  msubu  $6, $7          # encoding: [0x05,0x00,0xc7,0x70]
+# CHECK:  mult   $3, $5          # encoding: [0x18,0x00,0x65,0x00]
+# CHECK:  multu  $3, $5          # encoding: [0x19,0x00,0x65,0x00]
+# CHECK:  dsubu   $4, $3, $5     # encoding: [0x2f,0x20,0x65,0x00]
+# CHECK:  move    $7, $8         # encoding: [0x2d,0x38,0x00,0x01]
+# CHECK:  .set    push
+# CHECK:  .set    mips32r2
+# CHECK:  rdhwr   $5, $29
+# CHECK:  .set    pop            # encoding: [0x3b,0xe8,0x05,0x7c]
+
+    dadd    $9,$6,$7
+    dadd    $9,$6,17767
+    daddu   $9,$6,-15001
+    daddi   $9,$6,17767
+    daddiu  $9,$6,-15001
+    daddu   $9,$6,$7
+    madd   $6,$7
+    maddu  $6,$7
+    msub   $6,$7
+    msubu  $6,$7
+    mult   $3,$5
+    multu  $3,$5
+    dsubu   $4,$3,$5
+    move   $7,$8
+    rdhwr   $5, $29
diff --git a/test/MC/Mips/mips_directives.s b/test/MC/Mips/mips_directives.s
index e2f75a8..65d584d 100644
--- a/test/MC/Mips/mips_directives.s
+++ b/test/MC/Mips/mips_directives.s
@@ -1,16 +1,19 @@
-# RUN: llvm-mc -triple mips-unknown-unknown %s
-#this test produces no output so there isS no FileCheck call
+# RUN: llvm-mc -show-encoding -triple mips-unknown-unknown %s | FileCheck %s
+#
 $BB0_2:
   .ent directives_test
-	.frame	$sp,0,$ra
-	.mask 	0x00000000,0
-	.fmask	0x00000000,0
-	.set	noreorder
-	.set	nomacro
-	.set	noat
+    .frame    $sp,0,$ra
+    .mask     0x00000000,0
+    .fmask    0x00000000,0
+    .set    noreorder
+    .set    nomacro
+    .set    noat
 $JTI0_0:
-	.gpword	($BB0_2)
-	.set  at=$12
-	.set macro
-	.set reorder
-	.end directives_test
+    .gpword    ($BB0_2)
+    .word 0x77fffffc
+# CHECK: $JTI0_0:
+# CHECK-NEXT:     .4byte    2013265916
+    .set  at=$12
+    .set macro
+    .set reorder
+    .set  at=$a0
diff --git a/test/MC/Mips/nabi-regs.s b/test/MC/Mips/nabi-regs.s
new file mode 100644
index 0000000..9371208
--- /dev/null
+++ b/test/MC/Mips/nabi-regs.s
@@ -0,0 +1,36 @@
+# OABI (o32, o64) have a different symbolic register
+# set for the A and T registers because the NABI allows
+# for 4 more register parameters (A registers) offsetting
+# the T registers.
+#
+# For now just check N64
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
+# RUN: -mcpu=mips64r2 -arch=mips64 | \
+# RUN: FileCheck %s
+
+# CHECK: .section    __TEXT,__text,regular,pure_instructions
+    .text
+foo:
+
+# CHECK: add    $16, $16, $4            # encoding: [0x02,0x04,0x80,0x20]
+    add $s0,$s0,$a0
+# CHECK: add    $16, $16, $6            # encoding: [0x02,0x06,0x80,0x20]
+    add $s0,$s0,$a2
+# CHECK: add    $16, $16, $7            # encoding: [0x02,0x07,0x80,0x20]
+    add $s0,$s0,$a3
+# CHECK: add    $16, $16, $8            # encoding: [0x02,0x08,0x80,0x20]
+    add $s0,$s0,$a4
+# CHECK: add    $16, $16, $9            # encoding: [0x02,0x09,0x80,0x20]
+    add $s0,$s0,$a5
+# CHECK: add    $16, $16, $10           # encoding: [0x02,0x0a,0x80,0x20]
+    add $s0,$s0,$a6
+# CHECK: add    $16, $16, $11           # encoding: [0x02,0x0b,0x80,0x20]
+    add $s0,$s0,$a7
+# CHECK: add    $16, $16, $12           # encoding: [0x02,0x0c,0x80,0x20]
+    add $s0,$s0,$t0
+# CHECK: add    $16, $16, $13           # encoding: [0x02,0x0d,0x80,0x20]
+    add $s0,$s0,$t1
+# CHECK: add    $16, $16, $14           # encoding: [0x02,0x0e,0x80,0x20]
+    add $s0,$s0,$t2
+# CHECK: add    $16, $16, $15           # encoding: [0x02,0x0f,0x80,0x20]
+    add $s0,$s0,$t3
diff --git a/test/MC/Mips/set-at-directive.s b/test/MC/Mips/set-at-directive.s
new file mode 100644
index 0000000..98a3a35
--- /dev/null
+++ b/test/MC/Mips/set-at-directive.s
@@ -0,0 +1,132 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | \
+# RUN: FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for ".set at" and set the correct value.
+
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+    .text
+foo:
+# CHECK:   jr    $1                      # encoding: [0x08,0x00,0x20,0x00]
+    .set    at=$1
+    jr    $at
+    nop
+# CHECK:   jr    $2                      # encoding: [0x08,0x00,0x40,0x00]
+    .set    at=$2
+    jr    $at
+    nop
+# CHECK:   jr    $3                      # encoding: [0x08,0x00,0x60,0x00]
+    .set    at=$3
+    jr    $at
+    nop
+# CHECK:   jr    $4                      # encoding: [0x08,0x00,0x80,0x00]
+    .set    at=$a0
+    jr    $at
+    nop
+# CHECK:   jr    $5                      # encoding: [0x08,0x00,0xa0,0x00]
+    .set    at=$a1
+    jr    $at
+    nop
+# CHECK:   jr    $6                      # encoding: [0x08,0x00,0xc0,0x00]
+    .set    at=$a2
+    jr    $at
+    nop
+# CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
+    .set    at=$a3
+    jr    $at
+    nop
+# CHECK:   jr    $8                      # encoding: [0x08,0x00,0x00,0x01]
+    .set    at=$8
+    jr    $at
+    nop
+# CHECK:   jr    $9                      # encoding: [0x08,0x00,0x20,0x01]
+    .set    at=$9
+    jr    $at
+    nop
+# CHECK:   jr    $10                     # encoding: [0x08,0x00,0x40,0x01]
+    .set    at=$10
+    jr    $at
+    nop
+# CHECK:   jr    $11                     # encoding: [0x08,0x00,0x60,0x01]
+    .set    at=$11
+    jr    $at
+    nop
+# CHECK:   jr    $12                     # encoding: [0x08,0x00,0x80,0x01]
+    .set    at=$12
+    jr    $at
+    nop
+# CHECK:   jr    $13                     # encoding: [0x08,0x00,0xa0,0x01]
+    .set    at=$13
+    jr    $at
+    nop
+# CHECK:   jr    $14                     # encoding: [0x08,0x00,0xc0,0x01]
+    .set    at=$14
+    jr    $at
+    nop
+# CHECK:   jr    $15                     # encoding: [0x08,0x00,0xe0,0x01]
+    .set    at=$15
+    jr    $at
+    nop
+# CHECK:   jr    $16                     # encoding: [0x08,0x00,0x00,0x02]
+    .set    at=$s0
+    jr    $at
+    nop
+# CHECK:   jr    $17                     # encoding: [0x08,0x00,0x20,0x02]
+    .set    at=$s1
+    jr    $at
+    nop
+# CHECK:   jr    $18                     # encoding: [0x08,0x00,0x40,0x02]
+    .set    at=$s2
+    jr    $at
+    nop
+# CHECK:   jr    $19                     # encoding: [0x08,0x00,0x60,0x02]
+    .set    at=$s3
+    jr    $at
+    nop
+# CHECK:   jr    $20                     # encoding: [0x08,0x00,0x80,0x02]
+    .set    at=$s4
+    jr    $at
+    nop
+# CHECK:   jr    $21                     # encoding: [0x08,0x00,0xa0,0x02]
+    .set    at=$s5
+    jr    $at
+    nop
+# CHECK:   jr    $22                     # encoding: [0x08,0x00,0xc0,0x02]
+    .set    at=$s6
+    jr    $at
+    nop
+# CHECK:   jr    $23                     # encoding: [0x08,0x00,0xe0,0x02]
+    .set    at=$s7
+    jr    $at
+    nop
+# CHECK:   jr    $24                     # encoding: [0x08,0x00,0x00,0x03]
+    .set    at=$24
+    jr    $at
+    nop
+# CHECK:   jr    $25                     # encoding: [0x08,0x00,0x20,0x03]
+    .set    at=$25
+    jr    $at
+    nop
+# CHECK:   jr    $26                     # encoding: [0x08,0x00,0x40,0x03]
+    .set    at=$26
+    jr    $at
+    nop
+# CHECK:   jr    $27                     # encoding: [0x08,0x00,0x60,0x03]
+    .set    at=$27
+    jr    $at
+    nop
+# CHECK:   jr    $gp                     # encoding: [0x08,0x00,0x80,0x03]
+    .set    at=$gp
+    jr    $at
+    nop
+# CHECK:   jr    $fp                     # encoding: [0x08,0x00,0xc0,0x03]
+    .set    at=$fp
+    jr    $at
+    nop
+# CHECK:   jr    $sp                     # encoding: [0x08,0x00,0xa0,0x03]
+    .set    at=$sp
+    jr    $at
+    nop
+# CHECK:   jr    $ra                     # encoding: [0x08,0x00,0xe0,0x03]
+    .set    at=$ra
+    jr    $at
+    nop
diff --git a/test/MC/PowerPC/ppc64-initial-cfa.ll b/test/MC/PowerPC/ppc64-initial-cfa.ll
index 0e36fb7..16236c9 100644
--- a/test/MC/PowerPC/ppc64-initial-cfa.ll
+++ b/test/MC/PowerPC/ppc64-initial-cfa.ll
@@ -20,7 +20,7 @@ entry:
 ; STATIC-NEXT: ('sh_info', 0x00000000)
 ; STATIC-NEXT: ('sh_addralign', 0x0000000000000008)
 ; STATIC-NEXT: ('sh_entsize', 0x0000000000000000)
-; STATIC-NEXT: ('_section_data', '00000010 00000000 017a5200 01784101 0b0c0100 00000010 00000018 00000000 00000010 00000000')
+; STATIC-NEXT: ('_section_data', '00000010 00000000 017a5200 01784101 1b0c0100 00000010 00000018 00000000 00000010 00000000')
 
 ; STATIC:      ('sh_name', 0x{{.*}}) # '.rela.eh_frame'
 ; STATIC-NEXT: ('sh_type', 0x00000004)
@@ -34,11 +34,11 @@ entry:
 ; STATIC-NEXT: ('sh_entsize', 0x0000000000000018)
 ; STATIC-NEXT: ('_relocations', [
 
-; Static build should create R_PPC64_ADDR32 relocations
+; Static build should create R_PPC64_REL32 relocations
 ; STATIC-NEXT:  # Relocation 0
 ; STATIC-NEXT:  (('r_offset', 0x000000000000001c)
 ; STATIC-NEXT:   ('r_sym', 0x{{.*}})
-; STATIC-NEXT:   ('r_type', 0x00000001)
+; STATIC-NEXT:   ('r_type', 0x0000001a)
 ; STATIC-NEXT:   ('r_addend', 0x0000000000000000)
 ; STATIC-NEXT:  ),
 ; STATIC-NEXT: ])
diff --git a/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s b/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s
index 7fbb71b..fbf5b52 100644
--- a/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s
+++ b/test/MC/X86/AlignedBundling/autogen-inst-offset-align-to-end.s
@@ -354,6 +354,7 @@ INSTRLEN_2_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 3ef: nop
+# CHECK: 3f0: nop
 # CHECK: 3fe: incl
 
   .align 32, 0x90
@@ -517,6 +518,7 @@ INSTRLEN_3_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 5ce: nop
+# CHECK: 5d0: nop
 # CHECK: 5dd: incl
 
   .align 32, 0x90
@@ -528,6 +530,7 @@ INSTRLEN_3_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 5ef: nop
+# CHECK: 5f0: nop
 # CHECK: 5fd: incl
 
   .align 32, 0x90
@@ -680,6 +683,7 @@ INSTRLEN_4_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: 7ad: nop
+# CHECK: 7b0: nop
 # CHECK: 7bc: incl
 
   .align 32, 0x90
@@ -691,6 +695,7 @@ INSTRLEN_4_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 7ce: nop
+# CHECK: 7d0: nop
 # CHECK: 7dc: incl
 
   .align 32, 0x90
@@ -702,6 +707,7 @@ INSTRLEN_4_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 7ef: nop
+# CHECK: 7f0: nop
 # CHECK: 7fc: incl
 
   .align 32, 0x90
@@ -843,6 +849,7 @@ INSTRLEN_5_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: 98c: nop
+# CHECK: 990: nop
 # CHECK: 99b: incl
 
   .align 32, 0x90
@@ -854,6 +861,7 @@ INSTRLEN_5_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: 9ad: nop
+# CHECK: 9b0: nop
 # CHECK: 9bb: incl
 
   .align 32, 0x90
@@ -865,6 +873,7 @@ INSTRLEN_5_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 9ce: nop
+# CHECK: 9d0: nop
 # CHECK: 9db: incl
 
   .align 32, 0x90
@@ -876,6 +885,7 @@ INSTRLEN_5_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 9ef: nop
+# CHECK: 9f0: nop
 # CHECK: 9fb: incl
 
   .align 32, 0x90
@@ -1006,6 +1016,7 @@ INSTRLEN_6_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: b6b: nop
+# CHECK: b70: nop
 # CHECK: b7a: incl
 
   .align 32, 0x90
@@ -1017,6 +1028,7 @@ INSTRLEN_6_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: b8c: nop
+# CHECK: b90: nop
 # CHECK: b9a: incl
 
   .align 32, 0x90
@@ -1028,6 +1040,7 @@ INSTRLEN_6_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: bad: nop
+# CHECK: bb0: nop
 # CHECK: bba: incl
 
   .align 32, 0x90
@@ -1039,6 +1052,7 @@ INSTRLEN_6_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: bce: nop
+# CHECK: bd0: nop
 # CHECK: bda: incl
 
   .align 32, 0x90
@@ -1050,6 +1064,7 @@ INSTRLEN_6_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: bef: nop
+# CHECK: bf0: nop
 # CHECK: bfa: incl
 
   .align 32, 0x90
@@ -1169,6 +1184,7 @@ INSTRLEN_7_OFFSET_10:
   .endr
   .bundle_unlock
 # CHECK: d4a: nop
+# CHECK: d50: nop
 # CHECK: d59: incl
 
   .align 32, 0x90
@@ -1180,6 +1196,7 @@ INSTRLEN_7_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: d6b: nop
+# CHECK: d70: nop
 # CHECK: d79: incl
 
   .align 32, 0x90
@@ -1191,6 +1208,7 @@ INSTRLEN_7_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: d8c: nop
+# CHECK: d90: nop
 # CHECK: d99: incl
 
   .align 32, 0x90
@@ -1202,6 +1220,7 @@ INSTRLEN_7_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: dad: nop
+# CHECK: db0: nop
 # CHECK: db9: incl
 
   .align 32, 0x90
@@ -1213,6 +1232,7 @@ INSTRLEN_7_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: dce: nop
+# CHECK: dd0: nop
 # CHECK: dd9: incl
 
   .align 32, 0x90
@@ -1224,6 +1244,7 @@ INSTRLEN_7_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: def: nop
+# CHECK: df0: nop
 # CHECK: df9: incl
 
   .align 32, 0x90
@@ -1332,6 +1353,7 @@ INSTRLEN_8_OFFSET_9:
   .endr
   .bundle_unlock
 # CHECK: f29: nop
+# CHECK: f30: nop
 # CHECK: f38: incl
 
   .align 32, 0x90
@@ -1343,6 +1365,7 @@ INSTRLEN_8_OFFSET_10:
   .endr
   .bundle_unlock
 # CHECK: f4a: nop
+# CHECK: f50: nop
 # CHECK: f58: incl
 
   .align 32, 0x90
@@ -1354,6 +1377,7 @@ INSTRLEN_8_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: f6b: nop
+# CHECK: f70: nop
 # CHECK: f78: incl
 
   .align 32, 0x90
@@ -1365,6 +1389,7 @@ INSTRLEN_8_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: f8c: nop
+# CHECK: f90: nop
 # CHECK: f98: incl
 
   .align 32, 0x90
@@ -1376,6 +1401,7 @@ INSTRLEN_8_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: fad: nop
+# CHECK: fb0: nop
 # CHECK: fb8: incl
 
   .align 32, 0x90
@@ -1387,6 +1413,7 @@ INSTRLEN_8_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: fce: nop
+# CHECK: fd0: nop
 # CHECK: fd8: incl
 
   .align 32, 0x90
@@ -1398,6 +1425,7 @@ INSTRLEN_8_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: fef: nop
+# CHECK: ff0: nop
 # CHECK: ff8: incl
 
   .align 32, 0x90
@@ -1495,6 +1523,7 @@ INSTRLEN_9_OFFSET_8:
   .endr
   .bundle_unlock
 # CHECK: 1108: nop
+# CHECK: 1110: nop
 # CHECK: 1117: incl
 
   .align 32, 0x90
@@ -1506,6 +1535,7 @@ INSTRLEN_9_OFFSET_9:
   .endr
   .bundle_unlock
 # CHECK: 1129: nop
+# CHECK: 1130: nop
 # CHECK: 1137: incl
 
   .align 32, 0x90
@@ -1517,6 +1547,7 @@ INSTRLEN_9_OFFSET_10:
   .endr
   .bundle_unlock
 # CHECK: 114a: nop
+# CHECK: 1150: nop
 # CHECK: 1157: incl
 
   .align 32, 0x90
@@ -1528,6 +1559,7 @@ INSTRLEN_9_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: 116b: nop
+# CHECK: 1170: nop
 # CHECK: 1177: incl
 
   .align 32, 0x90
@@ -1539,6 +1571,7 @@ INSTRLEN_9_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: 118c: nop
+# CHECK: 1190: nop
 # CHECK: 1197: incl
 
   .align 32, 0x90
@@ -1550,6 +1583,7 @@ INSTRLEN_9_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: 11ad: nop
+# CHECK: 11b0: nop
 # CHECK: 11b7: incl
 
   .align 32, 0x90
@@ -1561,6 +1595,7 @@ INSTRLEN_9_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 11ce: nop
+# CHECK: 11d0: nop
 # CHECK: 11d7: incl
 
   .align 32, 0x90
@@ -1572,6 +1607,7 @@ INSTRLEN_9_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 11ef: nop
+# CHECK: 11f0: nop
 # CHECK: 11f7: incl
 
   .align 32, 0x90
@@ -1658,6 +1694,7 @@ INSTRLEN_10_OFFSET_7:
   .endr
   .bundle_unlock
 # CHECK: 12e7: nop
+# CHECK: 12f0: nop
 # CHECK: 12f6: incl
 
   .align 32, 0x90
@@ -1669,6 +1706,7 @@ INSTRLEN_10_OFFSET_8:
   .endr
   .bundle_unlock
 # CHECK: 1308: nop
+# CHECK: 1310: nop
 # CHECK: 1316: incl
 
   .align 32, 0x90
@@ -1680,6 +1718,7 @@ INSTRLEN_10_OFFSET_9:
   .endr
   .bundle_unlock
 # CHECK: 1329: nop
+# CHECK: 1330: nop
 # CHECK: 1336: incl
 
   .align 32, 0x90
@@ -1691,6 +1730,7 @@ INSTRLEN_10_OFFSET_10:
   .endr
   .bundle_unlock
 # CHECK: 134a: nop
+# CHECK: 1350: nop
 # CHECK: 1356: incl
 
   .align 32, 0x90
@@ -1702,6 +1742,7 @@ INSTRLEN_10_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: 136b: nop
+# CHECK: 1370: nop
 # CHECK: 1376: incl
 
   .align 32, 0x90
@@ -1713,6 +1754,7 @@ INSTRLEN_10_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: 138c: nop
+# CHECK: 1390: nop
 # CHECK: 1396: incl
 
   .align 32, 0x90
@@ -1724,6 +1766,7 @@ INSTRLEN_10_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: 13ad: nop
+# CHECK: 13b0: nop
 # CHECK: 13b6: incl
 
   .align 32, 0x90
@@ -1735,6 +1778,7 @@ INSTRLEN_10_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 13ce: nop
+# CHECK: 13d0: nop
 # CHECK: 13d6: incl
 
   .align 32, 0x90
@@ -1746,6 +1790,7 @@ INSTRLEN_10_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 13ef: nop
+# CHECK: 13f0: nop
 # CHECK: 13f6: incl
 
   .align 32, 0x90
@@ -1821,6 +1866,7 @@ INSTRLEN_11_OFFSET_6:
   .endr
   .bundle_unlock
 # CHECK: 14c6: nop
+# CHECK: 14d0: nop
 # CHECK: 14d5: incl
 
   .align 32, 0x90
@@ -1832,6 +1878,7 @@ INSTRLEN_11_OFFSET_7:
   .endr
   .bundle_unlock
 # CHECK: 14e7: nop
+# CHECK: 14f0: nop
 # CHECK: 14f5: incl
 
   .align 32, 0x90
@@ -1843,6 +1890,7 @@ INSTRLEN_11_OFFSET_8:
   .endr
   .bundle_unlock
 # CHECK: 1508: nop
+# CHECK: 1510: nop
 # CHECK: 1515: incl
 
   .align 32, 0x90
@@ -1854,6 +1902,7 @@ INSTRLEN_11_OFFSET_9:
   .endr
   .bundle_unlock
 # CHECK: 1529: nop
+# CHECK: 1530: nop
 # CHECK: 1535: incl
 
   .align 32, 0x90
@@ -1865,6 +1914,7 @@ INSTRLEN_11_OFFSET_10:
   .endr
   .bundle_unlock
 # CHECK: 154a: nop
+# CHECK: 1550: nop
 # CHECK: 1555: incl
 
   .align 32, 0x90
@@ -1876,6 +1926,7 @@ INSTRLEN_11_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: 156b: nop
+# CHECK: 1570: nop
 # CHECK: 1575: incl
 
   .align 32, 0x90
@@ -1887,6 +1938,7 @@ INSTRLEN_11_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: 158c: nop
+# CHECK: 1590: nop
 # CHECK: 1595: incl
 
   .align 32, 0x90
@@ -1898,6 +1950,7 @@ INSTRLEN_11_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: 15ad: nop
+# CHECK: 15b0: nop
 # CHECK: 15b5: incl
 
   .align 32, 0x90
@@ -1909,6 +1962,7 @@ INSTRLEN_11_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 15ce: nop
+# CHECK: 15d0: nop
 # CHECK: 15d5: incl
 
   .align 32, 0x90
@@ -1920,6 +1974,7 @@ INSTRLEN_11_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 15ef: nop
+# CHECK: 15f0: nop
 # CHECK: 15f5: incl
 
   .align 32, 0x90
@@ -1984,6 +2039,7 @@ INSTRLEN_12_OFFSET_5:
   .endr
   .bundle_unlock
 # CHECK: 16a5: nop
+# CHECK: 16b0: nop
 # CHECK: 16b4: incl
 
   .align 32, 0x90
@@ -1995,6 +2051,7 @@ INSTRLEN_12_OFFSET_6:
   .endr
   .bundle_unlock
 # CHECK: 16c6: nop
+# CHECK: 16d0: nop
 # CHECK: 16d4: incl
 
   .align 32, 0x90
@@ -2006,6 +2063,7 @@ INSTRLEN_12_OFFSET_7:
   .endr
   .bundle_unlock
 # CHECK: 16e7: nop
+# CHECK: 16f0: nop
 # CHECK: 16f4: incl
 
   .align 32, 0x90
@@ -2017,6 +2075,7 @@ INSTRLEN_12_OFFSET_8:
   .endr
   .bundle_unlock
 # CHECK: 1708: nop
+# CHECK: 1710: nop
 # CHECK: 1714: incl
 
   .align 32, 0x90
@@ -2028,6 +2087,7 @@ INSTRLEN_12_OFFSET_9:
   .endr
   .bundle_unlock
 # CHECK: 1729: nop
+# CHECK: 1730: nop
 # CHECK: 1734: incl
 
   .align 32, 0x90
@@ -2039,6 +2099,7 @@ INSTRLEN_12_OFFSET_10:
   .endr
   .bundle_unlock
 # CHECK: 174a: nop
+# CHECK: 1750: nop
 # CHECK: 1754: incl
 
   .align 32, 0x90
@@ -2050,6 +2111,7 @@ INSTRLEN_12_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: 176b: nop
+# CHECK: 1770: nop
 # CHECK: 1774: incl
 
   .align 32, 0x90
@@ -2061,6 +2123,7 @@ INSTRLEN_12_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: 178c: nop
+# CHECK: 1790: nop
 # CHECK: 1794: incl
 
   .align 32, 0x90
@@ -2072,6 +2135,7 @@ INSTRLEN_12_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: 17ad: nop
+# CHECK: 17b0: nop
 # CHECK: 17b4: incl
 
   .align 32, 0x90
@@ -2083,6 +2147,7 @@ INSTRLEN_12_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 17ce: nop
+# CHECK: 17d0: nop
 # CHECK: 17d4: incl
 
   .align 32, 0x90
@@ -2094,6 +2159,7 @@ INSTRLEN_12_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 17ef: nop
+# CHECK: 17f0: nop
 # CHECK: 17f4: incl
 
   .align 32, 0x90
@@ -2147,6 +2213,7 @@ INSTRLEN_13_OFFSET_4:
   .endr
   .bundle_unlock
 # CHECK: 1884: nop
+# CHECK: 1890: nop
 # CHECK: 1893: incl
 
   .align 32, 0x90
@@ -2158,6 +2225,7 @@ INSTRLEN_13_OFFSET_5:
   .endr
   .bundle_unlock
 # CHECK: 18a5: nop
+# CHECK: 18b0: nop
 # CHECK: 18b3: incl
 
   .align 32, 0x90
@@ -2169,6 +2237,7 @@ INSTRLEN_13_OFFSET_6:
   .endr
   .bundle_unlock
 # CHECK: 18c6: nop
+# CHECK: 18d0: nop
 # CHECK: 18d3: incl
 
   .align 32, 0x90
@@ -2180,6 +2249,7 @@ INSTRLEN_13_OFFSET_7:
   .endr
   .bundle_unlock
 # CHECK: 18e7: nop
+# CHECK: 18f0: nop
 # CHECK: 18f3: incl
 
   .align 32, 0x90
@@ -2191,6 +2261,7 @@ INSTRLEN_13_OFFSET_8:
   .endr
   .bundle_unlock
 # CHECK: 1908: nop
+# CHECK: 1910: nop
 # CHECK: 1913: incl
 
   .align 32, 0x90
@@ -2202,6 +2273,7 @@ INSTRLEN_13_OFFSET_9:
   .endr
   .bundle_unlock
 # CHECK: 1929: nop
+# CHECK: 1930: nop
 # CHECK: 1933: incl
 
   .align 32, 0x90
@@ -2213,6 +2285,7 @@ INSTRLEN_13_OFFSET_10:
   .endr
   .bundle_unlock
 # CHECK: 194a: nop
+# CHECK: 1950: nop
 # CHECK: 1953: incl
 
   .align 32, 0x90
@@ -2224,6 +2297,7 @@ INSTRLEN_13_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: 196b: nop
+# CHECK: 1970: nop
 # CHECK: 1973: incl
 
   .align 32, 0x90
@@ -2235,6 +2309,7 @@ INSTRLEN_13_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: 198c: nop
+# CHECK: 1990: nop
 # CHECK: 1993: incl
 
   .align 32, 0x90
@@ -2246,6 +2321,7 @@ INSTRLEN_13_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: 19ad: nop
+# CHECK: 19b0: nop
 # CHECK: 19b3: incl
 
   .align 32, 0x90
@@ -2257,6 +2333,7 @@ INSTRLEN_13_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 19ce: nop
+# CHECK: 19d0: nop
 # CHECK: 19d3: incl
 
   .align 32, 0x90
@@ -2268,6 +2345,7 @@ INSTRLEN_13_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 19ef: nop
+# CHECK: 19f0: nop
 # CHECK: 19f3: incl
 
   .align 32, 0x90
@@ -2310,6 +2388,7 @@ INSTRLEN_14_OFFSET_3:
   .endr
   .bundle_unlock
 # CHECK: 1a63: nop
+# CHECK: 1a70: nop
 # CHECK: 1a72: incl
 
   .align 32, 0x90
@@ -2321,6 +2400,7 @@ INSTRLEN_14_OFFSET_4:
   .endr
   .bundle_unlock
 # CHECK: 1a84: nop
+# CHECK: 1a90: nop
 # CHECK: 1a92: incl
 
   .align 32, 0x90
@@ -2332,6 +2412,7 @@ INSTRLEN_14_OFFSET_5:
   .endr
   .bundle_unlock
 # CHECK: 1aa5: nop
+# CHECK: 1ab0: nop
 # CHECK: 1ab2: incl
 
   .align 32, 0x90
@@ -2343,6 +2424,7 @@ INSTRLEN_14_OFFSET_6:
   .endr
   .bundle_unlock
 # CHECK: 1ac6: nop
+# CHECK: 1ad0: nop
 # CHECK: 1ad2: incl
 
   .align 32, 0x90
@@ -2354,6 +2436,7 @@ INSTRLEN_14_OFFSET_7:
   .endr
   .bundle_unlock
 # CHECK: 1ae7: nop
+# CHECK: 1af0: nop
 # CHECK: 1af2: incl
 
   .align 32, 0x90
@@ -2365,6 +2448,7 @@ INSTRLEN_14_OFFSET_8:
   .endr
   .bundle_unlock
 # CHECK: 1b08: nop
+# CHECK: 1b10: nop
 # CHECK: 1b12: incl
 
   .align 32, 0x90
@@ -2376,6 +2460,7 @@ INSTRLEN_14_OFFSET_9:
   .endr
   .bundle_unlock
 # CHECK: 1b29: nop
+# CHECK: 1b30: nop
 # CHECK: 1b32: incl
 
   .align 32, 0x90
@@ -2387,6 +2472,7 @@ INSTRLEN_14_OFFSET_10:
   .endr
   .bundle_unlock
 # CHECK: 1b4a: nop
+# CHECK: 1b50: nop
 # CHECK: 1b52: incl
 
   .align 32, 0x90
@@ -2398,6 +2484,7 @@ INSTRLEN_14_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: 1b6b: nop
+# CHECK: 1b70: nop
 # CHECK: 1b72: incl
 
   .align 32, 0x90
@@ -2409,6 +2496,7 @@ INSTRLEN_14_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: 1b8c: nop
+# CHECK: 1b90: nop
 # CHECK: 1b92: incl
 
   .align 32, 0x90
@@ -2420,6 +2508,7 @@ INSTRLEN_14_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: 1bad: nop
+# CHECK: 1bb0: nop
 # CHECK: 1bb2: incl
 
   .align 32, 0x90
@@ -2431,6 +2520,7 @@ INSTRLEN_14_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 1bce: nop
+# CHECK: 1bd0: nop
 # CHECK: 1bd2: incl
 
   .align 32, 0x90
@@ -2442,6 +2532,7 @@ INSTRLEN_14_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 1bef: nop
+# CHECK: 1bf0: nop
 # CHECK: 1bf2: incl
 
   .align 32, 0x90
@@ -2473,6 +2564,7 @@ INSTRLEN_15_OFFSET_2:
   .endr
   .bundle_unlock
 # CHECK: 1c42: nop
+# CHECK: 1c50: nop
 # CHECK: 1c51: incl
 
   .align 32, 0x90
@@ -2484,6 +2576,7 @@ INSTRLEN_15_OFFSET_3:
   .endr
   .bundle_unlock
 # CHECK: 1c63: nop
+# CHECK: 1c70: nop
 # CHECK: 1c71: incl
 
   .align 32, 0x90
@@ -2495,6 +2588,7 @@ INSTRLEN_15_OFFSET_4:
   .endr
   .bundle_unlock
 # CHECK: 1c84: nop
+# CHECK: 1c90: nop
 # CHECK: 1c91: incl
 
   .align 32, 0x90
@@ -2506,6 +2600,7 @@ INSTRLEN_15_OFFSET_5:
   .endr
   .bundle_unlock
 # CHECK: 1ca5: nop
+# CHECK: 1cb0: nop
 # CHECK: 1cb1: incl
 
   .align 32, 0x90
@@ -2517,6 +2612,7 @@ INSTRLEN_15_OFFSET_6:
   .endr
   .bundle_unlock
 # CHECK: 1cc6: nop
+# CHECK: 1cd0: nop
 # CHECK: 1cd1: incl
 
   .align 32, 0x90
@@ -2528,6 +2624,7 @@ INSTRLEN_15_OFFSET_7:
   .endr
   .bundle_unlock
 # CHECK: 1ce7: nop
+# CHECK: 1cf0: nop
 # CHECK: 1cf1: incl
 
   .align 32, 0x90
@@ -2539,6 +2636,7 @@ INSTRLEN_15_OFFSET_8:
   .endr
   .bundle_unlock
 # CHECK: 1d08: nop
+# CHECK: 1d10: nop
 # CHECK: 1d11: incl
 
   .align 32, 0x90
@@ -2550,6 +2648,7 @@ INSTRLEN_15_OFFSET_9:
   .endr
   .bundle_unlock
 # CHECK: 1d29: nop
+# CHECK: 1d30: nop
 # CHECK: 1d31: incl
 
   .align 32, 0x90
@@ -2561,6 +2660,7 @@ INSTRLEN_15_OFFSET_10:
   .endr
   .bundle_unlock
 # CHECK: 1d4a: nop
+# CHECK: 1d50: nop
 # CHECK: 1d51: incl
 
   .align 32, 0x90
@@ -2572,6 +2672,7 @@ INSTRLEN_15_OFFSET_11:
   .endr
   .bundle_unlock
 # CHECK: 1d6b: nop
+# CHECK: 1d70: nop
 # CHECK: 1d71: incl
 
   .align 32, 0x90
@@ -2583,6 +2684,7 @@ INSTRLEN_15_OFFSET_12:
   .endr
   .bundle_unlock
 # CHECK: 1d8c: nop
+# CHECK: 1d90: nop
 # CHECK: 1d91: incl
 
   .align 32, 0x90
@@ -2594,6 +2696,7 @@ INSTRLEN_15_OFFSET_13:
   .endr
   .bundle_unlock
 # CHECK: 1dad: nop
+# CHECK: 1db0: nop
 # CHECK: 1db1: incl
 
   .align 32, 0x90
@@ -2605,6 +2708,7 @@ INSTRLEN_15_OFFSET_14:
   .endr
   .bundle_unlock
 # CHECK: 1dce: nop
+# CHECK: 1dd0: nop
 # CHECK: 1dd1: incl
 
   .align 32, 0x90
@@ -2616,6 +2720,7 @@ INSTRLEN_15_OFFSET_15:
   .endr
   .bundle_unlock
 # CHECK: 1def: nop
+# CHECK: 1df0: nop
 # CHECK: 1df1: incl
 
   .align 32, 0x90
diff --git a/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
index 7cb6d06..6ca4046 100644
--- a/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
+++ b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s
@@ -26,7 +26,8 @@ foo:
 # Here we have to pad until the end of the *next* boundary because
 # otherwise the group crosses a boundary.
 # CHECK:      1a: nop
+# The nop sequence may be implemented as one instruction or many, but if
+# it's one instruction, that instruction cannot itself cross the boundary.
+# CHECK:      20: nop
 # CHECK-NEXT: 26: callq
 # CHECK-NEXT: 2b: callq
-
-
diff --git a/test/MC/X86/AlignedBundling/relax-at-bundle-end.s b/test/MC/X86/AlignedBundling/relax-at-bundle-end.s
new file mode 100644
index 0000000..ab4affb
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/relax-at-bundle-end.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
+
+# Test that an instruction near a bundle end gets properly padded
+# after it is relaxed.
+.text
+foo:
+        .bundle_align_mode 5
+        .rept 29
+        push %rax
+        .endr
+# CHECK: 1c: push
+# CHECK: 1d: nop
+# CHECK: 20: jne
+        jne 0x100
+
diff --git a/test/MC/X86/gnux32-dwarf-gen.s b/test/MC/X86/gnux32-dwarf-gen.s
new file mode 100644
index 0000000..6603125
--- /dev/null
+++ b/test/MC/X86/gnux32-dwarf-gen.s
@@ -0,0 +1,24 @@
+# RUN: llvm-mc -g -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.64
+# RUN: llvm-dwarfdump -debug-dump=info %t.64 | FileCheck -check-prefix=DEFAULTABI %s
+
+# RUN: llvm-mc -g -filetype=obj -triple x86_64-pc-linux-gnux32 %s -o %t.32
+# RUN: llvm-dwarfdump -debug-dump=info %t.32 | FileCheck -check-prefix=X32ABI %s
+
+# This test checks the dwarf info section emitted to the output object by the
+# assembler, looking at the difference between the x32 ABI and default x86-64
+# ABI.
+
+# DEFAULTABI: addr_size = 0x08
+# X32ABI: addr_size = 0x04
+
+.globl _bar
+_bar:
+        movl    $0, %eax
+L1:     leave
+        ret
+_foo:
+_baz:
+        nop
+.data
+_x:     .long 1
+
diff --git a/test/MC/X86/intel-syntax-hex.s b/test/MC/X86/intel-syntax-hex.s
new file mode 100644
index 0000000..b3a19fb
--- /dev/null
+++ b/test/MC/X86/intel-syntax-hex.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s
+// rdar://12470373
+
+// Checks to make sure we parse the hexadecimal suffix properly.
+// CHECK: movl $10, %eax
+  mov eax, 10
+// CHECK: movl $16, %eax
+  mov eax, 10h
+// CHECK: movl $16, %eax
+  mov eax, 10H
+// CHECK: movl $4294967295, %eax
+  mov eax, 0ffffffffh
+// CHECK: movl $4294967295, %eax
+  mov eax, 0xffffffff
+// CHECK: movl $4294967295, %eax
+  mov eax, 0xffffffffh
+// CHECK: movl $15, %eax
+  mov eax, 0fh
+// CHECK: movl $162, %eax
+  mov eax, 0a2h
+// CHECK: movl $162, %eax
+  mov eax, 0xa2
+// CHECK: movl $162, %eax
+  mov eax, 0xa2h
+// CHECK: movl $674, %eax
+  mov eax, 2a2h
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index 7edd26a..8bfa58a 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -56,13 +56,195 @@ _main:
 // CHECK:	fld	%st(0)
 	fld	ST(0)
 // CHECK:	movl	%fs:(%rdi), %eax
-        mov     EAX, DWORD PTR FS:[RDI]
-// CHECK:	leal	(,%rdi,4), %r8d
-        lea     R8D, DWORD PTR [4*RDI]
-// CHECK:        movl    _fnan(,%ecx,4), %ecx
-        mov     ECX, DWORD PTR [4*ECX + _fnan]
-// CHECK:       movq    %fs:320, %rax
-        mov     RAX, QWORD PTR FS:[320]
-// CHECK:       vpgatherdd %xmm8, (%r15,%xmm9,2), %xmm1
-        vpgatherdd XMM10, DWORD PTR [R15 + 2*XMM9], XMM8
+    mov EAX, DWORD PTR FS:[RDI]
+// CHECK: leal (,%rdi,4), %r8d
+    lea R8D, DWORD PTR [4*RDI]
+// CHECK: movl _fnan(,%ecx,4), %ecx
+    mov ECX, DWORD PTR [4*ECX + _fnan]
+// CHECK: movq %fs:320, %rax
+    mov RAX, QWORD PTR FS:[320]
+// CHECK: vpgatherdd %xmm8, (%r15,%xmm9,2), %xmm1
+    vpgatherdd XMM10, DWORD PTR [R15 + 2*XMM9], XMM8
+// CHECK: movsd	-8, %xmm5
+    movsd   XMM5, QWORD PTR [-8]
+// CHECK: movl %ecx, (%eax)
+    mov [eax], ecx
+// CHECK: movl %ecx, (,%ebx,4)
+    mov [4*ebx], ecx
+ // CHECK:   movl %ecx, (,%ebx,4)
+    mov [ebx*4], ecx
+// CHECK: movl %ecx, 1024
+    mov [1024], ecx
+// CHECK: movl %ecx, 4132
+    mov [0x1024], ecx
+// CHECK: movl %ecx, 32        
+    mov [16 + 16], ecx
+// CHECK: movl %ecx, 0
+    mov [16 - 16], ecx        
+// CHECK: movl %ecx, 32        
+    mov [16][16], ecx
+// CHECK: movl %ecx, (%eax,%ebx,4)
+    mov [eax + 4*ebx], ecx
+// CHECK: movl %ecx, (%eax,%ebx,4)
+    mov [eax + ebx*4], ecx
+// CHECK: movl %ecx, (%eax,%ebx,4)
+    mov [4*ebx + eax], ecx
+// CHECK: movl %ecx, (%eax,%ebx,4)
+    mov [ebx*4 + eax], ecx
+// CHECK: movl %ecx, (%eax,%ebx,4)
+    mov [eax][4*ebx], ecx
+// CHECK: movl %ecx, (%eax,%ebx,4)
+    mov [eax][ebx*4], ecx
+// CHECK: movl %ecx, (%eax,%ebx,4)
+    mov [4*ebx][eax], ecx
+// CHECK: movl %ecx, (%eax,%ebx,4)
+    mov [ebx*4][eax], ecx
+// CHECK: movl %ecx, 12(%eax)
+    mov [eax + 12], ecx
+// CHECK: movl %ecx, 12(%eax)
+    mov [12 + eax], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [eax + 16 + 16], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [16 + eax + 16], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [16 + 16 + eax], ecx
+// CHECK: movl %ecx, 12(%eax)
+    mov [eax][12], ecx
+// CHECK: movl %ecx, 12(%eax)
+    mov [12][eax], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [eax][16 + 16], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [eax + 16][16], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [eax][16][16], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [16][eax + 16], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [16 + eax][16], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [16][16 + eax], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [16 + 16][eax], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [eax][16][16], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [16][eax][16], ecx
+// CHECK: movl %ecx, 32(%eax)
+    mov [16][16][eax], ecx
+// CHECK: movl %ecx, 16(,%ebx,4)
+    mov [4*ebx + 16], ecx
+// CHECK: movl %ecx, 16(,%ebx,4)
+    mov [ebx*4 + 16], ecx
+// CHECK: movl %ecx, 16(,%ebx,4)
+    mov [4*ebx][16], ecx
+// CHECK: movl %ecx, 16(,%ebx,4)
+    mov [ebx*4][16], ecx
+// CHECK: movl %ecx, 16(,%ebx,4)
+    mov [16 + 4*ebx], ecx
+// CHECK: movl %ecx, 16(,%ebx,4)
+    mov [16 + ebx*4], ecx
+// CHECK: movl %ecx, 16(,%ebx,4)
+    mov [16][4*ebx], ecx
+// CHECK: movl %ecx, 16(,%ebx,4)
+    mov [16][ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax + 4*ebx + 16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax + 16 + 4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [4*ebx + eax + 16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [4*ebx + 16 + eax], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16 + eax + 4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16 + eax + 4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax][4*ebx + 16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax][16 + 4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [4*ebx][eax + 16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [4*ebx][16 + eax], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16][eax + 4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16][eax + 4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax + 4*ebx][16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax + 16][4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [4*ebx + eax][16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [4*ebx + 16][eax], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16 + eax][4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16 + eax][4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax][4*ebx][16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax][16][4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [4*ebx][eax][16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [4*ebx][16][eax], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16][eax][4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16][eax][4*ebx], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax + ebx*4 + 16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax + 16 + ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [ebx*4 + eax + 16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [ebx*4 + 16 + eax], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16 + eax + ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16 + eax + ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax][ebx*4 + 16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax][16 + ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [ebx*4][eax + 16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [ebx*4][16 + eax], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16][eax + ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16][eax + ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax + ebx*4][16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax + 16][ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [ebx*4 + eax][16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [ebx*4 + 16][eax], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16 + eax][ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16 + eax][ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax][ebx*4][16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [eax][16][ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [ebx*4][eax][16], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [ebx*4][16][eax], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16][eax][ebx*4], ecx
+// CHECK: movl %ecx, 16(%eax,%ebx,4)
+    mov [16][eax][ebx*4], ecx
+// CHECK: movl %ecx, -16(%eax,%ebx,4)
+    mov [eax][ebx*4 - 16], ecx
 	ret
diff --git a/test/MC/X86/shuffle-comments.s b/test/MC/X86/shuffle-comments.s
new file mode 100644
index 0000000..20fd4eb
--- /dev/null
+++ b/test/MC/X86/shuffle-comments.s
@@ -0,0 +1,271 @@
+# RUN: llvm-mc %s -triple=x86_64-unknown-unknown | FileCheck %s
+
+palignr $8, %xmm0, %xmm1
+# CHECK: xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+palignr $8, (%rax), %xmm1
+# CHECK: xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+
+palignr $16, %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+palignr $16, (%rax), %xmm1
+# CHECK: xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+
+palignr $0, %xmm0, %xmm1
+# CHECK: xmm1 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+palignr $0, (%rax), %xmm1
+# CHECK: xmm1 = mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+
+vpalignr $8, %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+vpalignr $8, (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+
+vpalignr $16, %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+vpalignr $16, (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+
+vpalignr $0, %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+vpalignr $0, (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+
+vpalignr $8, %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+vpalignr $8, (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+
+vpalignr $16, %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+vpalignr $16, (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+
+vpalignr $0, %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+vpalignr $0, (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+
+pshufd $27, %xmm0, %xmm1
+# CHECK: xmm1 = xmm0[3,2,1,0]
+pshufd $27, (%rax), %xmm1
+# CHECK: xmm1 = mem[3,2,1,0]
+
+vpshufd $27, %xmm0, %xmm1
+# CHECK: xmm1 = xmm0[3,2,1,0]
+vpshufd $27, (%rax), %xmm1
+# CHECK: xmm1 = mem[3,2,1,0]
+
+vpshufd $27, %ymm0, %ymm1
+# CHECK: ymm1 = ymm0[3,2,1,0,7,6,5,4]
+vpshufd $27, (%rax), %ymm1
+# CHECK: ymm1 = mem[3,2,1,0,7,6,5,4]
+
+punpcklbw %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+punpcklbw (%rax), %xmm1
+# CHECK: xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+
+vpunpcklbw %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+vpunpcklbw (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+
+vpunpcklbw %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+vpunpcklbw (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[16],mem[16],ymm1[17],mem[17],ymm1[18],mem[18],ymm1[19],mem[19],ymm1[20],mem[20],ymm1[21],mem[21],ymm1[22],mem[22],ymm1[23],mem[23]
+
+punpckhbw %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+punpckhbw (%rax), %xmm1
+# CHECK: xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
+
+vpunpckhbw %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+vpunpckhbw (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
+
+vpunpckhbw %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+vpunpckhbw (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15],ymm1[24],mem[24],ymm1[25],mem[25],ymm1[26],mem[26],ymm1[27],mem[27],ymm1[28],mem[28],ymm1[29],mem[29],ymm1[30],mem[30],ymm1[31],mem[31]
+
+punpcklwd %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+punpcklwd (%rax), %xmm1
+# CHECK: xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+
+vpunpcklwd %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+vpunpcklwd (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+
+vpunpcklwd %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+vpunpcklwd (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11]
+
+punpckhwd %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+punpckhwd (%rax), %xmm1
+# CHECK: xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+
+vpunpckhwd %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+vpunpckhwd (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+
+vpunpckhwd %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+vpunpckhwd (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[4],mem[4],ymm1[5],mem[5],ymm1[6],mem[6],ymm1[7],mem[7],ymm1[12],mem[12],ymm1[13],mem[13],ymm1[14],mem[14],ymm1[15],mem[15]
+
+punpckldq %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+punpckldq (%rax), %xmm1
+# CHECK: xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+
+vpunpckldq %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+vpunpckldq (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],mem[0],xmm1[1],mem[1]
+
+vpunpckldq %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+vpunpckldq (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
+
+punpckhdq %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+punpckhdq (%rax), %xmm1
+# CHECK: xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
+
+vpunpckhdq %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+vpunpckhdq (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
+
+vpunpckhdq %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+vpunpckhdq (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
+
+punpcklqdq %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[0],xmm0[0]
+punpcklqdq (%rax), %xmm1
+# CHECK: xmm1 = xmm1[0],mem[0]
+
+vpunpcklqdq %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],xmm0[0]
+vpunpcklqdq (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],mem[0]
+
+vpunpcklqdq %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+vpunpcklqdq (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],mem[0],ymm1[2],mem[2]
+
+punpckhqdq %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[1],xmm0[1]
+punpckhqdq (%rax), %xmm1
+# CHECK: xmm1 = xmm1[1],mem[1]
+
+vpunpckhqdq %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[1],xmm0[1]
+vpunpckhqdq (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[1],mem[1]
+
+vpunpckhqdq %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+vpunpckhqdq (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[1],mem[1],ymm1[3],mem[3]
+
+unpcklps %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+unpcklps (%rax), %xmm1
+# CHECK: xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+
+vunpcklps %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+vunpcklps (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],mem[0],xmm1[1],mem[1]
+
+vunpcklps %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+vunpcklps (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
+
+unpckhps %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+unpckhps (%rax), %xmm1
+# CHECK: xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
+
+vunpckhps %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+vunpckhps (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
+
+vunpckhps %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+vunpckhps (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
+
+unpcklpd %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[0],xmm0[0]
+unpcklpd (%rax), %xmm1
+# CHECK: xmm1 = xmm1[0],mem[0]
+
+vunpcklpd %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],xmm0[0]
+vunpcklpd (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[0],mem[0]
+
+vunpcklpd %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+vunpcklpd (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[0],mem[0],ymm1[2],mem[2]
+
+unpckhpd %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[1],xmm0[1]
+unpckhpd (%rax), %xmm1
+# CHECK: xmm1 = xmm1[1],mem[1]
+
+vunpckhpd %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[1],xmm0[1]
+vunpckhpd (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[1],mem[1]
+
+vunpckhpd %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+vunpckhpd (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[1],mem[1],ymm1[3],mem[3]
+
+shufps $27, %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[3,2],xmm0[1,0]
+shufps $27, (%rax), %xmm1
+# CHECK: xmm1 = xmm1[3,2],mem[1,0]
+
+vshufps $27, %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[3,2],xmm0[1,0]
+vshufps $27, (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[3,2],mem[1,0]
+
+vshufps $27, %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[3,2],ymm0[1,0],ymm1[7,6],ymm0[5,4]
+vshufps $27, (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[3,2],mem[1,0],ymm1[7,6],mem[5,4]
+
+shufpd $3, %xmm0, %xmm1
+# CHECK: xmm1 = xmm1[1],xmm0[1]
+shufpd $3, (%rax), %xmm1
+# CHECK: xmm1 = xmm1[1],mem[1]
+
+vshufpd $3, %xmm0, %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[1],xmm0[1]
+vshufpd $3, (%rax), %xmm1, %xmm2
+# CHECK: xmm2 = xmm1[1],mem[1]
+
+vshufpd $11, %ymm0, %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
+vshufpd $11, (%rax), %ymm1, %ymm2
+# CHECK: ymm2 = ymm1[1],mem[1],ymm1[2],mem[3]
diff --git a/test/MC/X86/x86-32-ms-inline-asm.s b/test/MC/X86/x86-32-ms-inline-asm.s
index 73d5878..5524c70 100644
--- a/test/MC/X86/x86-32-ms-inline-asm.s
+++ b/test/MC/X86/x86-32-ms-inline-asm.s
@@ -57,4 +57,17 @@ _t21:                                   ## @t21
 // CHECK: movl 4(%esi,%eax,2), %eax
 // CHECK: # encoding: [0x8b,0x44,0x46,0x04]
 
+    pusha
+// CHECK: pushal
+// CHECK: # encoding: [0x60]
+    popa
+// CHECK: popal
+// CHECK: # encoding: [0x61]
+    pushad
+// CHECK: pushal
+// CHECK: # encoding: [0x60]
+    popad
+// CHECK: popal
+// CHECK: # encoding: [0x61]
+
 	ret
diff --git a/test/MC/X86/x86_64-fma4-encoding.s b/test/MC/X86/x86_64-fma4-encoding.s
index 805fc23..f7ee351 100644
--- a/test/MC/X86/x86_64-fma4-encoding.s
+++ b/test/MC/X86/x86_64-fma4-encoding.s
@@ -73,6 +73,67 @@
 // CHECK: encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10]
           vfmaddpd   %ymm2, %ymm1, %ymm0, %ymm0
 
+// PR15040
+// CHECK: vfmaddss  foo(%rip), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6a,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddss  foo(%rip), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddss   %xmm1, foo(%rip), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x6a,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddss   %xmm1, foo(%rip),%xmm0, %xmm0
+
+// CHECK: vfmaddsd  foo(%rip), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6b,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddsd  foo(%rip), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddsd   %xmm1, foo(%rip), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x6b,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddsd   %xmm1, foo(%rip),%xmm0, %xmm0
+
+// CHECK: vfmaddps  foo(%rip), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x68,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddps  foo(%rip), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddps   %xmm1, foo(%rip), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x68,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddps   %xmm1, foo(%rip),%xmm0, %xmm0
+
+// CHECK: vfmaddpd  foo(%rip), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x69,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddpd  foo(%rip), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddpd   %xmm1, foo(%rip), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x69,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddpd   %xmm1, foo(%rip),%xmm0, %xmm0
+
+// CHECK: vfmaddps  foo(%rip), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x68,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddps  foo(%rip), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmaddps   %ymm1, foo(%rip), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x68,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddps   %ymm1, foo(%rip),%ymm0, %ymm0
+
+// CHECK: vfmaddpd  foo(%rip), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x69,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddpd  foo(%rip), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmaddpd   %ymm1, foo(%rip), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x69,0x05,A,A,A,A,0x10]
+// CHECK: fixup A - offset: 5, value: foo-5, kind: reloc_riprel_4byte
+          vfmaddpd   %ymm1, foo(%rip),%ymm0, %ymm0
+
 // vfmsub
 // CHECK: vfmsubss  (%rcx), %xmm1, %xmm0, %xmm0
 // CHECK: encoding: [0xc4,0xe3,0xf9,0x6e,0x01,0x10]
diff --git a/test/Makefile b/test/Makefile
index 4e690cc..fc8ec08 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -87,14 +87,14 @@ endif # SunOS
 
 check-local:: lit.site.cfg Unit/lit.site.cfg
 	( $(ULIMIT) \
-	  $(LLVM_SRC_ROOT)/utils/lit/lit.py $(LIT_ARGS) $(LIT_TESTSUITE) )
+	  $(PYTHON) $(LLVM_SRC_ROOT)/utils/lit/lit.py $(LIT_ARGS) $(LIT_TESTSUITE) )
 
 # This is a legacy alias dating from when both DejaGNU and lit were in use.
 check-local-lit:: check-local
 
 check-local-all:: lit.site.cfg Unit/lit.site.cfg extra-site-cfgs
 	( $(ULIMIT) \
-	  $(LLVM_SRC_ROOT)/utils/lit/lit.py $(LIT_ARGS) $(LIT_ALL_TESTSUITES) )
+	  $(PYTHON) $(LLVM_SRC_ROOT)/utils/lit/lit.py $(LIT_ARGS) $(LIT_ALL_TESTSUITES) )
 
 clean::
 	$(RM) -rf `find $(LLVM_OBJ_ROOT)/test -name Output -type d -print`
@@ -131,13 +131,14 @@ endif
 
 lit.site.cfg: FORCE
 	@echo "Making LLVM 'lit.site.cfg' file..."
-	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g > lit.tmp
+	@$(ECHOPATH) s=@LLVM_HOSTTRIPLE@=$(HOST_TRIPLE)=g > lit.tmp
+	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBEXT@=$(SHLIBEXT)=g >> lit.tmp
-	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=python=g >> lit.tmp
+	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=$(PYTHON)=g >> lit.tmp
 	@$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -I $(LibDir)/ocaml=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp
diff --git a/test/Object/objdump-sectionheaders.test b/test/Object/objdump-sectionheaders.test
index a417d07..bc2478c 100644
--- a/test/Object/objdump-sectionheaders.test
+++ b/test/Object/objdump-sectionheaders.test
@@ -6,11 +6,11 @@
 
 ; CHECK: Sections:
 ; CHECK: Idx Name          Size      Address          Type
-; CHECK:   0               000000000 00000000000000000 
-; CHECK:   1 .text         000000026 00000000000000000 TEXT DATA 
-; CHECK:   2 .rodata.str1.1 00000000d 00000000000000026 DATA 
-; CHECK:   3 .note.GNU-stack 000000000 00000000000000033 
-; CHECK:   4 .rela.text    000000048 00000000000000038 
-; CHECK:   5 .symtab       0000000c0 00000000000000080 
-; CHECK:   6 .strtab       000000033 00000000000000140 
-; CHECK:   7 .shstrtab     00000004b 00000000000000173 
+; CHECK:   0               00000000 0000000000000000
+; CHECK:   1 .text         00000026 0000000000000000 TEXT DATA
+; CHECK:   2 .rodata.str1.1 0000000d 0000000000000026 DATA
+; CHECK:   3 .note.GNU-stack 00000000 0000000000000033
+; CHECK:   4 .rela.text    00000048 0000000000000038
+; CHECK:   5 .symtab       000000c0 0000000000000080
+; CHECK:   6 .strtab       00000033 0000000000000140
+; CHECK:   7 .shstrtab     0000004b 0000000000000173
diff --git a/test/Object/readobj-shared-object.test b/test/Object/readobj-shared-object.test
index 3065c6f..2c0b54d 100644
--- a/test/Object/readobj-shared-object.test
+++ b/test/Object/readobj-shared-object.test
@@ -71,6 +71,19 @@ ELF:  .symtab                     {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  r
 ELF:  .strtab                     {{[0-9a-f]+}}  {{[0-9a-f]+}}  {{[0-9a-f]+}}  rodata
 ELF:  Total: 14
 
+ELF:Dynamic section contains 9 entries
+ELF:  Tag        Type                 Name/Value
+ELF: 00000001 (NEEDED)             Shared library: [libc.so.6]
+ELF: 00000001 (NEEDED)             Shared library: [libm.so.6]
+ELF: 0000000e (SONAME)             Library soname: [libfoo.so]
+ELF: 00000004 (HASH)               {{[0-9a-f]+}}
+ELF: 00000005 (STRTAB)             {{[0-9a-f]+}}
+ELF: 00000006 (SYMTAB)             {{[0-9a-f]+}}
+ELF: 0000000a (STRSZ)              {{[0-9]+}} (bytes)
+ELF: 0000000b (SYMENT)             {{[0-9]+}} (bytes)
+ELF: 00000000 (NULL)               0x0
+ELF:  Total: 9
+
 ELF:Libraries needed:
 ELF:  libc.so.6
 ELF:  libm.so.6
diff --git a/test/Object/readobj.test b/test/Object/readobj.test
new file mode 100644
index 0000000..e29f404
--- /dev/null
+++ b/test/Object/readobj.test
@@ -0,0 +1,2 @@
+// Don't crash while reading non-dynamic files.
+RUN: llvm-readobj %p/Inputs/trivial-object-test.elf-x86-64
diff --git a/test/Other/close-stderr.ll b/test/Other/close-stderr.ll
index 1d207c7..6e180cd 100644
--- a/test/Other/close-stderr.ll
+++ b/test/Other/close-stderr.ll
@@ -1,9 +1,16 @@
 ; RUN: sh -c 'opt --reject-this-option 2>&-; echo $?; opt -o /dev/null /dev/null 2>&-; echo $?;' \
 ; RUN:   | FileCheck %s
+
 ; CHECK: {{^1$}}
+; On valgrind, we got 127 here.
+; XFAIL: valgrind
+
 ; CHECK: {{^0$}}
 ; XFAIL: vg_leak
 ; REQUIRES: shell
 
+; opt will fail to open /dev/null on native win32.
+; XFAIL: win32
+
 ; Test that the error handling when writing to stderr fails exits the
 ; program cleanly rather than aborting.
diff --git a/test/Other/constant-fold-gep.ll b/test/Other/constant-fold-gep.ll
index eafb16e..0224e9f 100644
--- a/test/Other/constant-fold-gep.ll
+++ b/test/Other/constant-fold-gep.ll
@@ -118,64 +118,64 @@
 ; Duplicate all of the above as function return values rather than
 ; global initializers.
 
-; PLAIN: define i8* @goo8() nounwind {
+; PLAIN: define i8* @goo8() #0 {
 ; PLAIN:   %t = bitcast i8* getelementptr (i8* inttoptr (i32 1 to i8*), i32 -1) to i8*
 ; PLAIN:   ret i8* %t
 ; PLAIN: }
-; PLAIN: define i1* @goo1() nounwind {
+; PLAIN: define i1* @goo1() #0 {
 ; PLAIN:   %t = bitcast i1* getelementptr (i1* inttoptr (i32 1 to i1*), i32 -1) to i1*
 ; PLAIN:   ret i1* %t
 ; PLAIN: }
-; PLAIN: define i8* @foo8() nounwind {
+; PLAIN: define i8* @foo8() #0 {
 ; PLAIN:   %t = bitcast i8* getelementptr (i8* inttoptr (i32 1 to i8*), i32 -2) to i8*
 ; PLAIN:   ret i8* %t
 ; PLAIN: }
-; PLAIN: define i1* @foo1() nounwind {
+; PLAIN: define i1* @foo1() #0 {
 ; PLAIN:   %t = bitcast i1* getelementptr (i1* inttoptr (i32 1 to i1*), i32 -2) to i1*
 ; PLAIN:   ret i1* %t
 ; PLAIN: }
-; PLAIN: define i8* @hoo8() nounwind {
+; PLAIN: define i8* @hoo8() #0 {
 ; PLAIN:   %t = bitcast i8* getelementptr (i8* null, i32 -1) to i8*
 ; PLAIN:   ret i8* %t
 ; PLAIN: }
-; PLAIN: define i1* @hoo1() nounwind {
+; PLAIN: define i1* @hoo1() #0 {
 ; PLAIN:   %t = bitcast i1* getelementptr (i1* null, i32 -1) to i1*
 ; PLAIN:   ret i1* %t
 ; PLAIN: }
-; OPT: define i8* @goo8() nounwind {
+; OPT: define i8* @goo8() #0 {
 ; OPT:   ret i8* getelementptr (i8* inttoptr (i32 1 to i8*), i32 -1)
 ; OPT: }
-; OPT: define i1* @goo1() nounwind {
+; OPT: define i1* @goo1() #0 {
 ; OPT:   ret i1* getelementptr (i1* inttoptr (i32 1 to i1*), i32 -1)
 ; OPT: }
-; OPT: define i8* @foo8() nounwind {
+; OPT: define i8* @foo8() #0 {
 ; OPT:   ret i8* getelementptr (i8* inttoptr (i32 1 to i8*), i32 -2)
 ; OPT: }
-; OPT: define i1* @foo1() nounwind {
+; OPT: define i1* @foo1() #0 {
 ; OPT:   ret i1* getelementptr (i1* inttoptr (i32 1 to i1*), i32 -2)
 ; OPT: }
-; OPT: define i8* @hoo8() nounwind {
+; OPT: define i8* @hoo8() #0 {
 ; OPT:   ret i8* getelementptr (i8* null, i32 -1)
 ; OPT: }
-; OPT: define i1* @hoo1() nounwind {
+; OPT: define i1* @hoo1() #0 {
 ; OPT:   ret i1* getelementptr (i1* null, i32 -1)
 ; OPT: }
-; TO: define i8* @goo8() nounwind {
+; TO: define i8* @goo8() #0 {
 ; TO:   ret i8* null
 ; TO: }
-; TO: define i1* @goo1() nounwind {
+; TO: define i1* @goo1() #0 {
 ; TO:   ret i1* null
 ; TO: }
-; TO: define i8* @foo8() nounwind {
+; TO: define i8* @foo8() #0 {
 ; TO:   ret i8* inttoptr (i64 -1 to i8*)
 ; TO: }
-; TO: define i1* @foo1() nounwind {
+; TO: define i1* @foo1() #0 {
 ; TO:   ret i1* inttoptr (i64 -1 to i1*)
 ; TO: }
-; TO: define i8* @hoo8() nounwind {
+; TO: define i8* @hoo8() #0 {
 ; TO:   ret i8* inttoptr (i64 -1 to i8*)
 ; TO: }
-; TO: define i1* @hoo1() nounwind {
+; TO: define i1* @hoo1() #0 {
 ; TO:   ret i1* inttoptr (i64 -1 to i1*)
 ; TO: }
 ; SCEV: Classifying expressions for: @goo8
@@ -220,94 +220,94 @@ define i1* @hoo1() nounwind {
   ret i1* %t
 }
 
-; PLAIN: define i64 @fa() nounwind {
+; PLAIN: define i64 @fa() #0 {
 ; PLAIN:   %t = bitcast i64 mul (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 2310) to i64
 ; PLAIN:   ret i64 %t
 ; PLAIN: }
-; PLAIN: define i64 @fb() nounwind {
+; PLAIN: define i64 @fb() #0 {
 ; PLAIN:   %t = bitcast i64 ptrtoint (double* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64) to i64
 ; PLAIN:   ret i64 %t
 ; PLAIN: }
-; PLAIN: define i64 @fc() nounwind {
+; PLAIN: define i64 @fc() #0 {
 ; PLAIN:   %t = bitcast i64 mul nuw (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 2) to i64
 ; PLAIN:   ret i64 %t
 ; PLAIN: }
-; PLAIN: define i64 @fd() nounwind {
+; PLAIN: define i64 @fd() #0 {
 ; PLAIN:   %t = bitcast i64 mul nuw (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 11) to i64
 ; PLAIN:   ret i64 %t
 ; PLAIN: }
-; PLAIN: define i64 @fe() nounwind {
+; PLAIN: define i64 @fe() #0 {
 ; PLAIN:   %t = bitcast i64 ptrtoint (double* getelementptr ({ double, float, double, double }* null, i64 0, i32 2) to i64) to i64
 ; PLAIN:   ret i64 %t
 ; PLAIN: }
-; PLAIN: define i64 @ff() nounwind {
+; PLAIN: define i64 @ff() #0 {
 ; PLAIN:   %t = bitcast i64 1 to i64
 ; PLAIN:   ret i64 %t
 ; PLAIN: }
-; PLAIN: define i64 @fg() nounwind {
+; PLAIN: define i64 @fg() #0 {
 ; PLAIN:   %t = bitcast i64 ptrtoint (double* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64) to i64
 ; PLAIN:   ret i64 %t
 ; PLAIN: }
-; PLAIN: define i64 @fh() nounwind {
+; PLAIN: define i64 @fh() #0 {
 ; PLAIN:   %t = bitcast i64 ptrtoint (i1** getelementptr (i1** null, i32 1) to i64) to i64
 ; PLAIN:   ret i64 %t
 ; PLAIN: }
-; PLAIN: define i64 @fi() nounwind {
+; PLAIN: define i64 @fi() #0 {
 ; PLAIN:   %t = bitcast i64 ptrtoint (i1** getelementptr ({ i1, i1* }* null, i64 0, i32 1) to i64) to i64
 ; PLAIN:   ret i64 %t
 ; PLAIN: }
-; OPT: define i64 @fa() nounwind {
+; OPT: define i64 @fa() #0 {
 ; OPT:   ret i64 mul (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 2310)
 ; OPT: }
-; OPT: define i64 @fb() nounwind {
+; OPT: define i64 @fb() #0 {
 ; OPT:   ret i64 ptrtoint (double* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64)
 ; OPT: }
-; OPT: define i64 @fc() nounwind {
+; OPT: define i64 @fc() #0 {
 ; OPT:   ret i64 mul (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 2)
 ; OPT: }
-; OPT: define i64 @fd() nounwind {
+; OPT: define i64 @fd() #0 {
 ; OPT:   ret i64 mul (i64 ptrtoint (double* getelementptr (double* null, i32 1) to i64), i64 11)
 ; OPT: }
-; OPT: define i64 @fe() nounwind {
+; OPT: define i64 @fe() #0 {
 ; OPT:   ret i64 ptrtoint (double* getelementptr ({ double, float, double, double }* null, i64 0, i32 2) to i64)
 ; OPT: }
-; OPT: define i64 @ff() nounwind {
+; OPT: define i64 @ff() #0 {
 ; OPT:   ret i64 1
 ; OPT: }
-; OPT: define i64 @fg() nounwind {
+; OPT: define i64 @fg() #0 {
 ; OPT:   ret i64 ptrtoint (double* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64)
 ; OPT: }
-; OPT: define i64 @fh() nounwind {
+; OPT: define i64 @fh() #0 {
 ; OPT:   ret i64 ptrtoint (i1** getelementptr (i1** null, i32 1) to i64)
 ; OPT: }
-; OPT: define i64 @fi() nounwind {
+; OPT: define i64 @fi() #0 {
 ; OPT:   ret i64 ptrtoint (i1** getelementptr ({ i1, i1* }* null, i64 0, i32 1) to i64)
 ; OPT: }
-; TO: define i64 @fa() nounwind {
+; TO: define i64 @fa() #0 {
 ; TO:   ret i64 18480
 ; TO: }
-; TO: define i64 @fb() nounwind {
+; TO: define i64 @fb() #0 {
 ; TO:   ret i64 8
 ; TO: }
-; TO: define i64 @fc() nounwind {
+; TO: define i64 @fc() #0 {
 ; TO:   ret i64 16
 ; TO: }
-; TO: define i64 @fd() nounwind {
+; TO: define i64 @fd() #0 {
 ; TO:   ret i64 88
 ; TO: }
-; TO: define i64 @fe() nounwind {
+; TO: define i64 @fe() #0 {
 ; TO:   ret i64 16
 ; TO: }
-; TO: define i64 @ff() nounwind {
+; TO: define i64 @ff() #0 {
 ; TO:   ret i64 1
 ; TO: }
-; TO: define i64 @fg() nounwind {
+; TO: define i64 @fg() #0 {
 ; TO:   ret i64 8
 ; TO: }
-; TO: define i64 @fh() nounwind {
+; TO: define i64 @fh() #0 {
 ; TO:   ret i64 8
 ; TO: }
-; TO: define i64 @fi() nounwind {
+; TO: define i64 @fi() #0 {
 ; TO:   ret i64 8
 ; TO: }
 ; SCEV: Classifying expressions for: @fa
@@ -375,34 +375,34 @@ define i64 @fi() nounwind {
   ret i64 %t
 }
 
-; PLAIN: define i64* @fM() nounwind {
+; PLAIN: define i64* @fM() #0 {
 ; PLAIN:   %t = bitcast i64* getelementptr (i64* null, i32 1) to i64*
 ; PLAIN:   ret i64* %t
 ; PLAIN: }
-; PLAIN: define i64* @fN() nounwind {
+; PLAIN: define i64* @fN() #0 {
 ; PLAIN:   %t = bitcast i64* getelementptr ({ i64, i64 }* null, i32 0, i32 1) to i64*
 ; PLAIN:   ret i64* %t
 ; PLAIN: }
-; PLAIN: define i64* @fO() nounwind {
+; PLAIN: define i64* @fO() #0 {
 ; PLAIN:   %t = bitcast i64* getelementptr ([2 x i64]* null, i32 0, i32 1) to i64*
 ; PLAIN:   ret i64* %t
 ; PLAIN: }
-; OPT: define i64* @fM() nounwind {
+; OPT: define i64* @fM() #0 {
 ; OPT:   ret i64* getelementptr (i64* null, i32 1)
 ; OPT: }
-; OPT: define i64* @fN() nounwind {
+; OPT: define i64* @fN() #0 {
 ; OPT:   ret i64* getelementptr ({ i64, i64 }* null, i32 0, i32 1)
 ; OPT: }
-; OPT: define i64* @fO() nounwind {
+; OPT: define i64* @fO() #0 {
 ; OPT:   ret i64* getelementptr ([2 x i64]* null, i32 0, i32 1)
 ; OPT: }
-; TO: define i64* @fM() nounwind {
+; TO: define i64* @fM() #0 {
 ; TO:   ret i64* inttoptr (i64 8 to i64*)
 ; TO: }
-; TO: define i64* @fN() nounwind {
+; TO: define i64* @fN() #0 {
 ; TO:   ret i64* inttoptr (i64 8 to i64*)
 ; TO: }
-; TO: define i64* @fO() nounwind {
+; TO: define i64* @fO() #0 {
 ; TO:   ret i64* inttoptr (i64 8 to i64*)
 ; TO: }
 ; SCEV: Classifying expressions for: @fM
@@ -428,14 +428,14 @@ define i64* @fO() nounwind {
   ret i64* %t
 }
 
-; PLAIN: define i32* @fZ() nounwind {
+; PLAIN: define i32* @fZ() #0 {
 ; PLAIN:   %t = bitcast i32* getelementptr inbounds (i32* getelementptr inbounds ([3 x { i32, i32 }]* @ext, i64 0, i64 1, i32 0), i64 1) to i32*
 ; PLAIN:   ret i32* %t
 ; PLAIN: }
-; OPT: define i32* @fZ() nounwind {
+; OPT: define i32* @fZ() #0 {
 ; OPT:   ret i32* getelementptr (i32* getelementptr inbounds ([3 x { i32, i32 }]* @ext, i64 0, i64 1, i32 0), i64 1)
 ; OPT: }
-; TO: define i32* @fZ() nounwind {
+; TO: define i32* @fZ() #0 {
 ; TO:   ret i32* getelementptr inbounds ([3 x { i32, i32 }]* @ext, i64 0, i64 1, i32 1)
 ; TO: }
 ; SCEV: Classifying expressions for: @fZ
@@ -446,3 +446,5 @@ define i32* @fZ() nounwind {
   %t = bitcast i32* getelementptr inbounds (i32* getelementptr inbounds ([3 x { i32, i32 }]* @ext, i64 0, i64 1, i32 0), i64 1) to i32*
   ret i32* %t
 }
+
+; CHECK: attributes #0 = { nounwind }
diff --git a/test/Scripts/coff-dump.py.bat b/test/Scripts/coff-dump.py.bat
index 56428e1..56428e1 100644..100755
--- a/test/Scripts/coff-dump.py.bat
+++ b/test/Scripts/coff-dump.py.bat
diff --git a/test/Scripts/elf-dump.bat b/test/Scripts/elf-dump.bat
index 9c70808..9c70808 100644..100755
--- a/test/Scripts/elf-dump.bat
+++ b/test/Scripts/elf-dump.bat
diff --git a/test/TableGen/Slice.td b/test/TableGen/Slice.td
index cec9fb6..7a35d31 100644
--- a/test/TableGen/Slice.td
+++ b/test/TableGen/Slice.td
@@ -1,5 +1,4 @@
-// RUN: llvm-tblgen %s | grep "\[(set" | count 2
-// RUN: llvm-tblgen %s | grep "\[\]" | count 2
+// RUN: llvm-tblgen %s | FileCheck %s
 
 class ValueType<int size, int value> {
   int Size = size;
@@ -85,3 +84,8 @@ multiclass myscalar<bits<8> opcode, string asmstr = "", list<list<dag>> patterns
   vscalar<opcode, asmstr, patterns>;
 
 defm NOT : myscalar<0x10, "not", [[], [(set FR32:$dst, (f32 (not FR32:$src)))]]>;
+
+// CHECK: Pattern = [(set FR32:$dst, (f32 (not FR32:$src)))];
+// CHECK: Pattern = [];
+// CHECK: Pattern = [(set FR32:$dst, (f32 (not FR32:$src)))];
+// CHECK: Pattern = [];
diff --git a/test/TableGen/math.td b/test/TableGen/math.td
new file mode 100644
index 0000000..bde267a
--- /dev/null
+++ b/test/TableGen/math.td
@@ -0,0 +1,18 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+
+class Int<int value> {
+  int Value = value;
+}
+
+def v1024   : Int<1024>;
+// CHECK: def v1024
+// CHECK: Value = 1024
+
+def v1025   : Int<!add(v1024.Value, 1)>;
+// CHECK: def v1025
+// CHECK: Value = 1025
+
+def v2048   : Int<!add(v1024.Value, v1024.Value)>;
+// CHECK: def v2048
+// CHECK: Value = 2048
+
diff --git a/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll b/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
index e740b29..1226b98 100644
--- a/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
+++ b/test/Transforms/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
@@ -1,15 +1,19 @@
-; RUN: opt < %s -argpromotion -S | grep nounwind | count 2
+; RUN: opt < %s -argpromotion -S | FileCheck %s
 
+; CHECK: define internal i32 @deref(i32 %x.val) #0 {
 define internal i32 @deref(i32* %x) nounwind {
 entry:
-	%tmp2 = load i32* %x, align 4		; <i32> [#uses=1]
-	ret i32 %tmp2
+  %tmp2 = load i32* %x, align 4
+  ret i32 %tmp2
 }
 
 define i32 @f(i32 %x) {
 entry:
-	%x_addr = alloca i32		; <i32*> [#uses=2]
-	store i32 %x, i32* %x_addr, align 4
-	%tmp1 = call i32 @deref( i32* %x_addr ) nounwind 		; <i32> [#uses=1]
-	ret i32 %tmp1
+  %x_addr = alloca i32
+  store i32 %x, i32* %x_addr, align 4
+; CHECK: %tmp1 = call i32 @deref(i32 %x_addr.val) [[NUW:#[0-9]+]]
+  %tmp1 = call i32 @deref( i32* %x_addr ) nounwind
+  ret i32 %tmp1
 }
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/BBVectorize/X86/pr15289.ll b/test/Transforms/BBVectorize/X86/pr15289.ll
new file mode 100644
index 0000000..07cc5d8
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/pr15289.ll
@@ -0,0 +1,98 @@
+; RUN: opt < %s -basicaa -bb-vectorize -disable-output
+; This is a bugpoint-reduced test case. It did not always assert, but does reproduce the bug
+; and running under valgrind (or some similar tool) will catch the error.
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin12.2.0"
+
+%0 = type { [10 x { float, float }], [10 x { float, float }], [10 x { float, float }], [10 x { float, float }], [10 x { float, float }] }
+%1 = type { [10 x [8 x i8]] }
+%2 = type { i64, i64 }
+%3 = type { [10 x i64], i64, i64, i64, i64, i64 }
+%4 = type { i64, i64, i64, i64, i64, i64 }
+%5 = type { [10 x i64] }
+%6 = type { [10 x float], [10 x float], [10 x float], [10 x float] }
+%struct.__st_parameter_dt.1.3.5.7 = type { %struct.__st_parameter_common.0.2.4.6, i64, i64*, i64*, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, [256 x i8], i32*, i64, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, [4 x i8] }
+%struct.__st_parameter_common.0.2.4.6 = type { i32, i32, i8*, i32, i32, i8*, i32* }
+
+@cctenso_ = external unnamed_addr global %0, align 32
+@ctenso_ = external unnamed_addr global %1, align 32
+@i_dim_ = external unnamed_addr global %2, align 16
+@itenso1_ = external unnamed_addr global %3, align 32
+@itenso2_ = external unnamed_addr global %4, align 32
+@ltenso_ = external unnamed_addr global %5, align 32
+@rtenso_ = external unnamed_addr global %6, align 32
+@.cst = external unnamed_addr constant [8 x i8], align 8
+@.cst1 = external unnamed_addr constant [3 x i8], align 8
+@.cst2 = external unnamed_addr constant [29 x i8], align 8
+@.cst3 = external unnamed_addr constant [32 x i8], align 64
+
+define void @cart_to_dc2y_(double* noalias nocapture %xx, double* noalias nocapture %yy, double* noalias nocapture %zz, [5 x { double, double }]* noalias nocapture %c2ten) nounwind uwtable {
+entry:
+  %0 = fmul double undef, undef
+  %1 = fmul double undef, undef
+  %2 = fadd double undef, undef
+  %3 = fmul double undef, 0x3FE8B8B76E3E9919
+  %4 = fsub double %0, %1
+  %5 = fsub double -0.000000e+00, undef
+  %6 = fmul double undef, undef
+  %7 = fmul double %4, %6
+  %8 = fmul double undef, 2.000000e+00
+  %9 = fmul double %8, undef
+  %10 = fmul double undef, %9
+  %11 = fmul double %10, undef
+  %12 = fsub double undef, %7
+  %13 = fmul double %3, %12
+  %14 = fmul double %3, undef
+  %15 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 0, i32 0
+  store double %13, double* %15, align 8, !tbaa !0
+  %16 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 0, i32 1
+  %17 = fmul double undef, %8
+  %18 = fmul double %17, undef
+  %19 = fmul double undef, %18
+  %20 = fadd double undef, undef
+  %21 = fmul double %3, %19
+  %22 = fsub double -0.000000e+00, %21
+  %23 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 1, i32 0
+  store double %22, double* %23, align 8, !tbaa !0
+  %24 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 1, i32 1
+  %25 = fmul double undef, 0x3FE42F601A8C6794
+  %26 = fmul double undef, 2.000000e+00
+  %27 = fsub double %26, %0
+  %28 = fmul double %6, undef
+  %29 = fsub double undef, %28
+  %30 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 2, i32 0
+  store double undef, double* %30, align 8, !tbaa !0
+  %31 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 2, i32 1
+  %32 = fmul double undef, %17
+  %33 = fmul double undef, %17
+  %34 = fmul double undef, %32
+  %35 = fmul double undef, %33
+  %36 = fsub double undef, %35
+  %37 = fmul double %3, %34
+  %38 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 3, i32 0
+  store double %37, double* %38, align 8, !tbaa !0
+  %39 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 3, i32 1
+  %40 = fmul double undef, %8
+  %41 = fmul double undef, %40
+  %42 = fmul double undef, %41
+  %43 = fsub double undef, %42
+  %44 = fmul double %3, %43
+  %45 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 4, i32 0
+  store double %13, double* %45, align 8, !tbaa !0
+  %46 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 4, i32 1
+  %47 = fsub double -0.000000e+00, %14
+  store double %47, double* %16, align 8, !tbaa !0
+  store double undef, double* %24, align 8, !tbaa !0
+  store double -0.000000e+00, double* %31, align 8, !tbaa !0
+  store double undef, double* %39, align 8, !tbaa !0
+  store double undef, double* %46, align 8, !tbaa !0
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
+!0 = metadata !{metadata !"alias set 17: real(kind=8)", metadata !1}
+!1 = metadata !{metadata !1}
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
index d7b7d6b..e4d5152 100644
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -124,8 +124,10 @@ define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) {
 ; CHECK: ret double %R
 }
 
-; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-; CHECK: declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly
-; CHECK: declare <2 x double> @llvm.powi.v2f64(<2 x double>, i32) nounwind readonly
+; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #0
+; CHECK: declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) #0
+; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) #1
+; CHECK: declare <2 x double> @llvm.powi.v2f64(<2 x double>, i32) #1
 
+; CHECK: attributes #0 = { nounwind readnone }
+; CHECK: attributes #1 = { nounwind readonly }
diff --git a/test/Transforms/ConstProp/2007-11-23-cttz.ll b/test/Transforms/ConstProp/2007-11-23-cttz.ll
index 6d34cb1..c5ee70c 100644
--- a/test/Transforms/ConstProp/2007-11-23-cttz.ll
+++ b/test/Transforms/ConstProp/2007-11-23-cttz.ll
@@ -3,6 +3,6 @@
 declare i13 @llvm.cttz.i13(i13, i1)
 
 define i13 @test() {
-	%X = call i13 @llvm.cttz.i13(i13 0, i1 true)
+	%X = call i13 @llvm.cttz.i13(i13 0, i1 false)
 	ret i13 %X
 }
diff --git a/test/Transforms/CorrelatedValuePropagation/basic.ll b/test/Transforms/CorrelatedValuePropagation/basic.ll
index 475cd8d..39c437c 100644
--- a/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -81,6 +81,26 @@ LessThanOrEqualToTwo:
   ret i32 0
 }
 
+declare i32* @f(i32*)
+define void @test5(i32* %x, i32* %y) {
+; CHECK: @test5
+entry:
+  %pre = icmp eq i32* %x, null
+  br i1 %pre, label %return, label %loop
+
+loop:
+  %phi = phi i32* [ %sel, %loop ], [ %x, %entry ]
+; CHECK: %phi = phi i32* [ %f, %loop ], [ %x, %entry ]
+  %f = tail call i32* @f(i32* %phi)
+  %cmp1 = icmp ne i32* %f, %y
+  %sel = select i1 %cmp1, i32* %f, i32* null
+  %cmp2 = icmp eq i32* %sel, null
+  br i1 %cmp2, label %return, label %loop
+
+return:
+  ret void
+}
+
 define i32 @switch1(i32 %s) {
 ; CHECK: @switch1
 entry:
@@ -105,7 +125,7 @@ negative:
   ]
 
 out:
-  %p = phi i32 [ 1, %entry ], [ -1, %negative ], [ -1, %negative ], [ -1, %negative ], [ -1, %negative ]
+  %p = phi i32 [ 1, %entry ], [ -1, %negative ], [ -1, %negative ], [ -1, %negative ], [ -1, %negative ], [ -1, %negative ]
   ret i32 %p
 
 next:
diff --git a/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll b/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll
index 7c6c575..f049265 100644
--- a/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll
+++ b/test/Transforms/DeadArgElim/2007-12-20-ParamAttrs.ll
@@ -1,20 +1,20 @@
-; RUN: opt < %s -deadargelim -S > %t
-; RUN: cat %t | grep nounwind | count 2
-; RUN: cat %t | grep signext | count 2
-; RUN: cat %t | not grep inreg
-; RUN: cat %t | not grep zeroext
-; RUN: cat %t | not grep byval
+; RUN: opt < %s -deadargelim -S | FileCheck %s
 
-	%struct = type { }
+%struct = type { }
 
 @g = global i8 0
 
+; CHECK: define internal void @foo(i8 signext %y) [[NUW:#[0-9]+]]
+
 define internal zeroext i8 @foo(i8* inreg %p, i8 signext %y, ... )  nounwind {
-	store i8 %y, i8* @g
-	ret i8 0
+  store i8 %y, i8* @g
+  ret i8 0
 }
 
 define i32 @bar() {
-	%A = call zeroext i8(i8*, i8, ...)* @foo(i8* inreg null, i8 signext 1, %struct* byval null ) nounwind
-	ret i32 0
+; CHECK: call void @foo(i8 signext 1) [[NUW]]
+  %A = call zeroext i8(i8*, i8, ...)* @foo(i8* inreg null, i8 signext 1, %struct* byval null ) nounwind
+  ret i32 0
 }
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index 2f820ba..f5d2588 100644
--- a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -8,14 +8,14 @@ entry:
   call void @llvm.dbg.value(metadata !{i32 %len}, i64 0, metadata !10)
   call void @llvm.dbg.value(metadata !{i32 %hash}, i64 0, metadata !11)
   call void @llvm.dbg.value(metadata !{i32 %flags}, i64 0, metadata !12)
-; CHECK:  call fastcc i8* @add_name_internal(i8* %name, i32 %hash) nounwind, !dbg !13
+; CHECK:  call fastcc i8* @add_name_internal(i8* %name, i32 %hash) [[NUW:#[0-9]+]], !dbg !13
   %0 = call fastcc i8* @add_name_internal(i8* %name, i32 %len, i32 %hash, i8 zeroext 0, i32 %flags) nounwind, !dbg !13 ; <i8*> [#uses=1]
   ret i8* %0, !dbg !13
 }
 
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
-define internal fastcc i8* @add_name_internal(i8* %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) nounwind noinline ssp {
+define internal fastcc i8* @add_name_internal(i8* %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp {
 entry:
   call void @llvm.dbg.value(metadata !{i8* %name}, i64 0, metadata !15)
   call void @llvm.dbg.value(metadata !{i32 %len}, i64 0, metadata !20)
@@ -38,6 +38,11 @@ bb2:                                              ; preds = %bb1, %bb
 
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
+; CHECK: attributes #0 = { nounwind ssp }
+; CHECK: attributes #1 = { nounwind readnone }
+; CHECK: attributes #2 = { noinline nounwind ssp }
+; CHECK: attributes [[NUW]] = { nounwind }
+
 !0 = metadata !{i32 524545, metadata !1, metadata !"name", metadata !2, i32 8, metadata !6} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 524334, i32 0, metadata !2, metadata !"vfs_addname", metadata !"vfs_addname", metadata !"vfs_addname", metadata !2, i32 12, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 524329, metadata !"tail.c", metadata !"/Users/echeng/LLVM/radars/r7927803/", metadata !3} ; [ DW_TAG_file_type ]
diff --git a/test/Transforms/DeadArgElim/dbginfo.ll b/test/Transforms/DeadArgElim/dbginfo.ll
index b07b60d..59eb458 100644
--- a/test/Transforms/DeadArgElim/dbginfo.ll
+++ b/test/Transforms/DeadArgElim/dbginfo.ll
@@ -37,13 +37,11 @@ entry:
 !llvm.dbg.cu = !{!0}
 
 !0 = metadata !{i32 786449, i32 0, i32 4, metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di", metadata !"clang version 3.2 (trunk 165305)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/samsonov/tmp/clang-di/test.cc] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5, metadata !8, metadata !9}
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5, metadata !8, metadata !9}
 !5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"run", metadata !"run", metadata !"", metadata !6, i32 8, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [run]
 !6 = metadata !{i32 786473, metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di", null} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !1, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{i32 786478, i32 0, metadata !6, metadata !"dead_vararg", metadata !"dead_vararg", metadata !"", metadata !6, i32 5, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (...)* @_ZN12_GLOBAL__N_111dead_varargEz, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [dead_vararg]
 
 ; CHECK: metadata !"dead_vararg"{{.*}}void ()* @_ZN12_GLOBAL__N_111dead_varargEz
diff --git a/test/Transforms/DeadArgElim/keepalive.ll b/test/Transforms/DeadArgElim/keepalive.ll
index dc92dc9..e41110c 100644
--- a/test/Transforms/DeadArgElim/keepalive.ll
+++ b/test/Transforms/DeadArgElim/keepalive.ll
@@ -1,6 +1,4 @@
-; RUN: opt < %s -deadargelim -S > %t
-; RUN: grep "define internal zeroext i32 @test1() nounwind" %t
-; RUN: grep "define internal <{ i32, i32 }> @test2" %t
+; RUN: opt < %s -deadargelim -S | FileCheck %s
 
 %Ty = type <{ i32, i32 }>
 
@@ -9,11 +7,13 @@
 ; the function and then changing too much.
 
 ; This checks if the return value attributes are not removed
+; CHECK: define internal zeroext i32 @test1() #0
 define internal zeroext i32 @test1(i32 %DEADARG1) nounwind {
         ret i32 1
 }
 
 ; This checks if the struct doesn't get non-packed
+; CHECK: define internal <{ i32, i32 }> @test2
 define internal <{ i32, i32 }> @test2(i32 %DEADARG1) {
         ret <{ i32, i32 }> <{ i32 1, i32 2 }>
 }
@@ -28,3 +28,4 @@ define void @caller() {
         ret void
 }
 
+; CHECK: attributes #0 = { nounwind }
diff --git a/test/Transforms/FunctionAttrs/2008-09-03-ReadNone.ll b/test/Transforms/FunctionAttrs/2008-09-03-ReadNone.ll
index 946453f..36a7658 100644
--- a/test/Transforms/FunctionAttrs/2008-09-03-ReadNone.ll
+++ b/test/Transforms/FunctionAttrs/2008-09-03-ReadNone.ll
@@ -1,18 +1,24 @@
-; RUN: opt < %s -basicaa -functionattrs -S | grep readnone | count 4
+; RUN: opt < %s -basicaa -functionattrs -S | FileCheck %s
 @x = global i32 0
 
+; CHECK: declare i32 @e() #0
 declare i32 @e() readnone
 
+; CHECK: define i32 @f() #0
 define i32 @f() {
 	%tmp = call i32 @e( )		; <i32> [#uses=1]
 	ret i32 %tmp
 }
 
+; CHECK: define i32 @g() #0
 define i32 @g() readonly {
 	ret i32 0
 }
 
+; CHECK: define i32 @h() #0
 define i32 @h() readnone {
 	%tmp = load i32* @x		; <i32> [#uses=1]
 	ret i32 %tmp
 }
+
+; CHECK: attributes #0 = { readnone }
diff --git a/test/Transforms/FunctionAttrs/2008-09-03-ReadOnly.ll b/test/Transforms/FunctionAttrs/2008-09-03-ReadOnly.ll
index 22eca13..d8256ae 100644
--- a/test/Transforms/FunctionAttrs/2008-09-03-ReadOnly.ll
+++ b/test/Transforms/FunctionAttrs/2008-09-03-ReadOnly.ll
@@ -1,9 +1,13 @@
-; RUN: opt < %s -basicaa -functionattrs -S | grep readonly | count 2
+; RUN: opt < %s -basicaa -functionattrs -S | FileCheck %s
 
+; CHECK: define i32 @f() #0
 define i32 @f() {
 entry:
-	%tmp = call i32 @e( )		; <i32> [#uses=1]
-	ret i32 %tmp
+  %tmp = call i32 @e( )
+  ret i32 %tmp
 }
 
+; CHECK: declare i32 @e() #0
 declare i32 @e() readonly
+
+; CHECK: attributes #0 = { readonly }
diff --git a/test/Transforms/FunctionAttrs/atomic.ll b/test/Transforms/FunctionAttrs/atomic.ll
index 7c2bff7..027ee0f 100644
--- a/test/Transforms/FunctionAttrs/atomic.ll
+++ b/test/Transforms/FunctionAttrs/atomic.ll
@@ -3,7 +3,7 @@
 ; Atomic load/store to local doesn't affect whether a function is
 ; readnone/readonly.
 define i32 @test1(i32 %x) uwtable ssp {
-; CHECK: define i32 @test1(i32 %x) uwtable readnone ssp {
+; CHECK: define i32 @test1(i32 %x) #0 {
 entry:
   %x.addr = alloca i32, align 4
   store atomic i32 %x, i32* %x.addr seq_cst, align 4
@@ -13,9 +13,11 @@ entry:
 
 ; A function with an Acquire load is not readonly.
 define i32 @test2(i32* %x) uwtable ssp {
-; CHECK: define i32 @test2(i32* nocapture %x) uwtable ssp {
+; CHECK: define i32 @test2(i32* nocapture %x) #1 {
 entry:
   %r = load atomic i32* %x seq_cst, align 4
   ret i32 %r
 }
 
+; CHECK: attributes #0 = { readnone ssp uwtable }
+; CHECK: attributes #1 = { ssp uwtable }
diff --git a/test/Transforms/FunctionAttrs/noreturn.ll b/test/Transforms/FunctionAttrs/noreturn.ll
new file mode 100644
index 0000000..470ebcb
--- /dev/null
+++ b/test/Transforms/FunctionAttrs/noreturn.ll
@@ -0,0 +1,18 @@
+; RUN: opt < %s -functionattrs -instcombine -S | FileCheck %s
+
+define void @endless_loop() noreturn nounwind readnone ssp uwtable {
+entry:
+  br label %while.body
+
+while.body:
+  br label %while.body
+}
+;CHECK: @main
+;CHECK: endless_loop
+;CHECK: ret
+define i32 @main() noreturn nounwind ssp uwtable {
+entry:
+  tail call void @endless_loop()
+  unreachable
+}
+
diff --git a/test/Transforms/GlobalOpt/externally-initialized-global-ctr.ll b/test/Transforms/GlobalOpt/externally-initialized-global-ctr.ll
new file mode 100644
index 0000000..9295c20
--- /dev/null
+++ b/test/Transforms/GlobalOpt/externally-initialized-global-ctr.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+; rdar://12580965.
+; ObjC++ test case.
+
+%struct.ButtonInitData = type { i8* }
+
+@_ZL14buttonInitData = internal global [1 x %struct.ButtonInitData] zeroinitializer, align 4
+
+@"\01L_OBJC_METH_VAR_NAME_40" = internal global [7 x i8] c"print:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+@"\01L_OBJC_SELECTOR_REFERENCES_41" = internal externally_initialized  global i8* getelementptr inbounds ([7 x i8]* @"\01L_OBJC_METH_VAR_NAME_40", i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+@llvm.used = appending global [2 x i8*] [i8* getelementptr inbounds ([7 x i8]* @"\01L_OBJC_METH_VAR_NAME_40", i32 0, i32 0),  i8* bitcast (i8** @"\01L_OBJC_SELECTOR_REFERENCES_41" to i8*)]
+
+define internal void @__cxx_global_var_init() section "__TEXT,__StaticInit,regular,pure_instructions" {
+  %1 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_41", !invariant.load !2009
+  store i8* %1, i8** getelementptr inbounds ([1 x %struct.ButtonInitData]* @_ZL14buttonInitData, i32 0, i32 0, i32 0), align 4
+  ret void
+}
+
+define internal void @_GLOBAL__I_a() section "__TEXT,__StaticInit,regular,pure_instructions" {
+  call void @__cxx_global_var_init()
+  ret void
+}
+
+declare void @test(i8*)
+
+define void @print() {
+; CHECK: %1 = load i8** getelementptr inbounds ([1 x %struct.ButtonInitData]* @_ZL14buttonInitData, i32 0, i32 0, i32 0), align 4
+  %1 = load i8** getelementptr inbounds ([1 x %struct.ButtonInitData]* @_ZL14buttonInitData, i32 0, i32 0, i32 0), align 4
+  call void @test(i8* %1)
+  ret void
+}
+
+!2009 = metadata !{}
diff --git a/test/Transforms/GlobalOpt/integer-bool.ll b/test/Transforms/GlobalOpt/integer-bool.ll
index 5a34a9c..cf025ec 100644
--- a/test/Transforms/GlobalOpt/integer-bool.ll
+++ b/test/Transforms/GlobalOpt/integer-bool.ll
@@ -1,23 +1,28 @@
-; RUN: opt < %s -globalopt -instcombine | \
-; RUN:    llvm-dis | grep "ret i1 true"
-
+; RUN: opt < %s -S -globalopt -instcombine | FileCheck %s
 ;; check that global opt turns integers that only hold 0 or 1 into bools.
 
-@G = internal global i32 0              ; <i32*> [#uses=3]
+@G = internal addrspace(1) global i32 0
+; CHECK @G.b
+; CHECK addrspace(1)
+; CHECK global i1 0
 
 define void @set1() {
-        store i32 0, i32* @G
-        ret void
+  store i32 0, i32 addrspace(1)* @G
+; CHECK: store i1 false
+  ret void
 }
 
 define void @set2() {
-        store i32 1, i32* @G
-        ret void
+  store i32 1, i32 addrspace(1)* @G
+; CHECK: store i1 true
+  ret void
 }
 
 define i1 @get() {
-        %A = load i32* @G               ; <i32> [#uses=1]
-        %C = icmp slt i32 %A, 2         ; <i1> [#uses=1]
-        ret i1 %C
+; CHECK @get
+  %A = load i32 addrspace(1) * @G
+  %C = icmp slt i32 %A, 2
+  ret i1 %C
+; CHECK: ret i1 true
 }
 
diff --git a/test/Transforms/IPConstantProp/user-with-multiple-uses.ll b/test/Transforms/IPConstantProp/user-with-multiple-uses.ll
index 402ea41..9687180 100644
--- a/test/Transforms/IPConstantProp/user-with-multiple-uses.ll
+++ b/test/Transforms/IPConstantProp/user-with-multiple-uses.ll
@@ -4,9 +4,9 @@
 ; IPSCCP should propagate the 0 argument, eliminate the switch, and propagate
 ; the result.
 
-; CHECK: define i32 @main() noreturn nounwind {
+; CHECK: define i32 @main() #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: %call2 = tail call i32 @wwrite(i64 0) nounwind
+; CHECK-NEXT: %call2 = tail call i32 @wwrite(i64 0) [[NUW:#[0-9]+]]
 ; CHECK-NEXT: ret i32 123
 
 define i32 @main() noreturn nounwind {
@@ -28,3 +28,7 @@ sw.default:
 return:
   ret i32 0
 }
+
+; CHECK: attributes #0 = { noreturn nounwind }
+; CHECK: attributes #1 = { nounwind readnone }
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/Inline/inline_invoke.ll b/test/Transforms/Inline/inline_invoke.ll
index 9f5f670..c53bb5a 100644
--- a/test/Transforms/Inline/inline_invoke.ll
+++ b/test/Transforms/Inline/inline_invoke.ll
@@ -330,7 +330,7 @@ terminate:
 ; CHECK-NEXT: br label %[[JOIN]]
 ; CHECK:    [[JOIN]]:
 ; CHECK-NEXT: phi { i8*, i32 }
-; CHECK-NEXT: call void @opaque() nounwind
+; CHECK-NEXT: call void @opaque() [[NUW:#[0-9]+]]
 ; CHECK-NEXT: br label %[[FIX:[^\s]+]]
 ; CHECK:    lpad:
 ; CHECK-NEXT: landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
@@ -340,3 +340,8 @@ terminate:
 ; CHECK-NEXT: [[T1:%.*]] = phi i32 [ 0, %[[JOIN]] ], [ 1, %lpad ]
 ; CHECK-NEXT: call void @use(i32 [[T1]])
 ; CHECK-NEXT: call void @_ZSt9terminatev()
+
+; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #1 = { nounwind readnone }
+; CHECK: attributes #2 = { ssp uwtable }
+; CHECK: attributes #3 = { noreturn nounwind }
diff --git a/test/Transforms/Inline/inline_ssp.ll b/test/Transforms/Inline/inline_ssp.ll
new file mode 100644
index 0000000..a4b43a7
--- /dev/null
+++ b/test/Transforms/Inline/inline_ssp.ll
@@ -0,0 +1,160 @@
+; RUN: opt -inline %s -S | FileCheck %s
+; Ensure SSP attributes are propagated correctly when inlining.
+
+@.str = private unnamed_addr constant [11 x i8] c"fun_nossp\0A\00", align 1
+@.str1 = private unnamed_addr constant [9 x i8] c"fun_ssp\0A\00", align 1
+@.str2 = private unnamed_addr constant [15 x i8] c"fun_sspstrong\0A\00", align 1
+@.str3 = private unnamed_addr constant [12 x i8] c"fun_sspreq\0A\00", align 1
+
+; These first four functions (@fun_sspreq, @fun_sspstrong, @fun_ssp, @fun_nossp)
+; are used by the remaining functions to ensure that the SSP attributes are
+; propagated correctly.  The caller should have its SSP attribute set as:
+; strictest(caller-ssp-attr, callee-ssp-attr), where strictness is ordered as:
+;  sspreq > sspstrong > ssp > [no ssp]
+define internal void @fun_sspreq() nounwind sspreq uwtable {
+entry:
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str3, i32 0, i32 0))
+  ret void
+}
+
+define internal void @fun_sspstrong() nounwind sspstrong uwtable {
+entry:
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([15 x i8]* @.str2, i32 0, i32 0))
+  ret void
+}
+
+define internal void @fun_ssp() nounwind ssp uwtable {
+entry:
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str1, i32 0, i32 0))
+  ret void
+}
+
+define internal void @fun_nossp() nounwind uwtable {
+entry:
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0))
+  ret void
+}
+
+; Tests start below 
+
+define void @inline_req_req() nounwind sspreq uwtable {
+entry:
+; CHECK: @inline_req_req() #0
+  call void @fun_sspreq()
+  ret void
+}
+
+define void @inline_req_strong() nounwind sspstrong uwtable {
+entry:
+; CHECK: @inline_req_strong() #0
+  call void @fun_sspreq()
+  ret void
+}
+
+define void @inline_req_ssp() nounwind ssp uwtable {
+entry:
+; CHECK: @inline_req_ssp() #0
+  call void @fun_sspreq()
+  ret void
+}
+
+define void @inline_req_nossp() nounwind uwtable {
+entry:
+; CHECK: @inline_req_nossp() #0
+  call void @fun_sspreq()
+  ret void
+}
+
+define void @inline_strong_req() nounwind sspreq uwtable {
+entry:
+; CHECK: @inline_strong_req() #0
+  call void @fun_sspstrong()
+  ret void
+}
+
+
+define void @inline_strong_strong() nounwind sspstrong uwtable {
+entry:
+; CHECK: @inline_strong_strong() #1
+  call void @fun_sspstrong()
+  ret void
+}
+
+define void @inline_strong_ssp() nounwind ssp uwtable {
+entry:
+; CHECK: @inline_strong_ssp() #1
+  call void @fun_sspstrong()
+  ret void
+}
+
+define void @inline_strong_nossp() nounwind uwtable {
+entry:
+; CHECK: @inline_strong_nossp() #1
+  call void @fun_sspstrong()
+  ret void
+}
+
+define void @inline_ssp_req() nounwind sspreq uwtable {
+entry:
+; CHECK: @inline_ssp_req() #0
+  call void @fun_ssp()
+  ret void
+}
+
+
+define void @inline_ssp_strong() nounwind sspstrong uwtable {
+entry:
+; CHECK: @inline_ssp_strong() #1
+  call void @fun_ssp()
+  ret void
+}
+
+define void @inline_ssp_ssp() nounwind ssp uwtable {
+entry:
+; CHECK: @inline_ssp_ssp() #2
+  call void @fun_ssp()
+  ret void
+}
+
+define void @inline_ssp_nossp() nounwind uwtable {
+entry:
+; CHECK: @inline_ssp_nossp() #2
+  call void @fun_ssp()
+  ret void
+}
+
+define void @inline_nossp_req() nounwind uwtable sspreq {
+entry:
+; CHECK: @inline_nossp_req() #0
+  call void @fun_nossp()
+  ret void
+}
+
+
+define void @inline_nossp_strong() nounwind sspstrong uwtable {
+entry:
+; CHECK: @inline_nossp_strong() #1
+  call void @fun_nossp()
+  ret void
+}
+
+define void @inline_nossp_ssp() nounwind ssp uwtable {
+entry:
+; CHECK: @inline_nossp_ssp() #2
+  call void @fun_nossp()
+  ret void
+}
+
+define void @inline_nossp_nossp() nounwind uwtable {
+entry:
+; CHECK: @inline_nossp_nossp() #3
+  call void @fun_nossp()
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+; CHECK: attributes #0 = { nounwind sspreq uwtable }
+; CHECK: attributes #1 = { nounwind sspstrong uwtable }
+; CHECK: attributes #2 = { nounwind ssp uwtable }
+; CHECK: attributes #3 = { nounwind uwtable }
diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
index 0907c49..2dedd44 100644
--- a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
+++ b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
@@ -50,7 +50,7 @@ entry:
   %b = add <4 x i32> zeroinitializer, %a
   ret <4 x i32> %b
 ; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
 ; CHECK-NEXT: ret <4 x i32> %a
 }
 
@@ -66,3 +66,7 @@ entry:
 
 declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+
+; CHECK: attributes #0 = { nounwind readnone ssp }
+; CHECK: attributes #1 = { nounwind readnone }
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/InstCombine/bitcast-vector-fold.ll b/test/Transforms/InstCombine/bitcast-vector-fold.ll
index 8feec22..8fd7f35 100644
--- a/test/Transforms/InstCombine/bitcast-vector-fold.ll
+++ b/test/Transforms/InstCombine/bitcast-vector-fold.ll
@@ -31,3 +31,8 @@ define <4 x i32> @test6() {
 	%tmp3 = bitcast <2 x double> <double 0.5, double 1.0> to <4 x i32>
 	ret <4 x i32> %tmp3
 }
+
+define i32 @test7() {
+       %tmp3 = bitcast <2 x half> <half 0xH1100, half 0xH0011> to i32
+       ret i32 %tmp3
+}
+\ No newline at end of file
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 8f6ae7d..1e61132 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -11,7 +11,7 @@ define i32 @test1(i64 %a) {
         %t3 = xor <2 x i32> %t1, %t2
         %t4 = extractelement <2 x i32> %t3, i32 0
         ret i32 %t4
-        
+
 ; CHECK: @test1
 ; CHECK: ret i32 0
 }
@@ -30,7 +30,7 @@ define float @test2(<2 x float> %A, <2 x i32> %B) {
 
   %add = fadd float %tmp24, %tmp4
   ret float %add
-  
+
 ; CHECK: @test2
 ; CHECK-NEXT:  %tmp24 = extractelement <2 x float> %A, i32 0
 ; CHECK-NEXT:  bitcast <2 x i32> %B to <2 x float>
@@ -55,7 +55,7 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 
   %add = fadd float %tmp24, %tmp4
   ret float %add
-  
+
 ; CHECK: @test3
 ; CHECK-NEXT:  %tmp24 = extractelement <2 x float> %A, i32 1
 ; CHECK-NEXT:  bitcast <2 x i64> %B to <4 x float>
@@ -75,7 +75,7 @@ define <2 x i32> @test4(i32 %A, i32 %B){
   ; CHECK: @test4
   ; CHECK-NEXT: insertelement <2 x i32> undef, i32 %A, i32 0
   ; CHECK-NEXT: insertelement <2 x i32> {{.*}}, i32 %B, i32 1
-  ; CHECK-NEXT: ret <2 x i32> 
+  ; CHECK-NEXT: ret <2 x i32>
 
 }
 
@@ -92,7 +92,7 @@ define <2 x float> @test5(float %A, float %B) {
   ; CHECK: @test5
   ; CHECK-NEXT: insertelement <2 x float> undef, float %A, i32 0
   ; CHECK-NEXT: insertelement <2 x float> {{.*}}, float %B, i32 1
-  ; CHECK-NEXT: ret <2 x float> 
+  ; CHECK-NEXT: ret <2 x float>
 }
 
 define <2 x float> @test6(float %A){
@@ -123,7 +123,7 @@ define i64 @Vec2(i64 %in) {
 }
 
 define i64 @All11(i64 %in) {
-  %out = and i64 %in, xor (i64 bitcast (<2 x float> bitcast (i64 -1 to <2 x float>) to i64), i64 -1) 
+  %out = and i64 %in, xor (i64 bitcast (<2 x float> bitcast (i64 -1 to <2 x float>) to i64), i64 -1)
   ret i64 %out
 ; CHECK: @All11
 ; CHECK: ret i64 0
@@ -131,9 +131,16 @@ define i64 @All11(i64 %in) {
 
 
 define i32 @All111(i32 %in) {
-  %out = and i32 %in, xor (i32 bitcast (<1 x float> bitcast (i32 -1 to <1 x float>) to i32), i32 -1) 
+  %out = and i32 %in, xor (i32 bitcast (<1 x float> bitcast (i32 -1 to <1 x float>) to i32), i32 -1)
   ret i32 %out
 ; CHECK: @All111
 ; CHECK: ret i32 0
 }
 
+define <2 x i16> @BitcastInsert(i32 %a) {
+  %v = insertelement <1 x i32> undef, i32 %a, i32 0
+  %r = bitcast <1 x i32> %v to <2 x i16>
+  ret <2 x i16> %r
+; CHECK: @BitcastInsert
+; CHECK: bitcast i32 %a to <2 x i16>
+}
diff --git a/test/Transforms/InstCombine/constant-expr-datalayout.ll b/test/Transforms/InstCombine/constant-expr-datalayout.ll
new file mode 100644
index 0000000..9a72c77
--- /dev/null
+++ b/test/Transforms/InstCombine/constant-expr-datalayout.ll
@@ -0,0 +1,12 @@
+; RUN: opt -instcombine %s -S -o - | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%test1.struct = type { i32, i32 }
+@test1.aligned_glbl = global %test1.struct zeroinitializer, align 4
+define void @test1(i64 *%ptr) {
+  store i64 and (i64 ptrtoint (i32* getelementptr (%test1.struct* @test1.aligned_glbl, i32 0, i32 1) to i64), i64 3), i64* %ptr
+; CHECK: store i64 0, i64* %ptr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/exact.ll b/test/Transforms/InstCombine/exact.ll
index 14741e3..88ca88c 100644
--- a/test/Transforms/InstCombine/exact.ll
+++ b/test/Transforms/InstCombine/exact.ll
@@ -99,9 +99,9 @@ define i1 @ashr_icmp2(i64 %X) nounwind {
 ; PR9998
 ; Make sure we don't transform the ashr here into an sdiv
 ; CHECK: @pr9998
-; CHECK: = and i32 %V, 1
-; CHECK: %Z = icmp ne
-; CHECK: ret i1 %Z
+; CHECK:      [[BIT:%[A-Za-z0-9.]+]] = and i32 %V, 1
+; CHECK-NEXT: [[CMP:%[A-Za-z0-9.]+]] = icmp ne i32 [[BIT]], 0
+; CHECK-NEXT: ret i1 [[CMP]]
 define i1 @pr9998(i32 %V) nounwind {
 entry:
   %W = shl i32 %V, 31
@@ -112,6 +112,7 @@ entry:
 }
 
 
+
 ; CHECK: @udiv_icmp1
 ; CHECK: icmp ne i64 %X, 0
 define i1 @udiv_icmp1(i64 %X) nounwind {
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index 5d40d71..c97bd28 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -7,7 +7,7 @@ define float @fold(float %a) {
   %mul1 = fmul fast float %mul, 0x4002666660000000
   ret float %mul1
 ; CHECK: @fold
-; CHECK: fmul float %a, 0x4006147AE0000000
+; CHECK: fmul fast float %a, 0x4006147AE0000000
 }
 
 ; Same testing-case as the one used in fold() except that the operators have
@@ -22,7 +22,7 @@ define float @notfold(float %a) {
 
 define float @fold2(float %a) {
 ; CHECK: @fold2
-; CHECK: fmul float %a, 0x4006147AE0000000
+; CHECK: fmul fast float %a, 0x4006147AE0000000
   %mul = fmul float %a, 0x3FF3333340000000
   %mul1 = fmul fast float %mul, 0x4002666660000000
   ret float %mul1
@@ -54,7 +54,7 @@ define float @fold5(float %f1, float %f2) {
   %add1 = fadd fast float %add, 5.000000e+00
   ret float %add1
 ; CHECK: @fold5
-; CHECK: fadd float %f1, 9.000000e+00
+; CHECK: fadd fast float %f1, 9.000000e+00
 }
 
 ; (X + X) + X => 3.0 * X
@@ -97,17 +97,17 @@ define float @fold9(float %f1, float %f2) {
 }
 
 ; Let C3 = C1 + C2. (f1 + C1) + (f2 + C2) => (f1 + f2) + C3 instead of
-; "(f1 + C3) + f2" or "(f2 + C3) + f1". Placing constant-addend at the 
+; "(f1 + C3) + f2" or "(f2 + C3) + f1". Placing constant-addend at the
 ; top of resulting simplified expression tree may potentially reveal some
 ; optimization opportunities in the super-expression trees.
-; 
+;
 define float @fold10(float %f1, float %f2) {
   %t1 = fadd fast float 2.000000e+00, %f1
   %t2 = fsub fast float %f2, 3.000000e+00
   %t3 = fadd fast float %t1, %t2
   ret float %t3
 ; CHECK: @fold10
-; CHECK: %t3 = fadd float %t2, -1.000000e+00
+; CHECK: %t3 = fadd fast float %t2, -1.000000e+00
 ; CHECK: ret float %t3
 }
 
@@ -130,37 +130,6 @@ define double @fail2(double %f1, double %f2) {
 ; CHECK: ret
 }
 
-; rdar://12753946:  x * cond ? 1.0 : 0.0 => cond ? x : 0.0
-define double @select1(i32 %cond, double %x, double %y) {
-  %tobool = icmp ne i32 %cond, 0
-  %cond1 = select i1 %tobool, double 1.000000e+00, double 0.000000e+00
-  %mul = fmul nnan nsz double %cond1, %x
-  %add = fadd double %mul, %y
-  ret double %add
-; CHECK: @select1
-; CHECK: select i1 %tobool, double %x, double 0.000000e+00
-}
-
-define double @select2(i32 %cond, double %x, double %y) {
-  %tobool = icmp ne i32 %cond, 0
-  %cond1 = select i1 %tobool, double 0.000000e+00, double 1.000000e+00
-  %mul = fmul nnan nsz double %cond1, %x
-  %add = fadd double %mul, %y
-  ret double %add
-; CHECK: @select2
-; CHECK: select i1 %tobool, double 0.000000e+00, double %x
-}
-
-define double @select3(i32 %cond, double %x, double %y) {
-  %tobool = icmp ne i32 %cond, 0
-  %cond1 = select i1 %tobool, double 0.000000e+00, double 2.000000e+00
-  %mul = fmul nnan nsz double %cond1, %x
-  %add = fadd double %mul, %y
-  ret double %add
-; CHECK: @select3
-; CHECK: fmul nnan nsz double %cond1, %x
-}
-
 ; =========================================================================
 ;
 ;   Testing-cases about fmul begin
@@ -172,7 +141,7 @@ define float @fmul_distribute1(float %f1) {
   %t1 = fmul float %f1, 6.0e+3
   %t2 = fadd float %t1, 2.0e+3
   %t3 = fmul fast float %t2, 5.0e+3
-  ret float %t3 
+  ret float %t3
 ; CHECK: @fmul_distribute1
 ; CHECK: %1 = fmul fast float %f1, 3.000000e+07
 ; CHECK: %t3 = fadd fast float %1, 1.000000e+07
@@ -205,9 +174,9 @@ define double @fmul_distribute3(double %f1) {
 
 ; C1/X * C2 => (C1*C2) / X
 define float @fmul2(float %f1) {
-  %t1 = fdiv float 2.0e+3, %f1 
+  %t1 = fdiv float 2.0e+3, %f1
   %t3 = fmul fast float %t1, 6.0e+3
-  ret float %t3 
+  ret float %t3
 ; CHECK: @fmul2
 ; CHECK: fdiv fast float 1.200000e+07, %f1
 }
@@ -216,7 +185,7 @@ define float @fmul2(float %f1) {
 define float @fmul3(float %f1, float %f2) {
   %t1 = fdiv float %f1, 2.0e+3
   %t3 = fmul fast float %t1, 6.0e+3
-  ret float %t3 
+  ret float %t3
 ; CHECK: @fmul3
 ; CHECK: fmul fast float %f1, 3.000000e+00
 }
@@ -227,21 +196,146 @@ define float @fmul3(float %f1, float %f2) {
 define float @fmul4(float %f1, float %f2) {
   %t1 = fdiv float %f1, 2.0e+3
   %t3 = fmul fast float %t1, 0x3810000000000000
-  ret float %t3 
+  ret float %t3
 ; CHECK: @fmul4
 ; CHECK: fmul fast float %t1, 0x3810000000000000
 }
 
-; X / C1 * C2 => X / (C2/C1) if  C1/C2 is either a special value of a denormal, 
+; X / C1 * C2 => X / (C2/C1) if  C1/C2 is either a special value of a denormal,
 ;  and C2/C1 is a normal value.
-; 
+;
 define float @fmul5(float %f1, float %f2) {
   %t1 = fdiv float %f1, 3.0e+0
   %t3 = fmul fast float %t1, 0x3810000000000000
-  ret float %t3 
+  ret float %t3
 ; CHECK: @fmul5
 ; CHECK: fdiv fast float %f1, 0x47E8000000000000
 }
 
+; (X*Y) * X => (X*X) * Y
+define float @fmul6(float %f1, float %f2) {
+  %mul = fmul float %f1, %f2
+  %mul1 = fmul fast float %mul, %f1
+  ret float %mul1
+; CHECK: @fmul6
+; CHECK: fmul fast float %f1, %f1
+}
+
+; "(X*Y) * X => (X*X) * Y" is disabled if "X*Y" has multiple uses
+define float @fmul7(float %f1, float %f2) {
+  %mul = fmul float %f1, %f2
+  %mul1 = fmul fast float %mul, %f1
+  %add = fadd float %mul1, %mul
+  ret float %add
+; CHECK: @fmul7
+; CHECK: fmul fast float %mul, %f1
+}
+
+; =========================================================================
+;
+;   Testing-cases about negation
+;
+; =========================================================================
+define float @fneg1(float %f1, float %f2) {
+  %sub = fsub float -0.000000e+00, %f1
+  %sub1 = fsub nsz float 0.000000e+00, %f2
+  %mul = fmul float %sub, %sub1
+  ret float %mul
+; CHECK: @fneg1
+; CHECK: fmul float %f1, %f2
+}
+
+; =========================================================================
+;
+;   Testing-cases about div
+;
+; =========================================================================
+
+; X/C1 / C2 => X * (1/(C2*C1))
+define float @fdiv1(float %x) {
+  %div = fdiv float %x, 0x3FF3333340000000
+  %div1 = fdiv fast float %div, 0x4002666660000000
+  ret float %div1
+; 0x3FF3333340000000 = 1.2f
+; 0x4002666660000000 = 2.3f
+; 0x3FD7303B60000000 = 0.36231884057971014492
+; CHECK: @fdiv1
+; CHECK: fmul fast float %x, 0x3FD7303B60000000
+}
+
+; X*C1 / C2 => X * (C1/C2)
+define float @fdiv2(float %x) {
+  %mul = fmul float %x, 0x3FF3333340000000
+  %div1 = fdiv fast float %mul, 0x4002666660000000
+  ret float %div1
+
+; 0x3FF3333340000000 = 1.2f
+; 0x4002666660000000 = 2.3f
+; 0x3FE0B21660000000 = 0.52173918485641479492
+; CHECK: @fdiv2
+; CHECK: fmul fast float %x, 0x3FE0B21660000000
+}
+
+; "X/C1 / C2 => X * (1/(C2*C1))" is disabled (for now) is C2/C1 is a denormal
+;
+define float @fdiv3(float %x) {
+  %div = fdiv float %x, 0x47EFFFFFE0000000
+  %div1 = fdiv fast float %div, 0x4002666660000000
+  ret float %div1
+; CHECK: @fdiv3
+; CHECK: fdiv float %x, 0x47EFFFFFE0000000
+}
+
+; "X*C1 / C2 => X * (C1/C2)" is disabled if C1/C2 is a denormal
+define float @fdiv4(float %x) {
+  %mul = fmul float %x, 0x47EFFFFFE0000000
+  %div = fdiv float %mul, 0x3FC99999A0000000
+  ret float %div
+; CHECK: @fdiv4
+; CHECK: fmul float %x, 0x47EFFFFFE0000000
+}
+
+; (X/Y)/Z = > X/(Y*Z)
+define float @fdiv5(float %f1, float %f2, float %f3) {
+  %t1 = fdiv float %f1, %f2
+  %t2 = fdiv fast float %t1, %f3
+  ret float %t2
+; CHECK: @fdiv5
+; CHECK: fmul float %f2, %f3
+}
 
+; Z/(X/Y) = > (Z*Y)/X
+define float @fdiv6(float %f1, float %f2, float %f3) {
+  %t1 = fdiv float %f1, %f2
+  %t2 = fdiv fast float %f3, %t1
+  ret float %t2
+; CHECK: @fdiv6
+; CHECK: fmul float %f3, %f2
+}
 
+; C1/(X*C2) => (C1/C2) / X
+define float @fdiv7(float %x) {
+  %t1 = fmul float %x, 3.0e0
+  %t2 = fdiv fast float 15.0e0, %t1
+  ret float %t2
+; CHECK: @fdiv7
+; CHECK: fdiv fast float 5.000000e+00, %x
+}
+
+; C1/(X/C2) => (C1*C2) / X
+define float @fdiv8(float %x) {
+  %t1 = fdiv float %x, 3.0e0
+  %t2 = fdiv fast float 15.0e0, %t1
+  ret float %t2
+; CHECK: @fdiv8
+; CHECK: fdiv fast float 4.500000e+01, %x
+}
+
+; C1/(C2/X) => (C1/C2) * X
+define float @fdiv9(float %x) {
+  %t1 = fdiv float 3.0e0, %x
+  %t2 = fdiv fast float 15.0e0, %t1
+  ret float %t2
+; CHECK: @fdiv9
+; CHECK: fmul fast float %x, 5.000000e+00
+}
diff --git a/test/Transforms/InstCombine/fmul.ll b/test/Transforms/InstCombine/fmul.ll
new file mode 100644
index 0000000..3671b4c
--- /dev/null
+++ b/test/Transforms/InstCombine/fmul.ll
@@ -0,0 +1,72 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; (-0.0 - X) * C => X * -C
+define float @test1(float %x) {
+  %sub = fsub float -0.000000e+00, %x
+  %mul = fmul float %sub, 2.0e+1
+  ret float %mul
+
+; CHECK: @test1
+; CHECK: fmul float %x, -2.000000e+01
+}
+
+; (0.0 - X) * C => X * -C
+define float @test2(float %x) {
+  %sub = fsub nsz float 0.000000e+00, %x
+  %mul = fmul float %sub, 2.0e+1
+  ret float %mul
+
+; CHECK: @test2
+; CHECK: fmul float %x, -2.000000e+01
+}
+
+; (-0.0 - X) * (-0.0 - Y) => X * Y
+define float @test3(float %x, float %y) {
+  %sub1 = fsub float -0.000000e+00, %x
+  %sub2 = fsub float -0.000000e+00, %y
+  %mul = fmul float %sub1, %sub2
+  ret float %mul
+; CHECK: @test3
+; CHECK: fmul float %x, %y
+}
+
+; (0.0 - X) * (0.0 - Y) => X * Y
+define float @test4(float %x, float %y) {
+  %sub1 = fsub nsz float 0.000000e+00, %x
+  %sub2 = fsub nsz float 0.000000e+00, %y
+  %mul = fmul float %sub1, %sub2
+  ret float %mul
+; CHECK: @test4
+; CHECK: fmul float %x, %y
+}
+
+; (-0.0 - X) * Y => -0.0 - (X * Y)
+define float @test5(float %x, float %y) {
+  %sub1 = fsub float -0.000000e+00, %x
+  %mul = fmul float %sub1, %y
+  ret float %mul
+; CHECK: @test5
+; CHECK: %1 = fmul float %x, %y
+; CHECK: %mul = fsub float -0.000000e+00, %1
+}
+
+; (0.0 - X) * Y => 0.0 - (X * Y)
+define float @test6(float %x, float %y) {
+  %sub1 = fsub nsz float 0.000000e+00, %x
+  %mul = fmul float %sub1, %y
+  ret float %mul
+; CHECK: @test6
+; CHECK: %1 = fmul float %x, %y
+; CHECK: %mul = fsub float -0.000000e+00, %1
+}
+
+; "(-0.0 - X) * Y => -0.0 - (X * Y)" is disabled if expression "-0.0 - X"
+; has multiple uses.
+define float @test7(float %x, float %y) {
+  %sub1 = fsub float -0.000000e+00, %x
+  %mul = fmul float %sub1, %y
+  %mul2 = fmul float %mul, %sub1
+  ret float %mul2
+; CHECK: @test7
+; CHECK: fsub float -0.000000e+00, %x
+}
diff --git a/test/Transforms/InstCombine/fpcast.ll b/test/Transforms/InstCombine/fpcast.ll
index bc6aa0a..09f0532 100644
--- a/test/Transforms/InstCombine/fpcast.ll
+++ b/test/Transforms/InstCombine/fpcast.ll
@@ -13,3 +13,22 @@ define i8 @test2() {
 ; CHECK: ret i8 -1
 }
 
+; CHECK: test3
+define half @test3(float %a) {
+; CHECK: fptrunc
+; CHECK: llvm.fabs.f16
+  %b = call float @llvm.fabs.f32(float %a)
+  %c = fptrunc float %b to half
+  ret half %c
+}
+
+; CHECK: test4
+define half @test4(float %a) {
+; CHECK: fptrunc
+; CHECK: fsub
+  %b = fsub float -0.0, %a
+  %c = fptrunc float %b to half
+  ret half %c
+}
+
+declare float @llvm.fabs.f32(float) nounwind readonly
diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll
index 1c120ec..bb07736 100644
--- a/test/Transforms/InstCombine/getelementptr.ll
+++ b/test/Transforms/InstCombine/getelementptr.ll
@@ -424,7 +424,7 @@ define i32 @test35() nounwind {
              i8* getelementptr (%t1* bitcast (%t0* @s to %t1*), i32 0, i32 1, i32 0)) nounwind
   ret i32 0
 ; CHECK: @test35
-; CHECK: call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @"\01LC8", i64 0, i64 0), i8* getelementptr inbounds (%t0* @s, i64 0, i32 1, i64 0)) nounwind
+; CHECK: call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8]* @"\01LC8", i64 0, i64 0), i8* getelementptr inbounds (%t0* @s, i64 0, i32 1, i64 0)) [[NUW:#[0-9]+]]
 }
 
 ; Instcombine should constant-fold the GEP so that indices that have
@@ -492,3 +492,21 @@ define void @three_gep_f(%three_gep_t2* %x) {
 
 declare void @three_gep_g(i32*)
 declare void @three_gep_h(%three_gep_t2*)
+
+%struct.ham = type { i32, %struct.zot*, %struct.zot*, %struct.zot* }
+%struct.zot = type { i64, i8 }
+
+define void @test39(%struct.ham* %arg, i8 %arg1) nounwind {
+  %tmp = getelementptr inbounds %struct.ham* %arg, i64 0, i32 2
+  %tmp2 = load %struct.zot** %tmp, align 8
+  %tmp3 = bitcast %struct.zot* %tmp2 to i8*
+  %tmp4 = getelementptr inbounds i8* %tmp3, i64 -8
+  store i8 %arg1, i8* %tmp4, align 8
+  ret void
+
+; CHECK: @test39
+; CHECK: getelementptr inbounds %struct.ham* %arg, i64 0, i32 2
+; CHECK: getelementptr inbounds i8* %tmp3, i64 -8
+}
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 8fb6144..331eb3f 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -706,3 +706,41 @@ define i1 @test69(i32 %c) nounwind uwtable {
   %3 = or i1 %1, %2
   ret i1 %3
 }
+
+; CHECK: @icmp_sext16trunc
+; CHECK-NEXT: %1 = trunc i32 %x to i16
+; CHECK-NEXT: %cmp = icmp slt i16 %1, 36
+define i1 @icmp_sext16trunc(i32 %x) {
+  %trunc = trunc i32 %x to i16
+  %sext = sext i16 %trunc to i32
+  %cmp = icmp slt i32 %sext, 36
+  ret i1 %cmp
+}
+
+; CHECK: @icmp_sext8trunc
+; CHECK-NEXT: %1 = trunc i32 %x to i8
+; CHECK-NEXT: %cmp = icmp slt i8 %1, 36
+define i1 @icmp_sext8trunc(i32 %x) {
+  %trunc = trunc i32 %x to i8
+  %sext = sext i8 %trunc to i32
+  %cmp = icmp slt i32 %sext, 36
+  ret i1 %cmp
+}
+
+; CHECK: @icmp_shl16
+; CHECK-NEXT: %1 = trunc i32 %x to i16
+; CHECK-NEXT: %cmp = icmp slt i16 %1, 36
+define i1 @icmp_shl16(i32 %x) {
+  %shl = shl i32 %x, 16
+  %cmp = icmp slt i32 %shl, 2359296
+  ret i1 %cmp
+}
+
+; CHECK: @icmp_shl24
+; CHECK-NEXT: %1 = trunc i32 %x to i8
+; CHECK-NEXT: %cmp = icmp slt i8 %1, 36
+define i1 @icmp_shl24(i32 %x) {
+  %shl = shl i32 %x, 24
+  %cmp = icmp slt i32 %shl, 603979776
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/intrinsics.ll b/test/Transforms/InstCombine/intrinsics.ll
index 93f0a95..f334b3b 100644
--- a/test/Transforms/InstCombine/intrinsics.ll
+++ b/test/Transforms/InstCombine/intrinsics.ll
@@ -220,3 +220,39 @@ define i32 @cttz_simplify1b(i32 %x) nounwind readnone ssp {
 ; CHECK: @cttz_simplify1b
 ; CHECK-NEXT: ret i32 0
 }
+
+define i32 @ctlz_undef(i32 %Value) nounwind {
+  %ctlz = call i32 @llvm.ctlz.i32(i32 0, i1 true)
+  ret i32 %ctlz
+
+; CHECK: @ctlz_undef
+; CHECK-NEXT: ret i32 undef
+}
+
+define i32 @cttz_undef(i32 %Value) nounwind {
+  %cttz = call i32 @llvm.cttz.i32(i32 0, i1 true)
+  ret i32 %cttz
+
+; CHECK: @cttz_undef
+; CHECK-NEXT: ret i32 undef
+}
+
+define i32 @ctlz_select(i32 %Value) nounwind {
+  %tobool = icmp ne i32 %Value, 0
+  %ctlz = call i32 @llvm.ctlz.i32(i32 %Value, i1 true)
+  %s = select i1 %tobool, i32 %ctlz, i32 32
+  ret i32 %s
+
+; CHECK: @ctlz_select
+; CHECK: select i1 %tobool, i32 %ctlz, i32 32
+}
+
+define i32 @cttz_select(i32 %Value) nounwind {
+  %tobool = icmp ne i32 %Value, 0
+  %cttz = call i32 @llvm.cttz.i32(i32 %Value, i1 true)
+  %s = select i1 %tobool, i32 %cttz, i32 32
+  ret i32 %s
+
+; CHECK: @cttz_select
+; CHECK: select i1 %tobool, i32 %cttz, i32 32
+}
diff --git a/test/Transforms/InstCombine/load3.ll b/test/Transforms/InstCombine/load3.ll
index 35398e1..db74426 100644
--- a/test/Transforms/InstCombine/load3.ll
+++ b/test/Transforms/InstCombine/load3.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.0.0"
 
 ; Instcombine should be able to do trivial CSE of loads.
 
@@ -24,4 +24,23 @@ define float @test2() {
   
 ; CHECK: @test2
 ; CHECK: ret float 0x3806965600000000
-}
-\ No newline at end of file
+}
+
+@rslts32 = global [36 x i32] zeroinitializer, align 4
+
+@expect32 = internal constant [36 x i32][ i32 1, i32 2, i32 0, i32 100, i32 3,
+i32 4, i32 0, i32 -7, i32 4, i32 4, i32 8, i32 8, i32 1, i32 3, i32 8, i32 3,
+i32 4, i32 -2, i32 2, i32 8, i32 83, i32 77, i32 8, i32 17, i32 77, i32 88, i32
+22, i32 33, i32 44, i32 88, i32 77, i32 4, i32 4, i32 7, i32 -7, i32 -8] ,
+align 4
+
+; PR14986
+define void @test3() nounwind {
+; This is a weird way of computing zero.
+  %l = load i32* getelementptr ([36 x i32]* @expect32, i32 29826161, i32 28), align 4
+  store i32 %l, i32* getelementptr ([36 x i32]* @rslts32, i32 29826161, i32 28), align 4
+  ret void
+
+; CHECK: @test3
+; CHECK: store i32 1, i32* getelementptr inbounds ([36 x i32]* @rslts32, i32 0, i32 0)
+}
diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index bb59817..f8c0676 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -10,8 +10,8 @@ define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
   %j = or i32 %g, %i
   ret i32 %j
 ; CHECK: %e = icmp slt i32 %a, %b
-; CHECK: %j = select i1 %e, i32 %c, i32 %d
-; CHECK: ret i32 %j
+; CHECK-NEXT: [[result:%.*]] = select i1 %e, i32 %c, i32 %d
+; CHECK-NEXT: ret i32 [[result]]
 }
 define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
   %e = icmp slt i32 %a, %b
@@ -22,8 +22,8 @@ define i32 @bar(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
   %j = or i32 %i, %g
   ret i32 %j
 ; CHECK: %e = icmp slt i32 %a, %b
-; CHECK: %j = select i1 %e, i32 %c, i32 %d
-; CHECK: ret i32 %j
+; CHECK-NEXT: [[result:%.*]] = select i1 %e, i32 %c, i32 %d
+; CHECK-NEXT: ret i32 [[result]]
 }
 
 define i32 @goo(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
@@ -36,8 +36,8 @@ entry:
   %3 = or i32 %1, %2
   ret i32 %3
 ; CHECK: %0 = icmp slt i32 %a, %b
-; CHECK: %1 = select i1 %0, i32 %c, i32 %d
-; CHECK: ret i32 %1
+; CHECK-NEXT: [[result:%.*]] = select i1 %0, i32 %c, i32 %d
+; CHECK-NEXT: ret i32 [[result]]
 }
 define i32 @poo(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 entry:
@@ -49,8 +49,8 @@ entry:
   %3 = or i32 %1, %2
   ret i32 %3
 ; CHECK: %0 = icmp slt i32 %a, %b
-; CHECK: %1 = select i1 %0, i32 %c, i32 %d
-; CHECK: ret i32 %1
+; CHECK-NEXT: [[result:%.*]] = select i1 %0, i32 %c, i32 %d
+; CHECK-NEXT: ret i32 [[result]]
 }
 
 define i32 @par(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
@@ -63,6 +63,6 @@ entry:
   %3 = or i32 %1, %2
   ret i32 %3
 ; CHECK: %0 = icmp slt i32 %a, %b
-; CHECK: %1 = select i1 %0, i32 %c, i32 %d
-; CHECK: ret i32 %1
+; CHECK-NEXT: [[result:%.*]] = select i1 %0, i32 %c, i32 %d
+; CHECK-NEXT: ret i32 [[result]]
 }
diff --git a/test/Transforms/InstCombine/mul.ll b/test/Transforms/InstCombine/mul.ll
index bbc70fe..16213b8 100644
--- a/test/Transforms/InstCombine/mul.ll
+++ b/test/Transforms/InstCombine/mul.ll
@@ -138,10 +138,8 @@ define i32 @test16(i32 %b, i1 %c) {
         ; e = b & (a >> 31)
         %e = mul i32 %d, %b             ; <i32> [#uses=1]
         ret i32 %e
-; CHECK: [[TEST16:%.*]] = zext i1 %c to i32
-; CHECK-NEXT: %1 = sub i32 0, [[TEST16]]
-; CHECK-NEXT: %e = and i32 %1, %b
-; CHECK-NEXT: ret i32 %e
+; CHECK: [[TEST16:%.*]] = select i1 %c, i32 %b, i32 0
+; CHECK-NEXT: ret i32 [[TEST16]]
 }
 
 ; X * Y (when Y is 0 or 1) --> x & (0-Y)
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index 0ead9d1..31a3cb4 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -256,131 +256,3 @@ xpto:
 return:
   ret i32 7
 }
-
-declare noalias i8* @valloc(i32) nounwind
-
-; CHECK: @test14
-; CHECK: ret i32 6
-define i32 @test14(i32 %a) nounwind {
-  switch i32 %a, label %sw.default [
-    i32 1, label %sw.bb
-    i32 2, label %sw.bb1
-  ]
-
-sw.bb:
-  %call = tail call noalias i8* @malloc(i32 6) nounwind
-  br label %sw.epilog
-
-sw.bb1:
-  %call2 = tail call noalias i8* @calloc(i32 3, i32 2) nounwind
-  br label %sw.epilog
-
-sw.default:
-  %call3 = tail call noalias i8* @valloc(i32 6) nounwind
-  br label %sw.epilog
-
-sw.epilog:
-  %b.0 = phi i8* [ %call3, %sw.default ], [ %call2, %sw.bb1 ], [ %call, %sw.bb ]
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %b.0, i1 false)
-  ret i32 %1
-}
-
-; CHECK: @test15
-; CHECK: llvm.objectsize
-define i32 @test15(i32 %a) nounwind {
-  switch i32 %a, label %sw.default [
-    i32 1, label %sw.bb
-    i32 2, label %sw.bb1
-  ]
-
-sw.bb:
-  %call = tail call noalias i8* @malloc(i32 3) nounwind
-  br label %sw.epilog
-
-sw.bb1:
-  %call2 = tail call noalias i8* @calloc(i32 2, i32 1) nounwind
-  br label %sw.epilog
-
-sw.default:
-  %call3 = tail call noalias i8* @valloc(i32 3) nounwind
-  br label %sw.epilog
-
-sw.epilog:
-  %b.0 = phi i8* [ %call3, %sw.default ], [ %call2, %sw.bb1 ], [ %call, %sw.bb ]
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %b.0, i1 false)
-  ret i32 %1
-}
-
-; CHECK: @test16
-; CHECK: llvm.objectsize
-define i32 @test16(i8* %a, i32 %n) nounwind {
-  %b = alloca [5 x i8], align 1
-  %c = alloca [5 x i8], align 1
-  switch i32 %n, label %sw.default [
-    i32 1, label %sw.bb
-    i32 2, label %sw.bb1
-  ]
-
-sw.bb:
-  %bp = bitcast [5 x i8]* %b to i8*
-  br label %sw.epilog
-
-sw.bb1:
-  %cp = bitcast [5 x i8]* %c to i8*
-  br label %sw.epilog
-
-sw.default:
-  br label %sw.epilog
-
-sw.epilog:
-  %phi = phi i8* [ %a, %sw.default ], [ %cp, %sw.bb1 ], [ %bp, %sw.bb ]
-  %sz = call i32 @llvm.objectsize.i32(i8* %phi, i1 false)
-  ret i32 %sz
-}
-
-; CHECK: @test17
-; CHECK: ret i32 5
-define i32 @test17(i32 %n) nounwind {
-  %b = alloca [5 x i8], align 1
-  %c = alloca [5 x i8], align 1
-  %bp = bitcast [5 x i8]* %b to i8*
-  switch i32 %n, label %sw.default [
-    i32 1, label %sw.bb
-    i32 2, label %sw.bb1
-  ]
-
-sw.bb:
-  br label %sw.epilog
-
-sw.bb1:
-  %cp = bitcast [5 x i8]* %c to i8*
-  br label %sw.epilog
-
-sw.default:
-  br label %sw.epilog
-
-sw.epilog:
-  %phi = phi i8* [ %bp, %sw.default ], [ %cp, %sw.bb1 ], [ %bp, %sw.bb ]
-  %sz = call i32 @llvm.objectsize.i32(i8* %phi, i1 false)
-  ret i32 %sz
-}
-
-@globalalias = alias internal [60 x i8]* @a
-
-; CHECK: @test18
-; CHECK-NEXT: ret i32 60
-define i32 @test18() {
-  %bc = bitcast [60 x i8]* @globalalias to i8*
-  %1 = call i32 @llvm.objectsize.i32(i8* %bc, i1 false)
-  ret i32 %1
-}
-
-@globalalias2 = alias weak [60 x i8]* @a
-
-; CHECK: @test19
-; CHECK: llvm.objectsize
-define i32 @test19() {
-  %bc = bitcast [60 x i8]* @globalalias2 to i8*
-  %1 = call i32 @llvm.objectsize.i32(i8* %bc, i1 false)
-  ret i32 %1
-}
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index c0bb28d..bde2a54 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -344,10 +344,9 @@ define <4 x i32> @test32(<4 x i1> %and.i1352, <4 x i32> %vecinit6.i176, <4 x i32
   %and.i = and <4 x i32> %vecinit6.i191, %neg.i   ; <<4 x i32>> [#uses=1]
   %or.i = or <4 x i32> %and.i, %and.i129          ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %or.i
-; Don't turn this into a vector select until codegen matures to handle them
-; better.
+; codegen is mature enough to handle vector selects.
 ; CHECK: @test32
-; CHECK: or <4 x i32> %and.i, %and.i129
+; CHECK: select <4 x i1> %and.i1352, <4 x i32> %vecinit6.i176, <4 x i32> %vecinit6.i191
 }
 
 define i1 @test33(i1 %X, i1 %Y) {
diff --git a/test/Transforms/InstCombine/pow-1.ll b/test/Transforms/InstCombine/pow-1.ll
index c8e5f38..8a311f0 100644
--- a/test/Transforms/InstCombine/pow-1.ll
+++ b/test/Transforms/InstCombine/pow-1.ll
@@ -30,7 +30,7 @@ define double @test_simplify2(double %x) {
 define float @test_simplify3(float %x) {
 ; CHECK: @test_simplify3
   %retval = call float @powf(float 2.0, float %x)
-; CHECK-NEXT: [[EXP2F:%[a-z0-9]+]] = call float @exp2f(float %x) nounwind readonly
+; CHECK-NEXT: [[EXP2F:%[a-z0-9]+]] = call float @exp2f(float %x) [[NUW_RO:#[0-9]+]]
   ret float %retval
 ; CHECK-NEXT: ret float [[EXP2F]]
 }
@@ -38,7 +38,7 @@ define float @test_simplify3(float %x) {
 define double @test_simplify4(double %x) {
 ; CHECK: @test_simplify4
   %retval = call double @pow(double 2.0, double %x)
-; CHECK-NEXT: [[EXP2:%[a-z0-9]+]] = call double @exp2(double %x) nounwind readonly
+; CHECK-NEXT: [[EXP2:%[a-z0-9]+]] = call double @exp2(double %x) [[NUW_RO]]
   ret double %retval
 ; CHECK-NEXT: ret double [[EXP2]]
 }
@@ -64,8 +64,8 @@ define double @test_simplify6(double %x) {
 define float @test_simplify7(float %x) {
 ; CHECK: @test_simplify7
   %retval = call float @powf(float %x, float 0.5)
-; CHECK-NEXT: [[SQRTF:%[a-z0-9]+]] = call float @sqrtf(float %x) nounwind readonly
-; CHECK-NEXT: [[FABSF:%[a-z0-9]+]] = call float @fabsf(float [[SQRTF]]) nounwind readonly
+; CHECK-NEXT: [[SQRTF:%[a-z0-9]+]] = call float @sqrtf(float %x) [[NUW_RO]]
+; CHECK-NEXT: [[FABSF:%[a-z0-9]+]] = call float @fabsf(float [[SQRTF]]) [[NUW_RO]]
 ; CHECK-NEXT: [[FCMP:%[a-z0-9]+]] = fcmp oeq float %x, 0xFFF0000000000000
 ; CHECK-NEXT: [[SELECT:%[a-z0-9]+]] = select i1 [[FCMP]], float 0x7FF0000000000000, float [[FABSF]]
   ret float %retval
@@ -75,8 +75,8 @@ define float @test_simplify7(float %x) {
 define double @test_simplify8(double %x) {
 ; CHECK: @test_simplify8
   %retval = call double @pow(double %x, double 0.5)
-; CHECK-NEXT: [[SQRT:%[a-z0-9]+]] = call double @sqrt(double %x) nounwind readonly
-; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @fabs(double [[SQRT]]) nounwind readonly
+; CHECK-NEXT: [[SQRT:%[a-z0-9]+]] = call double @sqrt(double %x) [[NUW_RO]]
+; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @fabs(double [[SQRT]]) [[NUW_RO]]
 ; CHECK-NEXT: [[FCMP:%[a-z0-9]+]] = fcmp oeq double %x, 0xFFF0000000000000
 ; CHECK-NEXT: [[SELECT:%[a-z0-9]+]] = select i1 [[FCMP]], double 0x7FF0000000000000, double [[FABS]]
   ret double %retval
@@ -150,3 +150,5 @@ define double @test_simplify16(double %x) {
   ret double %retval
 ; CHECK-NEXT: ret double [[RECIPROCAL]]
 }
+
+; CHECK: attributes [[NUW_RO]] = { nounwind readonly }
diff --git a/test/Transforms/InstCombine/ptr-int-cast.ll b/test/Transforms/InstCombine/ptr-int-cast.ll
index 9524d44..7a6ecff 100644
--- a/test/Transforms/InstCombine/ptr-int-cast.ll
+++ b/test/Transforms/InstCombine/ptr-int-cast.ll
@@ -27,3 +27,34 @@ define i64 @f0(i32 %a0) nounwind {
        ret i64 %t1
 }
 
+define <4 x i32> @test4(<4 x i8*> %arg) nounwind {
+; CHECK: @test4
+; CHECK: ptrtoint <4 x i8*> %arg to <4 x i64>
+; CHECK: trunc <4 x i64> %1 to <4 x i32>
+  %p1 = ptrtoint <4 x i8*> %arg to <4 x i32>
+  ret <4 x i32> %p1
+}
+
+define <4 x i128> @test5(<4 x i8*> %arg) nounwind {
+; CHECK: @test5
+; CHECK: ptrtoint <4 x i8*> %arg to <4 x i64>
+; CHECK: zext <4 x i64> %1 to <4 x i128>
+  %p1 = ptrtoint <4 x i8*> %arg to <4 x i128>
+  ret <4 x i128> %p1
+}
+
+define <4 x i8*> @test6(<4 x i32> %arg) nounwind {
+; CHECK: @test6
+; CHECK: zext <4 x i32> %arg to <4 x i64>
+; CHECK: inttoptr <4 x i64> %1 to <4 x i8*>
+  %p1 = inttoptr <4 x i32> %arg to <4 x i8*>
+  ret <4 x i8*> %p1
+}
+
+define <4 x i8*> @test7(<4 x i128> %arg) nounwind {
+; CHECK: @test7
+; CHECK: trunc <4 x i128> %arg to <4 x i64>
+; CHECK: inttoptr <4 x i64> %1 to <4 x i8*>
+  %p1 = inttoptr <4 x i128> %arg to <4 x i8*>
+  ret <4 x i8*> %p1
+}
diff --git a/test/Transforms/InstCombine/sext.ll b/test/Transforms/InstCombine/sext.ll
index f198797..968f37c 100644
--- a/test/Transforms/InstCombine/sext.ll
+++ b/test/Transforms/InstCombine/sext.ll
@@ -184,3 +184,12 @@ define i32 @test16(i16 %x) nounwind {
 ; CHECK-NEXT: %ext = sext i16 %sext to i32
 ; CHECK-NEXT: ret i32 %ext
 }
+
+define i32 @test17(i1 %x) nounwind {
+  %c1 = sext i1 %x to i32
+  %c2 = sub i32 0, %c1
+  ret i32 %c2
+; CHECK: @test17
+; CHECK-NEXT: [[TEST17:%.*]] = zext i1 %x to i32
+; CHECK-NEXT: ret i32 [[TEST17]]
+}
diff --git a/test/Transforms/InstCombine/vec_extract_elt.ll b/test/Transforms/InstCombine/vec_extract_elt.ll
index 63e4ee2..166066a 100644
--- a/test/Transforms/InstCombine/vec_extract_elt.ll
+++ b/test/Transforms/InstCombine/vec_extract_elt.ll
@@ -7,3 +7,13 @@ define i32 @test(float %f) {
         ret i32 %tmp19
 }
 
+define i64 @test2(i64 %in) {
+  %vec = insertelement <8 x i64> undef, i64 %in, i32 0
+  %splat = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> zeroinitializer
+  %add = add <8 x i64> %splat, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %scl1 = extractelement <8 x i64> %add, i32 0
+  %scl2 = extractelement <8 x i64> %add, i32 0
+  %r = add i64 %scl1, %scl2
+  ret i64 %r
+}
+
diff --git a/test/Transforms/InstCombine/vector-casts.ll b/test/Transforms/InstCombine/vector-casts.ll
index 7bbf53c..2f2990b 100644
--- a/test/Transforms/InstCombine/vector-casts.ll
+++ b/test/Transforms/InstCombine/vector-casts.ll
@@ -64,7 +64,8 @@ entry:
         
 ; CHECK: @test5
 ; CHECK:   sext <4 x i1> %cmp to <4 x i32>	
-; CHECK:   sext <4 x i1> %cmp4 to <4 x i32>	
+; The sext-and pair is canonicalized to a select.
+; CHECK:   select <4 x i1> %cmp4, <4 x i32>	%sext, <4 x i32> zeroinitializer
 }
 
 
diff --git a/test/Transforms/InstCombine/zext-bool-add-sub.ll b/test/Transforms/InstCombine/zext-bool-add-sub.ll
index 78bcedb..b531057 100644
--- a/test/Transforms/InstCombine/zext-bool-add-sub.ll
+++ b/test/Transforms/InstCombine/zext-bool-add-sub.ll
@@ -4,9 +4,9 @@
 define i32 @a(i1 zeroext %x, i1 zeroext %y) {
 entry:
 ; CHECK: @a
-; CHECK: [[TMP1:%.*]] = zext i1 %y to i32
+; CHECK: [[TMP1:%.*]] = sext i1 %y to i32
 ; CHECK: [[TMP2:%.*]] = select i1 %x, i32 2, i32 1
-; CHECK-NEXT: sub i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT: add i32 [[TMP2]], [[TMP1]]
   %conv = zext i1 %x to i32
   %conv3 = zext i1 %y to i32
   %conv3.neg = sub i32 0, %conv3
diff --git a/test/Transforms/InstSimplify/call-callconv.ll b/test/Transforms/InstSimplify/call-callconv.ll
new file mode 100644
index 0000000..e475be7
--- /dev/null
+++ b/test/Transforms/InstSimplify/call-callconv.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; Verify that the non-default calling conv doesn't prevent the libcall simplification
+
+@.str = private unnamed_addr constant [4 x i8] c"abc\00", align 1
+
+define arm_aapcscc i32 @_abs(i32 %i) nounwind readnone {
+; CHECK: _abs
+  %call = tail call arm_aapcscc i32 @abs(i32 %i) nounwind readnone
+  ret i32 %call
+; CHECK: %[[ISPOS:.*]] = icmp sgt i32 %i, -1
+; CHECK: %[[NEG:.*]] = sub i32 0, %i
+; CHECK: %[[RET:.*]] = select i1 %[[ISPOS]], i32 %i, i32 %[[NEG]]
+; CHECK: ret i32 %[[RET]]
+}
+
+declare arm_aapcscc i32 @abs(i32) nounwind readnone
+
+define arm_aapcscc i32 @_labs(i32 %i) nounwind readnone {
+; CHECK: _labs
+  %call = tail call arm_aapcscc i32 @labs(i32 %i) nounwind readnone
+  ret i32 %call
+; CHECK: %[[ISPOS:.*]] = icmp sgt i32 %i, -1
+; CHECK: %[[NEG:.*]] = sub i32 0, %i
+; CHECK: %[[RET:.*]] = select i1 %[[ISPOS]], i32 %i, i32 %[[NEG]]
+; CHECK: ret i32 %[[RET]]
+}
+
+declare arm_aapcscc i32 @labs(i32) nounwind readnone
+
+define arm_aapcscc i32 @_strlen1() {
+; CHECK: _strlen1
+  %call = tail call arm_aapcscc i32 @strlen(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0))
+  ret i32 %call
+; CHECK: ret i32 3
+}
+
+declare arm_aapcscc i32 @strlen(i8*)
+
+define arm_aapcscc zeroext i1 @_strlen2(i8* %str) {
+; CHECK: _strlen2
+  %call = tail call arm_aapcscc i32 @strlen(i8* %str)
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+
+; CHECK: %[[STRLENFIRST:.*]] = load i8* %str
+; CHECK: %[[CMP:.*]] = icmp ne i8 %[[STRLENFIRST]], 0
+; CHECK: ret i1 %[[CMP]]
+}
diff --git a/test/Transforms/InstSimplify/call.ll b/test/Transforms/InstSimplify/call.ll
index 1a8d0c2..cf2f847 100644
--- a/test/Transforms/InstSimplify/call.ll
+++ b/test/Transforms/InstSimplify/call.ll
@@ -50,3 +50,54 @@ define float @test_fabs_libcall() {
   ret float %x
 ; CHECK-NEXT: ret float 4.2{{0+}}e+01
 }
+
+
+declare float @llvm.fabs.f32(float) nounwind readnone
+declare float @llvm.floor.f32(float) nounwind readnone
+declare float @llvm.ceil.f32(float) nounwind readnone
+declare float @llvm.trunc.f32(float) nounwind readnone
+declare float @llvm.rint.f32(float) nounwind readnone
+declare float @llvm.nearbyint.f32(float) nounwind readnone
+
+; Test idempotent intrinsics
+define float @test_idempotence(float %a) {
+; CHECK: @test_idempotence
+
+; CHECK: fabs
+; CHECK-NOT: fabs
+  %a0 = call float @llvm.fabs.f32(float %a)
+  %a1 = call float @llvm.fabs.f32(float %a0)
+
+; CHECK: floor
+; CHECK-NOT: floor
+  %b0 = call float @llvm.floor.f32(float %a)
+  %b1 = call float @llvm.floor.f32(float %b0)
+
+; CHECK: ceil
+; CHECK-NOT: ceil
+  %c0 = call float @llvm.ceil.f32(float %a)
+  %c1 = call float @llvm.ceil.f32(float %c0)
+
+; CHECK: trunc
+; CHECK-NOT: trunc
+  %d0 = call float @llvm.trunc.f32(float %a)
+  %d1 = call float @llvm.trunc.f32(float %d0)
+
+; CHECK: rint
+; CHECK-NOT: rint
+  %e0 = call float @llvm.rint.f32(float %a)
+  %e1 = call float @llvm.rint.f32(float %e0)
+
+; CHECK: nearbyint
+; CHECK-NOT: nearbyint
+  %f0 = call float @llvm.nearbyint.f32(float %a)
+  %f1 = call float @llvm.nearbyint.f32(float %f0)
+
+  %r0 = fadd float %a1, %b1
+  %r1 = fadd float %r0, %c1
+  %r2 = fadd float %r1, %d1
+  %r3 = fadd float %r2, %e1
+  %r4 = fadd float %r3, %f1
+
+  ret float %r4
+}
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index 56627b9..0ecfb1f 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -647,3 +647,38 @@ unreachableblock:
   %Y = icmp eq i32* %X, null
   ret i1 %Y
 }
+
+; It's not valid to fold a comparison of an argument with an alloca, even though
+; that's tempting. An argument can't *alias* an alloca, however the aliasing rule
+; relies on restrictions against guessing an object's address and dereferencing.
+; There are no restrictions against guessing an object's address and comparing.
+
+define i1 @alloca_argument_compare(i64* %arg) {
+  %alloc = alloca i64
+  %cmp = icmp eq i64* %arg, %alloc
+  ret i1 %cmp
+  ; CHECK: alloca_argument_compare
+  ; CHECK: ret i1 %cmp
+}
+
+; As above, but with the operands reversed.
+
+define i1 @alloca_argument_compare_swapped(i64* %arg) {
+  %alloc = alloca i64
+  %cmp = icmp eq i64* %alloc, %arg
+  ret i1 %cmp
+  ; CHECK: alloca_argument_compare_swapped
+  ; CHECK: ret i1 %cmp
+}
+
+; Don't assume that a noalias argument isn't equal to a global variable's
+; address. This is an example where AliasAnalysis' NoAlias concept is
+; different from actual pointer inequality.
+
+@y = external global i32
+define zeroext i1 @external_compare(i32* noalias %x) {
+  %cmp = icmp eq i32* %x, @y
+  ret i1 %cmp
+  ; CHECK: external_compare
+  ; CHECK: ret i1 %cmp
+}
diff --git a/test/Transforms/InstSimplify/past-the-end.ll b/test/Transforms/InstSimplify/past-the-end.ll
new file mode 100644
index 0000000..075da4a
--- /dev/null
+++ b/test/Transforms/InstSimplify/past-the-end.ll
@@ -0,0 +1,77 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+target datalayout = "p:32:32"
+
+; Check some past-the-end subtleties.
+
+@opte_a = global i32 0
+@opte_b = global i32 0
+
+; Comparing base addresses of two distinct globals. Never equal.
+
+define zeroext i1 @no_offsets() {
+  %t = icmp eq i32* @opte_a, @opte_b
+  ret i1 %t
+  ; CHECK: no_offsets(
+  ; CHECK: ret i1 false
+}
+
+; Comparing past-the-end addresses of two distinct globals. Never equal.
+
+define zeroext i1 @both_past_the_end() {
+  %x = getelementptr i32* @opte_a, i32 1
+  %y = getelementptr i32* @opte_b, i32 1
+  %t = icmp eq i32* %x, %y
+  ret i1 %t
+  ; CHECK: both_past_the_end(
+  ; CHECK-NOT: ret i1 true
+  ; TODO: refine this
+}
+
+; Comparing past-the-end addresses of one global to the base address
+; of another. Can't fold this.
+
+define zeroext i1 @just_one_past_the_end() {
+  %x = getelementptr i32* @opte_a, i32 1
+  %t = icmp eq i32* %x, @opte_b
+  ret i1 %t
+  ; CHECK: just_one_past_the_end(
+  ; CHECK: ret i1 icmp eq (i32* getelementptr inbounds (i32* @opte_a, i32 1), i32* @opte_b)
+}
+
+; Comparing base addresses of two distinct allocas. Never equal.
+
+define zeroext i1 @no_alloca_offsets() {
+  %m = alloca i32
+  %n = alloca i32
+  %t = icmp eq i32* %m, %n
+  ret i1 %t
+  ; CHECK: no_alloca_offsets(
+  ; CHECK: ret i1 false
+}
+
+; Comparing past-the-end addresses of two distinct allocas. Never equal.
+
+define zeroext i1 @both_past_the_end_alloca() {
+  %m = alloca i32
+  %n = alloca i32
+  %x = getelementptr i32* %m, i32 1
+  %y = getelementptr i32* %n, i32 1
+  %t = icmp eq i32* %x, %y
+  ret i1 %t
+  ; CHECK: both_past_the_end_alloca(
+  ; CHECK-NOT: ret i1 true
+  ; TODO: refine this
+}
+
+; Comparing past-the-end addresses of one alloca to the base address
+; of another. Can't fold this.
+
+define zeroext i1 @just_one_past_the_end_alloca() {
+  %m = alloca i32
+  %n = alloca i32
+  %x = getelementptr i32* %m, i32 1
+  %t = icmp eq i32* %x, %n
+  ret i1 %t
+  ; CHECK: just_one_past_the_end_alloca(
+  ; CHECK: ret i1 %t
+}
diff --git a/test/Transforms/InstSimplify/ptr_diff.ll b/test/Transforms/InstSimplify/ptr_diff.ll
index 1eb1fd4..8b4aa79 100644
--- a/test/Transforms/InstSimplify/ptr_diff.ll
+++ b/test/Transforms/InstSimplify/ptr_diff.ll
@@ -46,3 +46,33 @@ define i64 @ptrdiff3(i8* %ptr) {
   %diff = sub i64 %last.int, %first.int
   ret i64 %diff
 }
+
+define <4 x i32> @ptrdiff4(<4 x i8*> %arg) nounwind {
+; Handle simple cases of vectors of pointers.
+; CHECK: @ptrdiff4
+; CHECK: ret <4 x i32> zeroinitializer
+  %p1 = ptrtoint <4 x i8*> %arg to <4 x i32>
+  %bc = bitcast <4 x i8*> %arg to <4 x i32*>
+  %p2 = ptrtoint <4 x i32*> %bc to <4 x i32>
+  %sub = sub <4 x i32> %p1, %p2
+  ret <4 x i32> %sub
+}
+
+%struct.ham = type { i32, [2 x [2 x i32]] }
+
+@global = internal global %struct.ham zeroinitializer, align 4
+
+define i32 @ptrdiff5() nounwind {
+bb:
+  %tmp = getelementptr inbounds %struct.ham* @global, i32 0, i32 1
+  %tmp1 = getelementptr inbounds [2 x [2 x i32]]* %tmp, i32 0, i32 0
+  %tmp2 = bitcast [2 x i32]* %tmp1 to i32*
+  %tmp3 = ptrtoint i32* %tmp2 to i32
+  %tmp4 = getelementptr inbounds %struct.ham* @global, i32 0, i32 1
+  %tmp5 = getelementptr inbounds [2 x [2 x i32]]* %tmp4, i32 0, i32 0
+  %tmp6 = ptrtoint [2 x i32]* %tmp5 to i32
+  %tmp7 = sub i32 %tmp3, %tmp6
+  ret i32 %tmp7
+; CHECK: @ptrdiff5
+; CHECK: ret i32 0
+}
diff --git a/test/Transforms/JumpThreading/basic.ll b/test/Transforms/JumpThreading/basic.ll
index 93fa29b..fe3dc77 100644
--- a/test/Transforms/JumpThreading/basic.ll
+++ b/test/Transforms/JumpThreading/basic.ll
@@ -497,8 +497,8 @@ l2:
   br label %l3
 
 l3:
-; CHECK: call void @g() noduplicate
-; CHECK-NOT: call void @g() noduplicate
+; CHECK: call void @g() [[NOD:#[0-9]+]]
+; CHECK-NOT: call void @g() [[NOD]]
   call void @g() noduplicate
   %y = icmp ult i32 %p, 5
   br i1 %y, label %l4, label %l5
@@ -512,3 +512,5 @@ l5:
   ret void
 ; CHECK: }
 }
+
+; CHECK: attributes [[NOD]] = { noduplicate }
diff --git a/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll b/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
index fe8d445..2bf2604 100644
--- a/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
+++ b/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -licm | lli %defaultjit
+; RUN: opt < %s -licm | lli -force-interpreter
 
 define i32 @main() {
 entry:
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index 98f9334..1ca377e 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -90,3 +90,29 @@ for.end:                                          ; preds = %for.body
 
 declare void @foo_may_call_exit(i32)
 
+; PR14854
+; CHECK: @test5
+; CHECK: extractvalue
+; CHECK: br label %tailrecurse
+; CHECK: tailrecurse:
+; CHECK: ifend:
+; CHECK: insertvalue
+define { i32*, i32 } @test5(i32 %i, { i32*, i32 } %e) {
+entry:
+  br label %tailrecurse
+
+tailrecurse:                                      ; preds = %then, %entry
+  %i.tr = phi i32 [ %i, %entry ], [ %cmp2, %then ]
+  %out = extractvalue { i32*, i32 } %e, 1
+  %d = insertvalue { i32*, i32 } %e, i32* null, 0
+  %cmp1 = icmp sgt i32 %out, %i.tr
+  br i1 %cmp1, label %then, label %ifend
+
+then:                                             ; preds = %tailrecurse
+  call void @foo()
+  %cmp2 = add i32 %i.tr, 1
+  br label %tailrecurse
+
+ifend:                                            ; preds = %tailrecurse
+  ret { i32*, i32 } %d
+}
diff --git a/test/Transforms/LoopDeletion/simplify-then-delete.ll b/test/Transforms/LoopDeletion/simplify-then-delete.ll
index 5a21672..4278ef1 100644
--- a/test/Transforms/LoopDeletion/simplify-then-delete.ll
+++ b/test/Transforms/LoopDeletion/simplify-then-delete.ll
@@ -4,7 +4,7 @@
 ; Indvars and loop deletion should be able to eliminate all looping
 ; in this testcase.
 
-; CHECK:      define i32 @pmat(i32 %m, i32 %n, double* %y) nounwind {
+; CHECK:      define i32 @pmat(i32 %m, i32 %n, double* %y) #0 {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   ret i32 0
 ; CHECK-NEXT: }
@@ -63,3 +63,5 @@ w.e:
 w.e12:
   ret i32 0
 }
+
+; CHECK: attributes #0 = { nounwind }
diff --git a/test/Transforms/LoopIdiom/X86/popcnt.ll b/test/Transforms/LoopIdiom/X86/popcnt.ll
index 2f458fb..25df93d 100644
--- a/test/Transforms/LoopIdiom/X86/popcnt.ll
+++ b/test/Transforms/LoopIdiom/X86/popcnt.ll
@@ -118,3 +118,23 @@ while.end:                                        ; preds = %while.body, %entry
   %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
   ret i32 %c.0.lcssa
 }
+
+define i32 @PopCntCrash3(i64 %a, i32 %x) {
+entry:
+  %tobool3 = icmp eq i64 %a, 0
+  %cmp = icmp eq i32 %x, 0
+  br i1 %tobool3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ]
+  %inc = add nsw i32 %c.05, 1
+  %sub = add i64 %a.addr.04, -1
+  %and = and i64 %sub, %a.addr.04
+  %tobool = icmp eq i64 %and, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  ret i32 %c.0.lcssa
+}
diff --git a/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll b/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll
index 53da462..9524be3 100644
--- a/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll
+++ b/test/Transforms/LoopStrengthReduce/2012-07-18-LimitReassociate.ll
@@ -5,7 +5,7 @@
 ; PR13361: LSR + SCEV "hangs" on reasonably sized test with sequence of loops
 ;
 ; Without limits on CollectSubexpr, we have thousands of formulae for
-; the use that crosses loops. With limits we have six.
+; the use that crosses loops. With limits we have five.
 ; CHECK: LSR on loop %bb221:
 ; CHECK: After generating reuse formulae:
 ; CHECK: LSR is examining the following uses:
@@ -15,11 +15,8 @@
 ; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
 ; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
 ; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
-; CHECK: {{.*reg\(\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{.*\{}}
 ; CHECK-NOT:reg
 ; CHECK: Filtering for use
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-freebsd9"
 
 %struct.snork = type { %struct.fuga, i32, i32, i32, i32, i32, i32 }
 %struct.fuga = type { %struct.gork, i64 }
diff --git a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
new file mode 100644
index 0000000..8fbddf8
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
@@ -0,0 +1,84 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+;
+; LTO of clang, which mistakenly uses no TargetLoweringInfo, causes a
+; miscompile. ReuseOrCreateCast replace ptrtoint operand with undef.
+; Reproducing the miscompile requires no triple, hence no "TTI".
+; rdar://13007381
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Verify that nothing uses the "dead" ptrtoint from "undef".
+; CHECK: @VerifyDiagnosticConsumerTest
+; CHECK: bb:
+; CHECK: %0 = ptrtoint i8* undef to i64
+; CHECK-NOT: %0
+; CHECK: .lr.ph
+; CHECK-NOT: %0
+; CHECK: sub i64 %7, %tmp6
+; CHECK-NOT: %0
+; CHECK: ret void
+define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 {
+bb:
+  %tmp3 = call i8* @getCharData() nounwind
+  %tmp4 = call i8* @getCharData() nounwind
+  %tmp5 = ptrtoint i8* %tmp4 to i64
+  %tmp6 = ptrtoint i8* %tmp3 to i64
+  %tmp7 = sub i64 %tmp5, %tmp6
+  br i1 undef, label %bb87, label %.preheader
+
+.preheader:                                       ; preds = %bb10, %bb
+  br i1 undef, label %_ZNK4llvm9StringRef4findEcm.exit42.thread, label %bb10
+
+bb10:                                             ; preds = %.preheader
+  br i1 undef, label %_ZNK4llvm9StringRef4findEcm.exit42, label %.preheader
+
+_ZNK4llvm9StringRef4findEcm.exit42:               ; preds = %bb10
+  br i1 undef, label %_ZNK4llvm9StringRef4findEcm.exit42.thread, label %.lr.ph
+
+_ZNK4llvm9StringRef4findEcm.exit42.thread:        ; preds = %_ZNK4llvm9StringRef4findEcm.exit42, %.preheader
+  unreachable
+
+.lr.ph:                                           ; preds = %_ZNK4llvm9StringRef4findEcm.exit42
+  br label %bb36
+
+_ZNK4llvm9StringRef4findEcm.exit.loopexit:        ; preds = %bb63
+  %tmp21 = icmp eq i64 %i.0.i, -1
+  br i1 %tmp21, label %_ZNK4llvm9StringRef4findEcm.exit._crit_edge, label %bb36
+
+_ZNK4llvm9StringRef4findEcm.exit._crit_edge:      ; preds = %bb61, %_ZNK4llvm9StringRef4findEcm.exit.loopexit
+  unreachable
+
+bb36:                                             ; preds = %_ZNK4llvm9StringRef4findEcm.exit.loopexit, %.lr.ph
+  %loc.063 = phi i64 [ undef, %.lr.ph ], [ %i.0.i, %_ZNK4llvm9StringRef4findEcm.exit.loopexit ]
+  switch i8 undef, label %bb57 [
+    i8 10, label %bb48
+    i8 13, label %bb48
+  ]
+
+bb48:                                             ; preds = %bb36, %bb36
+  br label %bb58
+
+bb57:                                             ; preds = %bb36
+  br label %bb58
+
+bb58:                                             ; preds = %bb57, %bb48
+  %tmp59 = icmp ugt i64 %tmp7, undef
+  %tmp60 = select i1 %tmp59, i64 undef, i64 %tmp7
+  br label %bb61
+
+bb61:                                             ; preds = %bb63, %bb58
+  %i.0.i = phi i64 [ %tmp60, %bb58 ], [ %tmp67, %bb63 ]
+  %tmp62 = icmp eq i64 %i.0.i, %tmp7
+  br i1 %tmp62, label %_ZNK4llvm9StringRef4findEcm.exit._crit_edge, label %bb63
+
+bb63:                                             ; preds = %bb61
+  %tmp64 = getelementptr inbounds i8* %tmp3, i64 %i.0.i
+  %tmp65 = load i8* %tmp64, align 1
+  %tmp67 = add i64 %i.0.i, 1
+  br i1 undef, label %_ZNK4llvm9StringRef4findEcm.exit.loopexit, label %bb61
+
+bb87:                                             ; preds = %bb
+  ret void
+}
+
+declare i8* @getCharData()
diff --git a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
index 9189d79..ee3cc4d 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -205,18 +205,18 @@ for.end:                                          ; preds = %for.body
 ; post-increment addressing, no add's or add.w's beyond the three
 ; mentioned. Most importantly, there should be no spills or reloads!
 ;
-; CHECK: testNeon:
-; CHECK: %.lr.ph
-; CHECK-NOT: lsl.w
-; CHECK-NOT: {{ldr|str|adds|add r}}
-; CHECK: add.w r
-; CHECK-NOT: {{ldr|str|adds|add r}}
-; CHECK: add.w r
-; CHECK-NOT: {{ldr|str|adds|add r}}
-; CHECK: add.w r
-; CHECK-NOT: {{ldr|str|adds|add r}}
-; CHECK-NOT: add.w r
-; CHECK: bne
+; A9: testNeon:
+; A9: %.lr.ph
+; A9-NOT: lsl.w
+; A9-NOT: {{ldr|str|adds|add r}}
+; A9: add.w r
+; A9-NOT: {{ldr|str|adds|add r}}
+; A9: add.w r
+; A9-NOT: {{ldr|str|adds|add r}}
+; A9: add.w r
+; A9-NOT: {{ldr|str|adds|add r}}
+; A9-NOT: add.w r
+; A9: bne
 define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize {
   %1 = icmp sgt i32 %limit, 0
   br i1 %1, label %.lr.ph, label %45
@@ -290,3 +290,80 @@ define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i
 }
 
 declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32) nounwind readonly
+
+; Handle chains in which the same offset is used for both loads and
+; stores to the same array.
+; rdar://11410078.
+;
+; A9: @testReuse
+; A9: %for.body
+; A9: vld1.8 {d{{[0-9]+}}}, [[BASE:[r[0-9]+]]], [[INC:r[0-9]]]
+; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], {{r[0-9]}}
+; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
+; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]]
+; A9: bne
+define void @testReuse(i8* %src, i32 %stride) nounwind ssp {
+entry:
+  %mul = shl nsw i32 %stride, 2
+  %idx.neg = sub i32 0, %mul
+  %mul1 = mul nsw i32 %stride, 3
+  %idx.neg2 = sub i32 0, %mul1
+  %mul5 = shl nsw i32 %stride, 1
+  %idx.neg6 = sub i32 0, %mul5
+  %idx.neg10 = sub i32 0, %stride
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.0110 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %src.addr = phi i8* [ %src, %entry ], [ %add.ptr45, %for.body ]
+  %add.ptr = getelementptr inbounds i8* %src.addr, i32 %idx.neg
+  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr, i32 1)
+  %add.ptr3 = getelementptr inbounds i8* %src.addr, i32 %idx.neg2
+  %vld2 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr3, i32 1)
+  %add.ptr7 = getelementptr inbounds i8* %src.addr, i32 %idx.neg6
+  %vld3 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr7, i32 1)
+  %add.ptr11 = getelementptr inbounds i8* %src.addr, i32 %idx.neg10
+  %vld4 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr11, i32 1)
+  %vld5 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %src.addr, i32 1)
+  %add.ptr17 = getelementptr inbounds i8* %src.addr, i32 %stride
+  %vld6 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr17, i32 1)
+  %add.ptr20 = getelementptr inbounds i8* %src.addr, i32 %mul5
+  %vld7 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr20, i32 1)
+  %add.ptr23 = getelementptr inbounds i8* %src.addr, i32 %mul1
+  %vld8 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %add.ptr23, i32 1)
+  %vadd1 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld1, <8 x i8> %vld2) nounwind
+  %vadd2 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld2, <8 x i8> %vld3) nounwind
+  %vadd3 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld3, <8 x i8> %vld4) nounwind
+  %vadd4 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld4, <8 x i8> %vld5) nounwind
+  %vadd5 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld5, <8 x i8> %vld6) nounwind
+  %vadd6 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld6, <8 x i8> %vld7) nounwind
+  tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr3, <8 x i8> %vadd1, i32 1)
+  tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr7, <8 x i8> %vadd2, i32 1)
+  tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr11, <8 x i8> %vadd3, i32 1)
+  tail call void @llvm.arm.neon.vst1.v8i8(i8* %src.addr, <8 x i8> %vadd4, i32 1)
+  tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr17, <8 x i8> %vadd5, i32 1)
+  tail call void @llvm.arm.neon.vst1.v8i8(i8* %add.ptr20, <8 x i8> %vadd6, i32 1)
+  %inc = add nsw i32 %i.0110, 1
+  %add.ptr45 = getelementptr inbounds i8* %src.addr, i32 8
+  %exitcond = icmp eq i32 %inc, 4
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
+
+declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
diff --git a/test/Transforms/LoopStrengthReduce/X86/2011-12-04-loserreg.ll b/test/Transforms/LoopStrengthReduce/X86/2011-12-04-loserreg.ll
index 5108650..eedfc20 100644
--- a/test/Transforms/LoopStrengthReduce/X86/2011-12-04-loserreg.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/2011-12-04-loserreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
 ;
 ; Test LSR's ability to prune formulae that refer to nonexistant
 ; AddRecs in other loops.
@@ -15,13 +15,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-darwin"
 
 ; CHECK: @test
-; CHECK: # %for.body{{$}}
-; dummyiv copy should be removed
-; CHECK-NOT: movq
-; CHECK: # %for.cond19.preheader
-; dummycnt should be removed
-; CHECK-NOT: incq
-; CHECK: # %for.body23{{$}}
+; CHECK: for.body:
+; CHECK: %lsr.iv
+; CHECK-NOT: %dummyout
+; CHECK: ret
 define i64 @test(i64 %count, float* nocapture %srcrow, i32* nocapture %destrow) nounwind uwtable ssp {
 entry:
   %cmp34 = icmp eq i64 %count, 0
diff --git a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
index 9e02d92..45aeb4e 100644
--- a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
+++ b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
@@ -4,18 +4,17 @@
 ; LSR should properly handle the post-inc offset when folding the
 ; non-IV operand of an icmp into the IV.
 
-; CHECK:   %3 = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
-; CHECK:   %4 = lshr i64 %3, 1
-; CHECK:   %5 = mul i64 %4, 2
+; CHECK:   [[r1:%[a-z0-9]+]] = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+; CHECK:   [[r2:%[a-z0-9]+]] = lshr i64 [[r1]], 1
+; CHECK:   [[r3:%[a-z0-9]+]] = mul i64 [[r2]], 2
 ; CHECK:   br label %for.body
 ; CHECK: for.body:
-; CHECK:   %lsr.iv2 = phi i64 [ %lsr.iv.next, %for.body ], [ %5, %for.body.lr.ph ]
+; CHECK:   %lsr.iv2 = phi i64 [ %lsr.iv.next, %for.body ], [ [[r3]], %for.body.lr.ph ]
 ; CHECK:   %lsr.iv.next = add i64 %lsr.iv2, -2
 ; CHECK:   %lsr.iv.next3 = inttoptr i64 %lsr.iv.next to i16*
 ; CHECK:   %cmp27 = icmp eq i16* %lsr.iv.next3, null
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
 
 %struct.Vector2 = type { i16*, [64 x i16], i32 }
 
diff --git a/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll b/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
index 59a8236..bde52da 100644
--- a/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
+++ b/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
@@ -19,7 +19,7 @@
 ; CHECK-NEXT:     i32 1, label %inc.us
 
 ; CHECK:      inc.us:                                           ; preds = %loop_begin.us
-; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   call void @incf() [[NOR_NUW:#[0-9]+]]
 ; CHECK-NEXT:   br label %loop_begin.backedge.us
 
 ; CHECK:      .split:                                           ; preds = %..split_crit_edge
@@ -40,7 +40,7 @@
 ; CHECK-NEXT:   ]
 
 ; CHECK:      dec.us3:                                          ; preds = %loop_begin.us1
-; CHECK-NEXT:   call void @decf() noreturn nounwind
+; CHECK-NEXT:   call void @decf() [[NOR_NUW]]
 ; CHECK-NEXT:   br label %loop_begin.backedge.us5
 
 ; CHECK:      .split.split:                                     ; preds = %.split..split.split_crit_edge
@@ -89,3 +89,6 @@ loop_exit:
 
 declare void @incf() noreturn
 declare void @decf() noreturn
+
+; CHECK: attributes #0 = { noreturn }
+; CHECK: attributes [[NOR_NUW]] = { noreturn nounwind }
diff --git a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
index 67982fe..c3bf596 100644
--- a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
+++ b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
@@ -25,7 +25,7 @@
 ; CHECK-NEXT:   ]
 
 ; CHECK:      inc.us:                                           ; preds = %second_switch.us, %loop_begin.us
-; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   call void @incf() [[NOR_NUW:#[0-9]+]]
 ; CHECK-NEXT:   br label %loop_begin.backedge.us
 
 ; CHECK:      .split:                                           ; preds = %..split_crit_edge
@@ -45,7 +45,7 @@
 ; CHECK-NEXT:   ]
 
 ; CHECK:      inc:                                              ; preds = %loop_begin.inc_crit_edge, %second_switch
-; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   call void @incf() [[NOR_NUW]]
 ; CHECK-NEXT:   br label %loop_begin.backedge
 
 define i32 @test(i32* %var) {
@@ -82,3 +82,6 @@ loop_exit:
 
 declare void @incf() noreturn
 declare void @decf() noreturn
+
+; CHECK: attributes #0 = { noreturn }
+; CHECK: attributes [[NOR_NUW]] = { noreturn nounwind }
diff --git a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
index 36b7eff..9530333 100644
--- a/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
+++ b/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
@@ -30,7 +30,7 @@
 ; CHECK-NEXT:     i32 1, label %inc.us.us
 
 ; CHECK:      inc.us.us:                                        ; preds = %second_switch.us.us, %loop_begin.us.us
-; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   call void @incf() [[NOR_NUW:#[0-9]+]]
 ; CHECK-NEXT:   br label %loop_begin.backedge.us.us
 
 ; CHECK:      .split.us.split:                                  ; preds = %.split.us..split.us.split_crit_edge
@@ -50,7 +50,7 @@
 ; CHECK-NEXT:   br i1 true, label %us-unreachable8, label %inc.us
 
 ; CHECK:      inc.us:                                           ; preds = %second_switch.us.inc.us_crit_edge, %loop_begin.us
-; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   call void @incf() [[NOR_NUW]]
 ; CHECK-NEXT:   br label %loop_begin.backedge.us
 
 ; CHECK:      .split:                                           ; preds = %..split_crit_edge
@@ -75,7 +75,7 @@
 ; CHECK-NEXT:   ]
 
 ; CHECK:      inc.us4:                                          ; preds = %loop_begin.inc_crit_edge.us, %second_switch.us3
-; CHECK-NEXT:   call void @incf() noreturn nounwind
+; CHECK-NEXT:   call void @incf() [[NOR_NUW]]
 ; CHECK-NEXT:   br label %loop_begin.backedge.us6
 
 ; CHECK:      loop_begin.inc_crit_edge.us:                      ; preds = %loop_begin.us1
@@ -136,3 +136,6 @@ loop_exit:
 
 declare void @incf() noreturn
 declare void @decf() noreturn
+
+; CHECK: attributes #0 = { noreturn }
+; CHECK: attributes [[NOR_NUW]] = { noreturn nounwind }
diff --git a/test/Transforms/LoopUnswitch/infinite-loop.ll b/test/Transforms/LoopUnswitch/infinite-loop.ll
index 73391ca..f3fba64 100644
--- a/test/Transforms/LoopUnswitch/infinite-loop.ll
+++ b/test/Transforms/LoopUnswitch/infinite-loop.ll
@@ -21,11 +21,11 @@
 ; CHECK-NEXT: br label %cond.end.us
 
 ; CHECK: abort0.split:
-; CHECK-NEXT: call void @end0() noreturn nounwind
+; CHECK-NEXT: call void @end0() [[NOR_NUW:#[0-9]+]]
 ; CHECK-NEXT: unreachable
 
 ; CHECK: abort1:
-; CHECK-NEXT: call void @end1() noreturn nounwind
+; CHECK-NEXT: call void @end1() [[NOR_NUW]]
 ; CHECK-NEXT: unreachable
 
 ; CHECK: }
@@ -51,3 +51,7 @@ abort1:
 
 declare void @end0() noreturn
 declare void @end1() noreturn
+
+; CHECK: attributes #0 = { nounwind }
+; CHECK: attributes #1 = { noreturn }
+; CHECK: attributes [[NOR_NUW]] = { noreturn nounwind }
diff --git a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
index f285887..2dd7fe3 100644
--- a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
+++ b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
new file mode 100644
index 0000000..c8d307f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK: @foo
+;CHECK: load <4 x i32>
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret
+;SWIFT: @foo
+;SWIFT: load <4 x i32>
+;SWIFT: load <4 x i32>
+;SWIFT: ret
+define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
+  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i32 %i.02
+  %3 = load i32* %2, align 4
+  %4 = add nsw i32 %3, %sum.01
+  %5 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
new file mode 100644
index 0000000..6a68e81
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK: @example1
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
new file mode 100644
index 0000000..cb77b09
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll b/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
new file mode 100644
index 0000000..d2e3de2
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
@@ -0,0 +1,114 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
+; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
+
+; ModuleID = 'arm.ll'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+%T216 = type <2 x i16>
+%T232 = type <2 x i32>
+%T264 = type <2 x i64>
+
+%T416 = type <4 x i16>
+%T432 = type <4 x i32>
+%T464 = type <4 x i64>
+
+define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'direct':
+  %v0 = load %T432* %loadaddr
+; ASM: vld1.64
+  %v1 = load %T432* %loadaddr2
+; ASM: vld1.64
+  %r3 = mul %T432 %v0, %v1 
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmul.i32
+  store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'ups1632':
+  %v0 = load %T416* %loadaddr
+; ASM: vldr
+  %v1 = load %T416* %loadaddr2
+; ASM: vldr
+  %r1 = sext %T416 %v0 to %T432
+  %r2 = sext %T416 %v1 to %T432
+; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
+  %r3 = mul %T432 %r1, %r2 
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmull.s16
+  store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'upu1632':
+  %v0 = load %T416* %loadaddr
+; ASM: vldr
+  %v1 = load %T416* %loadaddr2
+; ASM: vldr
+  %r1 = zext %T416 %v0 to %T432
+  %r2 = zext %T416 %v1 to %T432
+; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
+  %r3 = mul %T432 %r1, %r2 
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmull.u16
+  store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
+; COST: function 'ups3264':
+  %v0 = load %T232* %loadaddr
+; ASM: vldr
+  %v1 = load %T232* %loadaddr2
+; ASM: vldr
+  %r3 = mul %T232 %v0, %v1 
+; ASM: vmul.i32
+; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
+  %st = sext %T232 %r3 to %T264
+; ASM: vmovl.s32
+; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
+  store %T264 %st, %T264* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
+; COST: function 'upu3264':
+  %v0 = load %T232* %loadaddr
+; ASM: vldr
+  %v1 = load %T232* %loadaddr2
+; ASM: vldr
+  %r3 = mul %T232 %v0, %v1 
+; ASM: vmul.i32
+; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
+  %st = zext %T232 %r3 to %T264
+; ASM: vmovl.u32
+; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
+  store %T264 %st, %T264* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) {
+; COST: function 'dn3216':
+  %v0 = load %T432* %loadaddr
+; ASM: vld1.64
+  %v1 = load %T432* %loadaddr2
+; ASM: vld1.64
+  %r3 = mul %T432 %v0, %v1 
+; ASM: vmul.i32
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+  %st = trunc %T432 %r3 to %T416
+; ASM: vmovn.i32
+; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
+  store %T416 %st, %T416* %storeaddr
+; ASM: vstr
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/ARM/width-detect.ll b/test/Transforms/LoopVectorize/ARM/width-detect.ll
new file mode 100644
index 0000000..c0795b6
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/width-detect.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK:foo_F64
+;CHECK: <2 x double>
+;CHECK:ret
+define double @foo_F64(double* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.01 = phi double [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
+  %2 = getelementptr inbounds double* %A, i64 %indvars.iv
+  %3 = load double* %2, align 8
+  %4 = fmul fast double %prod.01, %3
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi double [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
+  ret double %prod.0.lcssa
+}
+
+;CHECK:foo_I8
+;CHECK: xor <16 x i8>
+;CHECK:ret
+define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i8* %A, i64 %indvars.iv
+  %3 = load i8* %2, align 1
+  %4 = xor i8 %3, %red.01
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ]
+  ret i8 %red.0.lcssa
+}
+
+
diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll
index a2d176a..a85c6fe 100644
--- a/test/Transforms/LoopVectorize/X86/avx1.ll
+++ b/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -27,7 +27,7 @@ define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwta
 
 
 ;CHECK: @read_mod_i64
-;CHECK: load <8 x i64>
+;CHECK: load <4 x i64>
 ;CHECK: ret i32
 define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 60c742e..23d9233 100644
--- a/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK: @conversion_cost1
-;CHECK: store <8 x i8>
+;CHECK: store <32 x i8>
 ;CHECK: ret
 define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 3
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
index 0f21ba6..d2d0eac 100644
--- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-unroll=0 -dce -instcombine -licm -S | FileCheck %s -check-prefix=UNROLL
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-unroll=0 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -53,8 +53,6 @@ define void @example1() nounwind uwtable ssp {
 ;UNROLL: @example10b
 ;UNROLL: load <4 x i16>
 ;UNROLL: load <4 x i16>
-;UNROLL: load <4 x i16>
-;UNROLL: store <4 x i32>
 ;UNROLL: store <4 x i32>
 ;UNROLL: store <4 x i32>
 ;UNROLL: ret void
diff --git a/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll b/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
new file mode 100644
index 0000000..186fba8
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
@@ -0,0 +1,28 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -vectorizer-min-trip-count=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: <4 x float>
+define void @trivial_loop(float* nocapture %a) nounwind uwtable optsize {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %a, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %add = fadd float %0, 1.000000e+00
+  store float %add, float* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
new file mode 100644
index 0000000..452d0df
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The parallel loop has been invalidated by the new memory accesses introduced
+; by reg2mem (Loop::isParallel() starts to return false). Ensure the loop is
+; now non-vectorizable.
+
+;CHECK-NOT: <4 x i32>
+define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  %indvars.iv.next.reg2mem = alloca i64
+  %indvars.iv.reg2mem = alloca i64
+  %"reg2mem alloca point" = bitcast i32 0 to i32
+  store i64 0, i64* %indvars.iv.reg2mem
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.for.body_crit_edge, %entry
+  %indvars.iv.reload = load i64* %indvars.iv.reg2mem
+  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv.reload
+  %0 = load i32* %arrayidx, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv.reload
+  %1 = load i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3
+  store i32 %0, i32* %arrayidx4, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next = add i64 %indvars.iv.reload, 1
+  ; A new store without the parallel metadata here:
+  store i64 %indvars.iv.next, i64* %indvars.iv.next.reg2mem
+  %indvars.iv.next.reload1 = load i64* %indvars.iv.next.reg2mem
+  %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next.reload1
+  %2 = load i32* %arrayidx6, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  store i32 %2, i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next.reload = load i64* %indvars.iv.next.reg2mem
+  %lftr.wideiv = trunc i64 %indvars.iv.next.reload to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop.parallel !3
+
+for.body.for.body_crit_edge:                      ; preds = %for.body
+  %indvars.iv.next.reload2 = load i64* %indvars.iv.next.reg2mem
+  store i64 %indvars.iv.next.reload2, i64* %indvars.iv.reg2mem
+  br label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !3}
diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/test/Transforms/LoopVectorize/X86/parallel-loops.ll
new file mode 100644
index 0000000..f648722
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -0,0 +1,114 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; A tricky loop:
+;
+; void loop(int *a, int *b) {
+;    for (int i = 0; i < 512; ++i) {
+;        a[a[i]] = b[i];
+;        a[i] = b[i+1];
+;    }
+;}
+
+;CHECK: @loop
+;CHECK-NOT: <4 x i32>
+define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4, !tbaa !0
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4, !tbaa !0
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3
+  store i32 %0, i32* %arrayidx4, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
+  %2 = load i32* %arrayidx6, align 4, !tbaa !0
+  store i32 %2, i32* %arrayidx2, align 4, !tbaa !0
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; The same loop with parallel loop metadata added to the loop branch
+; and the memory instructions.
+
+;CHECK: @parallel_loop
+;CHECK: <4 x i32>
+define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3
+  ; This store might have originated from inlining a function with a parallel
+  ; loop. Refers to a list with the "original loop reference" (!4) also included.
+  store i32 %0, i32* %arrayidx4, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
+  %2 = load i32* %arrayidx6, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  store i32 %2, i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop.parallel !3
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; The same loop with an illegal parallel loop metadata: the memory
+; accesses refer to a different loop's identifier.
+
+;CHECK: @mixed_metadata
+;CHECK-NOT: <4 x i32>
+
+define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !6
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !6
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3
+  ; This refers to the loop marked with !7 which we are not in at the moment.
+  ; It should prevent detecting as a parallel loop.
+  store i32 %0, i32* %arrayidx4, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
+  %2 = load i32* %arrayidx6, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !6
+  store i32 %2, i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !6
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop.parallel !6
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !3}
+!4 = metadata !{metadata !4}
+!5 = metadata !{metadata !3, metadata !4}
+!6 = metadata !{metadata !6}
+!7 = metadata !{metadata !7}
diff --git a/test/Transforms/LoopVectorize/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll
index 35b91bb..f390b33 100644
--- a/test/Transforms/LoopVectorize/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
index 2075986..ef63a14 100644
--- a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
+++ b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx2 -force-vector-width=4 -force-vector-unroll=0 -dce -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/X86/unroll_selection.ll b/test/Transforms/LoopVectorize/X86/unroll_selection.ll
new file mode 100644
index 0000000..2d7b663
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/unroll_selection.ll
@@ -0,0 +1,71 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't unroll when we have register pressure.
+;CHECK: reg_pressure
+;CHECK: load <4 x double>
+;CHECK-NOT: load  <4 x double>
+;CHECK: store <4 x double>
+;CHECK-NOT: store <4 x double>
+;CHECK: ret
+define void @reg_pressure(double* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = sext i32 %n to i64
+  br label %2
+
+; <label>:2                                       ; preds = %2, %0
+  %indvars.iv = phi i64 [ %indvars.iv.next, %2 ], [ %1, %0 ]
+  %3 = getelementptr inbounds double* %A, i64 %indvars.iv
+  %4 = load double* %3, align 8
+  %5 = fadd double %4, 3.000000e+00
+  %6 = fmul double %4, 2.000000e+00
+  %7 = fadd double %5, %6
+  %8 = fadd double %7, 2.000000e+00
+  %9 = fmul double %8, 5.000000e-01
+  %10 = fadd double %6, %9
+  %11 = fsub double %10, %5
+  %12 = fadd double %4, %11
+  %13 = fdiv double %8, %12
+  %14 = fmul double %13, %8
+  %15 = fmul double %6, %14
+  %16 = fmul double %5, %15
+  %17 = fadd double %16, -3.000000e+00
+  %18 = fsub double %4, %5
+  %19 = fadd double %6, %18
+  %20 = fadd double %13, %19
+  %21 = fadd double %20, %17
+  %22 = fadd double %21, 3.000000e+00
+  %23 = fmul double %4, %22
+  store double %23, double* %3, align 8
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %24 = trunc i64 %indvars.iv to i32
+  %25 = icmp eq i32 %24, 0
+  br i1 %25, label %26, label %2
+
+; <label>:26                                      ; preds = %2
+  ret void
+}
+
+; This is a small loop. Unroll it twice. 
+;CHECK: small_loop
+;CHECK: xor
+;CHECK: xor
+;CHECK: ret
+define void @small_loop(i16* nocapture %A, i64 %n) nounwind uwtable ssp {
+  %1 = icmp eq i64 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.01 = phi i64 [ %5, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i16* %A, i64 %i.01
+  %3 = load i16* %2, align 2
+  %4 = xor i16 %3, 3
+  store i16 %4, i16* %2, align 2
+  %5 = add i64 %i.01, 1
+  %exitcond = icmp eq i64 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
new file mode 100644
index 0000000..59bb8d0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
@@ -0,0 +1,150 @@
+; RUN: opt -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%0 = type { %0*, %1 }
+%1 = type { i8*, i32 }
+
+@p = global [2048 x [8 x i32*]] zeroinitializer, align 16
+@q = global [2048 x i16] zeroinitializer, align 16
+@r = global [2048 x i16] zeroinitializer, align 16
+
+; Tests for widest type
+; Ensure that we count the pointer store in the first test case. We have a
+; consecutive vector of pointers store, therefore we should count it towards the
+; widest vector count.
+;
+; CHECK: test_consecutive_store
+; CHECK: The Widest type: 64 bits
+define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 {
+  %4 = load %0** %2, align 8
+  %5 = icmp eq %0** %0, %1
+  br i1 %5, label %12, label %6
+
+; <label>:6                                       ; preds = %3
+  br label %7
+
+; <label>:7                                       ; preds = %7, %6
+  %8 = phi %0** [ %0, %6 ], [ %9, %7 ]
+  store %0* %4, %0** %8, align 8
+  %9 = getelementptr inbounds %0** %8, i64 1
+  %10 = icmp eq %0** %9, %1
+  br i1 %10, label %11, label %7
+
+; <label>:11                                      ; preds = %7
+  br label %12
+
+; <label>:12                                      ; preds = %11, %3
+  ret void
+}
+
+; However, if the store of a set of pointers is not to consecutive memory we do
+; NOT count the store towards the widest vector type.
+; In the test case below we add i16 types to store it in an array of pointer,
+; therefore the widest type should be i16.
+; int* p[2048][8];
+; short q[2048];
+;   for (int y = 0; y < 8; ++y)
+;     for (int i = 0; i < 1024; ++i) {
+;       p[i][y] = (int*) (1 + q[i]);
+;     }
+; CHECK: test_nonconsecutive_store
+; CHECK: The Widest type: 16 bits
+define void @test_nonconsecutive_store() nounwind ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %14, %0
+  %2 = phi i64 [ 0, %0 ], [ %15, %14 ]
+  br label %3
+
+; <label>:3                                       ; preds = %3, %1
+  %4 = phi i64 [ 0, %1 ], [ %11, %3 ]
+  %5 = getelementptr inbounds [2048 x i16]* @q, i64 0, i64 %4
+  %6 = load i16* %5, align 2
+  %7 = sext i16 %6 to i64
+  %8 = add i64 %7, 1
+  %9 = inttoptr i64 %8 to i32*
+  %10 = getelementptr inbounds [2048 x [8 x i32*]]* @p, i64 0, i64 %4, i64 %2
+  store i32* %9, i32** %10, align 8
+  %11 = add i64 %4, 1
+  %12 = trunc i64 %11 to i32
+  %13 = icmp ne i32 %12, 1024
+  br i1 %13, label %3, label %14
+
+; <label>:14                                      ; preds = %3
+  %15 = add i64 %2, 1
+  %16 = trunc i64 %15 to i32
+  %17 = icmp ne i32 %16, 8
+  br i1 %17, label %1, label %18
+
+; <label>:18                                      ; preds = %14
+  ret void
+}
+
+
+@ia = global [1024 x i32*] zeroinitializer, align 16
+@ib = global [1024 x i32] zeroinitializer, align 16
+@ic = global [1024 x i8] zeroinitializer, align 16
+@p2 = global [2048 x [8 x i32*]] zeroinitializer, align 16
+@q2 = global [2048 x i16] zeroinitializer, align 16
+
+;; Now we check the same rules for loads. We should take consecutive loads of
+;; pointer types into account.
+; CHECK: test_consecutive_ptr_load
+; CHECK: The Widest type: 64 bits
+define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %2 = phi i64 [ 0, %0 ], [ %10, %1 ]
+  %3 = phi i8 [ 0, %0 ], [ %9, %1 ]
+  %4 = getelementptr inbounds [1024 x i32*]* @ia, i32 0, i64 %2
+  %5 = load i32** %4, align 4
+  %6 = ptrtoint i32* %5 to i64
+  %7 = trunc i64 %6 to i8
+  %8 = add i8 %3, 1
+  %9 = add i8 %7, %8
+  %10 = add i64 %2, 1
+  %11 = icmp ne i64 %10, 1024
+  br i1 %11, label %1, label %12
+
+; <label>:12                                      ; preds = %1
+  %13 = phi i8 [ %9, %1 ]
+  ret i8 %13
+}
+
+;; However, we should not take unconsecutive loads of pointers into account.
+; CHECK: test_nonconsecutive_ptr_load
+; CHECK: The Widest type: 16 bits
+define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %13, %0
+  %2 = phi i64 [ 0, %0 ], [ %14, %13 ]
+  br label %3
+
+; <label>:3                                       ; preds = %3, %1
+  %4 = phi i64 [ 0, %1 ], [ %10, %3 ]
+  %5 = getelementptr inbounds [2048 x [8 x i32*]]* @p2, i64 0, i64 %4, i64 %2
+  %6 = getelementptr inbounds [2048 x i16]* @q2, i64 0, i64 %4
+  %7 = load i32** %5, align 2
+  %8 = ptrtoint i32* %7 to i64
+  %9 = trunc i64 %8 to i16
+  store i16 %9, i16* %6, align 8
+  %10 = add i64 %4, 1
+  %11 = trunc i64 %10 to i32
+  %12 = icmp ne i32 %11, 1024
+  br i1 %12, label %3, label %13
+
+; <label>:13                                      ; preds = %3
+  %14 = add i64 %2, 1
+  %15 = trunc i64 %14 to i32
+  %16 = icmp ne i32 %15, 8
+  br i1 %16, label %1, label %17
+
+; <label>:17                                      ; preds = %13
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/bzip_reverse_loops.ll b/test/Transforms/LoopVectorize/bzip_reverse_loops.ll
new file mode 100644
index 0000000..431e422
--- /dev/null
+++ b/test/Transforms/LoopVectorize/bzip_reverse_loops.ll
@@ -0,0 +1,71 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: fc
+;CHECK: load <4 x i16>
+;CHECK-NEXT: shufflevector <4 x i16>
+;CHECK: select <4 x i1>
+;CHECK: store <4 x i16>
+;CHECK: ret
+define void @fc(i16* nocapture %p, i32 %n, i32 %size) nounwind uwtable ssp {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %cond.end, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %cond.end ]
+  %p.addr.0 = phi i16* [ %p, %entry ], [ %incdec.ptr, %cond.end ]
+  %incdec.ptr = getelementptr inbounds i16* %p.addr.0, i64 -1
+  %0 = load i16* %incdec.ptr, align 2, !tbaa !0
+  %conv = zext i16 %0 to i32
+  %cmp = icmp ult i32 %conv, %size
+  br i1 %cmp, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %do.body
+  %sub = sub i32 %conv, %size
+  %phitmp = trunc i32 %sub to i16
+  br label %cond.end
+
+cond.end:                                         ; preds = %do.body, %cond.true
+  %cond = phi i16 [ %phitmp, %cond.true ], [ 0, %do.body ]
+  store i16 %cond, i16* %incdec.ptr, align 2, !tbaa !0
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %cond.end
+  ret void
+}
+
+;CHECK: example1
+;CHECK: load <4 x i32>
+;CHECK-NEXT: shufflevector <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define void @example1(i32* nocapture %a, i32 %n, i32 %wsize) nounwind uwtable ssp {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.body ]
+  %p.0 = phi i32* [ %a, %entry ], [ %incdec.ptr, %do.body ]
+  %incdec.ptr = getelementptr inbounds i32* %p.0, i64 -1
+  %0 = load i32* %incdec.ptr, align 4, !tbaa !3
+  %cmp = icmp slt i32 %0, %wsize
+  %sub = sub nsw i32 %0, %wsize
+  %cond = select i1 %cmp, i32 0, i32 %sub
+  store i32 %cond, i32* %incdec.ptr, align 4, !tbaa !3
+  %dec = add nsw i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  ret void
+}
+
+!0 = metadata !{metadata !"short", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"int", metadata !1}
diff --git a/test/Transforms/LoopVectorize/calloc.ll b/test/Transforms/LoopVectorize/calloc.ll
index 55c1eba..08c84ef 100644
--- a/test/Transforms/LoopVectorize/calloc.ll
+++ b/test/Transforms/LoopVectorize/calloc.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/Transforms/LoopVectorize/cast-induction.ll b/test/Transforms/LoopVectorize/cast-induction.ll
index 5c090aa..2aa29ed 100644
--- a/test/Transforms/LoopVectorize/cast-induction.ll
+++ b/test/Transforms/LoopVectorize/cast-induction.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 ; rdar://problem/12848162
 
diff --git a/test/Transforms/LoopVectorize/cpp-new-array.ll b/test/Transforms/LoopVectorize/cpp-new-array.ll
index 7cd608d..da0fb05 100644
--- a/test/Transforms/LoopVectorize/cpp-new-array.ll
+++ b/test/Transforms/LoopVectorize/cpp-new-array.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/flags.ll b/test/Transforms/LoopVectorize/flags.ll
index b7f3815..656912e 100644
--- a/test/Transforms/LoopVectorize/flags.ll
+++ b/test/Transforms/LoopVectorize/flags.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
index b8b125f..f335557 100644
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine -licm -S | FileCheck %s
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -licm -S | FileCheck %s -check-prefix=UNROLL
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/global_alias.ll b/test/Transforms/LoopVectorize/global_alias.ll
new file mode 100644
index 0000000..24e698b
--- /dev/null
+++ b/test/Transforms/LoopVectorize/global_alias.ll
@@ -0,0 +1,1078 @@
+; RUN: opt < %s -O3 -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+%struct.anon.0 = type { [100 x [100 x i32]], i32, [100 x [100 x i32]] }
+
+@Foo = common global %struct.anon zeroinitializer, align 4
+@Bar = common global %struct.anon.0 zeroinitializer, align 4
+
+@PB = external global i32*
+@PA = external global i32*
+
+
+;; === First, the tests that should always vectorize, wither statically or by adding run-time checks ===
+
+
+; /// Different objects, positive induction, constant distance
+; int noAlias01 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.B[i] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @noAlias01
+; CHECK: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias01(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %arrayidx1 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add, i32* %arrayidx1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx2, align 4
+  ret i32 %7
+}
+
+; /// Different objects, positive induction with widening slide
+; int noAlias02 (int a) {
+;   int i;
+;   for (i=0; i<SIZE-10; i++)
+;     Foo.A[i] = Foo.B[i+10] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @noAlias02
+; CHECK: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias02(i32 %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 90
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %add = add nsw i32 %1, 10
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %add
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add1 = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add1, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Different objects, positive induction with shortening slide
+; int noAlias03 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i+10] = Foo.B[i] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @noAlias03
+; CHECK: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias03(i32 %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %add1 = add nsw i32 %4, 10
+  %arrayidx2 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add1
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Pointer access, positive stride, run-time check added
+; int noAlias04 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     *(PA+i) = *(PB+i) + a;
+;   return *(PA+a);
+; }
+; CHECK: define i32 @noAlias04
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK ret
+;
+; TODO: This test vectorizes (with run-time check) on real targets with -O3)
+; Check why it's not being vectorized even when forcing vectorization
+
+define i32 @noAlias04(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32** @PB, align 4
+  %2 = load i32* %i, align 4
+  %add.ptr = getelementptr inbounds i32* %1, i32 %2
+  %3 = load i32* %add.ptr, align 4
+  %4 = load i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32** @PA, align 4
+  %6 = load i32* %i, align 4
+  %add.ptr1 = getelementptr inbounds i32* %5, i32 %6
+  store i32 %add, i32* %add.ptr1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32** @PA, align 4
+  %9 = load i32* %a.addr, align 4
+  %add.ptr2 = getelementptr inbounds i32* %8, i32 %9
+  %10 = load i32* %add.ptr2, align 4
+  ret i32 %10
+}
+
+; /// Different objects, positive induction, multi-array
+; int noAlias05 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][i] = Bar.B[N][i] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK: define i32 @noAlias05
+; CHECK: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias05(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %2 = load i32* %N, align 4
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 2), i32 0, i32 %2
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %arrayidx, i32 0, i32 %1
+  %3 = load i32* %arrayidx1, align 4
+  %4 = load i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32* %i, align 4
+  %6 = load i32* %N, align 4
+  %arrayidx2 = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %arrayidx2, i32 0, i32 %5
+  store i32 %add, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32* %a.addr, align 4
+  %9 = load i32* %N, align 4
+  %arrayidx4 = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %arrayidx4, i32 0, i32 %8
+  %10 = load i32* %arrayidx5, align 4
+  ret i32 %10
+}
+
+; /// Same objects, positive induction, multi-array, different sub-elements
+; int noAlias06 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][i] = Bar.A[N+1][i] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK: define i32 @noAlias06
+; CHECK: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias06(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %2 = load i32* %N, align 4
+  %add = add nsw i32 %2, 1
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %add
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %arrayidx, i32 0, i32 %1
+  %3 = load i32* %arrayidx1, align 4
+  %4 = load i32* %a.addr, align 4
+  %add2 = add nsw i32 %3, %4
+  %5 = load i32* %i, align 4
+  %6 = load i32* %N, align 4
+  %arrayidx3 = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %arrayidx3, i32 0, i32 %5
+  store i32 %add2, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32* %a.addr, align 4
+  %9 = load i32* %N, align 4
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %arrayidx5, i32 0, i32 %8
+  %10 = load i32* %arrayidx6, align 4
+  ret i32 %10
+}
+
+; /// Different objects, negative induction, constant distance
+; int noAlias07 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-1] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @noAlias07
+; CHECK: sub nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias07(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 1
+  %arrayidx4 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+; /// Different objects, negative induction, shortening slide
+; int noAlias08 (int a) {
+;   int i;
+;   for (i=0; i<SIZE-10; i++)
+;     Foo.A[SIZE-i-1] = Foo.B[SIZE-i-10] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @noAlias08
+; CHECK: sub nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias08(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 90
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 10
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 1
+  %arrayidx4 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+; /// Different objects, negative induction, widening slide
+; int noAlias09 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-10] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @noAlias09
+; CHECK: sub nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias09(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 10
+  %arrayidx4 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+; /// Pointer access, negative stride, run-time check added
+; int noAlias10 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     *(PA+SIZE-i-1) = *(PB+SIZE-i-1) + a;
+;   return *(PA+a);
+; }
+; CHECK: define i32 @noAlias10
+; CHECK-NOT: sub nsw <4 x i32>
+; CHECK ret
+;
+; TODO: This test vectorizes (with run-time check) on real targets with -O3)
+; Check why it's not being vectorized even when forcing vectorization
+
+define i32 @noAlias10(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32** @PB, align 4
+  %add.ptr = getelementptr inbounds i32* %1, i32 100
+  %2 = load i32* %i, align 4
+  %idx.neg = sub i32 0, %2
+  %add.ptr1 = getelementptr inbounds i32* %add.ptr, i32 %idx.neg
+  %add.ptr2 = getelementptr inbounds i32* %add.ptr1, i32 -1
+  %3 = load i32* %add.ptr2, align 4
+  %4 = load i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32** @PA, align 4
+  %add.ptr3 = getelementptr inbounds i32* %5, i32 100
+  %6 = load i32* %i, align 4
+  %idx.neg4 = sub i32 0, %6
+  %add.ptr5 = getelementptr inbounds i32* %add.ptr3, i32 %idx.neg4
+  %add.ptr6 = getelementptr inbounds i32* %add.ptr5, i32 -1
+  store i32 %add, i32* %add.ptr6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32** @PA, align 4
+  %9 = load i32* %a.addr, align 4
+  %add.ptr7 = getelementptr inbounds i32* %8, i32 %9
+  %10 = load i32* %add.ptr7, align 4
+  ret i32 %10
+}
+
+; /// Different objects, negative induction, multi-array
+; int noAlias11 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][SIZE-i-1] = Bar.B[N][SIZE-i-1] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK: define i32 @noAlias11
+; CHECK: sub nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias11(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %2 = load i32* %N, align 4
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 2), i32 0, i32 %2
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %arrayidx, i32 0, i32 %sub1
+  %3 = load i32* %arrayidx2, align 4
+  %4 = load i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32* %i, align 4
+  %sub3 = sub nsw i32 100, %5
+  %sub4 = sub nsw i32 %sub3, 1
+  %6 = load i32* %N, align 4
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %arrayidx5, i32 0, i32 %sub4
+  store i32 %add, i32* %arrayidx6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32* %a.addr, align 4
+  %9 = load i32* %N, align 4
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx8 = getelementptr inbounds [100 x i32]* %arrayidx7, i32 0, i32 %8
+  %10 = load i32* %arrayidx8, align 4
+  ret i32 %10
+}
+
+; /// Same objects, negative induction, multi-array, different sub-elements
+; int noAlias12 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][SIZE-i-1] = Bar.A[N+1][SIZE-i-1] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK: define i32 @noAlias12
+; CHECK: sub nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias12(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %2 = load i32* %N, align 4
+  %add = add nsw i32 %2, 1
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %add
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %arrayidx, i32 0, i32 %sub1
+  %3 = load i32* %arrayidx2, align 4
+  %4 = load i32* %a.addr, align 4
+  %add3 = add nsw i32 %3, %4
+  %5 = load i32* %i, align 4
+  %sub4 = sub nsw i32 100, %5
+  %sub5 = sub nsw i32 %sub4, 1
+  %6 = load i32* %N, align 4
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %arrayidx6, i32 0, i32 %sub5
+  store i32 %add3, i32* %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32* %a.addr, align 4
+  %9 = load i32* %N, align 4
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx9 = getelementptr inbounds [100 x i32]* %arrayidx8, i32 0, i32 %8
+  %10 = load i32* %arrayidx9, align 4
+  ret i32 %10
+}
+
+; /// Same objects, positive induction, constant distance, just enough for vector size
+; int noAlias13 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.A[i+4] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @noAlias13
+; CHECK: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias13(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %add = add nsw i32 %1, 4
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add1 = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add1, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Same objects, negative induction, constant distance, just enough for vector size
+; int noAlias14 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-1] = Foo.A[SIZE-i-5] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @noAlias14
+; CHECK: sub nsw <4 x i32>
+; CHECK ret
+
+define i32 @noAlias14(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 5
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 1
+  %arrayidx4 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+
+;; === Now, the tests that we could vectorize with induction changes or run-time checks ===
+
+
+; /// Different objects, swapped induction, alias at the end
+; int mayAlias01 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @mayAlias01
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @mayAlias01(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Different objects, swapped induction, alias at the beginning
+; int mayAlias02 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-1] = Foo.B[i] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @mayAlias02
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @mayAlias02(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %4
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx2 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub1
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Pointer access, run-time check added
+; int mayAlias03 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     *(PA+i) = *(PB+SIZE-i-1) + a;
+;   return *(PA+a);
+; }
+; CHECK: define i32 @mayAlias03
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @mayAlias03(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32** @PB, align 4
+  %add.ptr = getelementptr inbounds i32* %1, i32 100
+  %2 = load i32* %i, align 4
+  %idx.neg = sub i32 0, %2
+  %add.ptr1 = getelementptr inbounds i32* %add.ptr, i32 %idx.neg
+  %add.ptr2 = getelementptr inbounds i32* %add.ptr1, i32 -1
+  %3 = load i32* %add.ptr2, align 4
+  %4 = load i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32** @PA, align 4
+  %6 = load i32* %i, align 4
+  %add.ptr3 = getelementptr inbounds i32* %5, i32 %6
+  store i32 %add, i32* %add.ptr3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32** @PA, align 4
+  %9 = load i32* %a.addr, align 4
+  %add.ptr4 = getelementptr inbounds i32* %8, i32 %9
+  %10 = load i32* %add.ptr4, align 4
+  ret i32 %10
+}
+
+
+;; === Finally, the tests that should only vectorize with care (or if we ignore undefined behaviour at all) ===
+
+
+; int mustAlias01 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i+10] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @mustAlias01
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @mustAlias01(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %add2 = add nsw i32 %4, 10
+  %arrayidx3 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add2
+  store i32 %add, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx4, align 4
+  ret i32 %7
+}
+
+; int mustAlias02 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.B[SIZE-i-10] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @mustAlias02
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @mustAlias02(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 10
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; int mustAlias03 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i+10] = Foo.B[SIZE-i-10] + a;
+;   return Foo.A[a];
+; }
+; CHECK: define i32 @mustAlias03
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK ret
+
+define i32 @mustAlias03(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 10
+  %arrayidx = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32* %arrayidx, align 4
+  %3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32* %i, align 4
+  %add2 = add nsw i32 %4, 10
+  %arrayidx3 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add2
+  store i32 %add, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32* %a.addr, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* getelementptr inbounds (%struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32* %arrayidx4, align 4
+  ret i32 %7
+}
diff --git a/test/Transforms/LoopVectorize/i8-induction.ll b/test/Transforms/LoopVectorize/i8-induction.ll
new file mode 100644
index 0000000..7759b70
--- /dev/null
+++ b/test/Transforms/LoopVectorize/i8-induction.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global i8 0, align 1
+@b = common global i8 0, align 1
+
+define void @f() nounwind uwtable ssp {
+scalar.ph:
+  store i8 0, i8* inttoptr (i64 1 to i8*), align 1, !tbaa !0
+  %0 = load i8* @a, align 1, !tbaa !0
+  br label %for.body
+
+for.body:
+  %mul16 = phi i8 [ 0, %scalar.ph ], [ %mul, %for.body ]              ; <------- i8 induction var.
+  %c.015 = phi i8 [ undef, %scalar.ph ], [ %conv8, %for.body ]
+  %conv2 = sext i8 %c.015 to i32
+  %tobool = icmp ne i8 %c.015, 0
+  %.sink = select i1 %tobool, i8 %c.015, i8 %0
+  %mul = mul i8 %mul16, %.sink
+  %add = add nsw i32 %conv2, 1
+  %conv8 = trunc i32 %add to i8
+  %sext = shl i32 %add, 24
+  %phitmp14 = icmp slt i32 %sext, 268435456
+  br i1 %phitmp14, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  store i8 %mul, i8* @b, align 1, !tbaa !0
+  ret void
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
+
diff --git a/test/Transforms/LoopVectorize/if-conversion-reduction.ll b/test/Transforms/LoopVectorize/if-conversion-reduction.ll
index c6dc5d7..3a2d82e 100644
--- a/test/Transforms/LoopVectorize/if-conversion-reduction.ll
+++ b/test/Transforms/LoopVectorize/if-conversion-reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll
index 28407dc..6e7c03a 100644
--- a/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/test/Transforms/LoopVectorize/if-conversion.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
index e24fb39..3fa6b19 100644
--- a/test/Transforms/LoopVectorize/increment.ll
+++ b/test/Transforms/LoopVectorize/increment.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll
index 49c8ecb..7d5a5d7 100644
--- a/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LoopVectorize/no_int_induction.ll b/test/Transforms/LoopVectorize/no_int_induction.ll
index 6eab799..45aa8c7 100644
--- a/test/Transforms/LoopVectorize/no_int_induction.ll
+++ b/test/Transforms/LoopVectorize/no_int_induction.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 ; int __attribute__((noinline)) sum_array(int *A, int n) {
 ;  return std::accumulate(A, A + n, 0);
diff --git a/test/Transforms/LoopVectorize/nofloat.ll b/test/Transforms/LoopVectorize/nofloat.ll
index dbdec33..de23bf0 100644
--- a/test/Transforms/LoopVectorize/nofloat.ll
+++ b/test/Transforms/LoopVectorize/nofloat.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 ; Make sure that we don't vectorize functions with 'noimplicitfloat' attributes.
 
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
index 7e4cee4..8262a18 100644
--- a/test/Transforms/LoopVectorize/non-const-n.ll
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/nsw-crash.ll b/test/Transforms/LoopVectorize/nsw-crash.ll
new file mode 100644
index 0000000..e5fad14
--- /dev/null
+++ b/test/Transforms/LoopVectorize/nsw-crash.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+define void @test() {
+entry:
+  br i1 undef, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+  br label %while.body
+
+while.body:
+  %it.sroa.0.091 = phi i32* [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
+  %incdec.ptr.i = getelementptr inbounds i32* %it.sroa.0.091, i64 1
+  %inc32 = add i32 undef, 1                                        ; <------------- Make sure we don't set NSW flags to the undef.
+  %cmp.i11 = icmp eq i32* %incdec.ptr.i, undef
+  br i1 %cmp.i11, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+
diff --git a/test/Transforms/LoopVectorize/ptr_loops.ll b/test/Transforms/LoopVectorize/ptr_loops.ll
new file mode 100644
index 0000000..25599f8
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ptr_loops.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@A = global [36 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35], align 16
+@B = global [36 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35], align 16
+
+;CHECK:_Z5test1v
+;CHECK: load <4 x i32>
+;CHECK: shufflevector <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @_Z5test1v() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %0, %1
+  %p.02 = phi i32* [ getelementptr inbounds ([36 x i32]* @A, i64 0, i64 18), %0 ], [ %4, %1 ]
+  %b.01 = phi i32* [ getelementptr inbounds ([36 x i32]* @B, i64 0, i64 0), %0 ], [ %5, %1 ]
+  %2 = load i32* %b.01, align 4
+  %3 = shl nsw i32 %2, 1
+  store i32 %3, i32* %p.02, align 4
+  %4 = getelementptr inbounds i32* %p.02, i64 -1
+  %5 = getelementptr inbounds i32* %b.01, i64 1
+  %6 = icmp eq i32* %4, getelementptr ([36 x i32]* @A, i64 128102389400760775, i64 3)
+  br i1 %6, label %7, label %1
+
+; <label>:7                                       ; preds = %1
+  ret i32 0
+}
+
+;CHECK:_Z5test2v
+;CHECK: load <4 x i32>
+;CHECK: shufflevector <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @_Z5test2v() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %0, %1
+  %p.02 = phi i32* [ getelementptr inbounds ([36 x i32]* @A, i64 0, i64 25), %0 ], [ %3, %1 ]
+  %b.01 = phi i32* [ getelementptr inbounds ([36 x i32]* @B, i64 0, i64 2), %0 ], [ %4, %1 ]
+  %2 = load i32* %b.01, align 4
+  store i32 %2, i32* %p.02, align 4
+  %3 = getelementptr inbounds i32* %p.02, i64 -1
+  %4 = getelementptr inbounds i32* %b.01, i64 1
+  %5 = icmp eq i32* %4, getelementptr inbounds ([36 x i32]* @A, i64 0, i64 18)
+  br i1 %5, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret i32 0
+}
+
+;CHECK:_Z5test3v
+;CHECK: load <4 x i32>
+;CHECK: shufflevector <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @_Z5test3v() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %0, %1
+  %p.02 = phi i32* [ getelementptr inbounds ([36 x i32]* @A, i64 0, i64 29), %0 ], [ %3, %1 ]
+  %b.01 = phi i32* [ getelementptr inbounds ([36 x i32]* @B, i64 0, i64 5), %0 ], [ %4, %1 ]
+  %2 = load i32* %b.01, align 4
+  store i32 %2, i32* %p.02, align 4
+  %3 = getelementptr inbounds i32* %p.02, i64 -1
+  %4 = getelementptr inbounds i32* %b.01, i64 1
+  %5 = icmp eq i32* %3, getelementptr ([36 x i32]* @A, i64 128102389400760775, i64 3)
+  br i1 %5, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll
index c3c9035..bfaa6d4 100644
--- a/test/Transforms/LoopVectorize/read-only.ll
+++ b/test/Transforms/LoopVectorize/read-only.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
index 129c20d..08b7b27 100644
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
index 2852684..86098a6 100644
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
@@ -9,6 +9,10 @@ target triple = "x86_64-apple-macosx10.9.0"
 ;     a[i] = b[i] * 3;
 ; }
 
+;CHECK: for.body.preheader:
+;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck
+;CHECK: vector.memcheck:
+;CHECK: br i1 %found.conflict, label %middle.block, label %vector.ph
 ;CHECK: load <4 x float>
 define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
 entry:
diff --git a/test/Transforms/LoopVectorize/same-base-access.ll b/test/Transforms/LoopVectorize/same-base-access.ll
index 2a1f19d..1573893 100644
--- a/test/Transforms/LoopVectorize/same-base-access.ll
+++ b/test/Transforms/LoopVectorize/same-base-access.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S -enable-if-conversion | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
diff --git a/test/Transforms/LoopVectorize/scalar-select.ll b/test/Transforms/LoopVectorize/scalar-select.ll
index d72cd14..7a14d24 100644
--- a/test/Transforms/LoopVectorize/scalar-select.ll
+++ b/test/Transforms/LoopVectorize/scalar-select.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/simple-unroll.ll b/test/Transforms/LoopVectorize/simple-unroll.ll
index 9825764..7e2dd5f 100644
--- a/test/Transforms/LoopVectorize/simple-unroll.ll
+++ b/test/Transforms/LoopVectorize/simple-unroll.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=2 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/small-loop.ll b/test/Transforms/LoopVectorize/small-loop.ll
index ae784b3..fa83dba 100644
--- a/test/Transforms/LoopVectorize/small-loop.ll
+++ b/test/Transforms/LoopVectorize/small-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/struct_access.ll b/test/Transforms/LoopVectorize/struct_access.ll
new file mode 100644
index 0000000..de65d0d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/struct_access.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%struct.coordinate = type { i32, i32 }
+
+; Make sure that we don't generate a wide load when accessing the struct.
+; struct coordinate {
+;  int x;
+;  int y;
+; };
+;
+;
+; int foo(struct coordinate *A, int n) {
+;
+;   int sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += A[i].x;
+;
+;   return sum;
+; }
+
+;CHECK: @foo
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret
+define i32 @foo(%struct.coordinate* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %x = getelementptr inbounds %struct.coordinate* %A, i64 %indvars.iv, i32 0
+  %0 = load i32* %x, align 4, !tbaa !0
+  %add = add nsw i32 %0, %sum.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/write-only.ll b/test/Transforms/LoopVectorize/write-only.ll
index b42122b..54cbe8d 100644
--- a/test/Transforms/LoopVectorize/write-only.ll
+++ b/test/Transforms/LoopVectorize/write-only.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/MemCpyOpt/memcpy.ll b/test/Transforms/MemCpyOpt/memcpy.ll
index 3fa1628..582a57b 100644
--- a/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/test/Transforms/MemCpyOpt/memcpy.ll
@@ -70,20 +70,20 @@ define void @test4(i8 *%P) {
   %A = alloca %1
   %a = bitcast %1* %A to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %P, i64 8, i32 4, i1 false)
-  call void @test4a(i8* byval align 1 %a)
+  call void @test4a(i8* align 1 byval %a)
   ret void
 ; CHECK: @test4
 ; CHECK-NEXT: call void @test4a(
 }
 
-declare void @test4a(i8* byval align 1)
+declare void @test4a(i8* align 1 byval)
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 %struct.S = type { i128, [4 x i8]}
 
 @sS = external global %struct.S, align 16
 
-declare void @test5a(%struct.S* byval align 16) nounwind ssp
+declare void @test5a(%struct.S* align 16 byval) nounwind ssp
 
 
 ; rdar://8713376 - This memcpy can't be eliminated.
@@ -94,7 +94,7 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp, i8* bitcast (%struct.S* @sS to i8*), i64 32, i32 16, i1 false)
   %a = getelementptr %struct.S* %y, i64 0, i32 1, i64 0
   store i8 4, i8* %a
-  call void @test5a(%struct.S* byval align 16 %y)
+  call void @test5a(%struct.S* align 16 byval %y)
   ret i32 0
   ; CHECK: @test5(
   ; CHECK: store i8 4
@@ -114,19 +114,19 @@ define void @test6(i8 *%P) {
 ; isn't itself 8 byte aligned.
 %struct.p = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
 
-define i32 @test7(%struct.p* nocapture byval align 8 %q) nounwind ssp {
+define i32 @test7(%struct.p* nocapture align 8 byval %q) nounwind ssp {
 entry:
   %agg.tmp = alloca %struct.p, align 4
   %tmp = bitcast %struct.p* %agg.tmp to i8*
   %tmp1 = bitcast %struct.p* %q to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp, i8* %tmp1, i64 48, i32 4, i1 false)
-  %call = call i32 @g(%struct.p* byval align 8 %agg.tmp) nounwind
+  %call = call i32 @g(%struct.p* align 8 byval %agg.tmp) nounwind
   ret i32 %call
 ; CHECK: @test7
-; CHECK: call i32 @g(%struct.p* byval align 8 %q) nounwind
+; CHECK: call i32 @g(%struct.p* byval align 8 %q) [[NUW:#[0-9]+]]
 }
 
-declare i32 @g(%struct.p* byval align 8)
+declare i32 @g(%struct.p* align 8 byval)
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 
@@ -152,7 +152,7 @@ declare noalias i8* @malloc(i32)
 ; rdar://11341081
 %struct.big = type { [50 x i32] }
 
-define void @test9() nounwind uwtable ssp {
+define void @test9() nounwind ssp uwtable {
 entry:
 ; CHECK: test9
 ; CHECK: f1
@@ -170,3 +170,7 @@ entry:
 
 declare void @f1(%struct.big* sret)
 declare void @f2(%struct.big*)
+
+; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #1 = { nounwind ssp }
+; CHECK: attributes #2 = { nounwind ssp uwtable }
diff --git a/test/Transforms/MergeFunc/2013-01-10-MergeFuncAssert.ll b/test/Transforms/MergeFunc/2013-01-10-MergeFuncAssert.ll
new file mode 100644
index 0000000..3f6a5ba
--- /dev/null
+++ b/test/Transforms/MergeFunc/2013-01-10-MergeFuncAssert.ll
@@ -0,0 +1,36 @@
+; RUN: opt -mergefunc -disable-output < %s
+; This used to trigger a ConstantExpr::getBitCast assertion.
+
+define void @t1() unnamed_addr uwtable ssp align 2 {
+entry:
+  switch i32 undef, label %sw.bb12 [
+    i32 127, label %sw.bb
+    i32 126, label %sw.bb4
+  ]
+
+sw.bb:                                            ; preds = %entry
+  unreachable
+
+sw.bb4:                                           ; preds = %entry
+  unreachable
+
+sw.bb12:                                          ; preds = %entry
+  ret void
+}
+
+define void @t2() unnamed_addr uwtable ssp align 2 {
+entry:
+  switch i32 undef, label %sw.bb8 [
+    i32 4, label %sw.bb
+    i32 3, label %sw.bb4
+  ]
+
+sw.bb:                                            ; preds = %entry
+  unreachable
+
+sw.bb4:                                           ; preds = %entry
+  ret void
+
+sw.bb8:                                           ; preds = %entry
+  unreachable
+}
diff --git a/test/Transforms/ObjCARC/apelim.ll b/test/Transforms/ObjCARC/apelim.ll
index 8c7b5b1..4541b3f 100644
--- a/test/Transforms/ObjCARC/apelim.ll
+++ b/test/Transforms/ObjCARC/apelim.ll
@@ -38,8 +38,8 @@ entry:
 }
 
 ; CHECK: define internal void @_GLOBAL__I_y()
-; CHECK: %0 = call i8* @objc_autoreleasePoolPush() nounwind
-; CHECK: call void @objc_autoreleasePoolPop(i8* %0) nounwind
+; CHECK: %0 = call i8* @objc_autoreleasePoolPush() [[NUW:#[0-9]+]]
+; CHECK: call void @objc_autoreleasePoolPop(i8* %0) [[NUW]]
 ; CHECK: }
 define internal void @_GLOBAL__I_y() {
 entry:
@@ -51,3 +51,5 @@ entry:
 
 declare i8* @objc_autoreleasePoolPush()
 declare void @objc_autoreleasePoolPop(i8*)
+
+; CHECK: attributes #0 = { nounwind }
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 7b64b1b..4c24ebf 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -92,10 +92,10 @@ alt_return:
 
 ; CHECK: define void @test1b(
 ; CHECK: entry:
-; CHECK:   tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK:   tail call i8* @objc_retain(i8* %x) [[NUW:#[0-9]+]]
 ; CHECK-NOT: @objc_
 ; CHECK: if.end5:
-; CHECK:   tail call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+; CHECK:   tail call void @objc_release(i8* %x) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NOT: @objc_
 ; CHECK: }
 define void @test1b(i8* %x, i1 %p, i1 %q) {
@@ -404,8 +404,8 @@ entry:
 ; a stack argument.
 
 ; CHECK: define void @test11(
-; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
-; CHECK: tail call i8* @objc_autorelease(i8* %0) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %x) [[NUW]]
+; CHECK: call i8* @objc_autorelease(i8* %0) [[NUW]]
 ; CHECK: }
 define void @test11(i8* %x) nounwind {
 entry:
@@ -431,8 +431,8 @@ entry:
 ; Same as test11 but the value is returned. Do an RV optimization.
 
 ; CHECK: define i8* @test11b(
-; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
-; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %0) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %x) [[NUW]]
+; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %0) [[NUW]]
 ; CHECK: }
 define i8* @test11b(i8* %x) nounwind {
 entry:
@@ -462,10 +462,10 @@ entry:
 ; Trivial retain,autorelease pair. Don't delete!
 
 ; CHECK: define void @test13(
-; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
-; CHECK: tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %x) [[NUW]]
+; CHECK: tail call i8* @objc_retain(i8* %x) [[NUW]]
 ; CHECK: @use_pointer(i8* %x)
-; CHECK: tail call i8* @objc_autorelease(i8* %x) nounwind
+; CHECK: call i8* @objc_autorelease(i8* %x) [[NUW]]
 ; CHECK: }
 define void @test13(i8* %x, i64 %n) {
 entry:
@@ -716,7 +716,7 @@ entry:
 ; Bitcast insertion
 
 ; CHECK: define void @test20(
-; CHECK: %tmp1 = tail call i8* @objc_retain(i8* %tmp) nounwind
+; CHECK: %tmp1 = tail call i8* @objc_retain(i8* %tmp) [[NUW]]
 ; CHECK-NEXT: invoke
 define void @test20(double* %self) {
 if.then12:
@@ -980,7 +980,7 @@ done:
 ; CHECK: call i8* @objc_retain(
 ; CHECK: call void @callee()
 ; CHECK: store
-; CHECK: call void @objc_release(i8* %p) nounwind, !clang.imprecise_release
+; CHECK: call void @objc_release(i8* %p) [[NUW]], !clang.imprecise_release
 ; CHECK: done:
 ; CHECK-NOT: @objc_
 ; CHECK: }
@@ -1450,9 +1450,9 @@ define void @test45(i8** %pp, i8** %qq) {
 ; Don't delete retain and autorelease here.
 
 ; CHECK: define void @test46(
-; CHECK: tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %p) [[NUW]]
 ; CHECK: true:
-; CHECK: tail call i8* @objc_autorelease(i8* %p) nounwind
+; CHECK: call i8* @objc_autorelease(i8* %p) [[NUW]]
 define void @test46(i8* %p, i1 %a) {
 entry:
   call i8* @objc_retain(i8* %p)
@@ -1565,7 +1565,7 @@ define void @test53(void ()** %zz, i8** %pp) {
 
 ; CHECK: define void @test54(
 ; CHECK: call i8* @returner()
-; CHECK-NEXT: call void @objc_release(i8* %t) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT: call void @objc_release(i8* %t) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NEXT: ret void
 define void @test54() {
   %t = call i8* @returner()
@@ -1595,10 +1595,10 @@ entry:
 ; CHECK: define void @test56(
 ; CHECK-NOT: @objc
 ; CHECK: if.then:
-; CHECK-NEXT: %0 = tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK-NEXT: %0 = tail call i8* @objc_retain(i8* %x) [[NUW]]
 ; CHECK-NEXT: tail call void @use_pointer(i8* %x)
 ; CHECK-NEXT: tail call void @use_pointer(i8* %x)
-; CHECK-NEXT: tail call void @objc_release(i8* %x) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT: tail call void @objc_release(i8* %x) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NEXT: br label %if.end
 ; CHECK-NOT: @objc
 ; CHECK: }
@@ -1630,10 +1630,10 @@ if.end:                                           ; preds = %entry, %if.then
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
-; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %x) [[NUW]]
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
-; CHECK-NEXT:   call void @objc_release(i8* %x) nounwind
+; CHECK-NEXT:   call void @objc_release(i8* %x) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test57(i8* %x) nounwind {
@@ -1673,10 +1673,10 @@ entry:
 
 ; CHECK:      define void @test59(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %x) nounwind
+; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %x) [[NUW]]
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
-; CHECK-NEXT:   call void @objc_release(i8* %x) nounwind
+; CHECK-NEXT:   call void @objc_release(i8* %x) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test59(i8* %x) nounwind {
@@ -1875,8 +1875,8 @@ return:                                           ; preds = %if.then, %entry
 ; rdar://11931823
 
 ; CHECK: define void @test66(
-; CHECK:   %tmp7 = tail call i8* @objc_retain(i8* %cond) nounwind
-; CHECK:   tail call void @objc_release(i8* %cond) nounwind
+; CHECK:   %tmp7 = tail call i8* @objc_retain(i8* %cond) [[NUW]]
+; CHECK:   tail call void @objc_release(i8* %cond) [[NUW]]
 ; CHECK: }
 define void @test66(i8* %tmp5, i8* %bar, i1 %tobool, i1 %tobool1, i8* %call) {
 entry:
@@ -2224,3 +2224,6 @@ end:                                              ; preds = %if.end125, %if.end1
 !0 = metadata !{}
 
 declare i32 @__gxx_personality_v0(...)
+
+; CHECK: attributes #0 = { nounwind readnone }
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/cfg-hazards.ll b/test/Transforms/ObjCARC/cfg-hazards.ll
index 1519423..899298b 100644
--- a/test/Transforms/ObjCARC/cfg-hazards.ll
+++ b/test/Transforms/ObjCARC/cfg-hazards.ll
@@ -86,9 +86,9 @@ for.end:                                          ; preds = %for.body
 
 ; Delete nested retain+release pairs around loops.
 
-;      CHECK: define void @test3(i8* %a) nounwind {
+;      CHECK: define void @test3(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) nounwind
+; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW:#[0-9]+]]
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
 ;      CHECK: exit:
@@ -112,9 +112,9 @@ exit:
   ret void
 }
 
-;      CHECK: define void @test4(i8* %a) nounwind {
+;      CHECK: define void @test4(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) nounwind
+; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
 ;      CHECK: exit:
@@ -142,9 +142,9 @@ exit:
   ret void
 }
 
-;      CHECK: define void @test5(i8* %a) nounwind {
+;      CHECK: define void @test5(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) nounwind
+; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   call void @callee()
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
@@ -176,9 +176,9 @@ exit:
   ret void
 }
 
-;      CHECK: define void @test6(i8* %a) nounwind {
+;      CHECK: define void @test6(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) nounwind
+; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
 ;      CHECK: exit:
@@ -209,9 +209,9 @@ exit:
   ret void
 }
 
-;      CHECK: define void @test7(i8* %a) nounwind {
+;      CHECK: define void @test7(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) nounwind
+; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   call void @callee()
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
@@ -242,9 +242,9 @@ exit:
   ret void
 }
 
-;      CHECK: define void @test8(i8* %a) nounwind {
+;      CHECK: define void @test8(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) nounwind
+; CHECK-NEXT:   tail call i8* @objc_retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
 ;      CHECK: exit:
@@ -274,7 +274,7 @@ exit:
   ret void
 }
 
-;      CHECK: define void @test9(i8* %a) nounwind {
+;      CHECK: define void @test9(i8* %a) #0 {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
@@ -303,7 +303,7 @@ exit:
   ret void
 }
 
-;      CHECK: define void @test10(i8* %a) nounwind {
+;      CHECK: define void @test10(i8* %a) #0 {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
@@ -332,7 +332,7 @@ exit:
   ret void
 }
 
-;      CHECK: define void @test11(i8* %a) nounwind {
+;      CHECK: define void @test11(i8* %a) #0 {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
@@ -362,15 +362,15 @@ exit:
 
 ; Don't delete anything if they're not balanced.
 
-;      CHECK: define void @test12(i8* %a) nounwind {
+;      CHECK: define void @test12(i8* %a) #0 {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %outer = tail call i8* @objc_retain(i8* %a) nounwind
-; CHECK-NEXT:   %inner = tail call i8* @objc_retain(i8* %a) nounwind
+; CHECK-NEXT:   %outer = tail call i8* @objc_retain(i8* %a) [[NUW]]
+; CHECK-NEXT:   %inner = tail call i8* @objc_retain(i8* %a) [[NUW]]
 ; CHECK-NEXT:   br label %loop
 ;  CHECK-NOT:   @objc_
 ;      CHECK: exit:
-; CHECK-NEXT: call void @objc_release(i8* %a) nounwind
-; CHECK-NEXT: call void @objc_release(i8* %a) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT: call void @objc_release(i8* %a) [[NUW]]
+; CHECK-NEXT: call void @objc_release(i8* %a) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test12(i8* %a) nounwind {
@@ -394,4 +394,6 @@ exit:
   ret void
 }
 
+; CHECK: attributes [[NUW]] = { nounwind }
+
 !0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/contract-marker.ll b/test/Transforms/ObjCARC/contract-marker.ll
index 01d978a..01fd1e7 100644
--- a/test/Transforms/ObjCARC/contract-marker.ll
+++ b/test/Transforms/ObjCARC/contract-marker.ll
@@ -3,7 +3,7 @@
 ; CHECK:      %call = tail call i32* @qux()
 ; CHECK-NEXT: %tcall = bitcast i32* %call to i8*
 ; CHECK-NEXT: call void asm sideeffect "mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue", ""()
-; CHECK-NEXT: %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tcall) nounwind
+; CHECK-NEXT: %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %tcall) [[NUW:#[0-9]+]]
 
 define void @foo() {
 entry:
@@ -21,3 +21,5 @@ declare void @bar(i8*)
 !clang.arc.retainAutoreleasedReturnValueMarker = !{!0}
 
 !0 = metadata !{metadata !"mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue"}
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/contract-storestrong.ll b/test/Transforms/ObjCARC/contract-storestrong.ll
index 2922f81..6999237 100644
--- a/test/Transforms/ObjCARC/contract-storestrong.ll
+++ b/test/Transforms/ObjCARC/contract-storestrong.ll
@@ -10,7 +10,7 @@ declare void @use_pointer(i8*)
 
 ; CHECK: define void @test0(
 ; CHECK: entry:
-; CHECK-NEXT: tail call void @objc_storeStrong(i8** @x, i8* %p) nounwind
+; CHECK-NEXT: tail call void @objc_storeStrong(i8** @x, i8* %p) [[NUW:#[0-9]+]]
 ; CHECK-NEXT: ret void
 define void @test0(i8* %p) {
 entry:
@@ -25,10 +25,10 @@ entry:
 
 ;      CHECK: define void @test1(i8* %p) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) [[NUW]]
 ; CHECK-NEXT:   %tmp = load volatile i8** @x, align 8
 ; CHECK-NEXT:   store i8* %0, i8** @x, align 8
-; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) nounwind
+; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test1(i8* %p) {
@@ -44,10 +44,10 @@ entry:
 
 ;      CHECK: define void @test2(i8* %p) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) [[NUW]]
 ; CHECK-NEXT:   %tmp = load i8** @x, align 8
 ; CHECK-NEXT:   store volatile i8* %0, i8** @x, align 8
-; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) nounwind
+; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test2(i8* %p) {
@@ -64,11 +64,11 @@ entry:
 
 ; CHECK:      define void @test3(i8* %newValue) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+; CHECK-NEXT:   %x0 = tail call i8* @objc_retain(i8* %newValue) [[NUW]]
 ; CHECK-NEXT:   %x1 = load i8** @x, align 8
 ; CHECK-NEXT:   store i8* %x0, i8** @x, align 8
 ; CHECK-NEXT:   tail call void @use_pointer(i8* %x1), !clang.arc.no_objc_arc_exceptions !0
-; CHECK-NEXT:   tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT:   tail call void @objc_release(i8* %x1) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test3(i8* %newValue) {
@@ -85,11 +85,11 @@ entry:
 
 ; CHECK:      define i1 @test4(i8* %newValue, i8* %foo) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
+; CHECK-NEXT:   %x0 = tail call i8* @objc_retain(i8* %newValue) [[NUW]]
 ; CHECK-NEXT:   %x1 = load i8** @x, align 8
 ; CHECK-NEXT:   store i8* %x0, i8** @x, align 8
 ; CHECK-NEXT:   %t = icmp eq i8* %x1, %foo
-; CHECK-NEXT:   tail call void @objc_release(i8* %x1) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT:   tail call void @objc_release(i8* %x1) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NEXT:   ret i1 %t
 ; CHECK-NEXT: }
 define i1 @test4(i8* %newValue, i8* %foo) {
@@ -106,7 +106,7 @@ entry:
 
 ; CHECK: define i1 @test5(i8* %newValue, i8* %foo) {
 ; CHECK: %t = icmp eq i8* %x1, %foo
-; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) nounwind
+; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) [[NUW]]
 define i1 @test5(i8* %newValue, i8* %foo) {
 entry:
   %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
@@ -121,7 +121,7 @@ entry:
 
 ; CHECK: define i1 @test6(i8* %newValue, i8* %foo) {
 ; CHECK: %t = icmp eq i8* %x1, %foo
-; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) nounwind
+; CHECK: tail call void @objc_storeStrong(i8** @x, i8* %newValue) [[NUW]]
 define i1 @test6(i8* %newValue, i8* %foo) {
 entry:
   %x0 = tail call i8* @objc_retain(i8* %newValue) nounwind
@@ -136,9 +136,9 @@ entry:
 
 ;      CHECK: define void @test7(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK-NEXT:   %0 = tail call i8* @objc_retain(i8* %p) [[NUW]]
 ; CHECK-NEXT:   %tmp = load i8** @x, align 8
-; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) nounwind
+; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test7(i8* %p) {
@@ -155,7 +155,7 @@ entry:
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %tmp = load i8** @x, align 8
 ; CHECK-NEXT:   store i8* %p, i8** @x, align 8
-; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) nounwind
+; CHECK-NEXT:   tail call void @objc_release(i8* %tmp) [[NUW]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @test8(i8* %p) {
@@ -167,3 +167,5 @@ entry:
 }
 
 !0 = metadata !{}
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/contract-testcases.ll b/test/Transforms/ObjCARC/contract-testcases.ll
index 1510ed0..85b03be 100644
--- a/test/Transforms/ObjCARC/contract-testcases.ll
+++ b/test/Transforms/ObjCARC/contract-testcases.ll
@@ -69,7 +69,7 @@ bb7:                                              ; preds = %bb6, %bb6, %bb5
 ; CHECK: define void @_Z6doTestP8NSString() {
 ; CHECK: invoke.cont:                                      ; preds = %entry
 ; CHECK-NEXT: call void asm sideeffect "mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue", ""()
-; CHECK-NEXT: %tmp = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) nounwind
+; CHECK-NEXT: %tmp = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %call) [[NUW:#[0-9]+]]
 define void @_Z6doTestP8NSString() {
 entry:
   %call = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* ()*)()
@@ -88,3 +88,6 @@ lpad:                                             ; preds = %entry
 !clang.arc.retainAutoreleasedReturnValueMarker = !{!0}
 
 !0 = metadata !{metadata !"mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue"}
+
+; CHECK: attributes #0 = { optsize }
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/contract.ll b/test/Transforms/ObjCARC/contract.ll
index c48f8a5..b6fba59 100644
--- a/test/Transforms/ObjCARC/contract.ll
+++ b/test/Transforms/ObjCARC/contract.ll
@@ -34,12 +34,12 @@ entry:
 ; Merge objc_retain and objc_autorelease into objc_retainAutorelease.
 
 ; CHECK: define void @test2(
-; CHECK: tail call i8* @objc_retainAutorelease(i8* %x) nounwind
+; CHECK: tail call i8* @objc_retainAutorelease(i8* %x) [[NUW:#[0-9]+]]
 ; CHECK: }
 define void @test2(i8* %x) nounwind {
 entry:
   %0 = tail call i8* @objc_retain(i8* %x) nounwind
-  tail call i8* @objc_autorelease(i8* %0) nounwind
+  call i8* @objc_autorelease(i8* %0) nounwind
   call void @use_pointer(i8* %x)
   ret void
 }
@@ -47,7 +47,7 @@ entry:
 ; Same as test2 but the value is returned. Do an RV optimization.
 
 ; CHECK: define i8* @test2b(
-; CHECK: tail call i8* @objc_retainAutoreleaseReturnValue(i8* %x) nounwind
+; CHECK: tail call i8* @objc_retainAutoreleaseReturnValue(i8* %x) [[NUW]]
 ; CHECK: }
 define i8* @test2b(i8* %x) nounwind {
 entry:
@@ -59,14 +59,14 @@ entry:
 ; Merge a retain,autorelease pair around a call.
 
 ; CHECK: define void @test3(
-; CHECK: tail call i8* @objc_retainAutorelease(i8* %x) nounwind
+; CHECK: tail call i8* @objc_retainAutorelease(i8* %x) [[NUW]]
 ; CHECK: @use_pointer(i8* %0)
 ; CHECK: }
 define void @test3(i8* %x, i64 %n) {
 entry:
   tail call i8* @objc_retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  tail call i8* @objc_autorelease(i8* %x) nounwind
+  call i8* @objc_autorelease(i8* %x) nounwind
   ret void
 }
 
@@ -75,7 +75,7 @@ entry:
 
 ; CHECK: define void @test4(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: @objc_retainAutorelease(i8* %x) nounwind
+; CHECK-NEXT: @objc_retainAutorelease(i8* %x) [[NUW]]
 ; CHECK-NEXT: @use_pointer
 ; CHECK-NEXT: @objc_release
 ; CHECK-NEXT: ret void
@@ -84,7 +84,7 @@ define void @test4(i8* %x, i64 %n) {
 entry:
   tail call i8* @objc_retain(i8* %x) nounwind
   call void @use_pointer(i8* %x)
-  tail call i8* @objc_autorelease(i8* %x) nounwind
+  call i8* @objc_autorelease(i8* %x) nounwind
   tail call void @objc_release(i8* %x) nounwind
   ret void
 }
@@ -92,9 +92,9 @@ entry:
 ; Don't merge retain and autorelease if they're not control-equivalent.
 
 ; CHECK: define void @test5(
-; CHECK: tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %p) [[NUW]]
 ; CHECK: true:
-; CHECK: tail call i8* @objc_autorelease(i8* %0) nounwind
+; CHECK: call i8* @objc_autorelease(i8* %0) [[NUW]]
 ; CHECK: }
 define void @test5(i8* %p, i1 %a) {
 entry:
@@ -102,7 +102,7 @@ entry:
   br i1 %a, label %true, label %false
 
 true:
-  tail call i8* @objc_autorelease(i8* %p) nounwind
+  call i8* @objc_autorelease(i8* %p) nounwind
   call void @use_pointer(i8* %p)
   ret void
 
@@ -119,8 +119,8 @@ false:
 ; Those entrypoints don't exist yet though.
 
 ; CHECK: define i8* @test6(
-; CHECK: call i8* @objc_retainAutoreleasedReturnValue(i8* %p) nounwind
-; CHECK: %t = tail call i8* @objc_autoreleaseReturnValue(i8* %1) nounwind
+; CHECK: call i8* @objc_retainAutoreleasedReturnValue(i8* %p) [[NUW]]
+; CHECK: %t = tail call i8* @objc_autoreleaseReturnValue(i8* %1) [[NUW]]
 ; CHECK: }
 define i8* @test6() {
   %p = call i8* @returner()
@@ -161,3 +161,5 @@ return:                                           ; preds = %if.then, %entry
   %retval = phi i8* [ %c, %if.then ], [ null, %entry ]
   ret i8* %retval
 }
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/dont-infinite-loop-during-block-escape-analysis.ll b/test/Transforms/ObjCARC/dont-infinite-loop-during-block-escape-analysis.ll
new file mode 100644
index 0000000..bdee2be
--- /dev/null
+++ b/test/Transforms/ObjCARC/dont-infinite-loop-during-block-escape-analysis.ll
@@ -0,0 +1,87 @@
+; RUN: opt -S -objc-arc < %s
+; bugzilla://14551
+; rdar://12851911
+
+; Make sure that we do not hang clang during escape analysis.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-darwin"
+
+%struct.__block_descriptor = type { i64, i64 }
+%struct.__block_byref_foo = type { i8*, %struct.__block_byref_foo*, i32, i32, i32 }
+
+@_NSConcreteGlobalBlock = external global i8*
+@.str = private unnamed_addr constant [6 x i8] c"v8@?0\00", align 1
+@__block_descriptor_tmp = internal constant { i64, i64, i8*, i8* } { i64 0, i64 32, i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i8* null }
+@__block_literal_global = internal constant { i8**, i32, i32, i8*, %struct.__block_descriptor* } { i8** @_NSConcreteGlobalBlock, i32 1342177280, i32 0, i8* bitcast (void (i8*)* @__hang_clang_block_invoke to i8*), %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8* }* @__block_descriptor_tmp to %struct.__block_descriptor*) }, align 8
+
+define void @hang_clang() uwtable optsize ssp {
+entry:
+  %foo = alloca %struct.__block_byref_foo, align 8
+  %byref.isa = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 0
+  store i8* null, i8** %byref.isa, align 8
+  %byref.forwarding = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 1
+  store %struct.__block_byref_foo* %foo, %struct.__block_byref_foo** %byref.forwarding, align 8
+  %byref.flags = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 2
+  store i32 536870912, i32* %byref.flags, align 8
+  %byref.size = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 3
+  store i32 32, i32* %byref.size, align 4
+  %foo1 = getelementptr inbounds %struct.__block_byref_foo* %foo, i64 0, i32 4
+  store i32 0, i32* %foo1, align 8, !tbaa !4
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc.for.body_crit_edge, %entry
+  %0 = phi i1 [ true, %entry ], [ %phitmp, %for.inc.for.body_crit_edge ]
+  %i.06 = phi i32 [ 1, %entry ], [ %phitmp8, %for.inc.for.body_crit_edge ]
+  %block.05 = phi void (...)* [ null, %entry ], [ %block.1, %for.inc.for.body_crit_edge ]
+  br i1 %0, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %1 = call i8* @objc_retainBlock(i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global to i8*)) nounwind, !clang.arc.copy_on_escape !7
+  %2 = bitcast i8* %1 to void (...)*
+  %3 = bitcast void (...)* %block.05 to i8*
+  call void @objc_release(i8* %3) nounwind, !clang.imprecise_release !7
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %block.1 = phi void (...)* [ %2, %if.then ], [ %block.05, %for.body ]
+  %exitcond = icmp eq i32 %i.06, 10
+  br i1 %exitcond, label %for.end, label %for.inc.for.body_crit_edge
+
+for.inc.for.body_crit_edge:                       ; preds = %for.inc
+  %.pre = load %struct.__block_byref_foo** %byref.forwarding, align 8
+  %foo2.phi.trans.insert = getelementptr inbounds %struct.__block_byref_foo* %.pre, i64 0, i32 4
+  %.pre7 = load i32* %foo2.phi.trans.insert, align 4, !tbaa !4
+  %phitmp = icmp eq i32 %.pre7, 0
+  %phitmp8 = add i32 %i.06, 1
+  br label %for.body
+
+for.end:                                          ; preds = %for.inc
+  %4 = bitcast %struct.__block_byref_foo* %foo to i8*
+  call void @_Block_object_dispose(i8* %4, i32 8)
+  %5 = bitcast void (...)* %block.1 to i8*
+  call void @objc_release(i8* %5) nounwind, !clang.imprecise_release !7
+  ret void
+}
+
+define internal void @__hang_clang_block_invoke(i8* nocapture %.block_descriptor) nounwind uwtable readnone optsize ssp {
+entry:
+  ret void
+}
+
+declare i8* @objc_retainBlock(i8*)
+
+declare void @objc_release(i8*) nonlazybind
+
+declare void @_Block_object_dispose(i8*, i32)
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{metadata !"int", metadata !5}
+!5 = metadata !{metadata !"omnipotent char", metadata !6}
+!6 = metadata !{metadata !"Simple C/C++ TBAA"}
+!7 = metadata !{}
diff --git a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
new file mode 100644
index 0000000..05257d1
--- /dev/null
+++ b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
@@ -0,0 +1,174 @@
+; RUN: opt -objc-arc -S < %s | FileCheck %s
+; rdar://11744105
+; bugzilla://14584
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%0 = type opaque
+%struct._class_t = type { %struct._class_t*, %struct._class_t*, %struct._objc_cache*, i8* (i8*, i8*)**, %struct._class_ro_t* }
+%struct._objc_cache = type opaque
+%struct._class_ro_t = type { i32, i32, i32, i8*, i8*, %struct.__method_list_t*, %struct._objc_protocol_list*, %struct._ivar_list_t*, i8*, %struct._prop_list_t* }
+%struct.__method_list_t = type { i32, i32, [0 x %struct._objc_method] }
+%struct._objc_method = type { i8*, i8*, i8* }
+%struct._objc_protocol_list = type { i64, [0 x %struct._protocol_t*] }
+%struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32, i8** }
+%struct._prop_list_t = type { i32, i32, [0 x %struct._prop_t] }
+%struct._prop_t = type { i8*, i8* }
+%struct._ivar_list_t = type { i32, i32, [0 x %struct._ivar_t] }
+%struct._ivar_t = type { i64*, i8*, i8*, i32, i32 }
+%struct.NSConstantString = type { i32*, i32, i8*, i64 }
+
+@"OBJC_CLASS_$_NSObject" = external global %struct._class_t
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_" = internal global %struct._class_t* @"OBJC_CLASS_$_NSObject", section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@"\01L_OBJC_METH_VAR_NAME_" = internal global [4 x i8] c"new\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+@"\01L_OBJC_SELECTOR_REFERENCES_" = internal global i8* getelementptr inbounds ([4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i64 0, i64 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@__CFConstantStringClassReference = external global [0 x i32]
+@.str = linker_private unnamed_addr constant [11 x i8] c"Failed: %@\00", align 1
+@_unnamed_cfstring_ = private constant %struct.NSConstantString { i32* getelementptr inbounds ([0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([11 x i8]* @.str, i32 0, i32 0), i64 10 }, section "__DATA,__cfstring"
+@"OBJC_CLASS_$_NSException" = external global %struct._class_t
+@"\01L_OBJC_CLASSLIST_REFERENCES_$_1" = internal global %struct._class_t* @"OBJC_CLASS_$_NSException", section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@.str2 = linker_private unnamed_addr constant [4 x i8] c"Foo\00", align 1
+@_unnamed_cfstring_3 = private constant %struct.NSConstantString { i32* getelementptr inbounds ([0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8]* @.str2, i32 0, i32 0), i64 3 }, section "__DATA,__cfstring"
+@"\01L_OBJC_METH_VAR_NAME_4" = internal global [14 x i8] c"raise:format:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+@"\01L_OBJC_SELECTOR_REFERENCES_5" = internal global i8* getelementptr inbounds ([14 x i8]* @"\01L_OBJC_METH_VAR_NAME_4", i64 0, i64 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
+@llvm.used = appending global [6 x i8*] [i8* bitcast (%struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_" to i8*), i8* getelementptr inbounds ([4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i32 0, i32 0), i8* bitcast (i8** @"\01L_OBJC_SELECTOR_REFERENCES_" to i8*), i8* bitcast (%struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_1" to i8*), i8* getelementptr inbounds ([14 x i8]* @"\01L_OBJC_METH_VAR_NAME_4", i32 0, i32 0), i8* bitcast (i8** @"\01L_OBJC_SELECTOR_REFERENCES_5" to i8*)], section "llvm.metadata"
+
+define i32 @main() uwtable ssp {
+entry:
+  %tmp = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_", align 8, !dbg !37
+  %tmp1 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8, !dbg !37, !invariant.load !38
+  %tmp2 = bitcast %struct._class_t* %tmp to i8*, !dbg !37
+; CHECK: call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1)
+  %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %tmp2, i8* %tmp1), !dbg !37, !clang.arc.no_objc_arc_exceptions !38
+  call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !12), !dbg !37
+; CHECK: call i8* @objc_retain(i8* %call) [[NUW:#[0-9]+]]
+  %tmp3 = call i8* @objc_retain(i8* %call) nounwind, !dbg !39
+  call void @llvm.dbg.value(metadata !{i8* %call}, i64 0, metadata !25), !dbg !39
+  invoke fastcc void @ThrowFunc(i8* %call)
+          to label %eh.cont unwind label %lpad, !dbg !40, !clang.arc.no_objc_arc_exceptions !38
+
+eh.cont:                                          ; preds = %entry
+; CHECK: call void @objc_release(i8* %call)
+  call void @objc_release(i8* %call) nounwind, !dbg !42, !clang.imprecise_release !38
+  br label %if.end, !dbg !43
+
+lpad:                                             ; preds = %entry
+  %tmp4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+          catch i8* null, !dbg !40
+  %tmp5 = extractvalue { i8*, i32 } %tmp4, 0, !dbg !40
+  %exn.adjusted = call i8* @objc_begin_catch(i8* %tmp5) nounwind, !dbg !44
+  call void @llvm.dbg.value(metadata !45, i64 0, metadata !21), !dbg !46
+  call void @objc_end_catch(), !dbg !49, !clang.arc.no_objc_arc_exceptions !38
+; CHECK: call void @objc_release(i8* %call)
+  call void @objc_release(i8* %call) nounwind, !dbg !42, !clang.imprecise_release !38
+  call void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to i8*), i8* %call), !dbg !50, !clang.arc.no_objc_arc_exceptions !38
+  br label %if.end, !dbg !52
+
+if.end:                                           ; preds = %lpad, %eh.cont
+  call void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to i8*), i8* %call), !dbg !53, !clang.arc.no_objc_arc_exceptions !38
+; CHECK: call void @objc_release(i8* %call)
+  call void @objc_release(i8* %call) nounwind, !dbg !54, !clang.imprecise_release !38
+  ret i32 0, !dbg !54
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
+
+declare i8* @objc_retain(i8*) nonlazybind
+
+declare i8* @objc_begin_catch(i8*)
+
+declare void @objc_end_catch()
+
+declare void @objc_exception_rethrow()
+
+define internal fastcc void @ThrowFunc(i8* %obj) uwtable noinline ssp {
+entry:
+  %tmp = call i8* @objc_retain(i8* %obj) nounwind
+  call void @llvm.dbg.value(metadata !{i8* %obj}, i64 0, metadata !32), !dbg !55
+  %tmp1 = load %struct._class_t** @"\01L_OBJC_CLASSLIST_REFERENCES_$_1", align 8, !dbg !56
+  %tmp2 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_5", align 8, !dbg !56, !invariant.load !38
+  %tmp3 = bitcast %struct._class_t* %tmp1 to i8*, !dbg !56
+  call void (i8*, i8*, %0*, %0*, ...)* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, %0*, %0*, ...)*)(i8* %tmp3, i8* %tmp2, %0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_3 to %0*), %0* bitcast (%struct.NSConstantString* @_unnamed_cfstring_3 to %0*)), !dbg !56, !clang.arc.no_objc_arc_exceptions !38
+  call void @objc_release(i8* %obj) nounwind, !dbg !58, !clang.imprecise_release !38
+  ret void, !dbg !58
+}
+
+declare i32 @__objc_personality_v0(...)
+
+declare void @objc_release(i8*) nonlazybind
+
+declare void @NSLog(i8*, ...)
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+; CHECK: attributes #0 = { ssp uwtable }
+; CHECK: attributes #1 = { nounwind readnone }
+; CHECK: attributes #2 = { nonlazybind }
+; CHECK: attributes #3 = { noinline ssp uwtable }
+; CHECK: attributes [[NUW]] = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!33, !34, !35, !36}
+
+!0 = metadata !{i32 786449, i32 0, i32 16, metadata !"test.m", metadata !"/Volumes/Files/gottesmmcab/Radar/12906997", metadata !"clang version 3.3 ", i1 true, i1 true, metadata !"", i32 2, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m] [DW_LANG_ObjC]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !27}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 9, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !10, i32 10} ; [ DW_TAG_subprogram ] [line 9] [def] [scope 10] [main]
+!6 = metadata !{i32 786473, metadata !"test.m", metadata !"/Volumes/Files/gottesmmcab/Radar/12906997", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !11}
+!11 = metadata !{metadata !12, metadata !21, metadata !25}
+!12 = metadata !{i32 786688, metadata !13, metadata !"obj", metadata !6, i32 11, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [obj] [line 11]
+!13 = metadata !{i32 786443, metadata !5, i32 10, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!14 = metadata !{i32 786454, null, metadata !"id", metadata !6, i32 11, i64 0, i64 0, i64 0, i32 0, metadata !15} ; [ DW_TAG_typedef ] [id] [line 11, size 0, align 0, offset 0] [from ]
+!15 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
+!16 = metadata !{i32 786451, null, metadata !"objc_object", metadata !6, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{metadata !18}
+!18 = metadata !{i32 786445, metadata !16, metadata !"isa", metadata !6, i32 0, i64 64, i64 0, i64 0, i32 0, metadata !19} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
+!19 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
+!20 = metadata !{i32 786451, null, metadata !"objc_class", metadata !6, i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [fwd] [from ]
+!21 = metadata !{i32 786688, metadata !22, metadata !"ok", metadata !6, i32 13, metadata !23, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ok] [line 13]
+!22 = metadata !{i32 786443, metadata !13, i32 12, i32 0, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!23 = metadata !{i32 786454, null, metadata !"BOOL", metadata !6, i32 62, i64 0, i64 0, i64 0, i32 0, metadata !24} ; [ DW_TAG_typedef ] [BOOL] [line 62, size 0, align 0, offset 0] [from signed char]
+!24 = metadata !{i32 786468, null, metadata !"signed char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!25 = metadata !{i32 786688, metadata !26, metadata !"obj2", metadata !6, i32 15, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [obj2] [line 15]
+!26 = metadata !{i32 786443, metadata !22, i32 14, i32 0, metadata !6, i32 2} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!27 = metadata !{i32 786478, i32 0, metadata !6, metadata !"ThrowFunc", metadata !"ThrowFunc", metadata !"", metadata !6, i32 4, metadata !28, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i8*)* @ThrowFunc, null, null, metadata !30, i32 5} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [scope 5] [ThrowFunc]
+!28 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{null, metadata !14}
+!30 = metadata !{metadata !31}
+!31 = metadata !{metadata !32}
+!32 = metadata !{i32 786689, metadata !27, metadata !"obj", metadata !6, i32 16777220, metadata !14, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [obj] [line 4]
+!33 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!34 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!35 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!36 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!37 = metadata !{i32 11, i32 0, metadata !13, null}
+!38 = metadata !{}
+!39 = metadata !{i32 15, i32 0, metadata !26, null}
+!40 = metadata !{i32 17, i32 0, metadata !41, null}
+!41 = metadata !{i32 786443, metadata !26, i32 16, i32 0, metadata !6, i32 3} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!42 = metadata !{i32 22, i32 0, metadata !26, null}
+!43 = metadata !{i32 23, i32 0, metadata !22, null}
+!44 = metadata !{i32 19, i32 0, metadata !41, null}
+!45 = metadata !{i8 0}
+!46 = metadata !{i32 20, i32 0, metadata !47, null}
+!47 = metadata !{i32 786443, metadata !48, i32 19, i32 0, metadata !6, i32 5} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!48 = metadata !{i32 786443, metadata !26, i32 19, i32 0, metadata !6, i32 4} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!49 = metadata !{i32 21, i32 0, metadata !47, null}
+!50 = metadata !{i32 24, i32 0, metadata !51, null}
+!51 = metadata !{i32 786443, metadata !22, i32 23, i32 0, metadata !6, i32 6} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!52 = metadata !{i32 25, i32 0, metadata !51, null}
+!53 = metadata !{i32 27, i32 0, metadata !13, null}
+!54 = metadata !{i32 28, i32 0, metadata !13, null}
+!55 = metadata !{i32 4, i32 0, metadata !27, null}
+!56 = metadata !{i32 6, i32 0, metadata !57, null}
+!57 = metadata !{i32 786443, metadata !27, i32 5, i32 0, metadata !6, i32 7} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
+!58 = metadata !{i32 7, i32 0, metadata !57, null}
diff --git a/test/Transforms/ObjCARC/escape.ll b/test/Transforms/ObjCARC/escape.ll
index 3f694cf..8f252a0 100644
--- a/test/Transforms/ObjCARC/escape.ll
+++ b/test/Transforms/ObjCARC/escape.ll
@@ -10,8 +10,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; with the objc_storeWeak call.
 
 ; CHECK: define void @test0(
-; CHECK: %tmp7 = call i8* @objc_retainBlock(i8* %tmp6) nounwind, !clang.arc.copy_on_escape !0
-; CHECK: call void @objc_release(i8* %tmp7) nounwind, !clang.imprecise_release !0
+; CHECK: %tmp7 = call i8* @objc_retainBlock(i8* %tmp6) [[NUW:#[0-9]+]], !clang.arc.copy_on_escape !0
+; CHECK: call void @objc_release(i8* %tmp7) [[NUW]], !clang.imprecise_release !0
 ; CHECK: }
 define void @test0() nounwind {
 entry:
@@ -129,3 +129,6 @@ declare i8* @not_really_objc_storeWeak(i8**, i8*)
 declare void @objc_release(i8*)
 
 !0 = metadata !{}
+
+; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #1 = { nounwind ssp }
diff --git a/test/Transforms/ObjCARC/gvn.ll b/test/Transforms/ObjCARC/gvn.ll
index 6917b02..3648866 100644
--- a/test/Transforms/ObjCARC/gvn.ll
+++ b/test/Transforms/ObjCARC/gvn.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -basicaa -objc-arc -gvn < %s | FileCheck %s
+; RUN: opt -S -basicaa -objc-arc-aa -gvn < %s | FileCheck %s
 
 @x = common global i8* null, align 8
 
diff --git a/test/Transforms/ObjCARC/invoke.ll b/test/Transforms/ObjCARC/invoke.ll
index 1a58e34..f528b4a 100644
--- a/test/Transforms/ObjCARC/invoke.ll
+++ b/test/Transforms/ObjCARC/invoke.ll
@@ -12,10 +12,10 @@ declare i8* @returner()
 
 ; CHECK: define void @test0(
 ; CHECK: invoke.cont:
-; CHECK:   call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+; CHECK:   call void @objc_release(i8* %zipFile) [[NUW:#[0-9]+]], !clang.imprecise_release !0
 ; CHECK:   ret void
 ; CHECK: lpad:
-; CHECK:   call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+; CHECK:   call void @objc_release(i8* %zipFile) [[NUW]], !clang.imprecise_release !0
 ; CHECK:   ret void
 define void @test0(i8* %zipFile) {
 entry:
@@ -39,11 +39,11 @@ lpad:                                             ; preds = %entry
 
 ; CHECK: define void @test1(
 ; CHECK: invoke.cont:
-; CHECK:   call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+; CHECK:   call void @objc_release(i8* %zipFile) [[NUW]], !clang.imprecise_release !0
 ; CHECK:   call void @callee()
 ; CHECK:   br label %done
 ; CHECK: lpad:
-; CHECK:   call void @objc_release(i8* %zipFile) nounwind, !clang.imprecise_release !0
+; CHECK:   call void @objc_release(i8* %zipFile) [[NUW]], !clang.imprecise_release !0
 ; CHECK:   call void @callee()
 ; CHECK:   br label %done
 ; CHECK: done:
@@ -108,7 +108,7 @@ finally.rethrow:                                  ; preds = %invoke.cont, %entry
 
 ; CHECK: define void @test3(
 ; CHECK: if.end:
-; CHECK-NEXT: call void @objc_release(i8* %p) nounwind
+; CHECK-NEXT: call void @objc_release(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test3(i8* %p, i1 %b) {
 entry:
@@ -140,10 +140,10 @@ if.end:
 ; CHECK: lpad:
 ; CHECK-NEXT: %r = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
 ; CHECK-NEXT: cleanup
-; CHECK-NEXT: call void @objc_release(i8* %p) nounwind
+; CHECK-NEXT: call void @objc_release(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret void
 ; CHECK: if.end:
-; CHECK-NEXT: call void @objc_release(i8* %p) nounwind
+; CHECK-NEXT: call void @objc_release(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test4(i8* %p, i1 %b) {
 entry:
@@ -215,4 +215,6 @@ if.end:
 declare i32 @__gxx_personality_v0(...)
 declare i32 @__objc_personality_v0(...)
 
+; CHECK: attributes [[NUW]] = { nounwind }
+
 !0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll b/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
index 170d0a9..5d05825 100644
--- a/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
+++ b/test/Transforms/ObjCARC/move-and-form-retain-autorelease.ll
@@ -4,7 +4,7 @@
 ; and various scary looking things and fold it into an objc_retainAutorelease.
 
 ; CHECK: bb57:
-; CHECK: tail call i8* @objc_retainAutorelease(i8* %tmp71x) nounwind
+; CHECK: tail call i8* @objc_retainAutorelease(i8* %tmp71x) [[NUW:#[0-9]+]]
 ; CHECK: bb99:
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@@ -212,10 +212,12 @@ bb99:                                             ; preds = %bb57
   br label %bb104
 
 bb104:                                            ; preds = %bb99, %bb57
-  %tmp105 = tail call i8* @objc_autorelease(i8* %tmp72) nounwind
+  %tmp105 = call i8* @objc_autorelease(i8* %tmp72) nounwind
   %tmp106 = bitcast i8* %tmp105 to %14*
   tail call void @objc_release(i8* %tmp85) nounwind
   %tmp107 = bitcast %18* %tmp47 to i8*
   tail call void @objc_release(i8* %tmp107) nounwind
   ret %14* %tmp106
 }
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/nested.ll b/test/Transforms/ObjCARC/nested.ll
index 32be03e..ca9c58b 100644
--- a/test/Transforms/ObjCARC/nested.ll
+++ b/test/Transforms/ObjCARC/nested.ll
@@ -770,9 +770,9 @@ forcoll.empty:
 @__block_d_tmp5 = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
 
 ; CHECK: define void @test11(
-; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
-; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
-; CHECK: call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+; CHECK: tail call i8* @objc_retain(i8* %call) [[NUW:#[0-9]+]]
+; CHECK: tail call i8* @objc_retain(i8* %call) [[NUW]]
+; CHECK: call void @objc_release(i8* %call) [[NUW]], !clang.imprecise_release !0
 ; CHECK: }
 define void @test11() {
 entry:
@@ -820,3 +820,6 @@ entry:
   call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
   ret void
 }
+
+; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #1 = { nonlazybind }
diff --git a/test/Transforms/ObjCARC/retain-block-alloca.ll b/test/Transforms/ObjCARC/retain-block-alloca.ll
index 01f2087..f40be23 100644
--- a/test/Transforms/ObjCARC/retain-block-alloca.ll
+++ b/test/Transforms/ObjCARC/retain-block-alloca.ll
@@ -9,7 +9,7 @@
 @"\01L_OBJC_SELECTOR_REFERENCES_" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
 
 ; CHECK: define void @test(
-; CHECK: %3 = call i8* @objc_retainBlock(i8* %2) nounwind
+; CHECK: %3 = call i8* @objc_retainBlock(i8* %2) [[NUW:#[0-9]+]]
 ; CHECK: @objc_msgSend
 ; CHECK-NEXT: @objc_release(i8* %3)
 define void @test(%0* %array) uwtable {
@@ -87,4 +87,8 @@ declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
 
 declare void @objc_release(i8*)
 
+; CHECK: attributes #0 = { uwtable }
+; CHECK: attributes #1 = { nonlazybind }
+; CHECK: attributes [[NUW]] = { nounwind }
+
 !0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/retain-block-side-effects.ll b/test/Transforms/ObjCARC/retain-block-side-effects.ll
index e84d48f..7fa73cb 100644
--- a/test/Transforms/ObjCARC/retain-block-side-effects.ll
+++ b/test/Transforms/ObjCARC/retain-block-side-effects.ll
@@ -4,7 +4,7 @@
 ; objc_retainBlock stores into %repeater so the load from after the
 ; call isn't forwardable from the store before the call.
 
-; CHECK: %tmp16 = call i8* @objc_retainBlock(i8* %tmp15) nounwind
+; CHECK: %tmp16 = call i8* @objc_retainBlock(i8* %tmp15) [[NUW:#[0-9]+]]
 ; CHECK: %tmp17 = bitcast i8* %tmp16 to void ()*
 ; CHECK: %tmp18 = load %struct.__block_byref_repeater** %byref.forwarding, align 8
 ; CHECK: %repeater12 = getelementptr inbounds %struct.__block_byref_repeater* %tmp18, i64 0, i32 6
@@ -37,3 +37,6 @@ entry:
 }
 
 declare i8* @objc_retainBlock(i8*)
+
+; CHECK: attributes #0 = { noreturn }
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/retain-block.ll b/test/Transforms/ObjCARC/retain-block.ll
index b3b62d3..ee57049 100644
--- a/test/Transforms/ObjCARC/retain-block.ll
+++ b/test/Transforms/ObjCARC/retain-block.ll
@@ -28,8 +28,8 @@ entry:
 ; optimization possible.
 
 ; CHECK: define void @test0_no_metadata(i8* %tmp) {
-; CHECK: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind
-; CHECK: tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
+; CHECK: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) [[NUW:#[0-9]+]]
+; CHECK: tail call void @objc_release(i8* %tmp2) [[NUW]], !clang.imprecise_release !0
 ; CHECK: }
 define void @test0_no_metadata(i8* %tmp) {
 entry:
@@ -43,8 +43,8 @@ entry:
 ; optimization possible.
 
 ; CHECK: define void @test0_escape(i8* %tmp, i8** %z) {
-; CHECK: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind, !clang.arc.copy_on_escape !0
-; CHECK: tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
+; CHECK: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) [[NUW]], !clang.arc.copy_on_escape !0
+; CHECK: tail call void @objc_release(i8* %tmp2) [[NUW]], !clang.imprecise_release !0
 ; CHECK: }
 define void @test0_escape(i8* %tmp, i8** %z) {
 entry:
@@ -58,8 +58,8 @@ entry:
 ; Same as test0_escape, but there's no intervening call.
 
 ; CHECK: define void @test0_just_escape(i8* %tmp, i8** %z) {
-; CHECK: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind, !clang.arc.copy_on_escape !0
-; CHECK: tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
+; CHECK: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) [[NUW]], !clang.arc.copy_on_escape !0
+; CHECK: tail call void @objc_release(i8* %tmp2) [[NUW]], !clang.imprecise_release !0
 ; CHECK: }
 define void @test0_just_escape(i8* %tmp, i8** %z) {
 entry:
@@ -73,9 +73,9 @@ entry:
 
 ; CHECK: define void @test1(i8* %tmp) {
 ; CHECK-NOT: @objc
-; CHECK: tail call i8* @objc_retain(i8* %tmp) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %tmp) [[NUW]]
 ; CHECK-NOT: @objc
-; CHECK: tail call void @objc_release(i8* %tmp) nounwind, !clang.imprecise_release !0
+; CHECK: tail call void @objc_release(i8* %tmp) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NOT: @objc
 ; CHECK: }
 define void @test1(i8* %tmp) {
@@ -95,10 +95,10 @@ entry:
 
 ; CHECK: define void @test1_no_metadata(i8* %tmp) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: tail call i8* @objc_retainBlock(i8* %tmp) nounwind
+; CHECK-NEXT: tail call i8* @objc_retainBlock(i8* %tmp) [[NUW]]
 ; CHECK-NEXT: @use_pointer(i8* %tmp2)
 ; CHECK-NEXT: @use_pointer(i8* %tmp2)
-; CHECK-NEXT: tail call void @objc_release(i8* %tmp) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT: tail call void @objc_release(i8* %tmp) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NOT: @objc
 ; CHECK: }
 define void @test1_no_metadata(i8* %tmp) {
@@ -118,11 +118,11 @@ entry:
 
 ; CHECK: define void @test1_escape(i8* %tmp, i8** %z) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind, !clang.arc.copy_on_escape !0
+; CHECK-NEXT: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) [[NUW]], !clang.arc.copy_on_escape !0
 ; CHECK-NEXT: store i8* %tmp2, i8** %z
 ; CHECK-NEXT: @use_pointer(i8* %tmp2)
 ; CHECK-NEXT: @use_pointer(i8* %tmp2)
-; CHECK-NEXT: tail call void @objc_release(i8* %tmp) nounwind, !clang.imprecise_release !0
+; CHECK-NEXT: tail call void @objc_release(i8* %tmp) [[NUW]], !clang.imprecise_release !0
 ; CHECK-NOT: @objc
 ; CHECK: }
 define void @test1_escape(i8* %tmp, i8** %z) {
@@ -136,3 +136,5 @@ entry:
   tail call void @objc_release(i8* %tmp) nounwind, !clang.imprecise_release !0
   ret void
 }
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/retain-not-declared.ll b/test/Transforms/ObjCARC/retain-not-declared.ll
index f876e51..e834179 100644
--- a/test/Transforms/ObjCARC/retain-not-declared.ll
+++ b/test/Transforms/ObjCARC/retain-not-declared.ll
@@ -13,7 +13,7 @@ declare void @objc_release(i8*)
 
 ; CHECK:      define i8* @test0(i8* %p) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = tail call i8* @objc_retainAutoreleaseReturnValue(i8* %p) nounwind
+; CHECK-NEXT:   %0 = tail call i8* @objc_retainAutoreleaseReturnValue(i8* %p) [[NUW:#[0-9]+]]
 ; CHECK-NEXT:   ret i8* %0
 ; CHECK-NEXT: }
 
@@ -65,3 +65,5 @@ lpad100:                                          ; preds = %invoke.cont93
 declare i32 @__gxx_personality_v0(...)
 
 !0 = metadata !{}
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/rle-s2l.ll b/test/Transforms/ObjCARC/rle-s2l.ll
index 8f8d5c0..2865c94 100644
--- a/test/Transforms/ObjCARC/rle-s2l.ll
+++ b/test/Transforms/ObjCARC/rle-s2l.ll
@@ -57,7 +57,7 @@ define void @test2(i8** %p) {
 
 ; CHECK:      define void @test3(i8** %p) {
 ; CHECK-NEXT:   %x = call i8* @objc_loadWeak(i8** %p)
-; CHECK-NEXT:   call void @use_pointer(i8* %x) readonly
+; CHECK-NEXT:   call void @use_pointer(i8* %x) [[RO:#[0-9]+]]
 ; CHECK-NEXT:   %1 = tail call i8* @objc_retain(i8* %x)
 ; CHECK-NEXT:   call void @use_pointer(i8* %x)
 ; CHECK-NEXT:   ret void
@@ -74,7 +74,7 @@ define void @test3(i8** %p) {
 
 ; CHECK:      define void @test4(i8** %p) {
 ; CHECK-NEXT:   %x = call i8* @objc_loadWeak(i8** %p)
-; CHECK-NEXT:   call void @use_pointer(i8* %x) readonly
+; CHECK-NEXT:   call void @use_pointer(i8* %x) [[RO]]
 ; CHECK-NEXT:   call void @callee()
 ; CHECK-NEXT:   %y = call i8* @objc_loadWeak(i8** %p)
 ; CHECK-NEXT:   call void @use_pointer(i8* %y)
@@ -133,3 +133,6 @@ define void @test7(i8** %p, i8* %n, i8** %q, i8* %m) {
   call void @use_pointer(i8* %y)
   ret void
 }
+
+; CHECK: attributes #0 = { nounwind }
+; CHECK: attributes [[RO]] = { readonly }
diff --git a/test/Transforms/ObjCARC/rv.ll b/test/Transforms/ObjCARC/rv.ll
index 9353a19..a2fef96 100644
--- a/test/Transforms/ObjCARC/rv.ll
+++ b/test/Transforms/ObjCARC/rv.ll
@@ -29,7 +29,7 @@ declare i8* @returner()
 ; CHECK:      define void @test0(
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %x = call i8* @returner
-; CHECK-NEXT:   %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %x) nounwind
+; CHECK-NEXT:   %0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %x) [[NUW:#[0-9]+]]
 ; CHECK: t:
 ; CHECK-NOT: @objc_
 ; CHECK: return:
@@ -150,7 +150,7 @@ define void @test8() {
 ; Don't apply the RV optimization to autorelease if there's no retain.
 
 ; CHECK: define i8* @test9(i8* %p)
-; CHECK: tail call i8* @objc_autorelease(i8* %p)
+; CHECK: call i8* @objc_autorelease(i8* %p)
 define i8* @test9(i8* %p) {
   call i8* @objc_autorelease(i8* %p)
   ret i8* %p
@@ -159,8 +159,8 @@ define i8* @test9(i8* %p) {
 ; Apply the RV optimization.
 
 ; CHECK: define i8* @test10(i8* %p)
-; CHECK: tail call i8* @objc_retain(i8* %p) nounwind
-; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %p) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %p) [[NUW]]
+; CHECK: tail call i8* @objc_autoreleaseReturnValue(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret i8* %p
 define i8* @test10(i8* %p) {
   %1 = call i8* @objc_retain(i8* %p)
@@ -174,7 +174,7 @@ define i8* @test10(i8* %p) {
 ; CHECK: define i8* @test11(i8* %p)
 ; CHECK: tail call i8* @objc_retain(i8* %p)
 ; CHECK-NEXT: call void @use_pointer(i8* %p)
-; CHECK: tail call i8* @objc_autorelease(i8* %p)
+; CHECK: call i8* @objc_autorelease(i8* %p)
 ; CHECK-NEXT: ret i8* %p
 define i8* @test11(i8* %p) {
   %1 = call i8* @objc_retain(i8* %p)
@@ -201,7 +201,7 @@ define i8* @test12(i8* %p) {
 
 ; CHECK: define i8* @test13(
 ; CHECK: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
-; CHECK: tail call i8* @objc_autorelease(i8* %p)
+; CHECK: call i8* @objc_autorelease(i8* %p)
 ; CHECK: ret i8* %p
 define i8* @test13() {
   %p = call i8* @returner()
@@ -215,7 +215,7 @@ define i8* @test13() {
 ; argument is not a return value.
 
 ; CHECK: define void @test14(
-; CHECK-NEXT: tail call i8* @objc_retain(i8* %p) nounwind
+; CHECK-NEXT: tail call i8* @objc_retain(i8* %p) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test14(i8* %p) {
   call i8* @objc_retainAutoreleasedReturnValue(i8* %p)
@@ -227,7 +227,7 @@ define void @test14(i8* %p) {
 
 ; CHECK: define void @test15(
 ; CHECK-NEXT: %y = call i8* @returner()
-; CHECK-NEXT: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y) nounwind
+; CHECK-NEXT: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test15() {
   %y = call i8* @returner()
@@ -240,7 +240,7 @@ define void @test15() {
 
 ; CHECK: define void @test16(
 ; CHECK-NEXT: %y = call i8* @returner()
-; CHECK-NEXT: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y) nounwind
+; CHECK-NEXT: tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test16() {
   %y = call i8* @returner()
@@ -252,7 +252,7 @@ define void @test16() {
 ; argument is not a return value.
 
 ; CHECK: define void @test17(
-; CHECK-NEXT: tail call i8* @objc_retain(i8* %y) nounwind
+; CHECK-NEXT: tail call i8* @objc_retain(i8* %y) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test17(i8* %y) {
   call i8* @objc_retain(i8* %y)
@@ -265,7 +265,7 @@ define void @test17(i8* %y) {
 ; CHECK: define void @test18(
 ; CHECK-NEXT: %y = call i8* @returner()
 ; CHECK-NEXT: call void @callee()
-; CHECK-NEXT: tail call i8* @objc_retain(i8* %y) nounwind
+; CHECK-NEXT: tail call i8* @objc_retain(i8* %y) [[NUW]]
 ; CHECK-NEXT: ret void
 define void @test18() {
   %y = call i8* @returner()
@@ -323,7 +323,7 @@ define i8* @test22(i8* %p) {
 ; Convert autoreleaseRV to autorelease.
 
 ; CHECK: define void @test23(
-; CHECK: tail call i8* @objc_autorelease(i8* %p) nounwind
+; CHECK: call i8* @objc_autorelease(i8* %p) [[NUW]]
 define void @test23(i8* %p) {
   store i8 0, i8* %p
   call i8* @objc_autoreleaseReturnValue(i8* %p)
@@ -340,3 +340,5 @@ define {}* @test24(i8* %p) {
   %s = bitcast i8* %p to {}*
   ret {}* %s
 }
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/split-backedge.ll b/test/Transforms/ObjCARC/split-backedge.ll
index 08e2dce..5ac278a 100644
--- a/test/Transforms/ObjCARC/split-backedge.ll
+++ b/test/Transforms/ObjCARC/split-backedge.ll
@@ -4,12 +4,12 @@
 ; rdar://11256239
 
 ; CHECK: define void @test0
-; CHECK: call i8* @objc_retain(i8* %call) nounwind
-; CHECK: call i8* @objc_retain(i8* %call) nounwind
-; CHECK: call i8* @objc_retain(i8* %cond) nounwind
-; CHECK: call void @objc_release(i8* %call) nounwind
-; CHECK: call void @objc_release(i8* %call) nounwind
-; CHECK: call void @objc_release(i8* %cond) nounwind
+; CHECK: call i8* @objc_retain(i8* %call) [[NUW:#[0-9]+]]
+; CHECK: call i8* @objc_retain(i8* %call) [[NUW]]
+; CHECK: call i8* @objc_retain(i8* %cond) [[NUW]]
+; CHECK: call void @objc_release(i8* %call) [[NUW]]
+; CHECK: call void @objc_release(i8* %call) [[NUW]]
+; CHECK: call void @objc_release(i8* %cond) [[NUW]]
 define void @test0() {
 entry:
   br label %while.body
@@ -46,3 +46,5 @@ declare i8* @objc_retain(i8*)
 declare void @use_pointer(i8*)
 
 !0 = metadata !{}
+
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll b/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll
new file mode 100644
index 0000000..74ac97c
--- /dev/null
+++ b/test/Transforms/ObjCARC/tail-call-invariant-enforcement.ll
@@ -0,0 +1,84 @@
+; RUN: opt -objc-arc -S < %s | FileCheck %s
+
+declare i8* @objc_release(i8* %x)
+declare i8* @objc_retain(i8* %x)
+declare i8* @objc_autorelease(i8* %x)
+declare i8* @objc_autoreleaseReturnValue(i8* %x)
+declare i8* @objc_retainAutoreleasedReturnValue(i8* %x)
+
+; Never tail call objc_autorelease.
+define i8* @test0(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = call i8* @objc_autorelease(i8* %x)
+  %tmp0 = call i8* @objc_autorelease(i8* %x)
+  ; CHECK: %tmp1 = call i8* @objc_autorelease(i8* %x)
+  %tmp1 = tail call i8* @objc_autorelease(i8* %x)
+
+  ret i8* %x
+}
+
+; Always tail call autoreleaseReturnValue.
+define i8* @test1(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  %tmp0 = call i8* @objc_autoreleaseReturnValue(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  %tmp1 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  ret i8* %x
+}
+
+; Always tail call objc_retain.
+define i8* @test2(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = tail call i8* @objc_retain(i8* %x)
+  %tmp0 = call i8* @objc_retain(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_retain(i8* %x)
+  %tmp1 = tail call i8* @objc_retain(i8* %x)
+  ret i8* %x
+}
+
+define i8* @tmp(i8* %x) {
+  ret i8* %x
+}
+
+; Always tail call objc_retainAutoreleasedReturnValue.
+define i8* @test3(i8* %x) {
+entry:
+  %y = call i8* @tmp(i8* %x)
+  ; CHECK: %tmp0 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %y)
+  %tmp0 = call i8* @objc_retainAutoreleasedReturnValue(i8* %y)
+  %z = call i8* @tmp(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+  %tmp1 = tail call i8* @objc_retainAutoreleasedReturnValue(i8* %z)
+  ret i8* %x
+}
+
+; By itself, we should never change whether or not objc_release is tail called.
+define i8* @test4(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = call i8* @objc_release(i8* %x)
+  %tmp0 = call i8* @objc_release(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_release(i8* %x)
+  %tmp1 = tail call i8* @objc_release(i8* %x)
+  ret i8* %x
+}
+
+; If we convert a tail called @objc_autoreleaseReturnValue to an
+; @objc_autorelease, ensure that the tail call is removed.
+define i8* @test5(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = call i8* @objc_autorelease(i8* %x)
+  %tmp0 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  ret i8* %tmp0
+}
+
+; If we convert a called @objc_autorelease to an @objc_autoreleaseReturnValue,
+; ensure that the tail call is added.
+define i8* @test6(i8* %x) {
+entry:
+  ; CHECK: %tmp0 = tail call i8* @objc_retain(i8* %x)
+  %tmp0 = tail call i8* @objc_retain(i8* %x)
+  ; CHECK: %tmp1 = tail call i8* @objc_autoreleaseReturnValue(i8* %x)
+  %tmp1 = call i8* @objc_autorelease(i8* %x)
+  ret i8* %x
+}
diff --git a/test/Transforms/ObjCARC/weak-copies.ll b/test/Transforms/ObjCARC/weak-copies.ll
index e1a94bb..5dab4e0 100644
--- a/test/Transforms/ObjCARC/weak-copies.ll
+++ b/test/Transforms/ObjCARC/weak-copies.ll
@@ -19,7 +19,7 @@ target triple = "x86_64-apple-darwin11.0.0"
 ; CHECK:      define void @foo() {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %call = call i8* @bar()
-; CHECK-NEXT:   call void @use(i8* %call) nounwind
+; CHECK-NEXT:   call void @use(i8* %call) [[NUW:#[0-9]+]]
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 define void @foo() {
@@ -39,7 +39,7 @@ entry:
 
 ; Eliminate unnecessary weak pointer copies in a block initialization.
 
-; CHECK:      define void @qux(i8* %me) nounwind {
+; CHECK:      define void @qux(i8* %me) #0 {
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   %block = alloca %1, align 8
 ; CHECK-NOT:    alloca
@@ -84,4 +84,6 @@ declare i8* @objc_loadWeak(i8**)
 declare void @use(i8*) nounwind
 declare void @objc_destroyWeak(i8**)
 
+; CHECK: attributes [[NUW]] = { nounwind }
+
 !0 = metadata !{}
diff --git a/test/Transforms/ScalarRepl/2003-09-12-IncorrectPromote.ll b/test/Transforms/ScalarRepl/2003-09-12-IncorrectPromote.ll
index 0b5e415..3f28cb1 100644
--- a/test/Transforms/ScalarRepl/2003-09-12-IncorrectPromote.ll
+++ b/test/Transforms/ScalarRepl/2003-09-12-IncorrectPromote.ll
@@ -1,7 +1,6 @@
 ; Scalar replacement was incorrectly promoting this alloca!!
 ;
-; RUN: opt < %s -scalarrepl -S | \
-; RUN:   sed "s/;.*//g" | grep "\["
+; RUN: opt < %s -scalarrepl -S | FileCheck %s
 
 define i8* @test() {
 	%A = alloca [30 x i8]		; <[30 x i8]*> [#uses=1]
@@ -10,4 +9,4 @@ define i8* @test() {
 	store i8 0, i8* %B
 	ret i8* %C
 }
-
+; CHECK: alloca [
diff --git a/test/Transforms/ScalarRepl/phi-cycle.ll b/test/Transforms/ScalarRepl/phi-cycle.ll
index cb5101c..05d9382 100644
--- a/test/Transforms/ScalarRepl/phi-cycle.ll
+++ b/test/Transforms/ScalarRepl/phi-cycle.ll
@@ -67,7 +67,7 @@ while.cond.backedge.i:                            ; preds = %if.end.i, %while.bo
 
 ; CHECK: func.exit:
 ; CHECK-NOT: load
-; CHECK: %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0), i32 %tmp) nounwind
+; CHECK: %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0), i32 %tmp) [[NUW:#[0-9]+]]
 func.exit:                                        ; preds = %while.body.i.func.exit_crit_edge, %while.cond.i.func.exit_crit_edge
   %tmp3 = load i32* %x.i, align 4
   %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0), i32 %tmp3) nounwind
@@ -75,3 +75,6 @@ func.exit:                                        ; preds = %while.body.i.func.e
 }
 
 declare i32 @printf(i8* nocapture, ...) nounwind
+
+; CHECK: attributes #0 = { nounwind uwtable }
+; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/ScalarRepl/volatile.ll b/test/Transforms/ScalarRepl/volatile.ll
index 056526c..d506cdf 100644
--- a/test/Transforms/ScalarRepl/volatile.ll
+++ b/test/Transforms/ScalarRepl/volatile.ll
@@ -1,12 +1,13 @@
-; RUN: opt < %s -scalarrepl -S | grep "load volatile"
-; RUN: opt < %s -scalarrepl -S | grep "store volatile"
+; RUN: opt < %s -scalarrepl -S | FileCheck %s
 
 define i32 @voltest(i32 %T) {
 	%A = alloca {i32, i32}
 	%B = getelementptr {i32,i32}* %A, i32 0, i32 0
 	store volatile i32 %T, i32* %B
+; CHECK: store volatile
 
 	%C = getelementptr {i32,i32}* %A, i32 0, i32 1
 	%X = load volatile i32* %C
+; CHECK: load volatile
 	ret i32 %X
 }
diff --git a/test/Transforms/SimplifyCFG/2002-05-05-EmptyBlockMerge.ll b/test/Transforms/SimplifyCFG/EmptyBlockMerge.ll
index feffb4e..aba08dc 100644
--- a/test/Transforms/SimplifyCFG/2002-05-05-EmptyBlockMerge.ll
+++ b/test/Transforms/SimplifyCFG/EmptyBlockMerge.ll
@@ -1,8 +1,6 @@
 ; Basic block #2 should not be merged into BB #3!
 ;
-; RUN: opt < %s -simplifycfg -S | \
-; RUN:   grep "br label"
-;
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
 
 declare void @foo()
 
@@ -13,6 +11,7 @@ bb0:
 	br i1 %cond218, label %bb3, label %bb2
 bb2:		; preds = %bb0
 	call void @foo( )
+; CHECK: br label %bb3
 	br label %bb3
 bb3:		; preds = %bb2, %bb0
 	%reg117 = phi i32 [ 110, %bb2 ], [ %reg108, %bb0 ]		; <i32> [#uses=1]
diff --git a/test/Transforms/SimplifyCFG/2002-06-24-PHINode.ll b/test/Transforms/SimplifyCFG/PHINode.ll
index 88f32bc..25a242a 100644
--- a/test/Transforms/SimplifyCFG/2002-06-24-PHINode.ll
+++ b/test/Transforms/SimplifyCFG/PHINode.ll
@@ -1,10 +1,11 @@
 ; -simplifycfg is not folding blocks if there is a PHI node involved.  This 
 ; should be fixed eventually
 
-; RUN: opt < %s -simplifycfg -S | not grep br
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
 
 define i32 @main(i32 %argc) {
 ; <label>:0
+; CHECK-NOT: br label %InlinedFunctionReturnNode
 	br label %InlinedFunctionReturnNode
 InlinedFunctionReturnNode:		; preds = %0
 	%X = phi i32 [ 7, %0 ]		; <i32> [#uses=1]
diff --git a/test/Transforms/SimplifyCFG/SpeculativeExec.ll b/test/Transforms/SimplifyCFG/SpeculativeExec.ll
index a61867f..dd2e5d1 100644
--- a/test/Transforms/SimplifyCFG/SpeculativeExec.ll
+++ b/test/Transforms/SimplifyCFG/SpeculativeExec.ll
@@ -44,3 +44,44 @@ join:
   ret i8 %c
 }
 
+define i8* @test4(i1* %dummy, i8* %a, i8* %b) {
+; Test that we don't speculate an arbitrarily large number of unfolded constant
+; expressions.
+; CHECK: @test4
+
+entry:
+  %cond1 = load volatile i1* %dummy
+  br i1 %cond1, label %if, label %end
+
+if:
+  %cond2 = load volatile i1* %dummy
+  br i1 %cond2, label %then, label %end
+
+then:
+  br label %end
+
+end:
+  %x1 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 1 to i8*), %then ]
+  %x2 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 2 to i8*), %then ]
+  %x3 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 3 to i8*), %then ]
+  %x4 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 4 to i8*), %then ]
+  %x5 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 5 to i8*), %then ]
+  %x6 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 6 to i8*), %then ]
+  %x7 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 7 to i8*), %then ]
+  %x8 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 8 to i8*), %then ]
+  %x9 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 9 to i8*), %then ]
+  %x10 = phi i8* [ %a, %entry ], [ %b, %if ], [ inttoptr (i64 10 to i8*), %then ]
+; CHECK-NOT: select
+; CHECK: phi i8*
+; CHECK: phi i8*
+; CHECK: phi i8*
+; CHECK: phi i8*
+; CHECK: phi i8*
+; CHECK: phi i8*
+; CHECK: phi i8*
+; CHECK: phi i8*
+; CHECK: phi i8*
+; CHECK: phi i8*
+
+  ret i8* %x10
+}
diff --git a/test/Transforms/SimplifyCFG/switch-on-const-select.ll b/test/Transforms/SimplifyCFG/switch-on-const-select.ll
index 673a62b..9cd709f 100644
--- a/test/Transforms/SimplifyCFG/switch-on-const-select.ll
+++ b/test/Transforms/SimplifyCFG/switch-on-const-select.ll
@@ -35,7 +35,7 @@ define i32 @bar(i64 %x, i64 %y) nounwind {
 ; CHECK: @bar
 entry:
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: tail call void @bees.a() nounwind
+; CHECK-NEXT: tail call void @bees.a() [[NUW:#[0-9]+]]
 ; CHECK-NEXT: ret i32 0
     %lt = icmp slt i64 %x, %y
     %qux = select i1 %lt, i32 0, i32 2
@@ -61,7 +61,7 @@ define void @bazz(i64 %x, i64 %y) nounwind {
 ; CHECK: @bazz
 entry:
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: tail call void @bees.b() nounwind
+; CHECK-NEXT: tail call void @bees.b() [[NUW]]
 ; CHECK-NEXT: ret void
     %lt = icmp slt i64 %x, %y
     %qux = select i1 %lt, i32 10, i32 12
@@ -86,7 +86,7 @@ define void @quux(i64 %x, i64 %y) nounwind {
 ; CHECK: @quux
 entry:
 ; CHECK-NEXT: entry:
-; CHECK-NEXT: tail call void @bees.a() nounwind
+; CHECK-NEXT: tail call void @bees.a() [[NUW]]
 ; CHECK-NEXT: ret void
     %lt = icmp slt i64 %x, %y
     %qux = select i1 %lt, i32 0, i32 0
@@ -136,3 +136,6 @@ bees:
 declare void @llvm.trap() nounwind noreturn
 declare void @bees.a() nounwind
 declare void @bees.b() nounwind
+
+; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #1 = { noreturn nounwind }
diff --git a/test/Transforms/SimplifyLibCalls/2009-01-04-Annotate.ll b/test/Transforms/SimplifyLibCalls/2009-01-04-Annotate.ll
index 73eb05b..16791e2 100644
--- a/test/Transforms/SimplifyLibCalls/2009-01-04-Annotate.ll
+++ b/test/Transforms/SimplifyLibCalls/2009-01-04-Annotate.ll
@@ -1,12 +1,21 @@
-; RUN: opt < %s -simplify-libcalls -S > %t
-; RUN: grep noalias %t | count 2
-; RUN: grep nocapture %t | count 3
-; RUN: grep nounwind %t | count 3
-; RUN: grep readonly %t | count 1
+; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
 
+; CHECK: declare noalias i8* @fopen(i8* nocapture, i8* nocapture) #0
 declare i8* @fopen(i8*, i8*)
+
+; CHECK: declare i8 @strlen(i8* nocapture) #1
 declare i8 @strlen(i8*)
+
+; CHECK: declare noalias i32* @realloc(i32* nocapture, i32) #0
 declare i32* @realloc(i32*, i32)
 
 ; Test deliberately wrong declaration
 declare i32 @strcpy(...)
+
+; CHECK-NOT: strcpy{{.*}}noalias
+; CHECK-NOT: strcpy{{.*}}nocapture
+; CHECK-NOT: strcpy{{.*}}nounwind
+; CHECK-NOT: strcpy{{.*}}readonly
+
+; CHECK: attributes #0 = { nounwind }
+; CHECK: attributes #1 = { nounwind readonly }
diff --git a/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll b/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll
index e4f8b48..97e67b2 100644
--- a/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll
+++ b/test/Transforms/TailCallElim/dont-tce-tail-marked-call.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | \
-; RUN:    grep "call i32 @foo"
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 
 declare void @bar(i32*)
 
@@ -7,6 +6,7 @@ define i32 @foo(i32 %N) {
 	%A = alloca i32, i32 %N		; <i32*> [#uses=2]
 	store i32 17, i32* %A
 	call void @bar( i32* %A )
+; CHECK: tail call i32 @foo
 	%X = tail call i32 @foo( i32 %N )		; <i32> [#uses=1]
 	ret i32 %X
 }
diff --git a/test/Transforms/TailCallElim/intervening-inst.ll b/test/Transforms/TailCallElim/intervening-inst.ll
index 0c40bd5..10dffbd 100644
--- a/test/Transforms/TailCallElim/intervening-inst.ll
+++ b/test/Transforms/TailCallElim/intervening-inst.ll
@@ -1,5 +1,5 @@
 ; This function contains intervening instructions which should be moved out of the way
-; RUN: opt < %s -tailcallelim -S | not grep call
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 
 define i32 @Test(i32 %X) {
 entry:
@@ -10,6 +10,7 @@ then.0:		; preds = %entry
 	ret i32 %tmp.4
 endif.0:		; preds = %entry
 	%tmp.10 = add i32 %X, -1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp.8 = call i32 @Test( i32 %tmp.10 )		; <i32> [#uses=1]
 	%DUMMY = add i32 %X, 1		; <i32> [#uses=0]
 	ret i32 %tmp.8
diff --git a/test/Transforms/TailCallElim/reorder_load.ll b/test/Transforms/TailCallElim/reorder_load.ll
index 7f5c36e..53c65da 100644
--- a/test/Transforms/TailCallElim/reorder_load.ll
+++ b/test/Transforms/TailCallElim/reorder_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -tailcallelim -S | not grep call
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 ; PR4323
 
 ; Several cases where tail call elimination should move the load above the call,
@@ -21,6 +21,7 @@ if:		; preds = %entry
 
 else:		; preds = %entry
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_1(i32* %a_arg, i32 %a_len_arg, i32 %tmp7)		; <i32> [#uses=1]
 	%tmp9 = load i32* %a_arg		; <i32> [#uses=1]
 	%tmp10 = add i32 %tmp9, %tmp8		; <i32> [#uses=1]
@@ -47,6 +48,7 @@ unwind:		; preds = %else
 
 recurse:		; preds = %else
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_2(i32* %a_arg, i32 %a_len_arg, i32 %tmp7)		; <i32> [#uses=1]
 	%tmp9 = load i32* @global		; <i32> [#uses=1]
 	%tmp10 = add i32 %tmp9, %tmp8		; <i32> [#uses=1]
@@ -66,6 +68,7 @@ if:		; preds = %entry
 
 else:		; preds = %entry
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_3(i32* %a_arg, i32 %a_len_arg, i32 %tmp7)		; <i32> [#uses=1]
 	%tmp9 = load i32* @extern_weak_global		; <i32> [#uses=1]
 	%tmp10 = add i32 %tmp9, %tmp8		; <i32> [#uses=1]
@@ -94,6 +97,7 @@ unwind:		; preds = %else
 recurse:		; preds = %else
 	%tmp7 = add i32 %start_arg, 1		; <i32> [#uses=1]
 	%first = load i32* %a_arg		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp8 = call fastcc i32 @raise_load_4(i32* %a_arg, i32 %first, i32 %tmp7)		; <i32> [#uses=1]
 	%second = load i32* %a_arg		; <i32> [#uses=1]
 	%tmp10 = add i32 %second, %tmp8		; <i32> [#uses=1]
diff --git a/test/Transforms/TailCallElim/return_constant.ll b/test/Transforms/TailCallElim/return_constant.ll
index 48e5641..e99e57e 100644
--- a/test/Transforms/TailCallElim/return_constant.ll
+++ b/test/Transforms/TailCallElim/return_constant.ll
@@ -1,7 +1,7 @@
 ; Though this case seems to be fairly unlikely to occur in the wild, someone
 ; plunked it into the demo script, so maybe they care about it.
 ;
-; RUN: opt < %s -tailcallelim -S | not grep call
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 
 define i32 @aaa(i32 %c) {
 entry:
@@ -9,6 +9,7 @@ entry:
 	br i1 %tmp.1, label %return, label %else
 else:		; preds = %entry
 	%tmp.5 = add i32 %c, -1		; <i32> [#uses=1]
+; CHECK-NOT: call
 	%tmp.3 = call i32 @aaa( i32 %tmp.5 )		; <i32> [#uses=0]
 	ret i32 0
 return:		; preds = %entry
diff --git a/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll b/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll
index 3d01d17..7049e4d 100644
--- a/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll
+++ b/test/Transforms/TailCallElim/trivial_codegen_tailcall.ll
@@ -1,11 +1,11 @@
-; RUN: opt < %s -tailcallelim -S | \
-; RUN:    grep "tail call void @foo"
+; RUN: opt < %s -tailcallelim -S | FileCheck %s
 
 
 declare void @foo()
 
 define void @bar() {
-	call void @foo( )
+; CHECK: tail call void @foo()
+	call void @foo()
 	ret void
 }
 
diff --git a/test/Verifier/module-flags-1.ll b/test/Verifier/module-flags-1.ll
new file mode 100644
index 0000000..e5feaf3
--- /dev/null
+++ b/test/Verifier/module-flags-1.ll
@@ -0,0 +1,60 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+; Check that module flags are structurally correct.
+;
+; CHECK: incorrect number of operands in module flag
+; CHECK: metadata !0
+!0 = metadata !{ i32 1 }
+; CHECK: invalid behavior operand in module flag (expected constant integer)
+; CHECK: metadata !"foo"
+!1 = metadata !{ metadata !"foo", metadata !"foo", i32 42 }
+; CHECK: invalid behavior operand in module flag (unexpected constant)
+; CHECK: i32 999
+!2 = metadata !{ i32 999, metadata !"foo", i32 43 }
+; CHECK: invalid ID operand in module flag (expected metadata string)
+; CHECK: i32 1
+!3 = metadata !{ i32 1, i32 1, i32 44 }
+; CHECK: invalid value for 'require' module flag (expected metadata pair)
+; CHECK: i32 45
+!4 = metadata !{ i32 3, metadata !"bla", i32 45 }
+; CHECK: invalid value for 'require' module flag (expected metadata pair)
+; CHECK: metadata !
+!5 = metadata !{ i32 3, metadata !"bla", metadata !{ i32 46 } }
+; CHECK: invalid value for 'require' module flag (first value operand should be a string)
+; CHECK: i32 47
+!6 = metadata !{ i32 3, metadata !"bla", metadata !{ i32 47, i32 48 } }
+
+; Check that module flags only have unique IDs.
+;
+; CHECK: module flag identifiers must be unique (or of 'require' type)
+!7 = metadata !{ i32 1, metadata !"foo", i32 49 }
+!8 = metadata !{ i32 2, metadata !"foo", i32 50 }
+; CHECK-NOT: module flag identifiers must be unique
+!9 = metadata !{ i32 2, metadata !"bar", i32 51 }
+!10 = metadata !{ i32 3, metadata !"bar", metadata !{ metadata !"bar", i32 51 } }
+
+; Check that any 'append'-type module flags are valid.
+; CHECK: invalid value for 'append'-type module flag (expected a metadata node)
+!16 = metadata !{ i32 5, metadata !"flag-2", i32 56 }
+; CHECK: invalid value for 'append'-type module flag (expected a metadata node)
+!17 = metadata !{ i32 5, metadata !"flag-3", i32 57 }
+; CHECK-NOT: invalid value for 'append'-type module flag (expected a metadata node)
+!18 = metadata !{ i32 5, metadata !"flag-4", metadata !{ i32 57 } }
+
+; Check that any 'require' module flags are valid.
+; CHECK: invalid requirement on flag, flag is not present in module
+!11 = metadata !{ i32 3, metadata !"bar",
+     metadata !{ metadata !"no-such-flag", i32 52 } }
+; CHECK: invalid requirement on flag, flag does not have the required value
+!12 = metadata !{ i32 1, metadata !"flag-0", i32 53 }
+!13 = metadata !{ i32 3, metadata !"bar",
+     metadata !{ metadata !"flag-0", i32 54 } }
+; CHECK-NOT: invalid requirement on flag, flag is not present in module
+; CHECK-NOT: invalid requirement on flag, flag does not have the required value
+!14 = metadata !{ i32 1, metadata !"flag-1", i32 55 }
+!15 = metadata !{ i32 3, metadata !"bar",
+     metadata !{ metadata !"flag-1", i32 55 } }
+
+!llvm.module.flags = !{
+  !0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15,
+  !16, !17, !18 }
diff --git a/test/lit.cfg b/test/lit.cfg
index 5a4cced..128bbe9 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -140,12 +140,16 @@ if config.test_exec_root is None:
 
 ###
 
-# Provide a target triple for mcjit tests
-mcjit_triple = config.target_triple
-# Force ELF format on Windows
-if re.search(r'cygwin|mingw32|win32', mcjit_triple):
-  mcjit_triple += "-elf"
-config.substitutions.append( ('%mcjit_triple', mcjit_triple) )
+# Provide a command line for mcjit tests
+lli_mcjit = 'lli -use-mcjit'
+# The target triple used by default by lli is the process target triple (some
+# triple appropriate for generating code for the current process) but because
+# we don't support COFF in MCJIT well enough for the tests, force ELF format on
+# Windows.  FIXME: the process target triple should be used here, but this is
+# difficult to obtain on Windows.
+if re.search(r'cygwin|mingw32|win32', config.host_triple):
+  lli_mcjit += ' -mtriple='+config.host_triple+'-elf'
+config.substitutions.append( ('%lli_mcjit', lli_mcjit) )
 
 # Provide a substition for those tests that need to run the jit to obtain data
 # but simply want use the currently considered most reliable jit for platform
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 7a328f0..bfd901a 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -1,5 +1,6 @@
 ## Autogenerated by LLVM/Clang configuration.
 # Do not edit!
+config.host_triple = "@LLVM_HOSTTRIPLE@"
 config.target_triple = "@TARGET_TRIPLE@"
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
diff --git a/tools/bugpoint-passes/CMakeLists.txt b/tools/bugpoint-passes/CMakeLists.txt
index b2f1bb5..05f190a 100644
--- a/tools/bugpoint-passes/CMakeLists.txt
+++ b/tools/bugpoint-passes/CMakeLists.txt
@@ -1,3 +1,7 @@
+if( NOT LLVM_BUILD_TOOLS )
+  set(EXCLUDE_FROM_ALL ON)
+endif()
+
 add_llvm_loadable_module( BugpointPasses
   TestPasses.cpp
   )
diff --git a/tools/bugpoint/CMakeLists.txt b/tools/bugpoint/CMakeLists.txt
index ee2235b..3c5e64f 100644
--- a/tools/bugpoint/CMakeLists.txt
+++ b/tools/bugpoint/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(LLVM_LINK_COMPONENTS asmparser instrumentation scalaropts ipo
-  linker bitreader bitwriter vectorize)
+  linker bitreader bitwriter vectorize objcarcopts)
 
 add_llvm_tool(bugpoint
   BugDriver.cpp
diff --git a/tools/bugpoint/ExecutionDriver.cpp b/tools/bugpoint/ExecutionDriver.cpp
index 218a559..da36045 100644
--- a/tools/bugpoint/ExecutionDriver.cpp
+++ b/tools/bugpoint/ExecutionDriver.cpp
@@ -230,7 +230,7 @@ bool BugDriver::initializeExecutionEnvironment() {
     }
     if (!SafeInterpreter) {
       SafeInterpreterSel = AutoPick;
-      Message = "Sorry, I can't automatically select an interpreter!\n";
+      Message = "Sorry, I can't automatically select a safe interpreter!\n";
     }
     break;
   case RunLLC:
diff --git a/tools/bugpoint/LLVMBuild.txt b/tools/bugpoint/LLVMBuild.txt
index 549d9d0..e03c594 100644
--- a/tools/bugpoint/LLVMBuild.txt
+++ b/tools/bugpoint/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = bugpoint
 parent = Tools
-required_libraries = AsmParser BitReader BitWriter IPO Instrumentation Linker Scalar
+required_libraries = AsmParser BitReader BitWriter IPO Instrumentation Linker Scalar ObjCARC
diff --git a/tools/bugpoint/Makefile b/tools/bugpoint/Makefile
index 34f4bdd..65ffc13 100644
--- a/tools/bugpoint/Makefile
+++ b/tools/bugpoint/Makefile
@@ -10,6 +10,6 @@
 LEVEL := ../..
 TOOLNAME := bugpoint
 LINK_COMPONENTS := asmparser instrumentation scalaropts ipo linker bitreader \
-                   bitwriter vectorize
+                   bitwriter vectorize objcarcopts
 
 include $(LEVEL)/Makefile.common
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index e7d3978..735061d 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -531,12 +531,12 @@ LLC *AbstractInterpreter::createLLC(const char *Argv0,
     return 0;
   }
 
-  Message = "Found llc: " + LLCPath + "\n";
   GCC *gcc = GCC::create(Message, GCCBinary, GCCArgs);
   if (!gcc) {
     errs() << Message << "\n";
     exit(1);
   }
+  Message = "Found llc: " + LLCPath + "\n";
   return new LLC(LLCPath, gcc, Args, UseIntegratedAssembler);
 }
 
diff --git a/tools/bugpoint/bugpoint.cpp b/tools/bugpoint/bugpoint.cpp
index 3049421..5e8fdd1 100644
--- a/tools/bugpoint/bugpoint.cpp
+++ b/tools/bugpoint/bugpoint.cpp
@@ -16,8 +16,8 @@
 #include "BugDriver.h"
 #include "ToolRunner.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
-#include "llvm/LinkAllVMCore.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -120,6 +120,7 @@ int main(int argc, char **argv) {
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeCore(Registry);
   initializeScalarOpts(Registry);
+  initializeObjCARCOpts(Registry);
   initializeVectorization(Registry);
   initializeIPO(Registry);
   initializeAnalysis(Registry);
diff --git a/tools/lli/CMakeLists.txt b/tools/lli/CMakeLists.txt
index ed479f5..356233f 100644
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt
@@ -11,7 +11,9 @@ endif( LLVM_USE_OPROFILE )
 if( LLVM_USE_INTEL_JITEVENTS )
   set(LLVM_LINK_COMPONENTS
     ${LLVM_LINK_COMPONENTS}
+    DebugInfo
     IntelJITEvents
+    Object
     )
 endif( LLVM_USE_INTEL_JITEVENTS )
 
diff --git a/tools/lli/Makefile b/tools/lli/Makefile
index 31f3ab8..85ac6b4 100644
--- a/tools/lli/Makefile
+++ b/tools/lli/Makefile
@@ -17,7 +17,7 @@ LINK_COMPONENTS := mcjit jit interpreter nativecodegen bitreader asmparser selec
 # If Intel JIT Events support is confiured, link against the LLVM Intel JIT
 # Events interface library
 ifeq ($(USE_INTEL_JITEVENTS), 1)
-  LINK_COMPONENTS += inteljitevents
+  LINK_COMPONENTS += debuginfo inteljitevents object
 endif
 
 # If oprofile support is confiured, link against the LLVM oprofile interface
diff --git a/tools/lli/RecordingMemoryManager.cpp b/tools/lli/RecordingMemoryManager.cpp
index 75cb978..e4d992d 100644
--- a/tools/lli/RecordingMemoryManager.cpp
+++ b/tools/lli/RecordingMemoryManager.cpp
@@ -15,16 +15,26 @@
 #include "RecordingMemoryManager.h"
 using namespace llvm;
 
+RecordingMemoryManager::~RecordingMemoryManager() {
+  for (SmallVectorImpl<Allocation>::iterator
+         I = AllocatedCodeMem.begin(), E = AllocatedCodeMem.end();
+       I != E; ++I)
+    sys::Memory::releaseMappedMemory(I->first);
+  for (SmallVectorImpl<Allocation>::iterator
+         I = AllocatedDataMem.begin(), E = AllocatedDataMem.end();
+       I != E; ++I)
+    sys::Memory::releaseMappedMemory(I->first);
+}
+
 uint8_t *RecordingMemoryManager::
 allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID) {
   // The recording memory manager is just a local copy of the remote target.
   // The alignment requirement is just stored here for later use. Regular
-  // heap storage is sufficient here.
-  void *Addr = malloc(Size);
-  assert(Addr && "malloc() failure!");
-  sys::MemoryBlock Block(Addr, Size);
+  // heap storage is sufficient here, but we're using mapped memory to work
+  // around a bug in MCJIT.
+  sys::MemoryBlock Block = allocateSection(Size);
   AllocatedCodeMem.push_back(Allocation(Block, Alignment));
-  return (uint8_t*)Addr;
+  return (uint8_t*)Block.base();
 }
 
 uint8_t *RecordingMemoryManager::
@@ -32,13 +42,30 @@ allocateDataSection(uintptr_t Size, unsigned Alignment,
                     unsigned SectionID, bool IsReadOnly) {
   // The recording memory manager is just a local copy of the remote target.
   // The alignment requirement is just stored here for later use. Regular
-  // heap storage is sufficient here.
-  void *Addr = malloc(Size);
-  assert(Addr && "malloc() failure!");
-  sys::MemoryBlock Block(Addr, Size);
+  // heap storage is sufficient here, but we're using mapped memory to work
+  // around a bug in MCJIT.
+  sys::MemoryBlock Block = allocateSection(Size);
   AllocatedDataMem.push_back(Allocation(Block, Alignment));
-  return (uint8_t*)Addr;
+  return (uint8_t*)Block.base();
+}
+
+sys::MemoryBlock RecordingMemoryManager::allocateSection(uintptr_t Size) {
+  error_code ec;
+  sys::MemoryBlock MB = sys::Memory::allocateMappedMemory(Size,
+                                                          &Near,
+                                                          sys::Memory::MF_READ |
+                                                          sys::Memory::MF_WRITE,
+                                                          ec);
+  assert(!ec && MB.base());
+
+  // FIXME: This is part of a work around to keep sections near one another
+  // when MCJIT performs relocations after code emission but before
+  // the generated code is moved to the remote target.
+  // Save this address as the basis for our next request
+  Near = MB;
+  return MB;
 }
+
 void RecordingMemoryManager::setMemoryWritable() { llvm_unreachable("Unexpected!"); }
 void RecordingMemoryManager::setMemoryExecutable() { llvm_unreachable("Unexpected!"); }
 void RecordingMemoryManager::setPoisonMemory(bool poison) { llvm_unreachable("Unexpected!"); }
diff --git a/tools/lli/RecordingMemoryManager.h b/tools/lli/RecordingMemoryManager.h
index 20fd0c2..991f535 100644
--- a/tools/lli/RecordingMemoryManager.h
+++ b/tools/lli/RecordingMemoryManager.h
@@ -31,9 +31,15 @@ private:
   SmallVector<Allocation, 16> AllocatedDataMem;
   SmallVector<Allocation, 16> AllocatedCodeMem;
 
+  // FIXME: This is part of a work around to keep sections near one another
+  // when MCJIT performs relocations after code emission but before
+  // the generated code is moved to the remote target.
+  sys::MemoryBlock Near;
+  sys::MemoryBlock allocateSection(uintptr_t Size);
+
 public:
   RecordingMemoryManager() {}
-  virtual ~RecordingMemoryManager() {}
+  virtual ~RecordingMemoryManager();
 
   typedef SmallVectorImpl<Allocation>::const_iterator const_data_iterator;
   typedef SmallVectorImpl<Allocation>::const_iterator const_code_iterator;
diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index 46b687e..99479a4 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -98,16 +98,17 @@ static const char *GetBlockName(unsigned BlockID,
   if (CurStreamType != LLVMIRBitstream) return 0;
 
   switch (BlockID) {
-  default:                           return 0;
-  case bitc::MODULE_BLOCK_ID:        return "MODULE_BLOCK";
-  case bitc::PARAMATTR_BLOCK_ID:     return "PARAMATTR_BLOCK";
-  case bitc::TYPE_BLOCK_ID_NEW:      return "TYPE_BLOCK_ID";
-  case bitc::CONSTANTS_BLOCK_ID:     return "CONSTANTS_BLOCK";
-  case bitc::FUNCTION_BLOCK_ID:      return "FUNCTION_BLOCK";
-  case bitc::VALUE_SYMTAB_BLOCK_ID:  return "VALUE_SYMTAB";
-  case bitc::METADATA_BLOCK_ID:      return "METADATA_BLOCK";
-  case bitc::METADATA_ATTACHMENT_ID: return "METADATA_ATTACHMENT_BLOCK";
-  case bitc::USELIST_BLOCK_ID:       return "USELIST_BLOCK_ID";
+  default:                             return 0;
+  case bitc::MODULE_BLOCK_ID:          return "MODULE_BLOCK";
+  case bitc::PARAMATTR_BLOCK_ID:       return "PARAMATTR_BLOCK";
+  case bitc::PARAMATTR_GROUP_BLOCK_ID: return "PARAMATTR_GROUP_BLOCK_ID";
+  case bitc::TYPE_BLOCK_ID_NEW:        return "TYPE_BLOCK_ID";
+  case bitc::CONSTANTS_BLOCK_ID:       return "CONSTANTS_BLOCK";
+  case bitc::FUNCTION_BLOCK_ID:        return "FUNCTION_BLOCK";
+  case bitc::VALUE_SYMTAB_BLOCK_ID:    return "VALUE_SYMTAB";
+  case bitc::METADATA_BLOCK_ID:        return "METADATA_BLOCK";
+  case bitc::METADATA_ATTACHMENT_ID:   return "METADATA_ATTACHMENT_BLOCK";
+  case bitc::USELIST_BLOCK_ID:         return "USELIST_BLOCK_ID";
   }
 }
 
@@ -159,7 +160,9 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
   case bitc::PARAMATTR_BLOCK_ID:
     switch (CodeID) {
     default: return 0;
-    case bitc::PARAMATTR_CODE_ENTRY: return "ENTRY";
+    case bitc::PARAMATTR_CODE_ENTRY_OLD: return "ENTRY";
+    case bitc::PARAMATTR_CODE_ENTRY:     return "ENTRY";
+    case bitc::PARAMATTR_GRP_CODE_ENTRY: return "ENTRY";
     }
   case bitc::TYPE_BLOCK_ID_NEW:
     switch (CodeID) {
@@ -318,10 +321,10 @@ static bool Error(const std::string &Err) {
 }
 
 /// ParseBlock - Read a block, updating statistics, etc.
-static bool ParseBlock(BitstreamCursor &Stream, unsigned IndentLevel) {
+static bool ParseBlock(BitstreamCursor &Stream, unsigned BlockID,
+                       unsigned IndentLevel) {
   std::string Indent(IndentLevel*2, ' ');
   uint64_t BlockBitStart = Stream.GetCurrentBitNo();
-  unsigned BlockID = Stream.ReadSubBlockID();
 
   // Get the statistics for this BlockID.
   PerBlockIDStats &BlockStats = BlockIDStats[BlockID];
@@ -354,7 +357,7 @@ static bool ParseBlock(BitstreamCursor &Stream, unsigned IndentLevel) {
       outs() << " BlockID=" << BlockID;
 
     outs() << " NumWords=" << NumWords
-           << " BlockCodeSize=" << Stream.GetAbbrevIDWidth() << ">\n";
+           << " BlockCodeSize=" << Stream.getAbbrevIDWidth() << ">\n";
   }
 
   SmallVector<uint64_t, 64> Record;
@@ -366,12 +369,13 @@ static bool ParseBlock(BitstreamCursor &Stream, unsigned IndentLevel) {
 
     uint64_t RecordStartBit = Stream.GetCurrentBitNo();
 
-    // Read the code for this record.
-    unsigned AbbrevID = Stream.ReadCode();
-    switch (AbbrevID) {
-    case bitc::END_BLOCK: {
-      if (Stream.ReadBlockEnd())
-        return Error("Error at end of block");
+    BitstreamEntry Entry =
+      Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
+    
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      return Error("malformed bitcode file");
+    case BitstreamEntry::EndBlock: {
       uint64_t BlockBitEnd = Stream.GetCurrentBitNo();
       BlockStats.NumBits += BlockBitEnd-BlockBitStart;
       if (Dump) {
@@ -383,80 +387,81 @@ static bool ParseBlock(BitstreamCursor &Stream, unsigned IndentLevel) {
       }
       return false;
     }
-    case bitc::ENTER_SUBBLOCK: {
+        
+    case BitstreamEntry::SubBlock: {
       uint64_t SubBlockBitStart = Stream.GetCurrentBitNo();
-      if (ParseBlock(Stream, IndentLevel+1))
+      if (ParseBlock(Stream, Entry.ID, IndentLevel+1))
         return true;
       ++BlockStats.NumSubBlocks;
       uint64_t SubBlockBitEnd = Stream.GetCurrentBitNo();
-
+      
       // Don't include subblock sizes in the size of this block.
       BlockBitStart += SubBlockBitEnd-SubBlockBitStart;
+      continue;
+    }
+    case BitstreamEntry::Record:
+      // The interesting case.
       break;
     }
-    case bitc::DEFINE_ABBREV:
+
+    if (Entry.ID == bitc::DEFINE_ABBREV) {
       Stream.ReadAbbrevRecord();
       ++BlockStats.NumAbbrevs;
-      break;
-    default:
-      Record.clear();
-
-      ++BlockStats.NumRecords;
-      if (AbbrevID != bitc::UNABBREV_RECORD)
-        ++BlockStats.NumAbbreviatedRecords;
-
-      const char *BlobStart = 0;
-      unsigned BlobLen = 0;
-      unsigned Code = Stream.ReadRecord(AbbrevID, Record, BlobStart, BlobLen);
-
-
-
-      // Increment the # occurrences of this code.
-      if (BlockStats.CodeFreq.size() <= Code)
-        BlockStats.CodeFreq.resize(Code+1);
-      BlockStats.CodeFreq[Code].NumInstances++;
-      BlockStats.CodeFreq[Code].TotalBits +=
-        Stream.GetCurrentBitNo()-RecordStartBit;
-      if (AbbrevID != bitc::UNABBREV_RECORD)
-        BlockStats.CodeFreq[Code].NumAbbrev++;
+      continue;
+    }
+    
+    Record.clear();
+
+    ++BlockStats.NumRecords;
+
+    StringRef Blob;
+    unsigned Code = Stream.readRecord(Entry.ID, Record, &Blob);
+
+    // Increment the # occurrences of this code.
+    if (BlockStats.CodeFreq.size() <= Code)
+      BlockStats.CodeFreq.resize(Code+1);
+    BlockStats.CodeFreq[Code].NumInstances++;
+    BlockStats.CodeFreq[Code].TotalBits +=
+      Stream.GetCurrentBitNo()-RecordStartBit;
+    if (Entry.ID != bitc::UNABBREV_RECORD) {
+      BlockStats.CodeFreq[Code].NumAbbrev++;
+      ++BlockStats.NumAbbreviatedRecords;
+    }
 
-      if (Dump) {
-        outs() << Indent << "  <";
-        if (const char *CodeName =
-              GetCodeName(Code, BlockID, *Stream.getBitStreamReader()))
-          outs() << CodeName;
-        else
-          outs() << "UnknownCode" << Code;
-        if (NonSymbolic &&
+    if (Dump) {
+      outs() << Indent << "  <";
+      if (const char *CodeName =
             GetCodeName(Code, BlockID, *Stream.getBitStreamReader()))
-          outs() << " codeid=" << Code;
-        if (AbbrevID != bitc::UNABBREV_RECORD)
-          outs() << " abbrevid=" << AbbrevID;
-
-        for (unsigned i = 0, e = Record.size(); i != e; ++i)
-          outs() << " op" << i << "=" << (int64_t)Record[i];
-
-        outs() << "/>";
-
-        if (BlobStart) {
-          outs() << " blob data = ";
-          bool BlobIsPrintable = true;
-          for (unsigned i = 0; i != BlobLen; ++i)
-            if (!isprint(BlobStart[i])) {
-              BlobIsPrintable = false;
-              break;
-            }
-
-          if (BlobIsPrintable)
-            outs() << "'" << std::string(BlobStart, BlobStart+BlobLen) <<"'";
-          else
-            outs() << "unprintable, " << BlobLen << " bytes.";
-        }
-
-        outs() << "\n";
+        outs() << CodeName;
+      else
+        outs() << "UnknownCode" << Code;
+      if (NonSymbolic &&
+          GetCodeName(Code, BlockID, *Stream.getBitStreamReader()))
+        outs() << " codeid=" << Code;
+      if (Entry.ID != bitc::UNABBREV_RECORD)
+        outs() << " abbrevid=" << Entry.ID;
+
+      for (unsigned i = 0, e = Record.size(); i != e; ++i)
+        outs() << " op" << i << "=" << (int64_t)Record[i];
+
+      outs() << "/>";
+
+      if (Blob.data()) {
+        outs() << " blob data = ";
+        bool BlobIsPrintable = true;
+        for (unsigned i = 0, e = Blob.size(); i != e; ++i)
+          if (!isprint(static_cast<unsigned char>(Blob[i]))) {
+            BlobIsPrintable = false;
+            break;
+          }
+
+        if (BlobIsPrintable)
+          outs() << "'" << Blob << "'";
+        else
+          outs() << "unprintable, " << Blob.size() << " bytes.";
       }
 
-      break;
+      outs() << "\n";
     }
   }
 }
@@ -519,7 +524,9 @@ static int AnalyzeBitcode() {
     if (Code != bitc::ENTER_SUBBLOCK)
       return Error("Invalid record at top-level");
 
-    if (ParseBlock(Stream, 0))
+    unsigned BlockID = Stream.ReadSubBlockID();
+
+    if (ParseBlock(Stream, BlockID, 0))
       return true;
     ++NumTopBlocks;
   }
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 34302fa..8094856 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -52,6 +52,25 @@ static cl::opt<bool>
 PrintInlining("inlining", cl::init(false),
               cl::desc("Print all inlined frames for a given address"));
 
+static cl::opt<DIDumpType>
+DumpType("debug-dump", cl::init(DIDT_All),
+  cl::desc("Dump of debug sections:"),
+  cl::values(
+        clEnumValN(DIDT_All, "all", "Dump all debug sections"),
+        clEnumValN(DIDT_Abbrev, "abbrev", ".debug_abbrev"),
+        clEnumValN(DIDT_AbbrevDwo, "abbrev.dwo", ".debug_abbrev.dwo"),
+        clEnumValN(DIDT_Aranges, "aranges", ".debug_aranges"),
+        clEnumValN(DIDT_Info, "info", ".debug_info"),
+        clEnumValN(DIDT_InfoDwo, "info.dwo", ".debug_info.dwo"),
+        clEnumValN(DIDT_Line, "line", ".debug_line"),
+        clEnumValN(DIDT_Frames, "frames", ".debug_frame"),
+        clEnumValN(DIDT_Ranges, "ranges", ".debug_ranges"),
+        clEnumValN(DIDT_Pubnames, "pubnames", ".debug_pubnames"),
+        clEnumValN(DIDT_Str, "str", ".debug_str"),
+        clEnumValN(DIDT_StrDwo, "str.dwo", ".debug_str.dwo"),
+        clEnumValN(DIDT_StrOffsetsDwo, "str_offsets.dwo", ".debug_str_offsets.dwo"),
+        clEnumValEnd));
+
 static void PrintDILineInfo(DILineInfo dli) {
   if (PrintFunctions)
     outs() << (dli.getFunctionName() ? dli.getFunctionName() : "<unknown>")
@@ -69,13 +88,18 @@ static void DumpInput(const StringRef &Filename) {
   }
 
   OwningPtr<ObjectFile> Obj(ObjectFile::createObjectFile(Buff.take()));
-  OwningPtr<DIContext> dictx(DIContext::getDWARFContext(Obj.get()));
+  if (!Obj) {
+    errs() << Filename << ": Unknown object file format\n";
+    return;
+  }
+
+  OwningPtr<DIContext> DICtx(DIContext::getDWARFContext(Obj.get()));
 
   if (Address == -1ULL) {
     outs() << Filename
            << ":\tfile format " << Obj->getFileFormatName() << "\n\n";
     // Dump the complete DWARF structure.
-    dictx->dump(outs());
+    DICtx->dump(outs(), DumpType);
   } else {
     // Print line info for the specified address.
     int SpecFlags = DILineInfoSpecifier::FileLineInfo |
@@ -84,7 +108,7 @@ static void DumpInput(const StringRef &Filename) {
       SpecFlags |= DILineInfoSpecifier::FunctionName;
     if (PrintInlining) {
       DIInliningInfo InliningInfo =
-        dictx->getInliningInfoForAddress(Address, SpecFlags);
+        DICtx->getInliningInfoForAddress(Address, SpecFlags);
       uint32_t n = InliningInfo.getNumberOfFrames();
       if (n == 0) {
         // Print one empty debug line info in any case.
@@ -96,7 +120,7 @@ static void DumpInput(const StringRef &Filename) {
         }
       }
     } else {
-      DILineInfo dli = dictx->getLineInfoForAddress(Address, SpecFlags);
+      DILineInfo dli = DICtx->getLineInfoForAddress(Address, SpecFlags);
       PrintDILineInfo(dli);
     }
   }
diff --git a/tools/llvm-jitlistener/CMakeLists.txt b/tools/llvm-jitlistener/CMakeLists.txt
index 57a4a0c..d429af9 100644
--- a/tools/llvm-jitlistener/CMakeLists.txt
+++ b/tools/llvm-jitlistener/CMakeLists.txt
@@ -6,6 +6,7 @@ include_directories( ${LLVM_INTEL_JITEVENTS_INCDIR} )
 set(LLVM_LINK_COMPONENTS
   asmparser
   bitreader
+  debuginfo
   inteljitevents
   interpreter
   jit
diff --git a/tools/llvm-jitlistener/Makefile b/tools/llvm-jitlistener/Makefile
index 0971e6a..3018235 100644
--- a/tools/llvm-jitlistener/Makefile
+++ b/tools/llvm-jitlistener/Makefile
@@ -18,7 +18,7 @@ LINK_COMPONENTS := mcjit jit interpreter nativecodegen bitreader asmparser selec
 # Events interface library.  If not, this tool will do nothing useful, but it
 # will build correctly.
 ifeq ($(USE_INTEL_JITEVENTS), 1)
-  LINK_COMPONENTS += inteljitevents
+  LINK_COMPONENTS += debuginfo inteljitevents
 endif
 
 # This tool has no plugins, optimize startup time.
diff --git a/tools/llvm-jitlistener/llvm-jitlistener.cpp b/tools/llvm-jitlistener/llvm-jitlistener.cpp
index 2f72e42..d6f5032 100644
--- a/tools/llvm-jitlistener/llvm-jitlistener.cpp
+++ b/tools/llvm-jitlistener/llvm-jitlistener.cpp
@@ -137,7 +137,7 @@ protected:
     // Override the triple to generate ELF on Windows since that's supported
     Triple Tuple(TheModule->getTargetTriple());
     if (Tuple.getTriple().empty())
-      Tuple.setTriple(LLVM_HOSTTRIPLE);
+      Tuple.setTriple(sys::getProcessTriple());
 
     if (Tuple.isOSWindows() && Triple::ELF != Tuple.getEnvironment()) {
       Tuple.setEnvironment(Triple::ELF);
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index e77f27e..243899b 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -22,7 +22,6 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/AsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
@@ -236,6 +235,13 @@ static void setDwarfDebugFlags(int argc, char **argv) {
   }
 }
 
+static std::string DwarfDebugProducer;
+static void setDwarfDebugProducer(void) {
+  if(!getenv("DEBUG_PRODUCER"))
+    return;
+  DwarfDebugProducer += getenv("DEBUG_PRODUCER");
+}
+
 static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI, tool_output_file *Out) {
 
   AsmLexer Lexer(MAI);
@@ -353,6 +359,8 @@ int main(int argc, char **argv) {
   TripleName = Triple::normalize(TripleName);
   setDwarfDebugFlags(argc, argv);
 
+  setDwarfDebugProducer();
+
   const char *ProgName = argv[0];
   const Target *TheTarget = GetTarget(ProgName);
   if (!TheTarget)
@@ -374,7 +382,6 @@ int main(int argc, char **argv) {
   // it later.
   SrcMgr.setIncludeDirs(IncludeDirs);
 
-
   llvm::OwningPtr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(TripleName));
   assert(MAI && "Unable to create target asm info!");
 
@@ -393,6 +400,8 @@ int main(int argc, char **argv) {
   Ctx.setGenDwarfForAssembly(GenDwarfForAssembly);
   if (!DwarfDebugFlags.empty())
     Ctx.setDwarfDebugFlags(StringRef(DwarfDebugFlags));
+  if (!DwarfDebugProducer.empty())
+    Ctx.setDwarfDebugProducer(StringRef(DwarfDebugProducer));
   if (!DebugCompilationDir.empty())
     Ctx.setCompilationDir(DebugCompilationDir);
   if (!MainFileName.empty())
@@ -418,7 +427,7 @@ int main(int argc, char **argv) {
   OwningPtr<MCSubtargetInfo>
     STI(TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr));
 
-  MCInstPrinter *IP;
+  MCInstPrinter *IP = NULL;
   if (FileType == OFT_AssemblyFile) {
     IP =
       TheTarget->createMCInstPrinter(OutputAsmVariant, *MAI, *MCII, *MRI, *STI);
@@ -456,10 +465,12 @@ int main(int argc, char **argv) {
     Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI);
     break;
   case AC_MDisassemble:
+    assert(IP && "Expected assembly output");
     IP->setUseMarkup(1);
     disassemble = true;
     break;
   case AC_HDisassemble:
+    assert(IP && "Expected assembly output");
     IP->setPrintImmHex(1);
     disassemble = true;
     break;
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 056fd35..a24aae6 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -384,7 +384,9 @@ static void DumpSymbolNamesFromFile(std::string &Filename) {
         OwningPtr<Binary> child;
         if (i->getAsBinary(child)) {
           // Try opening it as a bitcode file.
-          OwningPtr<MemoryBuffer> buff(i->getBuffer());
+          OwningPtr<MemoryBuffer> buff;
+          if (error(i->getMemoryBuffer(buff)))
+            return;
           Module *Result = 0;
           if (buff)
             Result = ParseBitcodeFile(buff.get(), Context, &ErrorMessage);
diff --git a/tools/llvm-objdump/ELFDump.cpp b/tools/llvm-objdump/ELFDump.cpp
index a635fef..72c512e 100644
--- a/tools/llvm-objdump/ELFDump.cpp
+++ b/tools/llvm-objdump/ELFDump.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-objdump.h"
-
 #include "llvm/Object/ELF.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
@@ -22,10 +21,10 @@
 using namespace llvm;
 using namespace llvm::object;
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
+template<class ELFT>
 void printProgramHeaders(
-    const ELFObjectFile<target_endianness, max_alignment, is64Bits> *o) {
-  typedef ELFObjectFile<target_endianness, max_alignment, is64Bits> ELFO;
+    const ELFObjectFile<ELFT> *o) {
+  typedef ELFObjectFile<ELFT> ELFO;
   outs() << "Program Header:\n";
   for (typename ELFO::Elf_Phdr_Iter pi = o->begin_program_headers(),
                                     pe = o->end_program_headers();
@@ -40,11 +39,20 @@ void printProgramHeaders(
     case ELF::PT_GNU_EH_FRAME:
       outs() << "EH_FRAME ";
       break;
+    case ELF::PT_INTERP:
+      outs() << "  INTERP ";
+      break;
+    case ELF::PT_DYNAMIC:
+      outs() << " DYNAMIC ";
+      break;
+    case ELF::PT_PHDR:
+      outs() << "    PHDR ";
+      break;
     default:
       outs() << " UNKNOWN ";
     }
 
-    const char *Fmt = is64Bits ? "0x%016" PRIx64 " " : "0x%08" PRIx64 " ";
+    const char *Fmt = ELFT::Is64Bits ? "0x%016" PRIx64 " " : "0x%08" PRIx64 " ";
 
     outs() << "off    "
            << format(Fmt, (uint64_t)pi->p_offset)
@@ -68,22 +76,22 @@ void printProgramHeaders(
 
 void llvm::printELFFileHeader(const object::ObjectFile *Obj) {
   // Little-endian 32-bit
-  if (const ELFObjectFile<support::little, 4, false> *ELFObj =
-          dyn_cast<ELFObjectFile<support::little, 4, false> >(Obj))
+  if (const ELFObjectFile<ELFType<support::little, 4, false> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::little, 4, false> > >(Obj))
     printProgramHeaders(ELFObj);
 
   // Big-endian 32-bit
-  if (const ELFObjectFile<support::big, 4, false> *ELFObj =
-          dyn_cast<ELFObjectFile<support::big, 4, false> >(Obj))
+  if (const ELFObjectFile<ELFType<support::big, 4, false> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::big, 4, false> > >(Obj))
     printProgramHeaders(ELFObj);
 
   // Little-endian 64-bit
-  if (const ELFObjectFile<support::little, 8, true> *ELFObj =
-          dyn_cast<ELFObjectFile<support::little, 8, true> >(Obj))
+  if (const ELFObjectFile<ELFType<support::little, 8, true> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::little, 8, true> > >(Obj))
     printProgramHeaders(ELFObj);
 
   // Big-endian 64-bit
-  if (const ELFObjectFile<support::big, 8, true> *ELFObj =
-          dyn_cast<ELFObjectFile<support::big, 8, true> >(Obj))
+  if (const ELFObjectFile<ELFType<support::big, 8, true> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::big, 8, true> > >(Obj))
     printProgramHeaders(ELFObj);
 }
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 9958dad..322bd21 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -11,6 +11,9 @@
 // dumps out a plethora of information about an object file depending on the
 // flags.
 //
+// The flags and output of this program should be near identical to those of
+// binutils objdump.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm-objdump.h"
@@ -28,7 +31,6 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/COFF.h"
-#include "llvm/Object/ELF.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Casting.h"
@@ -442,7 +444,7 @@ static void PrintSectionHeaders(const ObjectFile *o) {
     if (error(si->isBSS(BSS))) return;
     std::string Type = (std::string(Text ? "TEXT " : "") +
                         (Data ? "DATA " : "") + (BSS ? "BSS" : ""));
-    outs() << format("%3d %-13s %09" PRIx64 " %017" PRIx64 " %s\n",
+    outs() << format("%3d %-13s %08" PRIx64 " %016" PRIx64 " %s\n",
                      i, Name.str().c_str(), Size, Address, Type.c_str());
     ++i;
   }
@@ -479,7 +481,7 @@ static void PrintSectionContents(const ObjectFile *o) {
       // Print ascii.
       outs() << "  ";
       for (std::size_t i = 0; i < 16 && addr + i < end; ++i) {
-        if (std::isprint(Contents[addr + i] & 0xFF))
+        if (std::isprint(static_cast<unsigned char>(Contents[addr + i]) & 0xFF))
           outs() << Contents[addr + i];
         else
           outs() << ".";
@@ -572,7 +574,10 @@ static void PrintSymbolTable(const ObjectFile *o) {
       else if (Type == SymbolRef::ST_Function)
         FileFunc = 'F';
 
-      outs() << format("%08" PRIx64, Address) << " "
+      const char *Fmt = o->getBytesInAddress() > 4 ? "%016" PRIx64 :
+                                                     "%08" PRIx64;
+
+      outs() << format(Fmt, Address) << " "
              << GlobLoc // Local -> 'l', Global -> 'g', Neither -> ' '
              << (Weak ? 'w' : ' ') // Weak?
              << ' ' // Constructor. Not supported yet.
diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt
index be80469..676c23d 100644
--- a/tools/llvm-readobj/CMakeLists.txt
+++ b/tools/llvm-readobj/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS archive bitreader object)
 
 add_llvm_tool(llvm-readobj
+  ELF.cpp
   llvm-readobj.cpp
   )
diff --git a/tools/llvm-readobj/ELF.cpp b/tools/llvm-readobj/ELF.cpp
new file mode 100644
index 0000000..07f15b3
--- /dev/null
+++ b/tools/llvm-readobj/ELF.cpp
@@ -0,0 +1,196 @@
+//===- llvm-readobj/ELF.cpp - ELF Specific Dumper -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-readobj.h"
+
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Format.h"
+
+namespace llvm {
+using namespace object;
+using namespace ELF;
+
+const char *getTypeString(uint64_t Type) {
+  switch (Type) {
+  case DT_BIND_NOW:
+    return "(BIND_NOW)";
+  case DT_DEBUG:
+    return "(DEBUG)";
+  case DT_FINI:
+    return "(FINI)";
+  case DT_FINI_ARRAY:
+    return "(FINI_ARRAY)";
+  case DT_FINI_ARRAYSZ:
+    return "(FINI_ARRAYSZ)";
+  case DT_FLAGS:
+    return "(FLAGS)";
+  case DT_HASH:
+    return "(HASH)";
+  case DT_INIT:
+    return "(INIT)";
+  case DT_INIT_ARRAY:
+    return "(INIT_ARRAY)";
+  case DT_INIT_ARRAYSZ:
+    return "(INIT_ARRAYSZ)";
+  case DT_PREINIT_ARRAY:
+    return "(PREINIT_ARRAY)";
+  case DT_PREINIT_ARRAYSZ:
+    return "(PREINIT_ARRAYSZ)";
+  case DT_JMPREL:
+    return "(JMPREL)";
+  case DT_NEEDED:
+    return "(NEEDED)";
+  case DT_NULL:
+    return "(NULL)";
+  case DT_PLTGOT:
+    return "(PLTGOT)";
+  case DT_PLTREL:
+    return "(PLTREL)";
+  case DT_PLTRELSZ:
+    return "(PLTRELSZ)";
+  case DT_REL:
+    return "(REL)";
+  case DT_RELA:
+    return "(RELA)";
+  case DT_RELENT:
+    return "(RELENT)";
+  case DT_RELSZ:
+    return "(RELSZ)";
+  case DT_RELAENT:
+    return "(RELAENT)";
+  case DT_RELASZ:
+    return "(RELASZ)";
+  case DT_RPATH:
+    return "(RPATH)";
+  case DT_RUNPATH:
+    return "(RUNPATH)";
+  case DT_SONAME:
+    return "(SONAME)";
+  case DT_STRSZ:
+    return "(STRSZ)";
+  case DT_STRTAB:
+    return "(STRTAB)";
+  case DT_SYMBOLIC:
+    return "(SYMBOLIC)";
+  case DT_SYMENT:
+    return "(SYMENT)";
+  case DT_SYMTAB:
+    return "(SYMTAB)";
+  case DT_TEXTREL:
+    return "(TEXTREL)";
+  default:
+    return "unknown";
+  }
+}
+
+template <class ELFT>
+void printValue(const ELFObjectFile<ELFT> *O, uint64_t Type, uint64_t Value,
+                bool Is64, raw_ostream &OS) {
+  switch (Type) {
+  case DT_PLTREL:
+    if (Value == DT_REL) {
+      OS << "REL";
+      break;
+    } else if (Value == DT_RELA) {
+      OS << "RELA";
+      break;
+    }
+  // Fallthrough.
+  case DT_PLTGOT:
+  case DT_HASH:
+  case DT_STRTAB:
+  case DT_SYMTAB:
+  case DT_RELA:
+  case DT_INIT:
+  case DT_FINI:
+  case DT_REL:
+  case DT_JMPREL:
+  case DT_INIT_ARRAY:
+  case DT_FINI_ARRAY:
+  case DT_PREINIT_ARRAY:
+  case DT_DEBUG:
+  case DT_NULL:
+    OS << format("0x%" PRIx64, Value);
+    break;
+  case DT_PLTRELSZ:
+  case DT_RELASZ:
+  case DT_RELAENT:
+  case DT_STRSZ:
+  case DT_SYMENT:
+  case DT_RELSZ:
+  case DT_RELENT:
+  case DT_INIT_ARRAYSZ:
+  case DT_FINI_ARRAYSZ:
+  case DT_PREINIT_ARRAYSZ:
+    OS << Value << " (bytes)";
+    break;
+  case DT_NEEDED:
+    OS << "Shared library: ["
+       << O->getString(O->getDynamicStringTableSectionHeader(), Value) << "]";
+    break;
+  case DT_SONAME:
+    OS << "Library soname: ["
+       << O->getString(O->getDynamicStringTableSectionHeader(), Value) << "]";
+    break;
+  }
+}
+
+template <class ELFT>
+ErrorOr<void> dumpDynamicTable(const ELFObjectFile<ELFT> *O, raw_ostream &OS) {
+  typedef ELFObjectFile<ELFT> ELFO;
+  typedef typename ELFO::Elf_Dyn_iterator EDI;
+  EDI Start = O->begin_dynamic_table(),
+      End = O->end_dynamic_table(true);
+
+  if (Start == End)
+    return error_code::success();
+
+  ptrdiff_t Total = std::distance(Start, End);
+  OS << "Dynamic section contains " << Total << " entries\n";
+
+  bool Is64 = O->getBytesInAddress() == 8;
+
+  OS << "  Tag" << (Is64 ? "                " : "        ") << "Type"
+     << "                 " << "Name/Value\n";
+  for (; Start != End; ++Start) {
+    OS << " "
+       << format(Is64 ? "0x%016" PRIx64 : "0x%08" PRIx64, Start->getTag())
+       << " " << format("%-21s", getTypeString(Start->getTag()));
+    printValue(O, Start->getTag(), Start->getVal(), Is64, OS);
+    OS << "\n";
+  }
+
+  OS << "  Total: " << Total << "\n\n";
+  return error_code::success();
+}
+
+ErrorOr<void> dumpELFDynamicTable(ObjectFile *O, raw_ostream &OS) {
+  // Little-endian 32-bit
+  if (const ELFObjectFile<ELFType<support::little, 4, false> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::little, 4, false> > >(O))
+    return dumpDynamicTable(ELFObj, OS);
+
+  // Big-endian 32-bit
+  if (const ELFObjectFile<ELFType<support::big, 4, false> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::big, 4, false> > >(O))
+    return dumpDynamicTable(ELFObj, OS);
+
+  // Little-endian 64-bit
+  if (const ELFObjectFile<ELFType<support::little, 8, true> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::little, 8, true> > >(O))
+    return dumpDynamicTable(ELFObj, OS);
+
+  // Big-endian 64-bit
+  if (const ELFObjectFile<ELFType<support::big, 8, true> > *ELFObj =
+          dyn_cast<ELFObjectFile<ELFType<support::big, 8, true> > >(O))
+    return dumpDynamicTable(ELFObj, OS);
+  return error_code(object_error::invalid_file_type);
+}
+} // end namespace llvm
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index d22ecd1..8f0917f 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -7,15 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This program is a utility that works like traditional Unix "readelf",
-// except that it can handle any type of object file recognized by lib/Object.
+// This is a tool similar to readelf, except it works on multiple object file
+// formats. The main purpose of this tool is to provide detailed output suitable
+// for FileCheck.
 //
-// It makes use of the generic ObjectFile interface.
+// Flags should be similar to readelf where supported, but the output format
+// does not need to be identical. The point is to not make users learn yet
+// another set of flags.
 //
-// Caution: This utility is new, experimental, unsupported, and incomplete.
+// Output should be specialized for each format where appropriate.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm-readobj.h"
+
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Object/ELF.h"
@@ -237,7 +242,8 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  ObjectFile *obj = ObjectFile::createObjectFile(File.take());
+  OwningPtr<ObjectFile> o(ObjectFile::createObjectFile(File.take()));
+  ObjectFile *obj = o.get();
   if (!obj) {
     errs() << InputFilename << ": Object type not recognized\n";
   }
@@ -259,6 +265,13 @@ int main(int argc, char** argv) {
   dump(obj, &dumpSection, obj->begin_sections(), obj->end_sections(),
        "Section iteration failed");
 
+  if (obj->isELF()) {
+    if (ErrorOr<void> e = dumpELFDynamicTable(obj, outs()))
+      ;
+    else
+      errs() << "InputFilename" << ": " << error_code(e).message() << "\n";
+  }
+
   outs() << "Libraries needed:\n";
   dump(obj, &dumpLibrary, obj->begin_libraries_needed(),
        obj->end_libraries_needed(), "Needed libraries iteration failed");
diff --git a/tools/llvm-readobj/llvm-readobj.h b/tools/llvm-readobj/llvm-readobj.h
new file mode 100644
index 0000000..cf492b2
--- /dev/null
+++ b/tools/llvm-readobj/llvm-readobj.h
@@ -0,0 +1,22 @@
+//===- llvm-readobj.h - Dump contents of an Object File -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_READ_OBJ_H
+#define LLVM_TOOLS_READ_OBJ_H
+
+#include "llvm/Support/ErrorOr.h"
+
+namespace llvm {
+namespace object { class ObjectFile; }
+class raw_ostream;
+
+ErrorOr<void> dumpELFDynamicTable(object::ObjectFile *O, raw_ostream &OS);
+} // end namespace llvm
+
+#endif
diff --git a/tools/llvm-rtdyld/CMakeLists.txt b/tools/llvm-rtdyld/CMakeLists.txt
index 17e2c3e..8d161d3 100644
--- a/tools/llvm-rtdyld/CMakeLists.txt
+++ b/tools/llvm-rtdyld/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} support MC object RuntimeDyld JIT)
+set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} support MC object RuntimeDyld JIT debuginfo)
 
 add_llvm_tool(llvm-rtdyld
   llvm-rtdyld.cpp
diff --git a/tools/llvm-rtdyld/Makefile b/tools/llvm-rtdyld/Makefile
index 30fbee0..fabdd68 100644
--- a/tools/llvm-rtdyld/Makefile
+++ b/tools/llvm-rtdyld/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-rtdyld
-LINK_COMPONENTS := all-targets support MC object RuntimeDyld JIT
+LINK_COMPONENTS := all-targets support MC object RuntimeDyld JIT debuginfo
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index ec63c9b..4d8d345 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
@@ -31,7 +32,8 @@ InputFileList(cl::Positional, cl::ZeroOrMore,
               cl::desc("<input file>"));
 
 enum ActionType {
-  AC_Execute
+  AC_Execute,
+  AC_PrintLineInfo
 };
 
 static cl::opt<ActionType>
@@ -39,6 +41,8 @@ Action(cl::desc("Action to perform:"),
        cl::init(AC_Execute),
        cl::values(clEnumValN(AC_Execute, "execute",
                              "Load, link, and execute the inputs."),
+                  clEnumValN(AC_PrintLineInfo, "printline",
+                             "Load, link, and print line information for each function."),
                   clEnumValEnd));
 
 static cl::opt<std::string>
@@ -114,6 +118,66 @@ static int Error(const Twine &Msg) {
 
 /* *** */
 
+static int printLineInfoForInput() {
+  // If we don't have any input files, read from stdin.
+  if (!InputFileList.size())
+    InputFileList.push_back("-");
+  for(unsigned i = 0, e = InputFileList.size(); i != e; ++i) {
+    // Instantiate a dynamic linker.
+    TrivialMemoryManager *MemMgr = new TrivialMemoryManager;
+    RuntimeDyld Dyld(MemMgr);
+
+    // Load the input memory buffer.
+    OwningPtr<MemoryBuffer> InputBuffer;
+    OwningPtr<ObjectImage>  LoadedObject;
+    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFileList[i],
+                                                     InputBuffer))
+      return Error("unable to read input: '" + ec.message() + "'");
+
+    // Load the object file
+    LoadedObject.reset(Dyld.loadObject(new ObjectBuffer(InputBuffer.take())));
+    if (!LoadedObject) {
+      return Error(Dyld.getErrorString());
+    }
+
+    // Resolve all the relocations we can.
+    Dyld.resolveRelocations();
+
+    OwningPtr<DIContext> Context(DIContext::getDWARFContext(LoadedObject->getObjectFile()));
+
+    // Use symbol info to iterate functions in the object.
+    error_code ec;
+    for (object::symbol_iterator I = LoadedObject->begin_symbols(),
+                                 E = LoadedObject->end_symbols();
+                          I != E && !ec;
+                          I.increment(ec)) {
+      object::SymbolRef::Type SymType;
+      if (I->getType(SymType)) continue;
+      if (SymType == object::SymbolRef::ST_Function) {
+        StringRef  Name;
+        uint64_t   Addr;
+        uint64_t   Size;
+        if (I->getName(Name)) continue;
+        if (I->getAddress(Addr)) continue;
+        if (I->getSize(Size)) continue;
+
+        outs() << "Function: " << Name << ", Size = " << Size << "\n";
+
+        DILineInfoTable Lines = Context->getLineInfoForAddressRange(Addr, Size);
+        DILineInfoTable::iterator  Begin = Lines.begin();
+        DILineInfoTable::iterator  End = Lines.end();
+        for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
+          outs() << "  Line info @ " << It->first - Addr << ": "
+                 << It->second.getFileName()
+                 << ", line:" << It->second.getLine() << "\n";
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
 static int executeInput() {
   // Instantiate a dynamic linker.
   TrivialMemoryManager *MemMgr = new TrivialMemoryManager;
@@ -180,5 +244,7 @@ int main(int argc, char **argv) {
   switch (Action) {
   case AC_Execute:
     return executeInput();
+  case AC_PrintLineInfo:
+    return printLineInfoForInput();
   }
 }
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index bb15c6b..fbda1b7 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/ADT/OwningPtr.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Assembly/PrintModulePass.h"
@@ -26,7 +27,6 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <algorithm>
-#include <memory>
 #include <set>
 #include <sstream>
 #include <vector>
@@ -379,9 +379,7 @@ struct ConstModifier: public Modifier {
         RandomBits[i] = Ran->Rand64();
 
       APInt RandomInt(Ty->getPrimitiveSizeInBits(), makeArrayRef(RandomBits));
-
-      bool isIEEE = !Ty->isX86_FP80Ty() && !Ty->isPPC_FP128Ty();
-      APFloat RandomFloat(RandomInt, isIEEE);
+      APFloat RandomFloat(Ty->getFltSemantics(), RandomInt);
 
       if (Ran->Rand() & 1)
         return PT->push_back(ConstantFP::getNullValue(Ty));
@@ -624,15 +622,15 @@ void FillFunction(Function *F, Random &R) {
 
   // List of modifiers which add new random instructions.
   std::vector<Modifier*> Modifiers;
-  std::auto_ptr<Modifier> LM(new LoadModifier(BB, &PT, &R));
-  std::auto_ptr<Modifier> SM(new StoreModifier(BB, &PT, &R));
-  std::auto_ptr<Modifier> EE(new ExtractElementModifier(BB, &PT, &R));
-  std::auto_ptr<Modifier> SHM(new ShuffModifier(BB, &PT, &R));
-  std::auto_ptr<Modifier> IE(new InsertElementModifier(BB, &PT, &R));
-  std::auto_ptr<Modifier> BM(new BinModifier(BB, &PT, &R));
-  std::auto_ptr<Modifier> CM(new CastModifier(BB, &PT, &R));
-  std::auto_ptr<Modifier> SLM(new SelectModifier(BB, &PT, &R));
-  std::auto_ptr<Modifier> PM(new CmpModifier(BB, &PT, &R));
+  OwningPtr<Modifier> LM(new LoadModifier(BB, &PT, &R));
+  OwningPtr<Modifier> SM(new StoreModifier(BB, &PT, &R));
+  OwningPtr<Modifier> EE(new ExtractElementModifier(BB, &PT, &R));
+  OwningPtr<Modifier> SHM(new ShuffModifier(BB, &PT, &R));
+  OwningPtr<Modifier> IE(new InsertElementModifier(BB, &PT, &R));
+  OwningPtr<Modifier> BM(new BinModifier(BB, &PT, &R));
+  OwningPtr<Modifier> CM(new CastModifier(BB, &PT, &R));
+  OwningPtr<Modifier> SLM(new SelectModifier(BB, &PT, &R));
+  OwningPtr<Modifier> PM(new CmpModifier(BB, &PT, &R));
   Modifiers.push_back(LM.get());
   Modifiers.push_back(SM.get());
   Modifiers.push_back(EE.get());
@@ -686,7 +684,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "llvm codegen stress-tester\n");
   llvm_shutdown_obj Y;
 
-  std::auto_ptr<Module> M(new Module("/tmp/autogen.bc", getGlobalContext()));
+  OwningPtr<Module> M(new Module("/tmp/autogen.bc", getGlobalContext()));
   Function *F = GenEmptyFunction(M.get());
 
   // Pick an initial seed value
diff --git a/tools/llvm-symbolizer/CMakeLists.txt b/tools/llvm-symbolizer/CMakeLists.txt
index 5e27463..5967b89 100644
--- a/tools/llvm-symbolizer/CMakeLists.txt
+++ b/tools/llvm-symbolizer/CMakeLists.txt
@@ -9,5 +9,6 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_tool(llvm-symbolizer
+  LLVMSymbolize.cpp
   llvm-symbolizer.cpp
   )
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.cpp b/tools/llvm-symbolizer/LLVMSymbolize.cpp
new file mode 100644
index 0000000..86ea34b
--- /dev/null
+++ b/tools/llvm-symbolizer/LLVMSymbolize.cpp
@@ -0,0 +1,287 @@
+//===-- LLVMSymbolize.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation for LLVM symbolization library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LLVMSymbolize.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Path.h"
+
+#include <sstream>
+
+namespace llvm {
+namespace symbolize {
+
+static bool error(error_code ec) {
+  if (!ec)
+    return false;
+  errs() << "LLVMSymbolizer: error reading file: " << ec.message() << ".\n";
+  return true;
+}
+
+static uint32_t
+getDILineInfoSpecifierFlags(const LLVMSymbolizer::Options &Opts) {
+  uint32_t Flags = llvm::DILineInfoSpecifier::FileLineInfo |
+                   llvm::DILineInfoSpecifier::AbsoluteFilePath;
+  if (Opts.PrintFunctions)
+    Flags |= llvm::DILineInfoSpecifier::FunctionName;
+  return Flags;
+}
+
+static void patchFunctionNameInDILineInfo(const std::string &NewFunctionName,
+                                          DILineInfo &LineInfo) {
+  std::string FileName = LineInfo.getFileName();
+  LineInfo = DILineInfo(StringRef(FileName), StringRef(NewFunctionName),
+                        LineInfo.getLine(), LineInfo.getColumn());
+}
+
+ModuleInfo::ModuleInfo(ObjectFile *Obj, DIContext *DICtx)
+    : Module(Obj), DebugInfoContext(DICtx) {
+  error_code ec;
+  for (symbol_iterator si = Module->begin_symbols(), se = Module->end_symbols();
+       si != se; si.increment(ec)) {
+    if (error(ec))
+      return;
+    SymbolRef::Type SymbolType;
+    if (error(si->getType(SymbolType)))
+      continue;
+    if (SymbolType != SymbolRef::ST_Function &&
+        SymbolType != SymbolRef::ST_Data)
+      continue;
+    uint64_t SymbolAddress;
+    if (error(si->getAddress(SymbolAddress)) ||
+        SymbolAddress == UnknownAddressOrSize)
+      continue;
+    uint64_t SymbolSize;
+    if (error(si->getSize(SymbolSize)) || SymbolSize == UnknownAddressOrSize)
+      continue;
+    StringRef SymbolName;
+    if (error(si->getName(SymbolName)))
+      continue;
+    // FIXME: If a function has alias, there are two entries in symbol table
+    // with same address size. Make sure we choose the correct one.
+    SymbolMapTy &M = SymbolType == SymbolRef::ST_Function ? Functions : Objects;
+    SymbolDesc SD = { SymbolAddress, SymbolAddress + SymbolSize };
+    M.insert(std::make_pair(SD, SymbolName));
+  }
+}
+
+bool ModuleInfo::getNameFromSymbolTable(SymbolRef::Type Type, uint64_t Address,
+                                        std::string &Name, uint64_t &Addr,
+                                        uint64_t &Size) const {
+  const SymbolMapTy &M = Type == SymbolRef::ST_Function ? Functions : Objects;
+  SymbolDesc SD = { Address, Address + 1 };
+  SymbolMapTy::const_iterator it = M.find(SD);
+  if (it == M.end())
+    return false;
+  if (Address < it->first.Addr || Address >= it->first.AddrEnd)
+    return false;
+  Name = it->second.str();
+  Addr = it->first.Addr;
+  Size = it->first.AddrEnd - it->first.Addr;
+  return true;
+}
+
+DILineInfo ModuleInfo::symbolizeCode(
+    uint64_t ModuleOffset, const LLVMSymbolizer::Options &Opts) const {
+  DILineInfo LineInfo;
+  if (DebugInfoContext) {
+    LineInfo = DebugInfoContext->getLineInfoForAddress(
+        ModuleOffset, getDILineInfoSpecifierFlags(Opts));
+  }
+  // Override function name from symbol table if necessary.
+  if (Opts.PrintFunctions && Opts.UseSymbolTable) {
+    std::string FunctionName;
+    uint64_t Start, Size;
+    if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset,
+                               FunctionName, Start, Size)) {
+      patchFunctionNameInDILineInfo(FunctionName, LineInfo);
+    }
+  }
+  return LineInfo;
+}
+
+DIInliningInfo ModuleInfo::symbolizeInlinedCode(
+    uint64_t ModuleOffset, const LLVMSymbolizer::Options &Opts) const {
+  DIInliningInfo InlinedContext;
+  if (DebugInfoContext) {
+    InlinedContext = DebugInfoContext->getInliningInfoForAddress(
+        ModuleOffset, getDILineInfoSpecifierFlags(Opts));
+  }
+  // Make sure there is at least one frame in context.
+  if (InlinedContext.getNumberOfFrames() == 0) {
+    InlinedContext.addFrame(DILineInfo());
+  }
+  // Override the function name in lower frame with name from symbol table.
+  if (Opts.PrintFunctions && Opts.UseSymbolTable) {
+    DIInliningInfo PatchedInlinedContext;
+    for (uint32_t i = 0, n = InlinedContext.getNumberOfFrames(); i < n; i++) {
+      DILineInfo LineInfo = InlinedContext.getFrame(i);
+      if (i == n - 1) {
+        std::string FunctionName;
+        uint64_t Start, Size;
+        if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset,
+                                   FunctionName, Start, Size)) {
+          patchFunctionNameInDILineInfo(FunctionName, LineInfo);
+        }
+      }
+      PatchedInlinedContext.addFrame(LineInfo);
+    }
+    InlinedContext = PatchedInlinedContext;
+  }
+  return InlinedContext;
+}
+
+bool ModuleInfo::symbolizeData(uint64_t ModuleOffset, std::string &Name,
+                               uint64_t &Start, uint64_t &Size) const {
+  return getNameFromSymbolTable(SymbolRef::ST_Data, ModuleOffset, Name, Start,
+                                Size);
+}
+
+const char LLVMSymbolizer::kBadString[] = "??";
+
+std::string LLVMSymbolizer::symbolizeCode(const std::string &ModuleName,
+                                          uint64_t ModuleOffset) {
+  ModuleInfo *Info = getOrCreateModuleInfo(ModuleName);
+  if (Info == 0)
+    return printDILineInfo(DILineInfo());
+  if (Opts.PrintInlining) {
+    DIInliningInfo InlinedContext =
+        Info->symbolizeInlinedCode(ModuleOffset, Opts);
+    uint32_t FramesNum = InlinedContext.getNumberOfFrames();
+    assert(FramesNum > 0);
+    std::string Result;
+    for (uint32_t i = 0; i < FramesNum; i++) {
+      DILineInfo LineInfo = InlinedContext.getFrame(i);
+      Result += printDILineInfo(LineInfo);
+    }
+    return Result;
+  }
+  DILineInfo LineInfo = Info->symbolizeCode(ModuleOffset, Opts);
+  return printDILineInfo(LineInfo);
+}
+
+std::string LLVMSymbolizer::symbolizeData(const std::string &ModuleName,
+                                          uint64_t ModuleOffset) {
+  std::string Name = kBadString;
+  uint64_t Start = 0;
+  uint64_t Size = 0;
+  if (Opts.UseSymbolTable) {
+    if (ModuleInfo *Info = getOrCreateModuleInfo(ModuleName)) {
+      if (Info->symbolizeData(ModuleOffset, Name, Start, Size))
+        DemangleName(Name);
+    }
+  }
+  std::stringstream ss;
+  ss << Name << "\n" << Start << " " << Size << "\n";
+  return ss.str();
+}
+
+// Returns true if the object endianness is known.
+static bool getObjectEndianness(const ObjectFile *Obj, bool &IsLittleEndian) {
+  // FIXME: Implement this when libLLVMObject allows to do it easily.
+  IsLittleEndian = true;
+  return true;
+}
+
+static ObjectFile *getObjectFile(const std::string &Path) {
+  OwningPtr<MemoryBuffer> Buff;
+  if (error_code ec = MemoryBuffer::getFile(Path, Buff))
+    error(ec);
+  return ObjectFile::createObjectFile(Buff.take());
+}
+
+static std::string getDarwinDWARFResourceForModule(const std::string &Path) {
+  StringRef Basename = sys::path::filename(Path);
+  const std::string &DSymDirectory = Path + ".dSYM";
+  SmallString<16> ResourceName = StringRef(DSymDirectory);
+  sys::path::append(ResourceName, "Contents", "Resources", "DWARF");
+  sys::path::append(ResourceName, Basename);
+  return ResourceName.str();
+}
+
+ModuleInfo *
+LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
+  ModuleMapTy::iterator I = Modules.find(ModuleName);
+  if (I != Modules.end())
+    return I->second;
+
+  ObjectFile *Obj = getObjectFile(ModuleName);
+  if (Obj == 0) {
+    // Module name doesn't point to a valid object file.
+    Modules.insert(make_pair(ModuleName, (ModuleInfo *)0));
+    return 0;
+  }
+
+  DIContext *Context = 0;
+  bool IsLittleEndian;
+  if (getObjectEndianness(Obj, IsLittleEndian)) {
+    // On Darwin we may find DWARF in separate object file in
+    // resource directory.
+    ObjectFile *DbgObj = Obj;
+    if (isa<MachOObjectFile>(Obj)) {
+      const std::string &ResourceName =
+          getDarwinDWARFResourceForModule(ModuleName);
+      ObjectFile *ResourceObj = getObjectFile(ResourceName);
+      if (ResourceObj != 0)
+        DbgObj = ResourceObj;
+    }
+    Context = DIContext::getDWARFContext(DbgObj);
+    assert(Context);
+  }
+
+  ModuleInfo *Info = new ModuleInfo(Obj, Context);
+  Modules.insert(make_pair(ModuleName, Info));
+  return Info;
+}
+
+std::string LLVMSymbolizer::printDILineInfo(DILineInfo LineInfo) const {
+  // By default, DILineInfo contains "<invalid>" for function/filename it
+  // cannot fetch. We replace it to "??" to make our output closer to addr2line.
+  static const std::string kDILineInfoBadString = "<invalid>";
+  std::stringstream Result;
+  if (Opts.PrintFunctions) {
+    std::string FunctionName = LineInfo.getFunctionName();
+    if (FunctionName == kDILineInfoBadString)
+      FunctionName = kBadString;
+    DemangleName(FunctionName);
+    Result << FunctionName << "\n";
+  }
+  std::string Filename = LineInfo.getFileName();
+  if (Filename == kDILineInfoBadString)
+    Filename = kBadString;
+  Result << Filename << ":" << LineInfo.getLine() << ":" << LineInfo.getColumn()
+         << "\n";
+  return Result.str();
+}
+
+#if !defined(_MSC_VER)
+// Assume that __cxa_demangle is provided by libcxxabi (except for Windows).
+extern "C" char *__cxa_demangle(const char *mangled_name, char *output_buffer,
+                                size_t *length, int *status);
+#endif
+
+void LLVMSymbolizer::DemangleName(std::string &Name) const {
+#if !defined(_MSC_VER)
+  if (!Opts.Demangle)
+    return;
+  int status = 0;
+  char *DemangledName = __cxa_demangle(Name.c_str(), 0, 0, &status);
+  if (status != 0)
+    return;
+  Name = DemangledName;
+  free(DemangledName);
+#endif
+}
+
+} // namespace symbolize
+} // namespace llvm
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.h b/tools/llvm-symbolizer/LLVMSymbolize.h
new file mode 100644
index 0000000..e6220aa
--- /dev/null
+++ b/tools/llvm-symbolizer/LLVMSymbolize.h
@@ -0,0 +1,97 @@
+//===-- LLVMSymbolize.h ----------------------------------------- C++ -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Header for LLVM symbolization library.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_SYMBOLIZE_H
+#define LLVM_SYMBOLIZE_H
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <map>
+#include <string>
+
+namespace llvm {
+
+using namespace object;
+
+namespace symbolize {
+
+class ModuleInfo;
+
+class LLVMSymbolizer {
+public:
+  struct Options {
+    bool UseSymbolTable : 1;
+    bool PrintFunctions : 1;
+    bool PrintInlining : 1;
+    bool Demangle : 1;
+    Options(bool UseSymbolTable = true, bool PrintFunctions = true,
+            bool PrintInlining = true, bool Demangle = true)
+        : UseSymbolTable(UseSymbolTable), PrintFunctions(PrintFunctions),
+          PrintInlining(PrintInlining), Demangle(Demangle) {
+    }
+  };
+
+  LLVMSymbolizer(const Options &Opts = Options()) : Opts(Opts) {}
+
+  // Returns the result of symbolization for module name/offset as
+  // a string (possibly containing newlines).
+  std::string
+  symbolizeCode(const std::string &ModuleName, uint64_t ModuleOffset);
+  std::string
+  symbolizeData(const std::string &ModuleName, uint64_t ModuleOffset);
+private:
+  ModuleInfo *getOrCreateModuleInfo(const std::string &ModuleName);
+  std::string printDILineInfo(DILineInfo LineInfo) const;
+  void DemangleName(std::string &Name) const;
+
+  typedef std::map<std::string, ModuleInfo *> ModuleMapTy;
+  ModuleMapTy Modules;
+  Options Opts;
+  static const char kBadString[];
+};
+
+class ModuleInfo {
+public:
+  ModuleInfo(ObjectFile *Obj, DIContext *DICtx);
+
+  DILineInfo symbolizeCode(uint64_t ModuleOffset,
+                           const LLVMSymbolizer::Options &Opts) const;
+  DIInliningInfo symbolizeInlinedCode(
+      uint64_t ModuleOffset, const LLVMSymbolizer::Options &Opts) const;
+  bool symbolizeData(uint64_t ModuleOffset, std::string &Name, uint64_t &Start,
+                     uint64_t &Size) const;
+
+private:
+  bool getNameFromSymbolTable(SymbolRef::Type Type, uint64_t Address,
+                              std::string &Name, uint64_t &Addr,
+                              uint64_t &Size) const;
+  OwningPtr<ObjectFile> Module;
+  OwningPtr<DIContext> DebugInfoContext;
+
+  struct SymbolDesc {
+    uint64_t Addr;
+    uint64_t AddrEnd;
+    friend bool operator<(const SymbolDesc &s1, const SymbolDesc &s2) {
+      return s1.AddrEnd <= s2.Addr;
+    }
+  };
+  typedef std::map<SymbolDesc, StringRef> SymbolMapTy;
+  SymbolMapTy Functions;
+  SymbolMapTy Objects;
+};
+
+} // namespace symbolize
+} // namespace llvm
+
+#endif // LLVM_SYMBOLIZE_H
diff --git a/tools/llvm-symbolizer/llvm-symbolizer.cpp b/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 6a5eb4a..d039ec6 100644
--- a/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -15,292 +15,69 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/OwningPtr.h"
+#include "LLVMSymbolize.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/DIContext.h"
-#include "llvm/Object/MachO.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
 #include <cstring>
-#include <map>
 #include <string>
 
 using namespace llvm;
-using namespace object;
+using namespace symbolize;
 
 static cl::opt<bool>
-UseSymbolTable("use-symbol-table", cl::init(true),
-               cl::desc("Prefer names in symbol table to names "
-                        "in debug info"));
+ClUseSymbolTable("use-symbol-table", cl::init(true),
+                 cl::desc("Prefer names in symbol table to names "
+                          "in debug info"));
 
 static cl::opt<bool>
-PrintFunctions("functions", cl::init(true),
-               cl::desc("Print function names as well as line "
-                        "information for a given address"));
+ClPrintFunctions("functions", cl::init(true),
+                 cl::desc("Print function names as well as line "
+                          "information for a given address"));
 
 static cl::opt<bool>
-PrintInlining("inlining", cl::init(true),
-              cl::desc("Print all inlined frames for a given address"));
+ClPrintInlining("inlining", cl::init(true),
+                cl::desc("Print all inlined frames for a given address"));
 
 static cl::opt<bool>
-Demangle("demangle", cl::init(true),
-         cl::desc("Demangle function names"));
-
-static StringRef ToolInvocationPath;
-
-static bool error(error_code ec) {
-  if (!ec) return false;
-  errs() << ToolInvocationPath << ": error reading file: "
-         << ec.message() << ".\n";
-  return true;
-}
-
-static uint32_t getDILineInfoSpecifierFlags() {
-  uint32_t Flags = llvm::DILineInfoSpecifier::FileLineInfo |
-                   llvm::DILineInfoSpecifier::AbsoluteFilePath;
-  if (PrintFunctions)
-    Flags |= llvm::DILineInfoSpecifier::FunctionName;
-  return Flags;
-}
-
-static void patchFunctionNameInDILineInfo(const std::string &NewFunctionName,
-                                          DILineInfo &LineInfo) {
-  std::string FileName = LineInfo.getFileName();
-  LineInfo = DILineInfo(StringRef(FileName), StringRef(NewFunctionName),
-                        LineInfo.getLine(), LineInfo.getColumn());
-}
-
-namespace {
-class ModuleInfo {
-  OwningPtr<ObjectFile> Module;
-  OwningPtr<DIContext> DebugInfoContext;
- public:
-  ModuleInfo(ObjectFile *Obj, DIContext *DICtx)
-      : Module(Obj), DebugInfoContext(DICtx) {}
-
-  DILineInfo symbolizeCode(uint64_t ModuleOffset) const {
-    DILineInfo LineInfo;
-    if (DebugInfoContext) {
-      LineInfo = DebugInfoContext->getLineInfoForAddress(
-          ModuleOffset, getDILineInfoSpecifierFlags());
-    }
-    // Override function name from symbol table if necessary.
-    if (PrintFunctions && UseSymbolTable) {
-      std::string Function;
-      if (getFunctionNameFromSymbolTable(ModuleOffset, Function)) {
-        patchFunctionNameInDILineInfo(Function, LineInfo);
-      }
-    }
-    return LineInfo;
-  }
-
-  DIInliningInfo symbolizeInlinedCode(uint64_t ModuleOffset) const {
-    DIInliningInfo InlinedContext;
-    if (DebugInfoContext) {
-      InlinedContext = DebugInfoContext->getInliningInfoForAddress(
-          ModuleOffset, getDILineInfoSpecifierFlags());
-    }
-    // Make sure there is at least one frame in context.
-    if (InlinedContext.getNumberOfFrames() == 0) {
-      InlinedContext.addFrame(DILineInfo());
-    }
-    // Override the function name in lower frame with name from symbol table.
-    if (PrintFunctions && UseSymbolTable) {
-      DIInliningInfo PatchedInlinedContext;
-      for (uint32_t i = 0, n = InlinedContext.getNumberOfFrames();
-           i != n; i++) {
-        DILineInfo LineInfo = InlinedContext.getFrame(i);
-        if (i == n - 1) {
-          std::string Function;
-          if (getFunctionNameFromSymbolTable(ModuleOffset, Function)) {
-            patchFunctionNameInDILineInfo(Function, LineInfo);
-          }
-        }
-        PatchedInlinedContext.addFrame(LineInfo);
-      }
-      InlinedContext = PatchedInlinedContext;
-    }
-    return InlinedContext;
-  }
-
- private:
-  bool getFunctionNameFromSymbolTable(uint64_t Address,
-                                      std::string &FunctionName) const {
-    assert(Module);
-    error_code ec;
-    for (symbol_iterator si = Module->begin_symbols(),
-                         se = Module->end_symbols();
-                         si != se; si.increment(ec)) {
-      if (error(ec)) return false;
-      uint64_t SymbolAddress;
-      uint64_t SymbolSize;
-      SymbolRef::Type SymbolType;
-      if (error(si->getAddress(SymbolAddress)) ||
-          SymbolAddress == UnknownAddressOrSize) continue;
-      if (error(si->getSize(SymbolSize)) ||
-          SymbolSize == UnknownAddressOrSize) continue;
-      if (error(si->getType(SymbolType))) continue;
-      // FIXME: If a function has alias, there are two entries in symbol table
-      // with same address size. Make sure we choose the correct one.
-      if (SymbolAddress <= Address && Address < SymbolAddress + SymbolSize &&
-          SymbolType == SymbolRef::ST_Function) {
-        StringRef Name;
-        if (error(si->getName(Name))) continue;
-        FunctionName = Name.str();
-        return true;
-      }
-    }
-    return false;
-  }
-};
-
-typedef std::map<std::string, ModuleInfo*> ModuleMapTy;
-typedef ModuleMapTy::iterator ModuleMapIter;
-}  // namespace
-
-static ModuleMapTy Modules;
-
-// Returns true if the object endianness is known.
-static bool getObjectEndianness(const ObjectFile *Obj,
-                                bool &IsLittleEndian) {
-  // FIXME: Implement this when libLLVMObject allows to do it easily.
-  IsLittleEndian = true;
-  return true;
-}
-
-static ObjectFile *getObjectFile(const std::string &Path) {
-  OwningPtr<MemoryBuffer> Buff;
-  MemoryBuffer::getFile(Path, Buff);
-  return ObjectFile::createObjectFile(Buff.take());
-}
-
-static std::string getDarwinDWARFResourceForModule(const std::string &Path) {
-  StringRef Basename = sys::path::filename(Path);
-  const std::string &DSymDirectory = Path + ".dSYM";
-  SmallString<16> ResourceName = StringRef(DSymDirectory);
-  sys::path::append(ResourceName, "Contents", "Resources", "DWARF");
-  sys::path::append(ResourceName, Basename);
-  return ResourceName.str();
-}
-
-static ModuleInfo *getOrCreateModuleInfo(const std::string &ModuleName) {
-  ModuleMapIter I = Modules.find(ModuleName);
-  if (I != Modules.end())
-    return I->second;
-
-  ObjectFile *Obj = getObjectFile(ModuleName);
-  ObjectFile *DbgObj = Obj;
-  if (Obj == 0) {
-    // Module name doesn't point to a valid object file.
-    Modules.insert(make_pair(ModuleName, (ModuleInfo*)0));
-    return 0;
-  }
-
-  DIContext *Context = 0;
-  bool IsLittleEndian;
-  if (getObjectEndianness(Obj, IsLittleEndian)) {
-    // On Darwin we may find DWARF in separate object file in
-    // resource directory.
-    if (isa<MachOObjectFile>(Obj)) {
-      const std::string &ResourceName = getDarwinDWARFResourceForModule(
-          ModuleName);
-      ObjectFile *ResourceObj = getObjectFile(ResourceName);
-      if (ResourceObj != 0)
-        DbgObj = ResourceObj;
-    }
-    Context = DIContext::getDWARFContext(DbgObj);
-    assert(Context);
-  }
-
-  ModuleInfo *Info = new ModuleInfo(Obj, Context);
-  Modules.insert(make_pair(ModuleName, Info));
-  return Info;
-}
-
-#if !defined(_MSC_VER)
-// Assume that __cxa_demangle is provided by libcxxabi (except for Windows).
-extern "C" char *__cxa_demangle(const char *mangled_name, char *output_buffer,
-                                size_t *length, int *status);
-#endif
-
-static void printDILineInfo(DILineInfo LineInfo) {
-  // By default, DILineInfo contains "<invalid>" for function/filename it
-  // cannot fetch. We replace it to "??" to make our output closer to addr2line.
-  static const std::string kDILineInfoBadString = "<invalid>";
-  static const std::string kSymbolizerBadString = "??";
-  if (PrintFunctions) {
-    std::string FunctionName = LineInfo.getFunctionName();
-    if (FunctionName == kDILineInfoBadString)
-      FunctionName = kSymbolizerBadString;
-#if !defined(_MSC_VER)
-    if (Demangle) {
-      int status = 0;
-      char *DemangledName = __cxa_demangle(
-          FunctionName.c_str(), 0, 0, &status);
-      if (status == 0) {
-        FunctionName = DemangledName;
-        free(DemangledName);
-      }
-    }
-#endif
-    outs() << FunctionName << "\n";
-  }
-  std::string Filename = LineInfo.getFileName();
-  if (Filename == kDILineInfoBadString)
-    Filename = kSymbolizerBadString;
-  outs() << Filename <<
-         ":" << LineInfo.getLine() <<
-         ":" << LineInfo.getColumn() <<
-         "\n";
-}
-
-static void symbolize(std::string ModuleName, std::string ModuleOffsetStr) {
-  ModuleInfo *Info = getOrCreateModuleInfo(ModuleName);
-  uint64_t Offset = 0;
-  if (Info == 0 ||
-      StringRef(ModuleOffsetStr).getAsInteger(0, Offset)) {
-    printDILineInfo(DILineInfo());
-  } else if (PrintInlining) {
-    DIInliningInfo InlinedContext = Info->symbolizeInlinedCode(Offset);
-    uint32_t FramesNum = InlinedContext.getNumberOfFrames();
-    assert(FramesNum > 0);
-    for (uint32_t i = 0; i < FramesNum; i++) {
-      DILineInfo LineInfo = InlinedContext.getFrame(i);
-      printDILineInfo(LineInfo);
-    }
-  } else {
-    DILineInfo LineInfo = Info->symbolizeCode(Offset);
-    printDILineInfo(LineInfo);
-  }
-
-  outs() << "\n";  // Print extra empty line to mark the end of output.
-  outs().flush();
-}
-
-static bool parseModuleNameAndOffset(std::string &ModuleName,
-                                     std::string &ModuleOffsetStr) {
-  static const int kMaxInputStringLength = 1024;
-  static const char kDelimiters[] = " \n";
+ClDemangle("demangle", cl::init(true), cl::desc("Demangle function names"));
+
+static bool parseCommand(bool &IsData, std::string &ModuleName,
+                         uint64_t &ModuleOffset) {
+  const char *kDataCmd = "DATA ";
+  const char *kCodeCmd = "CODE ";
+  const int kMaxInputStringLength = 1024;
+  const char kDelimiters[] = " \n";
   char InputString[kMaxInputStringLength];
   if (!fgets(InputString, sizeof(InputString), stdin))
     return false;
+  IsData = false;
   ModuleName = "";
-  ModuleOffsetStr = "";
+  std::string ModuleOffsetStr = "";
+  char *pos = InputString;
+  if (strncmp(pos, kDataCmd, strlen(kDataCmd)) == 0) {
+    IsData = true;
+    pos += strlen(kDataCmd);
+  } else if (strncmp(pos, kCodeCmd, strlen(kCodeCmd)) == 0) {
+    IsData = false;
+    pos += strlen(kCodeCmd);
+  } else {
+    // If no cmd, assume it's CODE.
+    IsData = false;
+  }
   // FIXME: Handle case when filename is given in quotes.
-  if (char *FilePath = strtok(InputString, kDelimiters)) {
+  if (char *FilePath = strtok(pos, kDelimiters)) {
     ModuleName = FilePath;
-    if (char *OffsetStr = strtok((char*)0, kDelimiters))
+    if (char *OffsetStr = strtok((char *)0, kDelimiters))
       ModuleOffsetStr = OffsetStr;
   }
+  if (StringRef(ModuleOffsetStr).getAsInteger(0, ModuleOffset))
+    return false;
   return true;
 }
 
@@ -308,15 +85,22 @@ int main(int argc, char **argv) {
   // Print stack trace if we signal out.
   sys::PrintStackTraceOnErrorSignal();
   PrettyStackTraceProgram X(argc, argv);
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
 
   cl::ParseCommandLineOptions(argc, argv, "llvm symbolizer for compiler-rt\n");
-  ToolInvocationPath = argv[0];
+  LLVMSymbolizer::Options Opts(ClUseSymbolTable, ClPrintFunctions,
+                               ClPrintInlining, ClDemangle);
+  LLVMSymbolizer Symbolizer(Opts);
 
+  bool IsData = false;
   std::string ModuleName;
-  std::string ModuleOffsetStr;
-  while (parseModuleNameAndOffset(ModuleName, ModuleOffsetStr)) {
-    symbolize(ModuleName, ModuleOffsetStr);
+  uint64_t ModuleOffset;
+  while (parseCommand(IsData, ModuleName, ModuleOffset)) {
+    std::string Result =
+        IsData ? Symbolizer.symbolizeData(ModuleName, ModuleOffset)
+               : Symbolizer.symbolizeCode(ModuleName, ModuleOffset);
+    outs() << Result << "\n";
+    outs().flush();
   }
   return 0;
 }
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index e87b378..477bd2d 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp
@@ -387,6 +387,7 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
   FunctionPassManager *codeGenPasses = new FunctionPassManager(mergedModule);
 
   codeGenPasses->add(new DataLayout(*_target->getDataLayout()));
+  _target->addAnalysisPasses(*codeGenPasses);
 
   formatted_raw_ostream Out(out);
 
diff --git a/tools/lto/LTOModule.cpp b/tools/lto/LTOModule.cpp
index 23a8fb4..ff67769 100644
--- a/tools/lto/LTOModule.cpp
+++ b/tools/lto/LTOModule.cpp
@@ -733,7 +733,8 @@ namespace {
       return Symbols.end();
     }
 
-    RecordStreamer(MCContext &Context) : MCStreamer(Context) {}
+    RecordStreamer(MCContext &Context)
+        : MCStreamer(SK_RecordStreamer, Context) {}
 
     virtual void EmitInstruction(const MCInst &Inst) {
       // Scan for values.
@@ -771,6 +772,7 @@ namespace {
 
     // Noop calls.
     virtual void ChangeSection(const MCSection *Section) {}
+    virtual void InitToTextSection() {}
     virtual void InitSections() {}
     virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {}
     virtual void EmitThumbFunc(MCSymbol *Func) {}
@@ -803,6 +805,10 @@ namespace {
                                           const MCSymbol *Label,
                                           unsigned PointerSize) {}
     virtual void FinishImpl() {}
+
+    static bool classof(const MCStreamer *S) {
+      return S->getKind() == SK_RecordStreamer;
+    }
   };
 } // end anonymous namespace
 
diff --git a/tools/lto/Makefile b/tools/lto/Makefile
index 3610fed..ab2e16e 100644
--- a/tools/lto/Makefile
+++ b/tools/lto/Makefile
@@ -51,7 +51,7 @@ ifeq ($(HOST_OS),Darwin)
     endif
 
     # If we're doing an Apple-style build, add the LTO object path.
-    ifeq ($(RC_BUILDIT),YES)
+    ifeq ($(RC_XBS),YES)
        TempFile        := $(shell mkdir -p ${OBJROOT}/dSYMs ; mktemp ${OBJROOT}/dSYMs/llvm-lto.XXXXXX)
        LLVMLibsOptions := $(LLVMLibsOptions) \
                           -Wl,-object_path_lto -Wl,$(TempFile)
diff --git a/tools/macho-dump/macho-dump.cpp b/tools/macho-dump/macho-dump.cpp
index 20deda9..3bd3ecc 100644
--- a/tools/macho-dump/macho-dump.cpp
+++ b/tools/macho-dump/macho-dump.cpp
@@ -337,7 +337,7 @@ static int DumpDataInCodeDataCommand(MachOObject &Obj,
   InMemoryStruct<macho::LinkeditDataLoadCommand> LLC;
   Obj.ReadLinkeditDataLoadCommand(LCI, LLC);
   if (!LLC)
-    return Error("unable to read segment load command");
+    return Error("unable to read data-in-code load command");
 
   outs() << "  ('dataoff', " << LLC->DataOffset << ")\n"
          << "  ('datasize', " << LLC->DataSize << ")\n"
@@ -361,6 +361,31 @@ static int DumpDataInCodeDataCommand(MachOObject &Obj,
   return 0;
 }
 
+static int DumpLinkerOptionsCommand(MachOObject &Obj,
+                                    const MachOObject::LoadCommandInfo &LCI) {
+  InMemoryStruct<macho::LinkerOptionsLoadCommand> LOLC;
+  Obj.ReadLinkerOptionsLoadCommand(LCI, LOLC);
+  if (!LOLC)
+    return Error("unable to read linker options load command");
+
+  outs() << "  ('count', " << LOLC->Count << ")\n"
+         << "  ('_strings', [\n";
+
+  uint64_t DataSize = LOLC->Size - sizeof(macho::LinkerOptionsLoadCommand);
+  StringRef Data = Obj.getData(
+    LCI.Offset + sizeof(macho::LinkerOptionsLoadCommand), DataSize);
+  for (unsigned i = 0; i != LOLC->Count; ++i) {
+    std::pair<StringRef,StringRef> Split = Data.split('\0');
+    outs() << "\t\"";
+    outs().write_escaped(Split.first);
+    outs() << "\",\n";
+    Data = Split.second;
+  }
+  outs() <<"  ])\n";
+
+  return 0;
+}
+
 
 static int DumpLoadCommand(MachOObject &Obj, unsigned Index) {
   const MachOObject::LoadCommandInfo &LCI = Obj.getLoadCommandInfo(Index);
@@ -390,6 +415,9 @@ static int DumpLoadCommand(MachOObject &Obj, unsigned Index) {
   case macho::LCT_DataInCode:
     Res = DumpDataInCodeDataCommand(Obj, LCI);
     break;
+  case macho::LCT_LinkerOptions:
+    Res = DumpLinkerOptionsCommand(Obj, LCI);
+    break;
   default:
     Warning("unknown load command: " + Twine(LCI.Command.Type));
     break;
diff --git a/tools/opt/CMakeLists.txt b/tools/opt/CMakeLists.txt
index 32de6d4..cf5e5a8 100644
--- a/tools/opt/CMakeLists.txt
+++ b/tools/opt/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} bitreader asmparser bitwriter instrumentation scalaropts ipo vectorize)
+set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} bitreader asmparser bitwriter instrumentation scalaropts objcarcopts ipo vectorize)
 
 add_llvm_tool(opt
   AnalysisWrappers.cpp
diff --git a/tools/opt/GraphPrinters.cpp b/tools/opt/GraphPrinters.cpp
index c7481e5..f271966 100644
--- a/tools/opt/GraphPrinters.cpp
+++ b/tools/opt/GraphPrinters.cpp
@@ -14,81 +14,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/GraphWriter.h"
-#include "llvm/Support/ToolOutputFile.h"
-using namespace llvm;
-
-template<typename GraphType>
-static void WriteGraphToFile(raw_ostream &O, const std::string &GraphName,
-                             const GraphType &GT) {
-  std::string Filename = GraphName + ".dot";
-  O << "Writing '" << Filename << "'...";
-  std::string ErrInfo;
-  tool_output_file F(Filename.c_str(), ErrInfo);
-
-  if (ErrInfo.empty()) {
-    WriteGraph(F.os(), GT);
-    F.os().close();
-    if (!F.os().has_error()) {
-      O << "\n";
-      F.keep();
-      return;
-    }
-  }
-  O << "  error opening file for writing!\n";
-  F.os().clear_error();
-}
-
-
-//===----------------------------------------------------------------------===//
-//                              Call Graph Printer
-//===----------------------------------------------------------------------===//
-
-namespace llvm {
-  template<>
-  struct DOTGraphTraits<CallGraph*> : public DefaultDOTGraphTraits {
-
-  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
-
-    static std::string getGraphName(CallGraph *F) {
-      return "Call Graph";
-    }
 
-    static std::string getNodeLabel(CallGraphNode *Node, CallGraph *Graph) {
-      if (Node->getFunction())
-        return ((Value*)Node->getFunction())->getName();
-      return "external node";
-    }
-  };
-}
-
-
-namespace {
-  struct CallGraphPrinter : public ModulePass {
-    static char ID; // Pass ID, replacement for typeid
-    CallGraphPrinter() : ModulePass(ID) {}
-
-    virtual bool runOnModule(Module &M) {
-      WriteGraphToFile(llvm::errs(), "callgraph", &getAnalysis<CallGraph>());
-      return false;
-    }
-
-    void print(raw_ostream &OS, const llvm::Module*) const {}
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<CallGraph>();
-      AU.setPreservesAll();
-    }
-  };
-}
-
-char CallGraphPrinter::ID = 0;
-static RegisterPass<CallGraphPrinter> P2("dot-callgraph",
-                                         "Print Call Graph to 'dot' file");
+using namespace llvm;
 
 //===----------------------------------------------------------------------===//
 //                            DomInfoPrinter Pass
diff --git a/tools/opt/LLVMBuild.txt b/tools/opt/LLVMBuild.txt
index b174431..a866d12 100644
--- a/tools/opt/LLVMBuild.txt
+++ b/tools/opt/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = opt
 parent = Tools
-required_libraries = AsmParser BitReader BitWriter IPO Instrumentation Scalar all-targets
+required_libraries = AsmParser BitReader BitWriter IPO Instrumentation Scalar ObjCARC all-targets
diff --git a/tools/opt/Makefile b/tools/opt/Makefile
index ee7e1cf..79ed815 100644
--- a/tools/opt/Makefile
+++ b/tools/opt/Makefile
@@ -9,6 +9,6 @@
 
 LEVEL := ../..
 TOOLNAME := opt
-LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo vectorize all-targets
+LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts objcarcopts ipo vectorize all-targets
 
 include $(LEVEL)/Makefile.common
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index ac8323b..81a2de2 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -26,8 +26,8 @@
 #include "llvm/DebugInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Module.h"
+#include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
-#include "llvm/LinkAllVMCore.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
@@ -567,6 +567,7 @@ int main(int argc, char **argv) {
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeCore(Registry);
   initializeScalarOpts(Registry);
+  initializeObjCARCOpts(Registry);
   initializeVectorization(Registry);
   initializeIPO(Registry);
   initializeAnalysis(Registry);
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index 7345e12..2789835 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -794,6 +794,32 @@ TEST(APFloatTest, convert) {
   test.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &losesInfo);
   EXPECT_EQ(4294967295.0, test.convertToDouble());
   EXPECT_FALSE(losesInfo);
+
+  test = APFloat::getSNaN(APFloat::IEEEsingle);
+  APFloat X87SNaN = APFloat::getSNaN(APFloat::x87DoubleExtended);
+  test.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+               &losesInfo);
+  EXPECT_TRUE(test.bitwiseIsEqual(X87SNaN));
+  EXPECT_FALSE(losesInfo);
+
+  test = APFloat::getQNaN(APFloat::IEEEsingle);
+  APFloat X87QNaN = APFloat::getQNaN(APFloat::x87DoubleExtended);
+  test.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+               &losesInfo);
+  EXPECT_TRUE(test.bitwiseIsEqual(X87QNaN));
+  EXPECT_FALSE(losesInfo);
+
+  test = APFloat::getSNaN(APFloat::x87DoubleExtended);
+  test.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+               &losesInfo);
+  EXPECT_TRUE(test.bitwiseIsEqual(X87SNaN));
+  EXPECT_FALSE(losesInfo);
+
+  test = APFloat::getQNaN(APFloat::x87DoubleExtended);
+  test.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+               &losesInfo);
+  EXPECT_TRUE(test.bitwiseIsEqual(X87QNaN));
+  EXPECT_FALSE(losesInfo);
 }
 
 TEST(APFloatTest, PPCDoubleDouble) {
diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index 2a4c5b4..f129fa7 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp
@@ -56,6 +56,14 @@ TEST(APIntTest, i33_Count) {
 #endif
 
 TEST(APIntTest, i65_Count) {
+  APInt i65(65, 0, true);
+  EXPECT_EQ(65u, i65.countLeadingZeros());
+  EXPECT_EQ(0u, i65.countLeadingOnes());
+  EXPECT_EQ(0u, i65.getActiveBits());
+  EXPECT_EQ(1u, i65.getActiveWords());
+  EXPECT_EQ(65u, i65.countTrailingZeros());
+  EXPECT_EQ(0u, i65.countPopulation());
+
   APInt i65minus(65, 0, true);
   i65minus.setBit(64);
   EXPECT_EQ(0u, i65minus.countLeadingZeros());
@@ -514,4 +522,14 @@ TEST(APIntTest, Rotate) {
   EXPECT_EQ(Rot, Big.rotr(144));
 }
 
+TEST(APIntTest, Splat) {
+  APInt ValA(8, 0x01);
+  EXPECT_EQ(ValA, APInt::getSplat(8, ValA));
+  EXPECT_EQ(APInt(64, 0x0101010101010101ULL), APInt::getSplat(64, ValA));
+
+  APInt ValB(3, 5);
+  EXPECT_EQ(APInt(4, 0xD), APInt::getSplat(4, ValB));
+  EXPECT_EQ(APInt(15, 0xDB6D), APInt::getSplat(15, ValB));
+}
+
 }
diff --git a/unittests/ADT/CMakeLists.txt b/unittests/ADT/CMakeLists.txt
index 94f7fda..9aad793 100644
--- a/unittests/ADT/CMakeLists.txt
+++ b/unittests/ADT/CMakeLists.txt
@@ -18,12 +18,15 @@ set(ADTSources
   IntEqClassesTest.cpp
   IntervalMapTest.cpp
   IntrusiveRefCntPtrTest.cpp
+  MapVectorTest.cpp
+  OptionalTest.cpp
   PackedVectorTest.cpp
   SCCIteratorTest.cpp
   SmallPtrSetTest.cpp
   SmallStringTest.cpp
   SmallVectorTest.cpp
   SparseBitVectorTest.cpp
+  SparseMultiSetTest.cpp
   SparseSetTest.cpp
   StringMapTest.cpp
   StringRefTest.cpp
diff --git a/unittests/ADT/MapVectorTest.cpp b/unittests/ADT/MapVectorTest.cpp
new file mode 100644
index 0000000..11178bc
--- /dev/null
+++ b/unittests/ADT/MapVectorTest.cpp
@@ -0,0 +1,55 @@
+//===- unittest/ADT/MapVectorTest.cpp - MapVector unit tests ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/ADT/MapVector.h"
+#include <utility>
+
+using namespace llvm;
+
+TEST(MapVectorTest, insert_pop) {
+  MapVector<int, int> MV;
+  std::pair<MapVector<int, int>::iterator, bool> R;
+
+  R = MV.insert(std::make_pair(1, 2));
+  ASSERT_EQ(R.first, MV.begin());
+  EXPECT_EQ(R.first->first, 1);
+  EXPECT_EQ(R.first->second, 2);
+  EXPECT_TRUE(R.second);
+
+  R = MV.insert(std::make_pair(1, 3));
+  ASSERT_EQ(R.first, MV.begin());
+  EXPECT_EQ(R.first->first, 1);
+  EXPECT_EQ(R.first->second, 2);
+  EXPECT_FALSE(R.second);
+
+  R = MV.insert(std::make_pair(4, 5));
+  ASSERT_NE(R.first, MV.end());
+  EXPECT_EQ(R.first->first, 4);
+  EXPECT_EQ(R.first->second, 5);
+  EXPECT_TRUE(R.second);
+
+  EXPECT_EQ(MV.size(), 2u);
+  EXPECT_EQ(MV[1], 2);
+  EXPECT_EQ(MV[4], 5);
+
+  MV.pop_back();
+  EXPECT_EQ(MV.size(), 1u);
+  EXPECT_EQ(MV[1], 2);
+
+  R = MV.insert(std::make_pair(4, 7));
+  ASSERT_NE(R.first, MV.end());
+  EXPECT_EQ(R.first->first, 4);
+  EXPECT_EQ(R.first->second, 7);
+  EXPECT_TRUE(R.second);  
+
+  EXPECT_EQ(MV.size(), 2u);
+  EXPECT_EQ(MV[1], 2);
+  EXPECT_EQ(MV[4], 7);
+}
diff --git a/unittests/ADT/OptionalTest.cpp b/unittests/ADT/OptionalTest.cpp
new file mode 100644
index 0000000..21e3847
--- /dev/null
+++ b/unittests/ADT/OptionalTest.cpp
@@ -0,0 +1,284 @@
+//===- llvm/unittest/ADT/OptionalTest.cpp - Optional unit tests -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/ADT/Optional.h"
+using namespace llvm;
+
+namespace {
+
+struct NonDefaultConstructible {
+  static unsigned CopyConstructions;
+  static unsigned Destructions;
+  static unsigned CopyAssignments;
+  explicit NonDefaultConstructible(int) {
+  }
+  NonDefaultConstructible(const NonDefaultConstructible&) {
+    ++CopyConstructions;
+  }
+  NonDefaultConstructible &operator=(const NonDefaultConstructible&) {
+    ++CopyAssignments;
+    return *this;
+  }
+  ~NonDefaultConstructible() {
+    ++Destructions;
+  }
+  static void ResetCounts() {
+    CopyConstructions = 0;
+    Destructions = 0;
+    CopyAssignments = 0;
+  }
+};
+
+unsigned NonDefaultConstructible::CopyConstructions = 0;
+unsigned NonDefaultConstructible::Destructions = 0;
+unsigned NonDefaultConstructible::CopyAssignments = 0;
+
+// Test fixture
+class OptionalTest : public testing::Test {
+};
+
+TEST_F(OptionalTest, NonDefaultConstructibleTest) {
+  Optional<NonDefaultConstructible> O;
+  EXPECT_FALSE(O);
+}
+
+TEST_F(OptionalTest, ResetTest) {
+  NonDefaultConstructible::ResetCounts();
+  Optional<NonDefaultConstructible> O(NonDefaultConstructible(3));
+  EXPECT_EQ(1u, NonDefaultConstructible::CopyConstructions);
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+  EXPECT_EQ(1u, NonDefaultConstructible::Destructions);
+  NonDefaultConstructible::ResetCounts();
+  O.reset();
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+  EXPECT_EQ(1u, NonDefaultConstructible::Destructions);
+}
+
+TEST_F(OptionalTest, InitializationLeakTest) {
+  NonDefaultConstructible::ResetCounts();
+  Optional<NonDefaultConstructible>(NonDefaultConstructible(3));
+  EXPECT_EQ(1u, NonDefaultConstructible::CopyConstructions);
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+  EXPECT_EQ(2u, NonDefaultConstructible::Destructions);
+}
+
+TEST_F(OptionalTest, CopyConstructionTest) {
+  NonDefaultConstructible::ResetCounts();
+  {
+    Optional<NonDefaultConstructible> A(NonDefaultConstructible(3));
+    EXPECT_EQ(1u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(1u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+    Optional<NonDefaultConstructible> B(A);
+    EXPECT_EQ(1u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(0u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+  }
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+  EXPECT_EQ(2u, NonDefaultConstructible::Destructions);
+}
+
+TEST_F(OptionalTest, ConstructingCopyAssignmentTest) {
+  NonDefaultConstructible::ResetCounts();
+  {
+    Optional<NonDefaultConstructible> A(NonDefaultConstructible(3));
+    Optional<NonDefaultConstructible> B;
+    EXPECT_EQ(1u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(1u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+    B = A;
+    EXPECT_EQ(1u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(0u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+  }
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+  EXPECT_EQ(2u, NonDefaultConstructible::Destructions);
+}
+
+TEST_F(OptionalTest, CopyingCopyAssignmentTest) {
+  NonDefaultConstructible::ResetCounts();
+  {
+    Optional<NonDefaultConstructible> A(NonDefaultConstructible(3));
+    Optional<NonDefaultConstructible> B(NonDefaultConstructible(4));
+    EXPECT_EQ(2u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(2u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+    B = A;
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(1u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(0u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+  }
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+  EXPECT_EQ(2u, NonDefaultConstructible::Destructions);
+}
+
+TEST_F(OptionalTest, DeletingCopyAssignmentTest) {
+  NonDefaultConstructible::ResetCounts();
+  {
+    Optional<NonDefaultConstructible> A;
+    Optional<NonDefaultConstructible> B(NonDefaultConstructible(3));
+    EXPECT_EQ(1u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(1u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+    B = A;
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(1u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+  }
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+  EXPECT_EQ(0u, NonDefaultConstructible::Destructions);
+}
+
+TEST_F(OptionalTest, NullCopyConstructionTest) {
+  NonDefaultConstructible::ResetCounts();
+  {
+    Optional<NonDefaultConstructible> A;
+    Optional<NonDefaultConstructible> B;
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(0u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+    B = A;
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+    EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+    EXPECT_EQ(0u, NonDefaultConstructible::Destructions);
+    NonDefaultConstructible::ResetCounts();
+  }
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyConstructions);
+  EXPECT_EQ(0u, NonDefaultConstructible::CopyAssignments);
+  EXPECT_EQ(0u, NonDefaultConstructible::Destructions);
+}
+
+#if LLVM_HAS_RVALUE_REFERENCES
+struct MoveOnly {
+  static unsigned MoveConstructions;
+  static unsigned Destructions;
+  static unsigned MoveAssignments;
+  int val;
+  explicit MoveOnly(int val) : val(val) {
+  }
+  MoveOnly(MoveOnly&& other) {
+    val = other.val;
+    ++MoveConstructions;
+  }
+  MoveOnly &operator=(MoveOnly&& other) {
+    val = other.val;
+    ++MoveAssignments;
+    return *this;
+  }
+  ~MoveOnly() {
+    ++Destructions;
+  }
+  static void ResetCounts() {
+    MoveConstructions = 0;
+    Destructions = 0;
+    MoveAssignments = 0;
+  }
+};
+
+unsigned MoveOnly::MoveConstructions = 0;
+unsigned MoveOnly::Destructions = 0;
+unsigned MoveOnly::MoveAssignments = 0;
+
+TEST_F(OptionalTest, MoveOnlyNull) {
+  MoveOnly::ResetCounts();
+  Optional<MoveOnly> O;
+  EXPECT_EQ(0u, MoveOnly::MoveConstructions);
+  EXPECT_EQ(0u, MoveOnly::MoveAssignments);
+  EXPECT_EQ(0u, MoveOnly::Destructions);
+}
+
+TEST_F(OptionalTest, MoveOnlyConstruction) {
+  MoveOnly::ResetCounts();
+  Optional<MoveOnly> O(MoveOnly(3));
+  EXPECT_TRUE((bool)O);
+  EXPECT_EQ(3, O->val);
+  EXPECT_EQ(1u, MoveOnly::MoveConstructions);
+  EXPECT_EQ(0u, MoveOnly::MoveAssignments);
+  EXPECT_EQ(1u, MoveOnly::Destructions);
+}
+
+TEST_F(OptionalTest, MoveOnlyMoveConstruction) {
+  Optional<MoveOnly> A(MoveOnly(3));
+  MoveOnly::ResetCounts();
+  Optional<MoveOnly> B(std::move(A));
+  EXPECT_FALSE((bool)A);
+  EXPECT_TRUE((bool)B);
+  EXPECT_EQ(3, B->val);
+  EXPECT_EQ(1u, MoveOnly::MoveConstructions);
+  EXPECT_EQ(0u, MoveOnly::MoveAssignments);
+  EXPECT_EQ(1u, MoveOnly::Destructions);
+}
+
+TEST_F(OptionalTest, MoveOnlyAssignment) {
+  MoveOnly::ResetCounts();
+  Optional<MoveOnly> O;
+  O = MoveOnly(3);
+  EXPECT_TRUE((bool)O);
+  EXPECT_EQ(3, O->val);
+  EXPECT_EQ(1u, MoveOnly::MoveConstructions);
+  EXPECT_EQ(0u, MoveOnly::MoveAssignments);
+  EXPECT_EQ(1u, MoveOnly::Destructions);
+}
+
+TEST_F(OptionalTest, MoveOnlyInitializingAssignment) {
+  Optional<MoveOnly> A(MoveOnly(3));
+  Optional<MoveOnly> B;
+  MoveOnly::ResetCounts();
+  B = std::move(A);
+  EXPECT_FALSE((bool)A);
+  EXPECT_TRUE((bool)B);
+  EXPECT_EQ(3, B->val);
+  EXPECT_EQ(1u, MoveOnly::MoveConstructions);
+  EXPECT_EQ(0u, MoveOnly::MoveAssignments);
+  EXPECT_EQ(1u, MoveOnly::Destructions);
+}
+
+TEST_F(OptionalTest, MoveOnlyNullingAssignment) {
+  Optional<MoveOnly> A;
+  Optional<MoveOnly> B(MoveOnly(3));
+  MoveOnly::ResetCounts();
+  B = std::move(A);
+  EXPECT_FALSE((bool)A);
+  EXPECT_FALSE((bool)B);
+  EXPECT_EQ(0u, MoveOnly::MoveConstructions);
+  EXPECT_EQ(0u, MoveOnly::MoveAssignments);
+  EXPECT_EQ(1u, MoveOnly::Destructions);
+}
+
+TEST_F(OptionalTest, MoveOnlyAssigningAssignment) {
+  Optional<MoveOnly> A(MoveOnly(3));
+  Optional<MoveOnly> B(MoveOnly(4));
+  MoveOnly::ResetCounts();
+  B = std::move(A);
+  EXPECT_FALSE((bool)A);
+  EXPECT_TRUE((bool)B);
+  EXPECT_EQ(3, B->val);
+  EXPECT_EQ(0u, MoveOnly::MoveConstructions);
+  EXPECT_EQ(1u, MoveOnly::MoveAssignments);
+  EXPECT_EQ(1u, MoveOnly::Destructions);
+}
+#endif
+
+} // end anonymous namespace
+
diff --git a/unittests/ADT/SparseMultiSetTest.cpp b/unittests/ADT/SparseMultiSetTest.cpp
new file mode 100644
index 0000000..032990e
--- /dev/null
+++ b/unittests/ADT/SparseMultiSetTest.cpp
@@ -0,0 +1,235 @@
+//===------ ADT/SparseSetTest.cpp - SparseSet unit tests -  -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SparseMultiSet.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+typedef SparseMultiSet<unsigned> USet;
+
+// Empty set tests.
+TEST(SparseMultiSetTest, EmptySet) {
+  USet Set;
+  EXPECT_TRUE(Set.empty());
+  EXPECT_EQ(0u, Set.size());
+
+  Set.setUniverse(10);
+
+  // Lookups on empty set.
+  EXPECT_TRUE(Set.find(0) == Set.end());
+  EXPECT_TRUE(Set.find(9) == Set.end());
+
+  // Same thing on a const reference.
+  const USet &CSet = Set;
+  EXPECT_TRUE(CSet.empty());
+  EXPECT_EQ(0u, CSet.size());
+  EXPECT_TRUE(CSet.find(0) == CSet.end());
+  USet::const_iterator I = CSet.find(5);
+  EXPECT_TRUE(I == CSet.end());
+}
+
+// Single entry set tests.
+TEST(SparseMultiSetTest, SingleEntrySet) {
+  USet Set;
+  Set.setUniverse(10);
+  USet::iterator I = Set.insert(5);
+  EXPECT_TRUE(I != Set.end());
+  EXPECT_TRUE(*I == 5);
+
+  EXPECT_FALSE(Set.empty());
+  EXPECT_EQ(1u, Set.size());
+
+  EXPECT_TRUE(Set.find(0) == Set.end());
+  EXPECT_TRUE(Set.find(9) == Set.end());
+
+  EXPECT_FALSE(Set.contains(0));
+  EXPECT_TRUE(Set.contains(5));
+
+  // Extra insert.
+  I = Set.insert(5);
+  EXPECT_TRUE(I != Set.end());
+  EXPECT_TRUE(I == ++Set.find(5));
+  I--;
+  EXPECT_TRUE(I == Set.find(5));
+
+  // Erase non-existent element.
+  I = Set.find(1);
+  EXPECT_TRUE(I == Set.end());
+  EXPECT_EQ(2u, Set.size());
+  EXPECT_EQ(5u, *Set.find(5));
+
+  // Erase iterator.
+  I = Set.find(5);
+  EXPECT_TRUE(I != Set.end());
+  I = Set.erase(I);
+  EXPECT_TRUE(I != Set.end());
+  I = Set.erase(I);
+  EXPECT_TRUE(I == Set.end());
+  EXPECT_TRUE(Set.empty());
+}
+
+// Multiple entry set tests.
+TEST(SparseMultiSetTest, MultipleEntrySet) {
+  USet Set;
+  Set.setUniverse(10);
+
+  Set.insert(5);
+  Set.insert(5);
+  Set.insert(5);
+  Set.insert(3);
+  Set.insert(2);
+  Set.insert(1);
+  Set.insert(4);
+  EXPECT_EQ(7u, Set.size());
+
+  // Erase last element by key.
+  EXPECT_TRUE(Set.erase(Set.find(4)) == Set.end());
+  EXPECT_EQ(6u, Set.size());
+  EXPECT_FALSE(Set.contains(4));
+  EXPECT_TRUE(Set.find(4) == Set.end());
+
+  // Erase first element by key.
+  EXPECT_EQ(3u, Set.count(5));
+  EXPECT_TRUE(Set.find(5) != Set.end());
+  EXPECT_TRUE(Set.erase(Set.find(5)) != Set.end());
+  EXPECT_EQ(5u, Set.size());
+  EXPECT_EQ(2u, Set.count(5));
+
+  Set.insert(6);
+  Set.insert(7);
+  EXPECT_EQ(7u, Set.size());
+
+  // Erase tail by iterator.
+  EXPECT_TRUE(Set.getTail(6) == Set.getHead(6));
+  USet::iterator I = Set.erase(Set.find(6));
+  EXPECT_TRUE(I == Set.end());
+  EXPECT_EQ(6u, Set.size());
+
+  // Erase tails by iterator.
+  EXPECT_EQ(2u, Set.count(5));
+  I = Set.getTail(5);
+  I = Set.erase(I);
+  EXPECT_TRUE(I == Set.end());
+  --I;
+  EXPECT_EQ(1u, Set.count(5));
+  EXPECT_EQ(5u, *I);
+  I = Set.erase(I);
+  EXPECT_TRUE(I == Set.end());
+  EXPECT_EQ(0u, Set.count(5));
+
+  Set.insert(8);
+  Set.insert(8);
+  Set.insert(8);
+  Set.insert(8);
+  Set.insert(8);
+
+  // Erase all the 8s
+  EXPECT_EQ(5, std::distance(Set.getHead(8), Set.end()));
+  Set.eraseAll(8);
+  EXPECT_EQ(0, std::distance(Set.getHead(8), Set.end()));
+
+  // Clear and resize the universe.
+  Set.clear();
+  EXPECT_EQ(0u, Set.size());
+  EXPECT_FALSE(Set.contains(3));
+  Set.setUniverse(1000);
+
+  // Add more than 256 elements.
+  for (unsigned i = 100; i != 800; ++i)
+    Set.insert(i);
+
+  for (unsigned i = 0; i != 10; ++i)
+    Set.eraseAll(i);
+
+  for (unsigned i = 100; i != 800; ++i)
+    EXPECT_EQ(1u, Set.count(i));
+
+  EXPECT_FALSE(Set.contains(99));
+  EXPECT_FALSE(Set.contains(800));
+  EXPECT_EQ(700u, Set.size());
+}
+
+// Test out iterators
+TEST(SparseMultiSetTest, Iterators) {
+  USet Set;
+  Set.setUniverse(100);
+
+  Set.insert(0);
+  Set.insert(1);
+  Set.insert(2);
+  Set.insert(0);
+  Set.insert(1);
+  Set.insert(0);
+
+  USet::RangePair RangePair = Set.equal_range(0);
+  USet::iterator B = RangePair.first;
+  USet::iterator E = RangePair.second;
+
+  // Move the iterators around, going to end and coming back.
+  EXPECT_EQ(3, std::distance(B, E));
+  EXPECT_EQ(B, --(--(--E)));
+  EXPECT_EQ(++(++(++E)), Set.end());
+  EXPECT_EQ(B, --(--(--E)));
+  EXPECT_EQ(++(++(++E)), Set.end());
+
+  // Insert into the tail, and move around again
+  Set.insert(0);
+  EXPECT_EQ(B, --(--(--(--E))));
+  EXPECT_EQ(++(++(++(++E))), Set.end());
+  EXPECT_EQ(B, --(--(--(--E))));
+  EXPECT_EQ(++(++(++(++E))), Set.end());
+
+  // Erase a tail, and move around again
+  USet::iterator Erased = Set.erase(Set.getTail(0));
+  EXPECT_EQ(Erased, E);
+  EXPECT_EQ(B, --(--(--E)));
+
+  USet Set2;
+  Set2.setUniverse(11);
+  Set2.insert(3);
+  EXPECT_TRUE(!Set2.contains(0));
+  EXPECT_TRUE(!Set.contains(3));
+
+  EXPECT_EQ(Set2.getHead(3), Set2.getTail(3));
+  EXPECT_EQ(Set2.getHead(0), Set2.getTail(0));
+  B = Set2.find(3);
+  EXPECT_EQ(Set2.find(3), --(++B));
+}
+
+struct Alt {
+  unsigned Value;
+  explicit Alt(unsigned x) : Value(x) {}
+  unsigned getSparseSetIndex() const { return Value - 1000; }
+};
+
+TEST(SparseMultiSetTest, AltStructSet) {
+  typedef SparseMultiSet<Alt> ASet;
+  ASet Set;
+  Set.setUniverse(10);
+  Set.insert(Alt(1005));
+
+  ASet::iterator I = Set.find(5);
+  ASSERT_TRUE(I != Set.end());
+  EXPECT_EQ(1005u, I->Value);
+
+  Set.insert(Alt(1006));
+  Set.insert(Alt(1006));
+  I = Set.erase(Set.find(6));
+  ASSERT_TRUE(I != Set.end());
+  EXPECT_EQ(1006u, I->Value);
+  I = Set.erase(Set.find(6));
+  ASSERT_TRUE(I == Set.end());
+
+  EXPECT_TRUE(Set.contains(5));
+  EXPECT_FALSE(Set.contains(6));
+}
+} // namespace
diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp
index 7c3ab97..b402896 100644
--- a/unittests/ADT/TripleTest.cpp
+++ b/unittests/ADT/TripleTest.cpp
@@ -407,6 +407,11 @@ TEST(TripleTest, getOSVersion) {
   unsigned Major, Minor, Micro;
 
   T = Triple("i386-apple-darwin9");
+  EXPECT_TRUE(T.isMacOSX());
+  EXPECT_FALSE(T.isiOS());
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_TRUE(T.isArch32Bit());
+  EXPECT_FALSE(T.isArch64Bit());
   T.getMacOSXVersion(Major, Minor, Micro);
   EXPECT_EQ((unsigned)10, Major);
   EXPECT_EQ((unsigned)5, Minor);
@@ -417,6 +422,11 @@ TEST(TripleTest, getOSVersion) {
   EXPECT_EQ((unsigned)0, Micro);
 
   T = Triple("x86_64-apple-darwin9");
+  EXPECT_TRUE(T.isMacOSX());
+  EXPECT_FALSE(T.isiOS());
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_FALSE(T.isArch32Bit());
+  EXPECT_TRUE(T.isArch64Bit());
   T.getMacOSXVersion(Major, Minor, Micro);
   EXPECT_EQ((unsigned)10, Major);
   EXPECT_EQ((unsigned)5, Minor);
@@ -427,6 +437,11 @@ TEST(TripleTest, getOSVersion) {
   EXPECT_EQ((unsigned)0, Micro);
 
   T = Triple("x86_64-apple-macosx");
+  EXPECT_TRUE(T.isMacOSX());
+  EXPECT_FALSE(T.isiOS());
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_FALSE(T.isArch32Bit());
+  EXPECT_TRUE(T.isArch64Bit());
   T.getMacOSXVersion(Major, Minor, Micro);
   EXPECT_EQ((unsigned)10, Major);
   EXPECT_EQ((unsigned)4, Minor);
@@ -437,6 +452,11 @@ TEST(TripleTest, getOSVersion) {
   EXPECT_EQ((unsigned)0, Micro);
 
   T = Triple("x86_64-apple-macosx10.7");
+  EXPECT_TRUE(T.isMacOSX());
+  EXPECT_FALSE(T.isiOS());
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_FALSE(T.isArch32Bit());
+  EXPECT_TRUE(T.isArch64Bit());
   T.getMacOSXVersion(Major, Minor, Micro);
   EXPECT_EQ((unsigned)10, Major);
   EXPECT_EQ((unsigned)7, Minor);
@@ -447,6 +467,11 @@ TEST(TripleTest, getOSVersion) {
   EXPECT_EQ((unsigned)0, Micro);
 
   T = Triple("armv7-apple-ios");
+  EXPECT_FALSE(T.isMacOSX());
+  EXPECT_TRUE(T.isiOS());
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_TRUE(T.isArch32Bit());
+  EXPECT_FALSE(T.isArch64Bit());
   T.getMacOSXVersion(Major, Minor, Micro);
   EXPECT_EQ((unsigned)10, Major);
   EXPECT_EQ((unsigned)4, Minor);
@@ -457,6 +482,11 @@ TEST(TripleTest, getOSVersion) {
   EXPECT_EQ((unsigned)0, Micro);
 
   T = Triple("armv7-apple-ios5.0");
+  EXPECT_FALSE(T.isMacOSX());
+  EXPECT_TRUE(T.isiOS());
+  EXPECT_FALSE(T.isArch16Bit());
+  EXPECT_TRUE(T.isArch32Bit());
+  EXPECT_FALSE(T.isArch64Bit());
   T.getMacOSXVersion(Major, Minor, Micro);
   EXPECT_EQ((unsigned)10, Major);
   EXPECT_EQ((unsigned)4, Minor);
diff --git a/unittests/Bitcode/BitReaderTest.cpp b/unittests/Bitcode/BitReaderTest.cpp
index 98cb814..f33af2f 100644
--- a/unittests/Bitcode/BitReaderTest.cpp
+++ b/unittests/Bitcode/BitReaderTest.cpp
@@ -45,9 +45,9 @@ static Module *makeLLVMModule() {
 }
 
 static void writeModuleToBuffer(SmallVectorImpl<char> &Buffer) {
-  Module *Mod = makeLLVMModule();
+  OwningPtr<Module> Mod(makeLLVMModule());
   raw_svector_ostream OS(Buffer);
-  WriteBitcodeToFile(Mod, OS);
+  WriteBitcodeToFile(Mod.get(), OS);
 }
 
 TEST(BitReaderTest, MaterializeFunctionsForBlockAddr) { // PR11677
@@ -55,7 +55,7 @@ TEST(BitReaderTest, MaterializeFunctionsForBlockAddr) { // PR11677
   writeModuleToBuffer(Mem);
   MemoryBuffer *Buffer = MemoryBuffer::getMemBuffer(Mem.str(), "test", false);
   std::string errMsg;
-  Module *m = getLazyBitcodeModule(Buffer, getGlobalContext(), &errMsg);
+  OwningPtr<Module> m(getLazyBitcodeModule(Buffer, getGlobalContext(), &errMsg));
   PassManager passes;
   passes.add(createVerifierPass());
   passes.run(*m);
diff --git a/unittests/ExecutionEngine/JIT/CMakeLists.txt b/unittests/ExecutionEngine/JIT/CMakeLists.txt
index 11cf784..3d33e4c 100644
--- a/unittests/ExecutionEngine/JIT/CMakeLists.txt
+++ b/unittests/ExecutionEngine/JIT/CMakeLists.txt
@@ -19,7 +19,9 @@ if( LLVM_USE_INTEL_JITEVENTS )
     )
   set(LLVM_LINK_COMPONENTS
     ${LLVM_LINK_COMPONENTS}
+    DebugInfo
     IntelJITEvents
+    Object
     ) 
 endif( LLVM_USE_INTEL_JITEVENTS )
 
diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp
index 0a9ee82..30dadc9 100644
--- a/unittests/ExecutionEngine/JIT/JITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITTest.cpp
@@ -161,7 +161,7 @@ public:
     uintptr_t ActualSizeResult;
   };
   std::vector<StartExceptionTableCall> startExceptionTableCalls;
-  virtual uint8_t* startExceptionTable(const Function* F,
+  virtual uint8_t *startExceptionTable(const Function *F,
                                        uintptr_t &ActualSize) {
     uintptr_t InitialActualSize = ActualSize;
     uint8_t *Result = Base->startExceptionTable(F, ActualSize);
@@ -203,14 +203,21 @@ bool LoadAssemblyInto(Module *M, const char *assembly) {
 
 class JITTest : public testing::Test {
  protected:
+  virtual RecordingJITMemoryManager *createMemoryManager() {
+    return new RecordingJITMemoryManager;
+  }
+
   virtual void SetUp() {
     M = new Module("<main>", Context);
-    RJMM = new RecordingJITMemoryManager;
+    RJMM = createMemoryManager();
     RJMM->setPoisonMemory(true);
     std::string Error;
+    TargetOptions Options;
+    Options.JITExceptionHandling = true;
     TheJIT.reset(EngineBuilder(M).setEngineKind(EngineKind::JIT)
                  .setJITMemoryManager(RJMM)
-                 .setErrorStr(&Error).create());
+                 .setErrorStr(&Error)
+                 .setTargetOptions(Options).create());
     ASSERT_TRUE(TheJIT.get() != NULL) << Error;
   }
 
@@ -297,6 +304,46 @@ TEST(JIT, GlobalInFunction) {
 
 #endif // !defined(__arm__) && !defined(__powerpc__)
 
+// Regression test for a bug.  The JITEmitter wasn't checking to verify that
+// it hadn't run out of space while generating the DWARF exception information
+// for an emitted function.
+
+class ExceptionMemoryManagerMock : public RecordingJITMemoryManager {
+ public:
+  virtual uint8_t *startExceptionTable(const Function *F,
+                                       uintptr_t &ActualSize) {
+    // force an insufficient size the first time through.
+    bool ChangeActualSize = false;
+    if (ActualSize == 0)
+      ChangeActualSize = true;;
+    uint8_t *result =
+      RecordingJITMemoryManager::startExceptionTable(F, ActualSize);
+    if (ChangeActualSize)
+      ActualSize = 1;
+    return result;
+  }
+};
+
+class JITExceptionMemoryTest : public JITTest {
+ protected:
+  virtual RecordingJITMemoryManager *createMemoryManager() {
+    return new ExceptionMemoryManagerMock;
+  }
+};
+
+TEST_F(JITExceptionMemoryTest, ExceptionTableOverflow) {
+  Function *F = Function::Create(TypeBuilder<void(void), false>::get(Context),
+                                 Function::ExternalLinkage,
+                                 "func1", M);
+  BasicBlock *Block = BasicBlock::Create(Context, "block", F);
+  IRBuilder<> Builder(Block);
+  Builder.CreateRetVoid();
+  TheJIT->getPointerToFunction(F);
+  ASSERT_TRUE(RJMM->startExceptionTableCalls.size() == 2);
+  ASSERT_TRUE(RJMM->deallocateExceptionTableCalls.size() == 1);
+  ASSERT_TRUE(RJMM->endExceptionTableCalls.size() == 1);
+}
+
 int PlusOne(int arg) {
   return arg + 1;
 }
diff --git a/unittests/ExecutionEngine/JIT/Makefile b/unittests/ExecutionEngine/JIT/Makefile
index 9e0bb9e..ef8b827 100644
--- a/unittests/ExecutionEngine/JIT/Makefile
+++ b/unittests/ExecutionEngine/JIT/Makefile
@@ -24,7 +24,7 @@ ifeq ($(USE_INTEL_JITEVENTS), 1)
   CPPFLAGS += -I$(INTEL_JITEVENTS_INCDIR)
 
   # Link against the LLVM Intel JIT Evens interface library
-  LINK_COMPONENTS += inteljitevents
+  LINK_COMPONENTS += debuginfo inteljitevents object
 endif
 
 ifeq ($(USE_OPROFILE), 1)
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
index 4604aa5..fc774ab 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
@@ -52,7 +52,7 @@ protected:
     , MArch("")
     , Builder(Context)
     , MM(new SectionMemoryManager)
-    , HostTriple(LLVM_HOSTTRIPLE)
+    , HostTriple(sys::getProcessTriple())
   {
     InitializeNativeTarget();
     InitializeNativeTargetAsmPrinter();
diff --git a/unittests/IR/AttributesTest.cpp b/unittests/IR/AttributesTest.cpp
new file mode 100644
index 0000000..2368bdf
--- /dev/null
+++ b/unittests/IR/AttributesTest.cpp
@@ -0,0 +1,34 @@
+//===- llvm/unittest/IR/AttributesTest.cpp - Attributes unit tests --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "gtest/gtest.h"
+using namespace llvm;
+
+namespace {
+
+TEST(Attributes, Uniquing) {
+  LLVMContext C;
+
+  Attribute AttrA = Attribute::get(C, Attribute::AlwaysInline);
+  Attribute AttrB = Attribute::get(C, Attribute::AlwaysInline);
+  EXPECT_EQ(AttrA, AttrB);
+
+  AttributeSet ASs[] = {
+    AttributeSet::get(C, 1, Attribute::ZExt),
+    AttributeSet::get(C, 2, Attribute::SExt)
+  };
+
+  AttributeSet SetA = AttributeSet::get(C, ASs);
+  AttributeSet SetB = AttributeSet::get(C, ASs);
+  EXPECT_EQ(SetA, SetB);
+}
+
+} // end anonymous namespace
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index 4ff94f7..aed4597 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 set(IRSources
+  AttributesTest.cpp
   ConstantsTest.cpp
   DominatorTreeTest.cpp
   IRBuilderTest.cpp
diff --git a/unittests/IR/ConstantsTest.cpp b/unittests/IR/ConstantsTest.cpp
index be34c1e..fee38b8 100644
--- a/unittests/IR/ConstantsTest.cpp
+++ b/unittests/IR/ConstantsTest.cpp
@@ -121,16 +121,48 @@ TEST(ConstantsTest, FP128Test) {
   EXPECT_TRUE(isa<ConstantFP>(X));
 }
 
-#define CHECK(x, y) {                                           \
-    std::string __s;                                            \
-    raw_string_ostream __o(__s);                                \
-    cast<ConstantExpr>(x)->getAsInstruction()->print(__o);      \
-    __o.flush();                                                \
-    EXPECT_EQ(std::string("  <badref> = " y), __s);             \
+TEST(ConstantsTest, PointerCast) {
+  LLVMContext &C(getGlobalContext());
+  Type *Int8PtrTy = Type::getInt8PtrTy(C);
+  Type *Int32PtrTy = Type::getInt32PtrTy(C);
+  Type *Int64Ty = Type::getInt64Ty(C);
+  VectorType *Int8PtrVecTy = VectorType::get(Int8PtrTy, 4);
+  VectorType *Int32PtrVecTy = VectorType::get(Int32PtrTy, 4);
+  VectorType *Int64VecTy = VectorType::get(Int64Ty, 4);
+
+  // ptrtoint i8* to i64
+  EXPECT_EQ(Constant::getNullValue(Int64Ty),
+            ConstantExpr::getPointerCast(
+              Constant::getNullValue(Int8PtrTy), Int64Ty));
+
+  // bitcast i8* to i32*
+  EXPECT_EQ(Constant::getNullValue(Int32PtrTy),
+            ConstantExpr::getPointerCast(
+              Constant::getNullValue(Int8PtrTy), Int32PtrTy));
+
+  // ptrtoint <4 x i8*> to <4 x i64>
+  EXPECT_EQ(Constant::getNullValue(Int64VecTy),
+            ConstantExpr::getPointerCast(
+              Constant::getNullValue(Int8PtrVecTy), Int64VecTy));
+
+  // bitcast <4 x i8*> to <4 x i32*>
+  EXPECT_EQ(Constant::getNullValue(Int32PtrVecTy),
+            ConstantExpr::getPointerCast(
+              Constant::getNullValue(Int8PtrVecTy), Int32PtrVecTy));
+}
+
+#define CHECK(x, y) {                                         		\
+    std::string __s;                                            	\
+    raw_string_ostream __o(__s);                                	\
+    Instruction *__I = cast<ConstantExpr>(x)->getAsInstruction();	\
+    __I->print(__o);      						\
+    delete __I; 							\
+    __o.flush();                                                	\
+    EXPECT_EQ(std::string("  <badref> = " y), __s);             	\
   }
 
 TEST(ConstantsTest, AsInstructionsTest) {
-  Module *M = new Module("MyModule", getGlobalContext());
+  OwningPtr<Module> M(new Module("MyModule", getGlobalContext()));
 
   Type *Int64Ty = Type::getInt64Ty(getGlobalContext());
   Type *Int32Ty = Type::getInt32Ty(getGlobalContext());
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index 3a527ad..4e5af93 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -1,3 +1,12 @@
+//===- llvm/unittests/IR/DominatorTreeTest.cpp - Constants unit tests -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Assembly/Parser.h"
 #include "llvm/IR/Instructions.h"
@@ -182,7 +191,7 @@ namespace llvm {
 
     TEST(DominatorTree, Unreachable) {
       DPass *P = new DPass();
-      Module *M = makeLLVMModule(P);
+      OwningPtr<Module> M(makeLLVMModule(P));
       PassManager Passes;
       Passes.add(P);
       Passes.run(*M);
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index c56721f..fecc4a4 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -30,8 +30,8 @@ protected:
                                           /*isVarArg=*/false);
     F = Function::Create(FTy, Function::ExternalLinkage, "", M.get());
     BB = BasicBlock::Create(getGlobalContext(), "", F);
-    GV = new GlobalVariable(Type::getFloatTy(getGlobalContext()), true,
-                            GlobalValue::ExternalLinkage);
+    GV = new GlobalVariable(*M, Type::getFloatTy(getGlobalContext()), true,
+                            GlobalValue::ExternalLinkage, 0);
   }
 
   virtual void TearDown() {
@@ -115,6 +115,7 @@ TEST_F(IRBuilderTest, GetIntTy) {
   IntegerType *IntPtrTy = Builder.getIntPtrTy(DL);
   unsigned IntPtrBitSize =  DL->getPointerSizeInBits(0);
   EXPECT_EQ(IntPtrTy, IntegerType::get(getGlobalContext(), IntPtrBitSize));
+  delete DL;
 }
 
 TEST_F(IRBuilderTest, FastMathFlags) {
diff --git a/unittests/IR/InstructionsTest.cpp b/unittests/IR/InstructionsTest.cpp
index 601a84b..9f66af1 100644
--- a/unittests/IR/InstructionsTest.cpp
+++ b/unittests/IR/InstructionsTest.cpp
@@ -162,6 +162,11 @@ TEST(InstructionsTest, VectorGep) {
   ICmpInst *ICmp1 = new ICmpInst(ICmpInst::ICMP_ULT, PtrVecA, PtrVecB);
   EXPECT_NE(ICmp0, ICmp1); // suppress warning.
 
+  BasicBlock* BB0 = BasicBlock::Create(C);
+  // Test InsertAtEnd ICmpInst constructor.
+  ICmpInst *ICmp2 = new ICmpInst(*BB0, ICmpInst::ICMP_SGE, PtrVecA, PtrVecB);
+  EXPECT_NE(ICmp0, ICmp2); // suppress warning.
+
   GetElementPtrInst *Gep0 = GetElementPtrInst::Create(PtrVecA, C2xi32a);
   GetElementPtrInst *Gep1 = GetElementPtrInst::Create(PtrVecA, C2xi32b);
   GetElementPtrInst *Gep2 = GetElementPtrInst::Create(PtrVecB, C2xi32a);
@@ -187,10 +192,10 @@ TEST(InstructionsTest, VectorGep) {
                 "2:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80"
                 ":128:128-n8:16:32:64-S128");
   // Make sure we don't crash
-  GetPointerBaseWithConstantOffset(Gep0, Offset, TD);
-  GetPointerBaseWithConstantOffset(Gep1, Offset, TD);
-  GetPointerBaseWithConstantOffset(Gep2, Offset, TD);
-  GetPointerBaseWithConstantOffset(Gep3, Offset, TD);
+  GetPointerBaseWithConstantOffset(Gep0, Offset, &TD);
+  GetPointerBaseWithConstantOffset(Gep1, Offset, &TD);
+  GetPointerBaseWithConstantOffset(Gep2, Offset, &TD);
+  GetPointerBaseWithConstantOffset(Gep3, Offset, &TD);
 
   // Gep of Geps
   GetElementPtrInst *GepII0 = GetElementPtrInst::Create(Gep0, C2xi32b);
@@ -223,6 +228,9 @@ TEST(InstructionsTest, VectorGep) {
   delete Gep2;
   delete Gep3;
 
+  ICmp2->eraseFromParent();
+  delete BB0;
+
   delete ICmp0;
   delete ICmp1;
   delete PtrVecA;
diff --git a/unittests/IR/WaymarkTest.cpp b/unittests/IR/WaymarkTest.cpp
index 69fc4da..cf7d76d 100644
--- a/unittests/IR/WaymarkTest.cpp
+++ b/unittests/IR/WaymarkTest.cpp
@@ -27,27 +27,29 @@ TEST(WaymarkTest, NativeArray) {
   static uint8_t tail[22] = "s02s33s30y2y0s1x0syxS";
   Value * values[22];
   std::transform(tail, tail + 22, values, char2constant);
-  FunctionType *FT =	FunctionType::get(Type::getVoidTy(getGlobalContext()), true);
+  FunctionType *FT = FunctionType::get(Type::getVoidTy(getGlobalContext()), true);
   Function *F = Function::Create(FT, GlobalValue::ExternalLinkage);
-	const CallInst *A = CallInst::Create(F, makeArrayRef(values));
+  const CallInst *A = CallInst::Create(F, makeArrayRef(values));
   ASSERT_NE(A, (const CallInst*)NULL);
   ASSERT_EQ(1U + 22, A->getNumOperands());
-	const Use *U = &A->getOperandUse(0);
-	const Use *Ue = &A->getOperandUse(22);
+  const Use *U = &A->getOperandUse(0);
+  const Use *Ue = &A->getOperandUse(22);
   for (; U != Ue; ++U)
   {
     EXPECT_EQ(A, U->getUser());
   }
+  delete A;
 }
 
 TEST(WaymarkTest, TwoBit) {
   Use* many = (Use*)calloc(sizeof(Use), 8212 + 1);
   ASSERT_TRUE(many);
-	Use::initTags(many, many + 8212);
-  for (const Use *U = many, *Ue = many + 8212 - 1; U != Ue; ++U)
+  Use::initTags(many, many + 8212);
+  for (Use *U = many, *Ue = many + 8212 - 1; U != Ue; ++U)
   {
-    EXPECT_EQ((User*)(Ue + 1), U->getUser());
+    EXPECT_EQ(reinterpret_cast<User *>(Ue + 1), U->getUser());
   }
+  free(many);
 }
 
 }  // end anonymous namespace
diff --git a/unittests/Option/OptionParsingTest.cpp b/unittests/Option/OptionParsingTest.cpp
index 76e2549..30944d9 100644
--- a/unittests/Option/OptionParsingTest.cpp
+++ b/unittests/Option/OptionParsingTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/OwningPtr.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
@@ -60,8 +61,12 @@ const char *Args[] = {
 TEST(Support, OptionParsing) {
   TestOptTable T;
   unsigned MAI, MAC;
-  InputArgList *AL = T.ParseArgs(Args, Args + (sizeof(Args) / sizeof(Args[0])), MAI, MAC);
-  
+  OwningPtr<InputArgList>
+    AL(T.ParseArgs(Args,
+                   Args + (sizeof(Args) / sizeof(Args[0])),
+                   MAI,
+                   MAC));
+
   // Check they all exist.
   EXPECT_TRUE(AL->hasArg(OPT_A));
   EXPECT_TRUE(AL->hasArg(OPT_B));
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index dd42585..b4b982f 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -13,6 +13,7 @@ add_llvm_unittest(SupportTests
   ConstantRangeTest.cpp
   DataExtractorTest.cpp
   EndianTest.cpp
+  ErrorOrTest.cpp
   FileOutputBufferTest.cpp
   IntegersSubsetTest.cpp
   LeakDetectorTest.cpp
diff --git a/unittests/Support/ErrorOrTest.cpp b/unittests/Support/ErrorOrTest.cpp
new file mode 100644
index 0000000..aa0ddd5
--- /dev/null
+++ b/unittests/Support/ErrorOrTest.cpp
@@ -0,0 +1,104 @@
+//===- unittests/ErrorOrTest.cpp - ErrorOr.h tests ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ErrorOr.h"
+
+#include "gtest/gtest.h"
+
+#include <memory>
+
+using namespace llvm;
+
+namespace {
+
+ErrorOr<int> t1() {return 1;}
+ErrorOr<int> t2() {return make_error_code(errc::invalid_argument);}
+
+TEST(ErrorOr, SimpleValue) {
+  ErrorOr<int> a = t1();
+  EXPECT_TRUE(a);
+  EXPECT_EQ(1, *a);
+
+  a = t2();
+  EXPECT_FALSE(a);
+  EXPECT_EQ(errc::invalid_argument, a);
+#ifdef EXPECT_DEBUG_DEATH
+  EXPECT_DEBUG_DEATH(*a, "Cannot get value when an error exists");
+#endif
+}
+
+#if LLVM_HAS_CXX11_STDLIB
+ErrorOr<std::unique_ptr<int> > t3() {
+  return std::unique_ptr<int>(new int(3));
+}
+#endif
+
+TEST(ErrorOr, Types) {
+  int x;
+  ErrorOr<int&> a(x);
+  *a = 42;
+  EXPECT_EQ(42, x);
+
+  EXPECT_FALSE(ErrorOr<void>(make_error_code(errc::broken_pipe)));
+  EXPECT_TRUE(ErrorOr<void>(make_error_code(errc::success)));
+
+#if LLVM_HAS_CXX11_STDLIB
+  // Move only types.
+  EXPECT_EQ(3, **t3());
+#endif
+}
+
+struct B {};
+struct D : B {};
+
+TEST(ErrorOr, Covariant) {
+  ErrorOr<B*> b(ErrorOr<D*>(0));
+  b = ErrorOr<D*>(0);
+
+#if LLVM_HAS_CXX11_STDLIB
+  ErrorOr<std::unique_ptr<B> > b1(ErrorOr<std::unique_ptr<D> >(0));
+  b1 = ErrorOr<std::unique_ptr<D> >(0);
+#endif
+}
+} // end anon namespace
+
+struct InvalidArgError {
+  InvalidArgError() {}
+  InvalidArgError(std::string S) : ArgName(S) {}
+  std::string ArgName;
+};
+
+namespace llvm {
+template<>
+struct ErrorOrUserDataTraits<InvalidArgError> : true_type {
+  static error_code error() {
+    return make_error_code(errc::invalid_argument);
+  }
+};
+} // end namespace llvm
+
+ErrorOr<int> t4() {
+  return InvalidArgError("adena");
+}
+
+ErrorOr<void> t5() {
+  return InvalidArgError("pie");
+}
+
+namespace {
+TEST(ErrorOr, UserErrorData) {
+  ErrorOr<int> a = t4();
+  EXPECT_EQ(errc::invalid_argument, a);
+  EXPECT_EQ("adena", t4().getError<InvalidArgError>().ArgName);
+  
+  ErrorOr<void> b = t5();
+  EXPECT_EQ(errc::invalid_argument, b);
+  EXPECT_EQ("pie", b.getError<InvalidArgError>().ArgName);
+}
+} // end anon namespace
diff --git a/unittests/Support/ManagedStatic.cpp b/unittests/Support/ManagedStatic.cpp
index 79eb098..a413761 100644
--- a/unittests/Support/ManagedStatic.cpp
+++ b/unittests/Support/ManagedStatic.cpp
@@ -26,17 +26,34 @@ namespace test1 {
     *ms;
     return NULL;
   }
+
+  // Valgrind's leak checker complains glibc's stack allocation.
+  // To appease valgrind, we provide our own stack for each thread.
+  void *allocate_stack(pthread_attr_t &a, size_t n = 65536) {
+    void *stack = malloc(n);
+    pthread_attr_init(&a);
+#if defined(__linux__)
+    pthread_attr_setstack(&a, stack, n);
+#endif
+    return stack;
+  }
 }
 
 TEST(Initialize, MultipleThreads) {
   // Run this test under tsan: http://code.google.com/p/data-race-test/
 
+  pthread_attr_t a1, a2;
+  void *p1 = test1::allocate_stack(a1);
+  void *p2 = test1::allocate_stack(a2);
+
   llvm_start_multithreaded();
   pthread_t t1, t2;
-  pthread_create(&t1, NULL, test1::helper, NULL);
-  pthread_create(&t2, NULL, test1::helper, NULL);
+  pthread_create(&t1, &a1, test1::helper, NULL);
+  pthread_create(&t2, &a2, test1::helper, NULL);
   pthread_join(t1, NULL);
   pthread_join(t2, NULL);
+  free(p1);
+  free(p2);
   llvm_stop_multithreaded();
 }
 #endif
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index afa71cc..0993d8c 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -782,11 +782,17 @@ namespace yaml {
   struct MappingTraits<KindAndFlags> {
     static void mapping(IO &io, KindAndFlags& kf) {
       io.mapRequired("kind",  kf.kind);
-      // type of flags field varies depending on kind field
-      if ( kf.kind == kindA )
-        io.mapRequired("flags", *((AFlags*)&kf.flags));
-      else
-        io.mapRequired("flags", *((BFlags*)&kf.flags));
+      // Type of "flags" field varies depending on "kind" field.
+      // Use memcpy here to avoid breaking strict aliasing rules.
+      if (kf.kind == kindA) {
+        AFlags aflags = static_cast<AFlags>(kf.flags);
+        io.mapRequired("flags", aflags);
+        kf.flags = aflags;
+      } else {
+        BFlags bflags = static_cast<BFlags>(kf.flags);
+        io.mapRequired("flags", bflags);
+        kf.flags = bflags;
+      }
     }
   };
 }
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index a0eeb0e..563cab0 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -242,7 +242,7 @@ bool Pattern::ParsePattern(StringRef PatternStr, SourceMgr &SM,
       }
 
       // Name can't start with a digit.
-      if (isdigit(Name[0])) {
+      if (isdigit(static_cast<unsigned char>(Name[0]))) {
         SM.PrintMessage(SMLoc::getFromPointer(Name.data()), SourceMgr::DK_Error,
                         "invalid name in named regex");
         return true;
@@ -587,9 +587,13 @@ struct CheckString {
     : Pat(P), Loc(L), IsCheckNext(isCheckNext) {}
 };
 
-/// CanonicalizeInputFile - Remove duplicate horizontal space from the specified
-/// memory buffer, free it, and return a new one.
-static MemoryBuffer *CanonicalizeInputFile(MemoryBuffer *MB) {
+/// Canonicalize whitespaces in the input file. Line endings are replaced
+/// with UNIX-style '\n'.
+///
+/// \param PreserveHorizontal Don't squash consecutive horizontal whitespace
+/// characters to a single space.
+static MemoryBuffer *CanonicalizeInputFile(MemoryBuffer *MB,
+                                           bool PreserveHorizontal) {
   SmallString<128> NewFile;
   NewFile.reserve(MB->getBufferSize());
 
@@ -600,8 +604,9 @@ static MemoryBuffer *CanonicalizeInputFile(MemoryBuffer *MB) {
       continue;
     }
 
-    // If current char is not a horizontal whitespace, dump it to output as is.
-    if (*Ptr != ' ' && *Ptr != '\t') {
+    // If current char is not a horizontal whitespace or if horizontal 
+    // whitespace canonicalization is disabled, dump it to output as is.
+    if (PreserveHorizontal || (*Ptr != ' ' && *Ptr != '\t')) {
       NewFile.push_back(*Ptr);
       continue;
     }
@@ -637,9 +642,8 @@ static bool ReadCheckFile(SourceMgr &SM,
   MemoryBuffer *F = File.take();
 
   // If we want to canonicalize whitespace, strip excess whitespace from the
-  // buffer containing the CHECK lines.
-  if (!NoCanonicalizeWhiteSpace)
-    F = CanonicalizeInputFile(F);
+  // buffer containing the CHECK lines. Remove DOS style line endings.
+  F = CanonicalizeInputFile(F, NoCanonicalizeWhiteSpace);
 
   SM.AddNewSourceBuffer(F, SMLoc());
 
@@ -807,8 +811,8 @@ int main(int argc, char **argv) {
   }
   
   // Remove duplicate spaces in the input file if requested.
-  if (!NoCanonicalizeWhiteSpace)
-    F = CanonicalizeInputFile(F);
+  // Remove DOS style line endings.
+  F = CanonicalizeInputFile(F, NoCanonicalizeWhiteSpace);
 
   SM.AddNewSourceBuffer(F, SMLoc());
 
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index bc0cf43..6faf819 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -958,8 +958,12 @@ static std::string getEnumNameForToken(StringRef Str) {
     case ':': Res += "_COLON_"; break;
     case '!': Res += "_EXCLAIM_"; break;
     case '.': Res += "_DOT_"; break;
+    case '<': Res += "_LT_"; break;
+    case '>': Res += "_GT_"; break;
     default:
-      if (isalnum(*it))
+      if ((*it >= 'A' && *it <= 'Z') ||
+          (*it >= 'a' && *it <= 'z') ||
+          (*it >= '0' && *it <= '9'))
         Res += *it;
       else
         Res += "_" + utostr((unsigned) *it) + "_";
@@ -1723,7 +1727,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
        << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
        << "    case CVT_Reg:\n"
        << "      Operands[*(p + 1)]->setMCOperandNum(NumMCOperands);\n"
-       << "      Operands[*(p + 1)]->setConstraint(\"m\");\n"
+       << "      Operands[*(p + 1)]->setConstraint(\"r\");\n"
        << "      ++NumMCOperands;\n"
        << "      break;\n"
        << "    case CVT_Tied:\n"
@@ -1754,7 +1758,8 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
       // Remember this converter for the kind enum.
       unsigned KindID = OperandConversionKinds.size();
-      OperandConversionKinds.insert("CVT_" + AsmMatchConverter);
+      OperandConversionKinds.insert("CVT_" +
+                                    getEnumNameForToken(AsmMatchConverter));
 
       // Add the converter row for this instruction.
       ConversionTable.push_back(std::vector<uint8_t>());
@@ -1762,7 +1767,8 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
       ConversionTable.back().push_back(CVT_Done);
 
       // Add the handler to the conversion driver function.
-      CvtOS << "    case CVT_" << AsmMatchConverter << ":\n"
+      CvtOS << "    case CVT_"
+            << getEnumNameForToken(AsmMatchConverter) << ":\n"
             << "      " << AsmMatchConverter << "(Inst, Operands);\n"
             << "      break;\n";
 
@@ -1800,6 +1806,7 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
         // the index of its entry in the vector).
         std::string Name = "CVT_" + (Op.Class->isRegisterClass() ? "Reg" :
                                      Op.Class->RenderMethod);
+        Name = getEnumNameForToken(Name);
 
         bool IsNewConverter = false;
         unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
@@ -1823,9 +1830,13 @@ static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
 
         // Add a handler for the operand number lookup.
         OpOS << "    case " << Name << ":\n"
-             << "      Operands[*(p + 1)]->setMCOperandNum(NumMCOperands);\n"
-             << "      Operands[*(p + 1)]->setConstraint(\"m\");\n"
-             << "      NumMCOperands += " << OpInfo.MINumOperands << ";\n"
+             << "      Operands[*(p + 1)]->setMCOperandNum(NumMCOperands);\n";
+
+        if (Op.Class->isRegisterClass())
+          OpOS << "      Operands[*(p + 1)]->setConstraint(\"r\");\n";
+        else
+          OpOS << "      Operands[*(p + 1)]->setConstraint(\"m\");\n";
+        OpOS << "      NumMCOperands += " << OpInfo.MINumOperands << ";\n"
              << "      break;\n";
         break;
       }
@@ -2867,6 +2878,15 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "(MatchClassKind)it->Classes[i]);\n";
   OS << "      if (Diag == Match_Success)\n";
   OS << "        continue;\n";
+  OS << "      // If the generic handler indicates an invalid operand\n";
+  OS << "      // failure, check for a special case.\n";
+  OS << "      if (Diag == Match_InvalidOperand) {\n";
+  OS << "        Diag = validateTargetOperandClass(Operands[i+1],\n";
+  OS.indent(43);
+  OS << "(MatchClassKind)it->Classes[i]);\n";
+  OS << "        if (Diag == Match_Success)\n";
+  OS << "          continue;\n";
+  OS << "      }\n";
   OS << "      // If this operand is broken for all of the instances of this\n";
   OS << "      // mnemonic, keep track of it so we can report loc info.\n";
   OS << "      // If we already had a match that only failed due to a\n";
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index a4114d9..ac8d896 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -842,8 +842,11 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
             if (!IAP->isOpMapped(ROName)) {
               IAP->addOperand(ROName, i);
+              Record *R = CGA->ResultOperands[i].getRecord();
+              if (R->isSubClassOf("RegisterOperand"))
+                R = R->getValueAsDef("RegClass");
               Cond = std::string("MRI.getRegClass(") + Target.getName() + "::" +
-                CGA->ResultOperands[i].getRecord()->getName() + "RegClassID)"
+                R->getName() + "RegClassID)"
                 ".contains(MI->getOperand(" + llvm::utostr(i) + ").getReg())";
               IAP->addCond(Cond);
             } else {
@@ -863,12 +866,18 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
           break;
         }
-        case CodeGenInstAlias::ResultOperand::K_Imm:
-          Cond = std::string("MI->getOperand(") +
-            llvm::utostr(i) + ").getImm() == " +
-            llvm::utostr(CGA->ResultOperands[i].getImm());
+        case CodeGenInstAlias::ResultOperand::K_Imm: {
+          std::string Op = "MI->getOperand(" + llvm::utostr(i) + ")";
+
+          // Just because the alias has an immediate result, doesn't mean the
+          // MCInst will. An MCExpr could be present, for example.
+          IAP->addCond(Op + ".isImm()");
+
+          Cond = Op + ".getImm() == "
+            + llvm::utostr(CGA->ResultOperands[i].getImm());
           IAP->addCond(Cond);
           break;
+        }
         case CodeGenInstAlias::ResultOperand::K_Reg:
           // If this is zero_reg, something's playing tricks we're not
           // equipped to handle.
diff --git a/utils/TableGen/CodeGenMapTable.cpp b/utils/TableGen/CodeGenMapTable.cpp
index 1653d67..ee32aa1 100644
--- a/utils/TableGen/CodeGenMapTable.cpp
+++ b/utils/TableGen/CodeGenMapTable.cpp
@@ -533,12 +533,11 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
        II = ColFieldValueMap.begin(), IE = ColFieldValueMap.end();
        II != IE; II++) {
     std::vector<Init*> FieldValues = (*II).second;
-    unsigned FieldSize = FieldValues.size();
 
     // Delete duplicate entries from ColFieldValueMap
-    for (unsigned i = 0; i < FieldSize - 1; i++) {
+    for (unsigned i = 0; i < FieldValues.size() - 1; i++) {
       Init *CurVal = FieldValues[i];
-      for (unsigned j = i+1; j < FieldSize; j++) {
+      for (unsigned j = i+1; j < FieldValues.size(); j++) {
         if (CurVal == FieldValues[j]) {
           FieldValues.erase(FieldValues.begin()+j);
         }
@@ -547,9 +546,9 @@ static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
 
     // Emit enumerated values for the column fields.
     OS << "enum " << (*II).first << " {\n";
-    for (unsigned i = 0; i < FieldSize; i++) {
+    for (unsigned i = 0, endFV = FieldValues.size(); i < endFV; i++) {
       OS << "\t" << (*II).first << "_" << FieldValues[i]->getAsUnquotedString();
-      if (i != FieldValues.size() - 1)
+      if (i != endFV - 1)
         OS << ",\n";
       else
         OS << "\n};\n\n";
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index 20d439f..993b8db 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -636,8 +636,10 @@ struct TupleExpander : SetTheory::Expander {
       Elts.insert(NewReg);
 
       // Copy Proto super-classes.
-      for (unsigned i = 0, e = Proto->getSuperClasses().size(); i != e; ++i)
-        NewReg->addSuperClass(Proto->getSuperClasses()[i]);
+      ArrayRef<Record *> Supers = Proto->getSuperClasses();
+      ArrayRef<SMRange> Ranges = Proto->getSuperClassRanges();
+      for (unsigned i = 0, e = Supers.size(); i != e; ++i)
+        NewReg->addSuperClass(Supers[i], Ranges[i]);
 
       // Copy Proto fields.
       for (unsigned i = 0, e = Proto->getValues().size(); i != e; ++i) {
@@ -701,7 +703,9 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   // Rename anonymous register classes.
   if (R->getName().size() > 9 && R->getName()[9] == '.') {
     static unsigned AnonCounter = 0;
-    R->setName("AnonRegClass_"+utostr(AnonCounter++));
+    R->setName("AnonRegClass_" + utostr(AnonCounter));
+    // MSVC2012 ICEs if AnonCounter++ is directly passed to utostr.
+    ++AnonCounter;
   }
 
   std::vector<Record*> TypeList = R->getValueAsListOfDefs("RegTypes");
@@ -1196,6 +1200,12 @@ void CodeGenRegBank::computeSubRegIndexLaneMasks() {
     if (Idx->getComposites().empty()) {
       Idx->LaneMask = 1u << Bit;
       // Share bit 31 in the unlikely case there are more than 32 leafs.
+      //
+      // Sharing bits is harmless; it allows graceful degradation in targets
+      // with more than 32 vector lanes. They simply get a limited resolution
+      // view of lanes beyond the 32nd.
+      //
+      // See also the comment for getSubRegIndexLaneMask().
       if (Bit < 31) ++Bit;
     } else {
       Idx->LaneMask = 0;
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index c653c49..23b79fc 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -1380,8 +1380,22 @@ void CodeGenSchedModels::collectProcResources() {
        SCI != SCE; ++SCI) {
     if (SCI->ItinClassDef)
       collectItinProcResources(SCI->ItinClassDef);
-    else
+    else {
+      // This class may have a default ReadWrite list which can be overriden by
+      // InstRW definitions.
+      if (!SCI->InstRWs.empty()) {
+        for (RecIter RWI = SCI->InstRWs.begin(), RWE = SCI->InstRWs.end();
+             RWI != RWE; ++RWI) {
+          Record *RWModelDef = (*RWI)->getValueAsDef("SchedModel");
+          IdxVec ProcIndices(1, getProcModel(RWModelDef).Index);
+          IdxVec Writes, Reads;
+          findRWs((*RWI)->getValueAsListOfDefs("OperandReadWrites"),
+                  Writes, Reads);
+          collectRWResources(Writes, Reads, ProcIndices);
+        }
+      }
       collectRWResources(SCI->Writes, SCI->Reads, SCI->ProcIndices);
+    }
   }
   // Add resources separately defined by each subtarget.
   RecVec WRDefs = Records.getAllDerivedDefinitions("WriteRes");
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index 4345708..93f84ce 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -132,7 +132,7 @@ static uint64_t EmitVBRValue(uint64_t Val, raw_ostream &OS) {
   return NumBytes+1;
 }
 
-/// EmitMatcherOpcodes - Emit bytes for the specified matcher and return
+/// EmitMatcher - Emit bytes for the specified matcher and return
 /// the number of bytes emitted.
 unsigned MatcherTableEmitter::
 EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
diff --git a/utils/TableGen/DisassemblerEmitter.cpp b/utils/TableGen/DisassemblerEmitter.cpp
index 2d11d24..5a2a41b 100644
--- a/utils/TableGen/DisassemblerEmitter.cpp
+++ b/utils/TableGen/DisassemblerEmitter.cpp
@@ -127,8 +127,9 @@ void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
 
   // ARM and Thumb have a CHECK() macro to deal with DecodeStatuses.
   if (Target.getName() == "ARM" ||
-      Target.getName() == "Thumb") {
-    EmitFixedLenDecoder(Records, OS, "ARM",
+      Target.getName() == "Thumb" || 
+      Target.getName() == "AArch64") {
+    EmitFixedLenDecoder(Records, OS, Target.getName() == "AArch64" ? "AArch64" : "ARM",
                         "if (!Check(S, ", ")) return MCDisassembler::Fail;",
                         "S", "MCDisassembler::Fail",
                         "  MCDisassembler::DecodeStatus S = "
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index c4549d1..df4d847 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -221,27 +221,28 @@ enum IIT_Info {
   IIT_I16  = 3,
   IIT_I32  = 4,
   IIT_I64  = 5,
-  IIT_F32  = 6,
-  IIT_F64  = 7,
-  IIT_V2   = 8,
-  IIT_V4   = 9,
-  IIT_V8   = 10,
-  IIT_V16  = 11,
-  IIT_V32  = 12,
-  IIT_MMX  = 13,
+  IIT_F16  = 6,
+  IIT_F32  = 7,
+  IIT_F64  = 8,
+  IIT_V2   = 9,
+  IIT_V4   = 10,
+  IIT_V8   = 11,
+  IIT_V16  = 12,
+  IIT_V32  = 13,
   IIT_PTR  = 14,
   IIT_ARG  = 15,
-  
+
   // Values from 16+ are only encodable with the inefficient encoding.
-  IIT_METADATA = 16,
-  IIT_EMPTYSTRUCT = 17,
-  IIT_STRUCT2 = 18,
-  IIT_STRUCT3 = 19,
-  IIT_STRUCT4 = 20,
-  IIT_STRUCT5 = 21,
-  IIT_EXTEND_VEC_ARG = 22,
-  IIT_TRUNC_VEC_ARG = 23,
-  IIT_ANYPTR = 24
+  IIT_MMX  = 16,
+  IIT_METADATA = 17,
+  IIT_EMPTYSTRUCT = 18,
+  IIT_STRUCT2 = 19,
+  IIT_STRUCT3 = 20,
+  IIT_STRUCT4 = 21,
+  IIT_STRUCT5 = 22,
+  IIT_EXTEND_VEC_ARG = 23,
+  IIT_TRUNC_VEC_ARG = 24,
+  IIT_ANYPTR = 25
 };
 
 
@@ -261,6 +262,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   
   switch (VT) {
   default: PrintFatalError("unhandled MVT in intrinsic!");
+  case MVT::f16: return Sig.push_back(IIT_F16);
   case MVT::f32: return Sig.push_back(IIT_F32);
   case MVT::f64: return Sig.push_back(IIT_F64);
   case MVT::Metadata: return Sig.push_back(IIT_METADATA);
@@ -532,9 +534,8 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
     N = ++AttrNum;
   }
 
-  // Emit an array of AttributeWithIndex.  Most intrinsics will have
-  // at least one entry, for the function itself (index ~1), which is
-  // usually nounwind.
+  // Emit an array of AttributeSet.  Most intrinsics will have at least one
+  // entry, for the function itself (index ~1), which is usually nounwind.
   OS << "  static const uint8_t IntrinsicsToAttributesMap[] = {\n";
 
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
@@ -545,7 +546,7 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   }
   OS << "  };\n\n";
 
-  OS << "  AttributeWithIndex AWI[" << maxArgAttrs+1 << "];\n";
+  OS << "  AttributeSet AS[" << maxArgAttrs+1 << "];\n";
   OS << "  unsigned NumAttrs = 0;\n";
   OS << "  if (id != 0) {\n";
   OS << "    SmallVector<Attribute::AttrKind, 8> AttrVec;\n";
@@ -583,7 +584,7 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
           ++ai;
         } while (ai != ae && intrinsic.ArgumentAttributes[ai].first == argNo);
 
-        OS << "      AWI[" << numAttrs++ << "] = AttributeWithIndex::get(C, "
+        OS << "      AS[" << numAttrs++ << "] = AttributeSet::get(C, "
            << argNo+1 << ", AttrVec);\n";
       }
     }
@@ -607,7 +608,7 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
         OS << "      AttrVec.push_back(Attribute::ReadNone);\n"; 
         break;
       }
-      OS << "      AWI[" << numAttrs++ << "] = AttributeWithIndex::get(C, "
+      OS << "      AS[" << numAttrs++ << "] = AttributeSet::get(C, "
          << "AttributeSet::FunctionIndex, AttrVec);\n";
     }
 
@@ -621,7 +622,7 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   
   OS << "    }\n";
   OS << "  }\n";
-  OS << "  return AttributeSet::get(C, ArrayRef<AttributeWithIndex>(AWI, "
+  OS << "  return AttributeSet::get(C, ArrayRef<AttributeSet>(AS, "
              "NumAttrs));\n";
   OS << "}\n";
   OS << "#endif // GET_INTRINSIC_ATTRIBUTES\n\n";
diff --git a/utils/TableGen/PseudoLoweringEmitter.cpp b/utils/TableGen/PseudoLoweringEmitter.cpp
index 64aaee7..1ea6f79 100644
--- a/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -252,6 +252,7 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
       MIOpNo += Dest.Operands[OpNo].MINumOperands;
     }
     if (Dest.Operands.isVariadic) {
+      MIOpNo = Source.Operands.size() + 1;
       o << "      // variable_ops\n";
       o << "      for (unsigned i = " << MIOpNo
         << ", e = MI->getNumOperands(); i != e; ++i)\n"
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 3b7d006..fc8d00d 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -1108,6 +1108,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
     EmitProcessorProp(OS, PI->ModelDef, "MinLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
+    EmitProcessorProp(OS, PI->ModelDef, "ILPWindow", ',');
     EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
     OS << "  " << PI->Index << ", // Processor ID\n";
     if (PI->hasInstrSchedModel())
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index b99a6eb..b00f7ea 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -119,6 +119,7 @@ namespace X86Local {
 #define TWO_BYTE_EXTENSION_TABLES \
   EXTENSION_TABLE(00)             \
   EXTENSION_TABLE(01)             \
+  EXTENSION_TABLE(0d)             \
   EXTENSION_TABLE(18)             \
   EXTENSION_TABLE(71)             \
   EXTENSION_TABLE(72)             \
diff --git a/utils/buildit/build_llvm b/utils/buildit/build_llvm
index 6aee831..c056b97 100755
--- a/utils/buildit/build_llvm
+++ b/utils/buildit/build_llvm
@@ -77,6 +77,45 @@ rm $SRC_DIR/Makefile || exit 1
 # Now create our own by editing the top-level Makefile, deleting every line marked "Apple-style":
 sed -e '/[Aa]pple-style/d' -e '/include.*GNUmakefile/d' $ORIG_SRC_DIR/Makefile > $SRC_DIR/Makefile || exit 1
 
+SUBVERSION=`echo $RC_ProjectSourceVersion | sed -e 's/.*\.\([0-9]*\).*/\1/'`
+if [ "x$SUBVERSION" != "x$RC_ProjectSourceVersion" ]; then
+    LLVM_SUBMIT_SUBVERSION=`printf "%02d" $SUBVERSION`
+    RC_ProjectSourceVersion=`echo $RC_ProjectSourceVersion | sed -e 's/\..*//'`
+    LLVM_SUBMIT_VERSION=$RC_ProjectSourceVersion
+fi
+if [ "x$LLVM_SUBMIT_SUBVERSION" = "x00" -o "x$LLVM_SUBMIT_SUBVERSION" = "x0" ]; then
+    LLVM_VERSION="$LLVM_SUBMIT_VERSION"
+else
+    LLVM_VERSION="$LLVM_SUBMIT_VERSION-$LLVM_SUBMIT_SUBVERSION"
+fi
+
+# Figure out how many make processes to run.
+SYSCTL=`sysctl -n hw.activecpu`
+# sysctl -n hw.* does not work when invoked via B&I chroot /BuildRoot.
+# Builders can default to 2, since even if they are single processor,
+# nothing else is running on the machine.
+if [ -z "$SYSCTL" ]; then
+    SYSCTL=2
+fi
+JOBS_FLAG="-j $SYSCTL"
+
+COMMON_CONFIGURE_OPTS="\
+  --prefix=$DEST_DIR$DEST_ROOT \
+  --enable-assertions=$LLVM_ASSERTIONS \
+  --enable-optimized=$LLVM_OPTIMIZED \
+  --disable-bindings"
+
+COMMON_MAKEFLAGS="\
+  UNIVERSAL=1 \
+  UNIVERSAL_SDK_PATH=$SDKROOT \
+  NO_RUNTIME_LIBS=1 \
+  DISABLE_EDIS=1 \
+  REQUIRES_RTTI=1 \
+  DEBUG_SYMBOLS=1 \
+  LLVM_SUBMIT_VERSION=$LLVM_SUBMIT_VERSION \
+  LLVM_SUBMIT_SUBVERSION=$LLVM_SUBMIT_SUBVERSION \
+  VERBOSE=1"
+
 # Build the LLVM tree universal.
 mkdir -p $DIR/obj-llvm || exit 1
 cd $DIR/obj-llvm || exit 1
@@ -89,6 +128,7 @@ if [ "$ARM_HOSTED_BUILD" = yes ]; then
   for prog in ar nm ranlib strip lipo ld as ; do
     P=$DIR/bin/arm-apple-darwin$DARWIN_VERS-${prog}
     T=`xcrun -sdk $SDKROOT -find ${prog}`
+    ln -s $T $DIR/bin/$prog
     echo '#!/bin/sh' > $P || exit 1
     echo 'exec '$T' "$@"' >> $P || exit 1
     chmod a+x $P || exit 1
@@ -97,80 +137,74 @@ if [ "$ARM_HOSTED_BUILD" = yes ]; then
   for prog in clang clang++ ; do
     P=$DIR/bin/arm-apple-darwin$DARWIN_VERS-${prog}
     T=`xcrun -sdk $SDKROOT -find ${prog}`
+    ln -s $T $DIR/bin/$prog
     echo '#!/bin/sh' > $P || exit 1
     echo 'exec '$T' -arch armv7 -isysroot '${SDKROOT}' "$@"' >> $P || exit 1
     chmod a+x $P || exit 1
   done
 
   PATH=$DIR/bin:$PATH
-fi
 
-if [ "$ARM_HOSTED_BUILD" = yes ]; then
-  configure_opts="--enable-targets=arm --host=arm-apple-darwin10 \
-                  --target=arm-apple-darwin10 --build=i686-apple-darwin10"
-elif [ "$IOS_SIM_BUILD" = yes ]; then
-  # Use a non-standard "darwin_sim" host triple to trigger a cross-build.
-  configure_opts="--enable-targets=x86 --host=i686-apple-darwin_sim \
-                  --build=i686-apple-darwin10"
+  unset SDKROOT && \
+  $SRC_DIR/configure $COMMON_CONFIGURE_OPTS \
+    --enable-targets=arm \
+    --host=arm-apple-darwin10 \
+    --target=arm-apple-darwin10 \
+    --build=i686-apple-darwin10 \
+    --program-prefix="" \
+    || exit 1
+
+  if [ -n "$IPHONEOS_DEPLOYMENT_TARGET" ]; then
+    COMMON_MAKEFLAGS="$COMMON_MAKEFLAGS \
+      DEPLOYMENT_TARGET=-mios-version-min=$IPHONEOS_DEPLOYMENT_TARGET"
+  fi
+
+  make $JOBS_FLAG $COMMON_MAKEFLAGS SDKROOT= UNIVERSAL_ARCH="$HOSTS" \
+    CXXFLAGS="-DLLVM_VERSION_INFO='\" Apple Build #$LLVM_VERSION\"'"
+  if [ $? != 0 ] ; then
+    echo "error: LLVM 'make' failed!"
+    exit 1
+  fi 
+
 else
-  configure_opts="--enable-targets=arm,x86"
-fi
+# not $ARM_HOSTED_BUILD
+
+  export CC=`xcrun -find clang`
+  export CXX=`xcrun -find clang++`
+
+  if [ "$IOS_SIM_BUILD" = yes ]; then
+    # Use a non-standard "darwin_sim" host triple to trigger a cross-build.
+    configure_opts="--enable-targets=x86 --host=i686-apple-darwin_sim \
+                    --build=i686-apple-darwin10"
+    if [ -n "$IPHONEOS_DEPLOYMENT_TARGET" ]; then
+      COMMON_MAKEFLAGS="$COMMON_MAKEFLAGS \
+        DEPLOYMENT_TARGET=-mios-simulator-version-min=$IPHONEOS_DEPLOYMENT_TARGET"
+    fi
+  else
+    configure_opts="--enable-targets=arm,x86"
+    if [ -n "$MACOSX_DEPLOYMENT_TARGET" ]; then
+      COMMON_MAKEFLAGS="$COMMON_MAKEFLAGS \
+        DEPLOYMENT_TARGET=-mmacosx-version-min=$MACOSX_DEPLOYMENT_TARGET"
+    fi
+  fi
 
-if [ "$ARM_HOSTED_BUILD" != yes ]; then
   if [ $SDKROOT ]; then
     CPPFLAGS="$CPPFLAGS -isysroot $SDKROOT"
   fi
   for host in $HOSTS; do :; done
   CPPFLAGS="$CPPFLAGS -arch $host"
-fi
 
-if [ \! -f Makefile.config ]; then
-  $SRC_DIR/configure --prefix=$DEST_DIR$DEST_ROOT $configure_opts \
-    --enable-assertions=$LLVM_ASSERTIONS \
-    --enable-optimized=$LLVM_OPTIMIZED \
-    --disable-bindings \
+  $SRC_DIR/configure $COMMON_CONFIGURE_OPTS $configure_opts \
+    --program-prefix="" \
     CPPFLAGS="$CPPFLAGS" \
     || exit 1
-fi
-
-SUBVERSION=`echo $RC_ProjectSourceVersion | sed -e 's/.*\.\([0-9]*\).*/\1/'`
-
-if [ "x$SUBVERSION" != "x$RC_ProjectSourceVersion" ]; then
-    LLVM_SUBMIT_SUBVERSION=`printf "%02d" $SUBVERSION`
-    RC_ProjectSourceVersion=`echo $RC_ProjectSourceVersion | sed -e 's/\..*//'`
-    LLVM_SUBMIT_VERSION=$RC_ProjectSourceVersion
-fi
 
-if [ "x$LLVM_SUBMIT_SUBVERSION" = "x00" -o "x$LLVM_SUBMIT_SUBVERSION" = "x0" ]; then
-    LLVM_VERSION="$LLVM_SUBMIT_VERSION"
-else
-    LLVM_VERSION="$LLVM_SUBMIT_VERSION-$LLVM_SUBMIT_SUBVERSION"
-fi
-
-# Figure out how many make processes to run.
-SYSCTL=`sysctl -n hw.activecpu`
-# sysctl -n hw.* does not work when invoked via B&I chroot /BuildRoot.
-# Builders can default to 2, since even if they are single processor,
-# nothing else is running on the machine.
-if [ -z "$SYSCTL" ]; then
-    SYSCTL=2
-fi
-JOBS_FLAG="-j $SYSCTL"
-
-make $JOBS_FLAG $OPTIMIZE_OPTS UNIVERSAL=1 UNIVERSAL_ARCH="$HOSTS" \
-    UNIVERSAL_SDK_PATH=$SDKROOT \
-    NO_RUNTIME_LIBS=1 \
-    DISABLE_EDIS=1 \
-    REQUIRES_RTTI=1 \
-    DEBUG_SYMBOLS=1 \
-    LLVM_SUBMIT_VERSION=$LLVM_SUBMIT_VERSION \
-    LLVM_SUBMIT_SUBVERSION=$LLVM_SUBMIT_SUBVERSION \
-    CXXFLAGS="-DLLVM_VERSION_INFO='\" Apple Build #$LLVM_VERSION\"'" \
-    VERBOSE=1
-
-if [ $? != 0 ] ; then
+  make $JOBS_FLAG $COMMON_MAKEFLAGS UNIVERSAL_ARCH="$HOSTS" \
+    CXXFLAGS="-DLLVM_VERSION_INFO='\" Apple Build #$LLVM_VERSION\"'"
+  if [ $? != 0 ] ; then
     echo "error: LLVM 'make' failed!"
     exit 1
+  fi 
 fi 
 
 ################################################################################
@@ -185,14 +219,7 @@ rm -rf * || exit 1
 cd $DIR/obj-llvm || exit 1
 
 # Install the tree into the destination directory.
-make $LOCAL_MAKEFLAGS $OPTIMIZE_OPTS UNIVERSAL=1 UNIVERSAL_ARCH="$HOSTS" \
-    NO_RUNTIME_LIBS=1 \
-    DISABLE_EDIS=1 \
-    DEBUG_SYMBOLS=1 \
-    LLVM_SUBMIT_VERSION=$LLVM_SUBMIT_VERSION \
-    LLVM_SUBMIT_SUBVERSION=$LLVM_SUBMIT_SUBVERSION \
-    OPTIMIZE_OPTION='-O3' VERBOSE=1 install
-
+make $JOBS_FLAG $COMMON_MAKEFLAGS UNIVERSAL_ARCH="$HOSTS" install
 if ! test $? == 0 ; then
     echo "error: LLVM 'make install' failed!"
     exit 1
@@ -207,6 +234,16 @@ RC_ProjectSourceSubversion=`printf "%d" $LLVM_MINOR_VERSION`
 echo "#define LLVM_VERSION ${RC_ProjectSourceVersion}" > $DEST_DIR$DEST_ROOT/include/llvm/Version.h
 echo "#define LLVM_MINOR_VERSION ${RC_ProjectSourceSubversion}" >> $DEST_DIR$DEST_ROOT/include/llvm/Version.h
 
+# Run unifdef to preprocess the installed headers to reflect whether this
+# was a debug or release build.
+for file in `find $DEST_DIR$DEST_ROOT/include -type f -print`; do
+  if [ "$LLVM_ASSERTIONS" = yes ]; then
+    unifdef -UNDEBUG -D_DEBUG -o $file $file
+  else
+    unifdef -DNDEBUG -U_DEBUG -ULLVM_ENABLE_DUMP -o $file $file
+  fi
+done
+
 # Find the right version of strip to use.
 STRIP=strip
 if [ -n "$SDKROOT" ]; then
@@ -263,9 +300,10 @@ cd $SYM_DIR || exit 1
 rm -rf * || exit 1
 
 # Generate .dSYM files
+DSYMUTIL=`xcrun -find dsymutil`
 find $DEST_DIR -perm -0111 -type f \
     ! \( -name '*.la' -o -name gccas -o -name gccld -o -name llvm-config -o -name '*.a' \) \
-    -print | xargs -n 1 -P ${SYSCTL} dsymutil
+    -print | xargs -n 1 -P ${SYSCTL} ${DSYMUTIL}
 
 # Save .dSYM files and .a archives
 cd $DEST_DIR || exit 1
diff --git a/utils/git/find-rev b/utils/git/find-rev
index a6161db..059ca0b 100755
--- a/utils/git/find-rev
+++ b/utils/git/find-rev
@@ -5,9 +5,9 @@ import os, sys, subprocess
 def main():
     from optparse import OptionParser, OptionGroup
     parser = OptionParser("usage: %prog [options] <repo> <revision>")
-    parser.add_option("", "--dump-section-data", dest="dumpSectionData",
-                      help="Dump the contents of sections",
-                      action="store_true", default=False)    
+    parser.add_option("", "--branch", dest="branch",
+                      help="Ref for the branch to search [%default]",
+                      action="store", default="git-svn")    
     (opts, args) = parser.parse_args()
 
     if len(args) != 2:
@@ -21,7 +21,7 @@ def main():
         parser.error("invalid revision argument (not an integer)")
 
     os.chdir(repo)
-    p = subprocess.Popen(['git', 'rev-list', 'git-svn', '--pretty'],
+    p = subprocess.Popen(['git', 'rev-list', opts.branch, '--pretty'],
                          stdout=subprocess.PIPE)
 
     bestRev = bestCommit = None
diff --git a/utils/kate/llvm.xml b/utils/kate/llvm.xml
index 074fa16..1778cfc 100644
--- a/utils/kate/llvm.xml
+++ b/utils/kate/llvm.xml
@@ -90,6 +90,7 @@
       <item> readonly </item>
       <item> ssp </item>
       <item> sspreq </item>
+      <item> sspstrong </item>
     </list>
     <list name="types">
       <item> float </item>
diff --git a/utils/lit/MANIFEST.in b/utils/lit/MANIFEST.in
new file mode 100644
index 0000000..6491a02
--- /dev/null
+++ b/utils/lit/MANIFEST.in
@@ -0,0 +1,7 @@
+include TODO lit.py
+recursive-include tests *
+global-exclude *pyc
+global-exclude *~
+prune tests/Output
+prune tests/*/Output
+prune tests/*/*/Output
diff --git a/utils/lit/TODO b/utils/lit/TODO
index 6d7f7ea..d2ff842 100644
--- a/utils/lit/TODO
+++ b/utils/lit/TODO
@@ -7,3 +7,20 @@
  - Support valgrind in all configs, and LLVM style valgrind.
 
  - Support a timeout / ulimit.
+
+ - Rename 'lit' injected variable for config to be lit_config.
+
+ - Allow import of 'lit' in test suite definitions.
+
+ - Create an explicit test suite object (instead of using the top-level
+   TestingConfig object).
+
+ - Allow 'lit' driver to cooperate with test suites to add options (or at least
+   sanitize accepted params).
+
+ - Consider move to identifying all tests by path-to-test-suite and then path to
+   subtest, and don't use test suite names.
+
+ - Consider move to change workflow to always load suites, then resolve command
+   line arguments.
+
diff --git a/utils/lit/lit/ExampleTests/Clang/lit.cfg b/utils/lit/lit/ExampleTests/Clang/lit.cfg
index 1e1e807..9295bd9 100644
--- a/utils/lit/lit/ExampleTests/Clang/lit.cfg
+++ b/utils/lit/lit/ExampleTests/Clang/lit.cfg
@@ -14,7 +14,7 @@ config.test_format = lit.formats.ShTest(execute_external = True)
 # suffixes: A list of file extensions to treat as test files.
 config.suffixes = ['.c', '.cpp', '.m', '.mm']
 
-# target_triple: Used by ShTest and TclTest formats for XFAIL checks.
+# target_triple: Used by ShTest format for XFAIL checks.
 config.target_triple = 'foo'
 
 ###
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/data.txt b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/data.txt
new file mode 100644
index 0000000..45b983b
--- /dev/null
+++ b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/data.txt
@@ -0,0 +1 @@
+hi
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/dg.exp b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/dg.exp
deleted file mode 100644
index 2bda07a..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/dg.exp
+++ /dev/null
@@ -1,6 +0,0 @@
-load_lib llvm.exp
-
-if { [llvm_supports_target X86] } {
-  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
-}
-
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/pct-S.ll b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/pct-S.ll
new file mode 100644
index 0000000..3ff3633
--- /dev/null
+++ b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/pct-S.ll
@@ -0,0 +1 @@
+; RUN: grep "hi" %S/data.txt
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
index 3fdd63c..533c445 100644
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
+++ b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
@@ -8,11 +8,11 @@ import os
 config.name = 'LLVM'
 
 # testFormat: The test format to use to interpret tests.
-config.test_format = lit.formats.TclTest()
+config.test_format = lit.formats.ShTest()
 
 # suffixes: A list of file extensions to treat as test files, this is actually
 # set by on_clone().
-config.suffixes = []
+config.suffixes = [ '.ll' ]
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
@@ -64,74 +64,3 @@ if config.test_exec_root is None:
     lit.load_config(config, site_cfg)
     raise SystemExit
 
-###
-
-# Load site data from DejaGNU's site.exp.
-import re
-site_exp = {}
-# FIXME: Implement lit.site.cfg.
-for line in open(os.path.join(config.llvm_obj_root, 'test', 'site.exp')):
-    m = re.match('set ([^ ]+) "([^"]*)"', line)
-    if m:
-        site_exp[m.group(1)] = m.group(2)
-
-excludes = []
-
-# Provide target_triple for use in XFAIL.
-config.target_triple = site_exp['target_triplet']
-
-# Provide llvm_supports_target for use in local configs.
-targets = set(site_exp["TARGETS_TO_BUILD"].split())
-def llvm_supports_target(name):
-    return name in targets
-
-# Provide on_clone hook for reading 'dg.exp'.
-import os
-simpleLibData = re.compile(r"""load_lib llvm.exp
-
-RunLLVMTests \[lsort \[glob -nocomplain \$srcdir/\$subdir/\*\.(.*)\]\]""",
-                           re.MULTILINE)
-conditionalLibData = re.compile(r"""load_lib llvm.exp
-
-if.*\[ ?(llvm[^ ]*) ([^ ]*) ?\].*{
- *RunLLVMTests \[lsort \[glob -nocomplain \$srcdir/\$subdir/\*\.(.*)\]\]
-\}""", re.MULTILINE)
-def on_clone(parent, cfg, for_path):
-    def addSuffixes(match):
-        if match[0] == '{' and match[-1] == '}':
-            cfg.suffixes = ['.' + s for s in match[1:-1].split(',')]
-        else:
-            cfg.suffixes = ['.' + match]
-
-    libPath = os.path.join(os.path.dirname(for_path),
-                           'dg.exp')
-    if not os.path.exists(libPath):
-        cfg.unsupported = True
-        return
-
-    # Reset unsupported, in case we inherited it.
-    cfg.unsupported = False
-    lib = open(libPath).read().strip()
-
-    # Check for a simple library.
-    m = simpleLibData.match(lib)
-    if m:
-        addSuffixes(m.group(1))
-        return
-
-    # Check for a conditional test set.
-    m = conditionalLibData.match(lib)
-    if m:
-        funcname,arg,match = m.groups()
-        addSuffixes(match)
-
-        func = globals().get(funcname)
-        if not func:
-            lit.error('unsupported predicate %r' % funcname)
-        elif not func(arg):
-            cfg.unsupported = True
-        return
-    # Otherwise, give up.
-    lit.error('unable to understand %r:\n%s' % (libPath, lib))
-
-config.on_clone = on_clone
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.site.cfg b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.site.cfg
index 3bfee54..d45f3ac 100644
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.site.cfg
+++ b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.site.cfg
@@ -1,8 +1,5 @@
 # -*- Python -*-
 
-## Autogenerated by Makefile ##
-# Do not edit!
-
 # Preserve some key paths for use by main LLVM test suite config.
 config.llvm_obj_root = os.path.dirname(os.path.dirname(__file__))
 
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/site.exp b/utils/lit/lit/ExampleTests/LLVM.InTree/test/site.exp
deleted file mode 100644
index 2b60cb9..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/site.exp
+++ /dev/null
@@ -1,10 +0,0 @@
-## these variables are automatically generated by make ##
-# Do not edit here.  If you wish to override these values
-# edit the last section
-set target_triplet "x86_64-apple-darwin10"
-set TARGETS_TO_BUILD "X86 Sparc PowerPC ARM Mips PIC16 XCore MSP430 Blackfin MSIL CppBackend"
-set srcroot "/Volumes/Data/ddunbar/llvm"
-set objroot "/Volumes/Data/ddunbar/llvm.obj.64"
-set srcdir "/Volumes/Data/ddunbar/llvm/test"
-set objdir "/Volumes/Data/ddunbar/llvm.obj.64/test"
-## All variables above are generated by configure. Do Not Edit ## 
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/lit.site.cfg b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/lit.site.cfg
index bdcc35e..94a02d8 100644
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/lit.site.cfg
+++ b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/lit.site.cfg
@@ -1,8 +1,5 @@
 # -*- Python -*-
 
-## Autogenerated by Makefile ##
-# Do not edit!
-
 # Preserve some key paths for use by main LLVM test suite config.
 config.llvm_obj_root = os.path.dirname(os.path.dirname(__file__))
 
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/site.exp b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/site.exp
deleted file mode 100644
index 2b60cb9..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/site.exp
+++ /dev/null
@@ -1,10 +0,0 @@
-## these variables are automatically generated by make ##
-# Do not edit here.  If you wish to override these values
-# edit the last section
-set target_triplet "x86_64-apple-darwin10"
-set TARGETS_TO_BUILD "X86 Sparc PowerPC ARM Mips PIC16 XCore MSP430 Blackfin MSIL CppBackend"
-set srcroot "/Volumes/Data/ddunbar/llvm"
-set objroot "/Volumes/Data/ddunbar/llvm.obj.64"
-set srcdir "/Volumes/Data/ddunbar/llvm/test"
-set objdir "/Volumes/Data/ddunbar/llvm.obj.64/test"
-## All variables above are generated by configure. Do Not Edit ## 
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/Foo/dg.exp b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/Foo/dg.exp
deleted file mode 100644
index 2bda07a..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/Foo/dg.exp
+++ /dev/null
@@ -1,6 +0,0 @@
-load_lib llvm.exp
-
-if { [llvm_supports_target X86] } {
-  RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll}]]
-}
-
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
index 3fdd63c..533c445 100644
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
+++ b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
@@ -8,11 +8,11 @@ import os
 config.name = 'LLVM'
 
 # testFormat: The test format to use to interpret tests.
-config.test_format = lit.formats.TclTest()
+config.test_format = lit.formats.ShTest()
 
 # suffixes: A list of file extensions to treat as test files, this is actually
 # set by on_clone().
-config.suffixes = []
+config.suffixes = [ '.ll' ]
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
@@ -64,74 +64,3 @@ if config.test_exec_root is None:
     lit.load_config(config, site_cfg)
     raise SystemExit
 
-###
-
-# Load site data from DejaGNU's site.exp.
-import re
-site_exp = {}
-# FIXME: Implement lit.site.cfg.
-for line in open(os.path.join(config.llvm_obj_root, 'test', 'site.exp')):
-    m = re.match('set ([^ ]+) "([^"]*)"', line)
-    if m:
-        site_exp[m.group(1)] = m.group(2)
-
-excludes = []
-
-# Provide target_triple for use in XFAIL.
-config.target_triple = site_exp['target_triplet']
-
-# Provide llvm_supports_target for use in local configs.
-targets = set(site_exp["TARGETS_TO_BUILD"].split())
-def llvm_supports_target(name):
-    return name in targets
-
-# Provide on_clone hook for reading 'dg.exp'.
-import os
-simpleLibData = re.compile(r"""load_lib llvm.exp
-
-RunLLVMTests \[lsort \[glob -nocomplain \$srcdir/\$subdir/\*\.(.*)\]\]""",
-                           re.MULTILINE)
-conditionalLibData = re.compile(r"""load_lib llvm.exp
-
-if.*\[ ?(llvm[^ ]*) ([^ ]*) ?\].*{
- *RunLLVMTests \[lsort \[glob -nocomplain \$srcdir/\$subdir/\*\.(.*)\]\]
-\}""", re.MULTILINE)
-def on_clone(parent, cfg, for_path):
-    def addSuffixes(match):
-        if match[0] == '{' and match[-1] == '}':
-            cfg.suffixes = ['.' + s for s in match[1:-1].split(',')]
-        else:
-            cfg.suffixes = ['.' + match]
-
-    libPath = os.path.join(os.path.dirname(for_path),
-                           'dg.exp')
-    if not os.path.exists(libPath):
-        cfg.unsupported = True
-        return
-
-    # Reset unsupported, in case we inherited it.
-    cfg.unsupported = False
-    lib = open(libPath).read().strip()
-
-    # Check for a simple library.
-    m = simpleLibData.match(lib)
-    if m:
-        addSuffixes(m.group(1))
-        return
-
-    # Check for a conditional test set.
-    m = conditionalLibData.match(lib)
-    if m:
-        funcname,arg,match = m.groups()
-        addSuffixes(match)
-
-        func = globals().get(funcname)
-        if not func:
-            lit.error('unsupported predicate %r' % funcname)
-        elif not func(arg):
-            cfg.unsupported = True
-        return
-    # Otherwise, give up.
-    lit.error('unable to understand %r:\n%s' % (libPath, lib))
-
-config.on_clone = on_clone
diff --git a/utils/lit/lit/ExampleTests/TclTest/lit.local.cfg b/utils/lit/lit/ExampleTests/TclTest/lit.local.cfg
deleted file mode 100644
index 6a37129..0000000
--- a/utils/lit/lit/ExampleTests/TclTest/lit.local.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-# -*- Python -*-
-
-config.test_format = lit.formats.TclTest()
-
-config.suffixes = ['.ll']
diff --git a/utils/lit/lit/ExampleTests/TclTest/stderr-pipe.ll b/utils/lit/lit/ExampleTests/TclTest/stderr-pipe.ll
deleted file mode 100644
index 6c55fe8..0000000
--- a/utils/lit/lit/ExampleTests/TclTest/stderr-pipe.ll
+++ /dev/null
@@ -1 +0,0 @@
-; RUN: gcc -### > /dev/null |& grep {gcc version}
diff --git a/utils/lit/lit/ExampleTests/TclTest/tcl-redir-1.ll b/utils/lit/lit/ExampleTests/TclTest/tcl-redir-1.ll
deleted file mode 100644
index 61240ba..0000000
--- a/utils/lit/lit/ExampleTests/TclTest/tcl-redir-1.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: echo 'hi' > %t.1 | echo 'hello' > %t.2
-; RUN: not grep 'hi' %t.1
-; RUN: grep 'hello' %t.2
-
-
-
-
diff --git a/utils/lit/lit/ExampleTests/lit.cfg b/utils/lit/lit/ExampleTests/lit.cfg
index 2629918..164daba 100644
--- a/utils/lit/lit/ExampleTests/lit.cfg
+++ b/utils/lit/lit/ExampleTests/lit.cfg
@@ -19,8 +19,8 @@ config.test_source_root = None
 # root).
 config.test_exec_root = None
 
-# target_triple: Used by ShTest and TclTest formats for XFAIL checks.
+# target_triple: Used by ShTest format for XFAIL checks.
 config.target_triple = 'foo'
 
-# available_features: Used by ShTest and TclTest formats for REQUIRES checks.
+# available_features: Used by ShTest format for REQUIRES checks.
 config.available_features.add('some-feature-name')
diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index 0a359a3..9bcf20b 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -12,16 +12,15 @@ class LitConfig:
     import Test
 
     # Provide access to built-in formats.
-    import LitFormats as formats
+    import TestFormats as formats
 
     # Provide access to built-in utility functions.
     import Util as util
 
     def __init__(self, progname, path, quiet,
                  useValgrind, valgrindLeakCheck, valgrindArgs,
-                 useTclAsSh,
                  noExecute, ignoreStdErr, debug, isWindows,
-                 params):
+                 params, config_prefix = None):
         # The name of the test runner.
         self.progname = progname
         # The items to add to the PATH environment variable.
@@ -30,7 +29,6 @@ class LitConfig:
         self.useValgrind = bool(useValgrind)
         self.valgrindLeakCheck = bool(valgrindLeakCheck)
         self.valgrindUserArgs = list(valgrindArgs)
-        self.useTclAsSh = bool(useTclAsSh)
         self.noExecute = noExecute
         self.ignoreStdErr = ignoreStdErr
         self.debug = debug
@@ -38,6 +36,12 @@ class LitConfig:
         self.params = dict(params)
         self.bashPath = None
 
+        # Configuration files to look for when discovering test suites.
+        self.config_prefix = config_prefix or 'lit'
+        self.config_name = '%s.cfg' % (self.config_prefix,)
+        self.site_config_name = '%s.site.cfg' % (self.config_prefix,)
+        self.local_config_name = '%s.local.cfg' % (self.config_prefix,)
+
         self.numErrors = 0
         self.numWarnings = 0
 
@@ -80,7 +84,7 @@ class LitConfig:
                     break
 
         if self.bashPath is None:
-            self.warning("Unable to find 'bash', running Tcl tests internally.")
+            self.warning("Unable to find 'bash'.")
             self.bashPath = ''
 
         return self.bashPath
diff --git a/utils/lit/lit/LitFormats.py b/utils/lit/lit/LitFormats.py
deleted file mode 100644
index 931d107..0000000
--- a/utils/lit/lit/LitFormats.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from TestFormats import FileBasedTest
-from TestFormats import GoogleTest, ShTest, TclTest
-from TestFormats import SyntaxCheckTest, OneCommandPerFileTest
diff --git a/utils/lit/lit/ShUtil.py b/utils/lit/lit/ShUtil.py
index dda622a..50f7910 100644
--- a/utils/lit/lit/ShUtil.py
+++ b/utils/lit/lit/ShUtil.py
@@ -35,7 +35,7 @@ class ShLexer:
         if ('|' in chunk or '&' in chunk or 
             '<' in chunk or '>' in chunk or
             "'" in chunk or '"' in chunk or
-            '\\' in chunk):
+            ';' in chunk or '\\' in chunk):
             return None
         
         self.pos = self.pos - 1 + len(chunk)
@@ -48,7 +48,7 @@ class ShLexer:
             str = c
         while self.pos != self.end:
             c = self.look()
-            if c.isspace() or c in "|&":
+            if c.isspace() or c in "|&;":
                 break
             elif c in '><':
                 # This is an annoying case; we treat '2>' as a single token so
@@ -129,7 +129,7 @@ class ShLexer:
         lex_one_token - Lex a single 'sh' token. """
 
         c = self.eat()
-        if c in ';!':
+        if c == ';':
             return (c,)
         if c == '|':
             if self.maybe_eat('|'):
@@ -219,9 +219,6 @@ class ShParser:
 
     def parse_pipeline(self):
         negate = False
-        if self.look() == ('!',):
-            self.lex()
-            negate = True
 
         commands = [self.parse_command()]
         while self.look() == ('|',):
@@ -253,9 +250,9 @@ class TestShLexer(unittest.TestCase):
         return list(ShLexer(str, *args, **kwargs).lex())
 
     def test_basic(self):
-        self.assertEqual(self.lex('a|b>c&d<e'),
+        self.assertEqual(self.lex('a|b>c&d<e;f'),
                          ['a', ('|',), 'b', ('>',), 'c', ('&',), 'd', 
-                          ('<',), 'e'])
+                          ('<',), 'e', (';',), 'f'])
 
     def test_redirection_tokens(self):
         self.assertEqual(self.lex('a2>c'),
@@ -317,10 +314,6 @@ class TestShParse(unittest.TestCase):
                                    Command(['c'], [])],
                                   False))
 
-        self.assertEqual(self.parse('! a'),
-                         Pipeline([Command(['a'], [])],
-                                  True))
-
     def test_list(self):        
         self.assertEqual(self.parse('a ; b'),
                          Seq(Pipeline([Command(['a'], [])], False),
@@ -349,5 +342,10 @@ class TestShParse(unittest.TestCase):
                              '||',
                              Pipeline([Command(['c'], [])], False)))
 
+        self.assertEqual(self.parse('a; b'),
+                         Seq(Pipeline([Command(['a'], [])], False),
+                             ';',
+                             Pipeline([Command(['b'], [])], False)))
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/utils/lit/lit/TclUtil.py b/utils/lit/lit/TclUtil.py
deleted file mode 100644
index 4a3f345..0000000
--- a/utils/lit/lit/TclUtil.py
+++ /dev/null
@@ -1,322 +0,0 @@
-import itertools
-
-from ShCommands import Command, Pipeline
-
-def tcl_preprocess(data):
-    # Tcl has a preprocessing step to replace escaped newlines.
-    i = data.find('\\\n')
-    if i == -1:
-        return data
-
-    # Replace '\\\n' and subsequent whitespace by a single space.
-    n = len(data)
-    str = data[:i]
-    i += 2
-    while i < n and data[i] in ' \t':
-        i += 1
-    return str + ' ' + data[i:]
-
-class TclLexer:
-    """TclLexer - Lex a string into "words", following the Tcl syntax."""
-
-    def __init__(self, data):
-        self.data = tcl_preprocess(data)
-        self.pos = 0
-        self.end = len(self.data)
-
-    def at_end(self):
-        return self.pos == self.end
-
-    def eat(self):
-        c = self.data[self.pos]
-        self.pos += 1
-        return c
-
-    def look(self):
-        return self.data[self.pos]
-
-    def maybe_eat(self, c):
-        """
-        maybe_eat(c) - Consume the character c if it is the next character,
-        returning True if a character was consumed. """
-        if self.data[self.pos] == c:
-            self.pos += 1
-            return True
-        return False
-
-    def escape(self, c):
-        if c == 'a':
-            return '\x07'
-        elif c == 'b':
-            return '\x08'
-        elif c == 'f':
-            return '\x0c'
-        elif c == 'n':
-            return '\n'
-        elif c == 'r':
-            return '\r'
-        elif c == 't':
-            return '\t'
-        elif c == 'v':
-            return '\x0b'
-        elif c in 'uxo':
-            raise ValueError,'Invalid quoted character %r' % c
-        else:
-            return c
-        
-    def lex_braced(self):
-        # Lex until whitespace or end of string, the opening brace has already
-        # been consumed.
-
-        str = ''        
-        while 1:
-            if self.at_end():
-                raise ValueError,"Unterminated '{' quoted word"
-            
-            c = self.eat()
-            if c == '}':
-                break
-            elif c == '{':
-                str += '{' + self.lex_braced() + '}'
-            elif c == '\\' and self.look() in '{}':
-                str += self.eat()
-            else:
-                str += c
-
-        return str
-
-    def lex_quoted(self):
-        str = ''
-
-        while 1:
-            if self.at_end():
-                raise ValueError,"Unterminated '\"' quoted word"
-            
-            c = self.eat()
-            if c == '"':
-                break
-            elif c == '\\':
-                if self.at_end():
-                    raise ValueError,'Missing quoted character'
-
-                str += self.escape(self.eat())
-            else:
-                str += c
-
-        return str
-
-    def lex_unquoted(self, process_all=False):
-        # Lex until whitespace or end of string.
-        str = ''
-        while not self.at_end():
-            if not process_all:
-                if self.look().isspace() or self.look() == ';':
-                    break
-
-            c = self.eat()
-            if c == '\\':
-                if self.at_end():
-                    raise ValueError,'Missing quoted character'
-
-                str += self.escape(self.eat())
-            elif c == '[':
-                raise NotImplementedError, ('Command substitution is '
-                                            'not supported')
-            elif c == '$' and not self.at_end() and (self.look().isalpha() or
-                                                     self.look() == '{'):
-                raise NotImplementedError, ('Variable substitution is '
-                                            'not supported')
-            else:
-                str += c
-
-        return str
-
-    def lex_one_token(self):
-        if self.maybe_eat('"'):
-            return self.lex_quoted()
-        elif self.maybe_eat('{'):
-            # Check for argument substitution.
-            if not self.maybe_eat('*'):
-                return self.lex_braced()
-
-            if not self.maybe_eat('}'):
-                    return '*' + self.lex_braced()
-                
-            if self.at_end() or self.look().isspace():
-                return '*'
-
-            raise NotImplementedError, "Argument substitution is unsupported"
-        else:
-            return self.lex_unquoted()
-
-    def lex(self):
-        while not self.at_end():
-            c = self.look()
-            if c in ' \t':
-                self.eat()
-            elif c in ';\n':
-                self.eat()
-                yield (';',)
-            else:
-                yield self.lex_one_token()
-
-class TclExecCommand:
-    kRedirectPrefixes1 = ('<', '>')
-    kRedirectPrefixes2 = ('<@', '<<', '2>', '>&', '>>', '>@')
-    kRedirectPrefixes3 = ('2>@', '2>>', '>>&', '>&@')
-    kRedirectPrefixes4 = ('2>@1',)
-
-    def __init__(self, args):
-        self.args = iter(args)
-
-    def lex(self):
-        try:
-            return self.args.next()
-        except StopIteration:
-            return None
-
-    def look(self):
-        next = self.lex()
-        if next is not None:
-            self.args = itertools.chain([next], self.args)
-        return next
-
-    def parse_redirect(self, tok, length):
-        if len(tok) == length:
-            arg = self.lex()
-            if arg is None:
-                raise ValueError,'Missing argument to %r redirection' % tok
-        else:
-            tok,arg = tok[:length],tok[length:]
-
-        if tok[0] == '2':
-            op = (tok[1:],2)
-        else:
-            op = (tok,)
-        return (op, arg)
-
-    def parse_pipeline(self):
-        if self.look() is None:
-            raise ValueError,"Expected at least one argument to exec"
-
-        commands = [Command([],[])]
-        while 1:
-            arg = self.lex()
-            if arg is None:
-                break
-            elif arg == '|':
-                commands.append(Command([],[]))
-            elif arg == '|&':
-                # Write this as a redirect of stderr; it must come first because
-                # stdout may have already been redirected.
-                commands[-1].redirects.insert(0, (('>&',2),'1'))
-                commands.append(Command([],[]))
-            elif arg[:4] in TclExecCommand.kRedirectPrefixes4:
-                commands[-1].redirects.append(self.parse_redirect(arg, 4))
-            elif arg[:3] in TclExecCommand.kRedirectPrefixes3:
-                commands[-1].redirects.append(self.parse_redirect(arg, 3))
-            elif arg[:2] in TclExecCommand.kRedirectPrefixes2:
-                commands[-1].redirects.append(self.parse_redirect(arg, 2))
-            elif arg[:1] in TclExecCommand.kRedirectPrefixes1:
-                commands[-1].redirects.append(self.parse_redirect(arg, 1))
-            else:
-                commands[-1].args.append(arg)
-
-        return Pipeline(commands, False, pipe_err=True)
-
-    def parse(self):
-        ignoreStderr = False
-        keepNewline = False
-
-        # Parse arguments.
-        while 1:
-            next = self.look()
-            if not isinstance(next, str) or next[0] != '-':
-                break
-
-            if next == '--':
-                self.lex()
-                break
-            elif next == '-ignorestderr':
-                ignoreStderr = True
-            elif next == '-keepnewline':
-                keepNewline = True
-            else:
-                raise ValueError,"Invalid exec argument %r" % next
-
-        return (ignoreStderr, keepNewline, self.parse_pipeline())
-
-###
-
-import unittest
-
-class TestTclLexer(unittest.TestCase):
-    def lex(self, str, *args, **kwargs):
-        return list(TclLexer(str, *args, **kwargs).lex())
-
-    def test_preprocess(self):
-        self.assertEqual(tcl_preprocess('a b'), 'a b')
-        self.assertEqual(tcl_preprocess('a\\\nb c'), 'a b c')
-
-    def test_unquoted(self):
-        self.assertEqual(self.lex('a b c'),
-                         ['a', 'b', 'c'])
-        self.assertEqual(self.lex(r'a\nb\tc\ '),
-                         ['a\nb\tc '])
-        self.assertEqual(self.lex(r'a \\\$b c $\\'),
-                         ['a', r'\$b', 'c', '$\\'])
-
-    def test_braced(self):
-        self.assertEqual(self.lex('a {b c} {}'),
-                         ['a', 'b c', ''])
-        self.assertEqual(self.lex(r'a {b {c\n}}'),
-                         ['a', 'b {c\\n}'])
-        self.assertEqual(self.lex(r'a {b\{}'),
-                         ['a', 'b{'])
-        self.assertEqual(self.lex(r'{*}'), ['*'])
-        self.assertEqual(self.lex(r'{*} a'), ['*', 'a'])
-        self.assertEqual(self.lex(r'{*} a'), ['*', 'a'])
-        self.assertEqual(self.lex('{a\\\n   b}'),
-                         ['a b'])
-
-    def test_quoted(self):
-        self.assertEqual(self.lex('a "b c"'),
-                         ['a', 'b c'])
-
-    def test_terminators(self):
-        self.assertEqual(self.lex('a\nb'),
-                         ['a', (';',), 'b'])
-        self.assertEqual(self.lex('a;b'),
-                         ['a', (';',), 'b'])
-        self.assertEqual(self.lex('a   ;   b'),
-                         ['a', (';',), 'b'])
-
-class TestTclExecCommand(unittest.TestCase):
-    def parse(self, str):
-        return TclExecCommand(list(TclLexer(str).lex())).parse()
-
-    def test_basic(self):
-        self.assertEqual(self.parse('echo hello'),
-                         (False, False,
-                          Pipeline([Command(['echo', 'hello'], [])],
-                                   False, True)))
-        self.assertEqual(self.parse('echo hello | grep hello'),
-                         (False, False,
-                          Pipeline([Command(['echo', 'hello'], []),
-                                    Command(['grep', 'hello'], [])],
-                                   False, True)))
-
-    def test_redirect(self):
-        self.assertEqual(self.parse('echo hello > a >b >>c 2> d |& e'),
-                         (False, False,
-                          Pipeline([Command(['echo', 'hello'],
-                                            [(('>&',2),'1'),
-                                             (('>',),'a'),
-                                             (('>',),'b'),
-                                             (('>>',),'c'),
-                                             (('>',2),'d')]),
-                                    Command(['e'], [])],
-                                   False, True)))
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/utils/lit/lit/Test.py b/utils/lit/lit/Test.py
index db2e032..9471e3a 100644
--- a/utils/lit/lit/Test.py
+++ b/utils/lit/lit/Test.py
@@ -7,6 +7,10 @@ class TestResult:
         self.name = name
         self.isFailure = isFailure
 
+    def __repr__(self):
+        return '%s%r' % (self.__class__.__name__,
+                         (self.name, self.isFailure))
+
 PASS        = TestResult('PASS', False)
 XFAIL       = TestResult('XFAIL', False)
 FAIL        = TestResult('FAIL', True)
diff --git a/utils/lit/lit/TestFormats.py b/utils/lit/lit/TestFormats.py
index d1c0558..a272976 100644
--- a/utils/lit/lit/TestFormats.py
+++ b/utils/lit/lit/TestFormats.py
@@ -54,28 +54,36 @@ class GoogleTest(object):
             else:
                 yield ''.join(nested_tests) + ln
 
+    def getTestsInExecutable(self, testSuite, path_in_suite, execpath,
+                             litConfig, localConfig):
+        if not execpath.endswith(self.test_suffix):
+            return
+        (dirname, basename) = os.path.split(execpath)
+        # Discover the tests in this executable.
+        for testname in self.getGTestTests(execpath, litConfig, localConfig):
+            testPath = path_in_suite + (dirname, basename, testname)
+            yield Test.Test(testSuite, testPath, localConfig)
+    
     def getTestsInDirectory(self, testSuite, path_in_suite,
                             litConfig, localConfig):
         source_path = testSuite.getSourcePath(path_in_suite)
         for filename in os.listdir(source_path):
-            # Check for the one subdirectory (build directory) tests will be in.
-            if not '.' in self.test_sub_dir:
+            filepath = os.path.join(source_path, filename)
+            if os.path.isdir(filepath):
+                # Iterate over executables in a directory.
                 if not os.path.normcase(filename) in self.test_sub_dir:
                     continue
-
-            filepath = os.path.join(source_path, filename)
-            if not os.path.isdir(filepath):
-                continue
-
-            for subfilename in os.listdir(filepath):
-                if subfilename.endswith(self.test_suffix):
+                for subfilename in os.listdir(filepath):
                     execpath = os.path.join(filepath, subfilename)
-
-                    # Discover the tests in this executable.
-                    for name in self.getGTestTests(execpath, litConfig,
-                                                   localConfig):
-                        testPath = path_in_suite + (filename, subfilename, name)
-                        yield Test.Test(testSuite, testPath, localConfig)
+                    for test in self.getTestsInExecutable(
+                            testSuite, path_in_suite, execpath,
+                            litConfig, localConfig):
+                      yield test
+            elif ('.' in self.test_sub_dir):
+                for test in self.getTestsInExecutable(
+                        testSuite, path_in_suite, filepath,
+                        litConfig, localConfig):
+                    yield test
 
     def execute(self, test, litConfig):
         testPath,testName = os.path.split(test.getSourcePath())
@@ -124,14 +132,6 @@ class ShTest(FileBasedTest):
         return TestRunner.executeShTest(test, litConfig,
                                         self.execute_external)
 
-class TclTest(FileBasedTest):
-    def __init__(self, ignoreStdErr=False):
-        self.ignoreStdErr = ignoreStdErr
-        
-    def execute(self, test, litConfig):
-        litConfig.ignoreStdErr = self.ignoreStdErr
-        return TestRunner.executeTclTest(test, litConfig)
-
 ###
 
 import re
@@ -221,12 +221,3 @@ class OneCommandPerFileTest:
         report += """Output:\n--\n%s--""" % diags
 
         return Test.FAIL, report
-
-class SyntaxCheckTest(OneCommandPerFileTest):
-    def __init__(self, compiler, dir, extra_cxx_args=[], *args, **kwargs):
-        cmd = [compiler, '-x', 'c++', '-fsyntax-only'] + extra_cxx_args
-        OneCommandPerFileTest.__init__(self, cmd, dir,
-                                       useTempInput=1, *args, **kwargs)
-
-    def createTempInput(self, tmp, test):
-        print >>tmp, '#include "%s"' % test.source_path
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 75182b8..07fb43f 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -49,13 +49,14 @@ def executeShCmd(cmd, cfg, cwd, results):
             return executeShCmd(cmd.rhs, cfg, cwd, results)
 
         if cmd.op == '&':
-            raise NotImplementedError,"unsupported test command: '&'"
+            raise InternalShellError(cmd,"unsupported shell operator: '&'")
 
         if cmd.op == '||':
             res = executeShCmd(cmd.lhs, cfg, cwd, results)
             if res != 0:
                 res = executeShCmd(cmd.rhs, cfg, cwd, results)
             return res
+
         if cmd.op == '&&':
             res = executeShCmd(cmd.lhs, cfg, cwd, results)
             if res is None:
@@ -98,7 +99,7 @@ def executeShCmd(cmd, cfg, cwd, results):
             elif r[0] == ('<',):
                 redirects[0] = [r[1], 'r', None]
             else:
-                raise NotImplementedError,"Unsupported redirect: %r" % (r,)
+                raise InternalShellError(j,"Unsupported redirect: %r" % (r,))
 
         # Map from the final redirections to something subprocess can handle.
         final_redirects = []
@@ -107,14 +108,14 @@ def executeShCmd(cmd, cfg, cwd, results):
                 result = input
             elif r == (1,):
                 if index == 0:
-                    raise NotImplementedError,"Unsupported redirect for stdin"
+                    raise InternalShellError(j,"Unsupported redirect for stdin")
                 elif index == 1:
                     result = subprocess.PIPE
                 else:
                     result = subprocess.STDOUT
             elif r == (2,):
                 if index != 2:
-                    raise NotImplementedError,"Unsupported redirect on stdout"
+                    raise InternalShellError(j,"Unsupported redirect on stdout")
                 result = subprocess.PIPE
             else:
                 if r[2] is None:
@@ -256,9 +257,8 @@ def executeScriptInternal(test, litConfig, tmpBase, commands, cwd):
     try:
         exitCode = executeShCmd(cmd, test.config, cwd, results)
     except InternalShellError,e:
-        out = ''
-        err = e.message
-        exitCode = 255
+        exitCode = 127
+        results.append((e.command, '', e.message, exitCode))
 
     out = err = ''
     for i,(cmd, cmd_out,cmd_err,res) in enumerate(results):
@@ -269,82 +269,6 @@ def executeScriptInternal(test, litConfig, tmpBase, commands, cwd):
 
     return out, err, exitCode
 
-def executeTclScriptInternal(test, litConfig, tmpBase, commands, cwd):
-    import TclUtil
-    cmds = []
-    for ln in commands:
-        # Given the unfortunate way LLVM's test are written, the line gets
-        # backslash substitution done twice.
-        ln = TclUtil.TclLexer(ln).lex_unquoted(process_all = True)
-
-        try:
-            tokens = list(TclUtil.TclLexer(ln).lex())
-        except:
-            return (Test.FAIL, "Tcl lexer error on: %r" % ln)
-
-        # Validate there are no control tokens.
-        for t in tokens:
-            if not isinstance(t, str):
-                return (Test.FAIL,
-                        "Invalid test line: %r containing %r" % (ln, t))
-
-        try:
-            cmds.append(TclUtil.TclExecCommand(tokens).parse_pipeline())
-        except:
-            return (Test.FAIL, "Tcl 'exec' parse error on: %r" % ln)
-
-    if litConfig.useValgrind:
-        for pipeline in cmds:
-            if pipeline.commands:
-                # Only valgrind the first command in each pipeline, to avoid
-                # valgrinding things like grep, not, and FileCheck.
-                cmd = pipeline.commands[0]
-                cmd.args = litConfig.valgrindArgs + cmd.args
-
-    cmd = cmds[0]
-    for c in cmds[1:]:
-        cmd = ShUtil.Seq(cmd, '&&', c)
-
-    # FIXME: This is lame, we shouldn't need bash. See PR5240.
-    bashPath = litConfig.getBashPath()
-    if litConfig.useTclAsSh and bashPath:
-        script = tmpBase + '.script'
-
-        # Write script file
-        f = open(script,'w')
-        print >>f, 'set -o pipefail'
-        cmd.toShell(f, pipefail = True)
-        f.close()
-
-        if 0:
-            print >>sys.stdout, cmd
-            print >>sys.stdout, open(script).read()
-            print >>sys.stdout
-            return '', '', 0
-
-        command = [litConfig.getBashPath(), script]
-        out,err,exitCode = executeCommand(command, cwd=cwd,
-                                          env=test.config.environment)
-
-        return out,err,exitCode
-    else:
-        results = []
-        try:
-            exitCode = executeShCmd(cmd, test.config, cwd, results)
-        except InternalShellError,e:
-            results.append((e.command, '', e.message + '\n', 255))
-            exitCode = 255
-
-    out = err = ''
-
-    for i,(cmd, cmd_out, cmd_err, res) in enumerate(results):
-        out += 'Command %d: %s\n' % (i, ' '.join('"%s"' % s for s in cmd.args))
-        out += 'Command %d Result: %r\n' % (i, res)
-        out += 'Command %d Output:\n%s\n\n' % (i, cmd_out)
-        out += 'Command %d Stderr:\n%s\n\n' % (i, cmd_err)
-
-    return out, err, exitCode
-
 def executeScript(test, litConfig, tmpBase, commands, cwd):
     bashPath = litConfig.getBashPath();
     isWin32CMDEXE = (litConfig.isWindows and not bashPath)
@@ -429,8 +353,6 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
                           ('%{pathsep}', os.pathsep),
                           ('%t', tmpBase + '.tmp'),
                           ('%T', tmpDir),
-                          # FIXME: Remove this once we kill DejaGNU.
-                          ('%abs_tmp', tmpBase + '.tmp'),
                           ('#_MARKER_#', '%')])
 
     # Collect the test lines from the script.
@@ -506,17 +428,14 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
     isXFail = isExpectedFail(test, xfails)
     return script,isXFail,tmpBase,execdir
 
-def formatTestOutput(status, out, err, exitCode, failDueToStderr, script):
+def formatTestOutput(status, out, err, exitCode, script):
     output = StringIO.StringIO()
     print >>output, "Script:"
     print >>output, "--"
     print >>output, '\n'.join(script)
     print >>output, "--"
     print >>output, "Exit Code: %r" % exitCode,
-    if failDueToStderr:
-        print >>output, "(but there was output on stderr)"
-    else:
-        print >>output
+    print >>output
     if out:
         print >>output, "Command Output (stdout):"
         print >>output, "--"
@@ -529,53 +448,6 @@ def formatTestOutput(status, out, err, exitCode, failDueToStderr, script):
         print >>output, "--"
     return (status, output.getvalue())
 
-def executeTclTest(test, litConfig):
-    if test.config.unsupported:
-        return (Test.UNSUPPORTED, 'Test is unsupported')
-
-    # Parse the test script, normalizing slashes in substitutions on Windows
-    # (since otherwise Tcl style lexing will treat them as escapes).
-    res = parseIntegratedTestScript(test, normalize_slashes=kIsWindows)
-    if len(res) == 2:
-        return res
-
-    script, isXFail, tmpBase, execdir = res
-
-    if litConfig.noExecute:
-        return (Test.PASS, '')
-
-    # Create the output directory if it does not already exist.
-    Util.mkdir_p(os.path.dirname(tmpBase))
-
-    res = executeTclScriptInternal(test, litConfig, tmpBase, script, execdir)
-    if len(res) == 2:
-        return res
-
-    # Test for failure. In addition to the exit code, Tcl commands are
-    # considered to fail if there is any standard error output.
-    out,err,exitCode = res
-    if isXFail:
-        ok = exitCode != 0 or err and not litConfig.ignoreStdErr
-        if ok:
-            status = Test.XFAIL
-        else:
-            status = Test.XPASS
-    else:
-        ok = exitCode == 0 and (not err or litConfig.ignoreStdErr)
-        if ok:
-            status = Test.PASS
-        else:
-            status = Test.FAIL
-
-    if ok:
-        return (status,'')
-
-    # Set a flag for formatTestOutput so it can explain why the test was
-    # considered to have failed, despite having an exit code of 0.
-    failDueToStderr = exitCode == 0 and err and not litConfig.ignoreStdErr
-
-    return formatTestOutput(status, out, err, exitCode, failDueToStderr, script)
-
 def executeShTest(test, litConfig, useExternalSh,
                   extra_substitutions=[]):
     if test.config.unsupported:
@@ -617,7 +489,4 @@ def executeShTest(test, litConfig, useExternalSh,
     if ok:
         return (status,'')
 
-    # Sh tests are not considered to fail just from stderr output.
-    failDueToStderr = False
-
-    return formatTestOutput(status, out, err, exitCode, failDueToStderr, script)
+    return formatTestOutput(status, out, err, exitCode, script)
diff --git a/utils/lit/lit/discovery.py b/utils/lit/lit/discovery.py
new file mode 100644
index 0000000..c869a67
--- /dev/null
+++ b/utils/lit/lit/discovery.py
@@ -0,0 +1,234 @@
+"""
+Test discovery functions.
+"""
+
+import os
+import sys
+
+from lit.TestingConfig import TestingConfig
+from lit import LitConfig, Test
+
+def dirContainsTestSuite(path, lit_config):
+    cfgpath = os.path.join(path, lit_config.site_config_name)
+    if os.path.exists(cfgpath):
+        return cfgpath
+    cfgpath = os.path.join(path, lit_config.config_name)
+    if os.path.exists(cfgpath):
+        return cfgpath
+
+def getTestSuite(item, litConfig, cache):
+    """getTestSuite(item, litConfig, cache) -> (suite, relative_path)
+
+    Find the test suite containing @arg item.
+
+    @retval (None, ...) - Indicates no test suite contains @arg item.
+    @retval (suite, relative_path) - The suite that @arg item is in, and its
+    relative path inside that suite.
+    """
+    def search1(path):
+        # Check for a site config or a lit config.
+        cfgpath = dirContainsTestSuite(path, litConfig)
+
+        # If we didn't find a config file, keep looking.
+        if not cfgpath:
+            parent,base = os.path.split(path)
+            if parent == path:
+                return (None, ())
+
+            ts, relative = search(parent)
+            return (ts, relative + (base,))
+
+        # We found a config file, load it.
+        if litConfig.debug:
+            litConfig.note('loading suite config %r' % cfgpath)
+
+        cfg = TestingConfig.frompath(cfgpath, None, litConfig, mustExist = True)
+        source_root = os.path.realpath(cfg.test_source_root or path)
+        exec_root = os.path.realpath(cfg.test_exec_root or path)
+        return Test.TestSuite(cfg.name, source_root, exec_root, cfg), ()
+
+    def search(path):
+        # Check for an already instantiated test suite.
+        res = cache.get(path)
+        if res is None:
+            cache[path] = res = search1(path)
+        return res
+
+    # Canonicalize the path.
+    item = os.path.realpath(item)
+
+    # Skip files and virtual components.
+    components = []
+    while not os.path.isdir(item):
+        parent,base = os.path.split(item)
+        if parent == item:
+            return (None, ())
+        components.append(base)
+        item = parent
+    components.reverse()
+
+    ts, relative = search(item)
+    return ts, tuple(relative + tuple(components))
+
+def getLocalConfig(ts, path_in_suite, litConfig, cache):
+    def search1(path_in_suite):
+        # Get the parent config.
+        if not path_in_suite:
+            parent = ts.config
+        else:
+            parent = search(path_in_suite[:-1])
+
+        # Load the local configuration.
+        source_path = ts.getSourcePath(path_in_suite)
+        cfgpath = os.path.join(source_path, litConfig.local_config_name)
+        if litConfig.debug:
+            litConfig.note('loading local config %r' % cfgpath)
+        return TestingConfig.frompath(cfgpath, parent, litConfig,
+                                    mustExist = False,
+                                    config = parent.clone(cfgpath))
+
+    def search(path_in_suite):
+        key = (ts, path_in_suite)
+        res = cache.get(key)
+        if res is None:
+            cache[key] = res = search1(path_in_suite)
+        return res
+
+    return search(path_in_suite)
+
+def getTests(path, litConfig, testSuiteCache, localConfigCache):
+    # Find the test suite for this input and its relative path.
+    ts,path_in_suite = getTestSuite(path, litConfig, testSuiteCache)
+    if ts is None:
+        litConfig.warning('unable to find test suite for %r' % path)
+        return (),()
+
+    if litConfig.debug:
+        litConfig.note('resolved input %r to %r::%r' % (path, ts.name,
+                                                        path_in_suite))
+
+    return ts, getTestsInSuite(ts, path_in_suite, litConfig,
+                               testSuiteCache, localConfigCache)
+
+def getTestsInSuite(ts, path_in_suite, litConfig,
+                    testSuiteCache, localConfigCache):
+    # Check that the source path exists (errors here are reported by the
+    # caller).
+    source_path = ts.getSourcePath(path_in_suite)
+    if not os.path.exists(source_path):
+        return
+
+    # Check if the user named a test directly.
+    if not os.path.isdir(source_path):
+        lc = getLocalConfig(ts, path_in_suite[:-1], litConfig, localConfigCache)
+        yield Test.Test(ts, path_in_suite, lc)
+        return
+
+    # Otherwise we have a directory to search for tests, start by getting the
+    # local configuration.
+    lc = getLocalConfig(ts, path_in_suite, litConfig, localConfigCache)
+
+    # Search for tests.
+    if lc.test_format is not None:
+        for res in lc.test_format.getTestsInDirectory(ts, path_in_suite,
+                                                      litConfig, lc):
+            yield res
+
+    # Search subdirectories.
+    for filename in os.listdir(source_path):
+        # FIXME: This doesn't belong here?
+        if filename in ('Output', '.svn') or filename in lc.excludes:
+            continue
+
+        # Ignore non-directories.
+        file_sourcepath = os.path.join(source_path, filename)
+        if not os.path.isdir(file_sourcepath):
+            continue
+
+        # Check for nested test suites, first in the execpath in case there is a
+        # site configuration and then in the source path.
+        file_execpath = ts.getExecPath(path_in_suite + (filename,))
+        if dirContainsTestSuite(file_execpath, litConfig):
+            sub_ts, subiter = getTests(file_execpath, litConfig,
+                                       testSuiteCache, localConfigCache)
+        elif dirContainsTestSuite(file_sourcepath, litConfig):
+            sub_ts, subiter = getTests(file_sourcepath, litConfig,
+                                       testSuiteCache, localConfigCache)
+        else:
+            # Otherwise, continue loading from inside this test suite.
+            subiter = getTestsInSuite(ts, path_in_suite + (filename,),
+                                      litConfig, testSuiteCache,
+                                      localConfigCache)
+            sub_ts = None
+
+        N = 0
+        for res in subiter:
+            N += 1
+            yield res
+        if sub_ts and not N:
+            litConfig.warning('test suite %r contained no tests' % sub_ts.name)
+
+def find_tests_for_inputs(lit_config, inputs):
+    """
+    find_tests_for_inputs(lit_config, inputs) -> [Test]
+
+    Given a configuration object and a list of input specifiers, find all the
+    tests to execute.
+    """
+
+    # Expand '@...' form in inputs.
+    actual_inputs = []
+    for input in inputs:
+        if os.path.exists(input) or not input.startswith('@'):
+            actual_inputs.append(input)
+        else:
+            f = open(input[1:])
+            try:
+                for ln in f:
+                    ln = ln.strip()
+                    if ln:
+                        actual_inputs.append(ln)
+            finally:
+                f.close()
+                    
+    # Load the tests from the inputs.
+    tests = []
+    test_suite_cache = {}
+    local_config_cache = {}
+    for input in actual_inputs:
+        prev = len(tests)
+        tests.extend(getTests(input, lit_config,
+                              test_suite_cache, local_config_cache)[1])
+        if prev == len(tests):
+            lit_config.warning('input %r contained no tests' % input)
+
+    # If there were any errors during test discovery, exit now.
+    if lit_config.numErrors:
+        print >>sys.stderr, '%d errors, exiting.' % lit_config.numErrors
+        sys.exit(2)
+
+    return tests
+
+def load_test_suite(inputs):
+    import platform
+    import unittest
+    from lit.LitTestCase import LitTestCase
+
+    # Create the global config object.
+    litConfig = LitConfig.LitConfig(progname = 'lit',
+                                    path = [],
+                                    quiet = False,
+                                    useValgrind = False,
+                                    valgrindLeakCheck = False,
+                                    valgrindArgs = [],
+                                    noExecute = False,
+                                    ignoreStdErr = False,
+                                    debug = False,
+                                    isWindows = (platform.system()=='Windows'),
+                                    params = {})
+
+    tests = find_tests_for_inputs(litConfig, inputs)
+
+    # Return a unittest test suite which just runs the tests in order.
+    return unittest.TestSuite([LitTestCase(test, litConfig) for test in tests])
+
diff --git a/utils/lit/lit/main.py b/utils/lit/lit/main.py
index 25bbcbd..da961ee 100755
--- a/utils/lit/lit/main.py
+++ b/utils/lit/lit/main.py
@@ -12,18 +12,10 @@ import ProgressBar
 import TestRunner
 import Util
 
-from TestingConfig import TestingConfig
 import LitConfig
 import Test
 
-# Configuration files to look for when discovering test suites. These can be
-# overridden with --config-prefix.
-#
-# FIXME: Rename to 'config.lit', 'site.lit', and 'local.lit' ?
-gConfigName = 'lit.cfg'
-gSiteConfigName = 'lit.site.cfg'
-
-kLocalConfigName = 'lit.local.cfg'
+import lit.discovery
 
 class TestingProgressDisplay:
     def __init__(self, opts, numTests, progressBar=None):
@@ -137,166 +129,6 @@ class Tester(threading.Thread):
         test.setResult(result, output, elapsed)
         self.display.update(test)
 
-def dirContainsTestSuite(path):
-    cfgpath = os.path.join(path, gSiteConfigName)
-    if os.path.exists(cfgpath):
-        return cfgpath
-    cfgpath = os.path.join(path, gConfigName)
-    if os.path.exists(cfgpath):
-        return cfgpath
-
-def getTestSuite(item, litConfig, cache):
-    """getTestSuite(item, litConfig, cache) -> (suite, relative_path)
-
-    Find the test suite containing @arg item.
-
-    @retval (None, ...) - Indicates no test suite contains @arg item.
-    @retval (suite, relative_path) - The suite that @arg item is in, and its
-    relative path inside that suite.
-    """
-    def search1(path):
-        # Check for a site config or a lit config.
-        cfgpath = dirContainsTestSuite(path)
-
-        # If we didn't find a config file, keep looking.
-        if not cfgpath:
-            parent,base = os.path.split(path)
-            if parent == path:
-                return (None, ())
-
-            ts, relative = search(parent)
-            return (ts, relative + (base,))
-
-        # We found a config file, load it.
-        if litConfig.debug:
-            litConfig.note('loading suite config %r' % cfgpath)
-
-        cfg = TestingConfig.frompath(cfgpath, None, litConfig, mustExist = True)
-        source_root = os.path.realpath(cfg.test_source_root or path)
-        exec_root = os.path.realpath(cfg.test_exec_root or path)
-        return Test.TestSuite(cfg.name, source_root, exec_root, cfg), ()
-
-    def search(path):
-        # Check for an already instantiated test suite.
-        res = cache.get(path)
-        if res is None:
-            cache[path] = res = search1(path)
-        return res
-
-    # Canonicalize the path.
-    item = os.path.realpath(item)
-
-    # Skip files and virtual components.
-    components = []
-    while not os.path.isdir(item):
-        parent,base = os.path.split(item)
-        if parent == item:
-            return (None, ())
-        components.append(base)
-        item = parent
-    components.reverse()
-
-    ts, relative = search(item)
-    return ts, tuple(relative + tuple(components))
-
-def getLocalConfig(ts, path_in_suite, litConfig, cache):
-    def search1(path_in_suite):
-        # Get the parent config.
-        if not path_in_suite:
-            parent = ts.config
-        else:
-            parent = search(path_in_suite[:-1])
-
-        # Load the local configuration.
-        source_path = ts.getSourcePath(path_in_suite)
-        cfgpath = os.path.join(source_path, kLocalConfigName)
-        if litConfig.debug:
-            litConfig.note('loading local config %r' % cfgpath)
-        return TestingConfig.frompath(cfgpath, parent, litConfig,
-                                    mustExist = False,
-                                    config = parent.clone(cfgpath))
-
-    def search(path_in_suite):
-        key = (ts, path_in_suite)
-        res = cache.get(key)
-        if res is None:
-            cache[key] = res = search1(path_in_suite)
-        return res
-
-    return search(path_in_suite)
-
-def getTests(path, litConfig, testSuiteCache, localConfigCache):
-    # Find the test suite for this input and its relative path.
-    ts,path_in_suite = getTestSuite(path, litConfig, testSuiteCache)
-    if ts is None:
-        litConfig.warning('unable to find test suite for %r' % path)
-        return (),()
-
-    if litConfig.debug:
-        litConfig.note('resolved input %r to %r::%r' % (path, ts.name,
-                                                        path_in_suite))
-
-    return ts, getTestsInSuite(ts, path_in_suite, litConfig,
-                               testSuiteCache, localConfigCache)
-
-def getTestsInSuite(ts, path_in_suite, litConfig,
-                    testSuiteCache, localConfigCache):
-    # Check that the source path exists (errors here are reported by the
-    # caller).
-    source_path = ts.getSourcePath(path_in_suite)
-    if not os.path.exists(source_path):
-        return
-
-    # Check if the user named a test directly.
-    if not os.path.isdir(source_path):
-        lc = getLocalConfig(ts, path_in_suite[:-1], litConfig, localConfigCache)
-        yield Test.Test(ts, path_in_suite, lc)
-        return
-
-    # Otherwise we have a directory to search for tests, start by getting the
-    # local configuration.
-    lc = getLocalConfig(ts, path_in_suite, litConfig, localConfigCache)
-
-    # Search for tests.
-    if lc.test_format is not None:
-        for res in lc.test_format.getTestsInDirectory(ts, path_in_suite,
-                                                      litConfig, lc):
-            yield res
-
-    # Search subdirectories.
-    for filename in os.listdir(source_path):
-        # FIXME: This doesn't belong here?
-        if filename in ('Output', '.svn') or filename in lc.excludes:
-            continue
-
-        # Ignore non-directories.
-        file_sourcepath = os.path.join(source_path, filename)
-        if not os.path.isdir(file_sourcepath):
-            continue
-
-        # Check for nested test suites, first in the execpath in case there is a
-        # site configuration and then in the source path.
-        file_execpath = ts.getExecPath(path_in_suite + (filename,))
-        if dirContainsTestSuite(file_execpath):
-            sub_ts, subiter = getTests(file_execpath, litConfig,
-                                       testSuiteCache, localConfigCache)
-        elif dirContainsTestSuite(file_sourcepath):
-            sub_ts, subiter = getTests(file_sourcepath, litConfig,
-                                       testSuiteCache, localConfigCache)
-        else:
-            # Otherwise, continue loading from inside this test suite.
-            subiter = getTestsInSuite(ts, path_in_suite + (filename,),
-                                      litConfig, testSuiteCache,
-                                      localConfigCache)
-            sub_ts = None
-
-        N = 0
-        for res in subiter:
-            N += 1
-            yield res
-        if sub_ts and not N:
-            litConfig.warning('test suite %r contained no tests' % sub_ts.name)
-
 def runTests(numThreads, litConfig, provider, display):
     # If only using one testing thread, don't use threads at all; this lets us
     # profile, among other things.
@@ -316,50 +148,8 @@ def runTests(numThreads, litConfig, provider, display):
     except KeyboardInterrupt:
         sys.exit(2)
 
-def load_test_suite(inputs):
-    import unittest
-
-    # Create the global config object.
-    litConfig = LitConfig.LitConfig(progname = 'lit',
-                                    path = [],
-                                    quiet = False,
-                                    useValgrind = False,
-                                    valgrindLeakCheck = False,
-                                    valgrindArgs = [],
-                                    useTclAsSh = False,
-                                    noExecute = False,
-                                    ignoreStdErr = False,
-                                    debug = False,
-                                    isWindows = (platform.system()=='Windows'),
-                                    params = {})
-
-    # Load the tests from the inputs.
-    tests = []
-    testSuiteCache = {}
-    localConfigCache = {}
-    for input in inputs:
-        prev = len(tests)
-        tests.extend(getTests(input, litConfig,
-                              testSuiteCache, localConfigCache)[1])
-        if prev == len(tests):
-            litConfig.warning('input %r contained no tests' % input)
-
-    # If there were any errors during test discovery, exit now.
-    if litConfig.numErrors:
-        print >>sys.stderr, '%d errors, exiting.' % litConfig.numErrors
-        sys.exit(2)
-
-    # Return a unittest test suite which just runs the tests in order.
-    def get_test_fn(test):
-        return unittest.FunctionTestCase(
-            lambda: test.config.test_format.execute(
-                test, litConfig),
-            description = test.getFullName())
-
-    from LitTestCase import LitTestCase
-    return unittest.TestSuite([LitTestCase(test, litConfig) for test in tests])
-
-def main(builtinParameters = {}):    # Bump the GIL check interval, its more important to get any one thread to a
+def main(builtinParameters = {}):
+    # Bump the GIL check interval, its more important to get any one thread to a
     # blocking operation (hopefully exec) than to try and unblock other threads.
     #
     # FIXME: This is a hack.
@@ -442,9 +232,6 @@ def main(builtinParameters = {}):    # Bump the GIL check interval, its more imp
     group.add_option("", "--show-suites", dest="showSuites",
                       help="Show discovered test suites",
                       action="store_true", default=False)
-    group.add_option("", "--no-tcl-as-sh", dest="useTclAsSh",
-                      help="Don't run Tcl scripts using 'sh'",
-                      action="store_false", default=True)
     group.add_option("", "--repeat", dest="repeatTests", metavar="N",
                       help="Repeat tests N times (for timing)",
                       action="store", default=None, type=int)
@@ -455,12 +242,6 @@ def main(builtinParameters = {}):    # Bump the GIL check interval, its more imp
     if not args:
         parser.error('No inputs specified')
 
-    if opts.configPrefix is not None:
-        global gConfigName, gSiteConfigName, kLocalConfigName
-        gConfigName = '%s.cfg' % opts.configPrefix
-        gSiteConfigName = '%s.site.cfg' % opts.configPrefix
-        kLocalConfigName = '%s.local.cfg' % opts.configPrefix
-
     if opts.numThreads is None:
 # Python <2.5 has a race condition causing lit to always fail with numThreads>1
 # http://bugs.python.org/issue1731717
@@ -489,50 +270,20 @@ def main(builtinParameters = {}):    # Bump the GIL check interval, its more imp
                                     useValgrind = opts.useValgrind,
                                     valgrindLeakCheck = opts.valgrindLeakCheck,
                                     valgrindArgs = opts.valgrindArgs,
-                                    useTclAsSh = opts.useTclAsSh,
                                     noExecute = opts.noExecute,
                                     ignoreStdErr = False,
                                     debug = opts.debug,
                                     isWindows = (platform.system()=='Windows'),
-                                    params = userParams)
+                                    params = userParams,
+                                    config_prefix = opts.configPrefix)
 
-    # Expand '@...' form in inputs.
-    actual_inputs = []
-    for input in inputs:
-        if os.path.exists(input) or not input.startswith('@'):
-            actual_inputs.append(input)
-        else:
-            f = open(input[1:])
-            try:
-                for ln in f:
-                    ln = ln.strip()
-                    if ln:
-                        actual_inputs.append(ln)
-            finally:
-                f.close()
-                    
-            
-    # Load the tests from the inputs.
-    tests = []
-    testSuiteCache = {}
-    localConfigCache = {}
-    for input in actual_inputs:
-        prev = len(tests)
-        tests.extend(getTests(input, litConfig,
-                              testSuiteCache, localConfigCache)[1])
-        if prev == len(tests):
-            litConfig.warning('input %r contained no tests' % input)
-
-    # If there were any errors during test discovery, exit now.
-    if litConfig.numErrors:
-        print >>sys.stderr, '%d errors, exiting.' % litConfig.numErrors
-        sys.exit(2)
+    tests = lit.discovery.find_tests_for_inputs(litConfig, inputs)
 
     if opts.showSuites:
-        suitesAndTests = dict([(ts,[])
-                               for ts,_ in testSuiteCache.values()
-                               if ts])
+        suitesAndTests = {}
         for t in tests:
+            if t.suite not in suitesAndTests:
+                suitesAndTests[t.suite] = []
             suitesAndTests[t.suite].append(t)
 
         print '-- Test Suites --'
diff --git a/utils/lit/tests/.coveragerc b/utils/lit/tests/.coveragerc
new file mode 100644
index 0000000..c886d0a
--- /dev/null
+++ b/utils/lit/tests/.coveragerc
@@ -0,0 +1,11 @@
+# .coveragerc to control coverage.py
+[run]
+branch = False
+parallel = True
+source = lit
+
+[html]
+directory = coverage_html_report
+
+[report]
+omit = Inputs
diff --git a/utils/lit/tests/Inputs/discovery/lit.cfg b/utils/lit/tests/Inputs/discovery/lit.cfg
new file mode 100644
index 0000000..3513bff
--- /dev/null
+++ b/utils/lit/tests/Inputs/discovery/lit.cfg
@@ -0,0 +1,5 @@
+config.name = 'top-level-suite'
+config.suffixes = ['.txt']
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
diff --git a/utils/lit/tests/Inputs/discovery/subdir/lit.local.cfg b/utils/lit/tests/Inputs/discovery/subdir/lit.local.cfg
new file mode 100644
index 0000000..5ae6b3c
--- /dev/null
+++ b/utils/lit/tests/Inputs/discovery/subdir/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.py']
diff --git a/utils/lit/tests/Inputs/discovery/subdir/test-three.py b/utils/lit/tests/Inputs/discovery/subdir/test-three.py
new file mode 100644
index 0000000..b80b60b
--- /dev/null
+++ b/utils/lit/tests/Inputs/discovery/subdir/test-three.py
@@ -0,0 +1 @@
+# RUN: true
diff --git a/utils/lit/tests/Inputs/discovery/subsuite/lit.cfg b/utils/lit/tests/Inputs/discovery/subsuite/lit.cfg
new file mode 100644
index 0000000..0c2979d
--- /dev/null
+++ b/utils/lit/tests/Inputs/discovery/subsuite/lit.cfg
@@ -0,0 +1,5 @@
+config.name = 'sub-suite'
+config.suffixes = ['.txt']
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
diff --git a/utils/lit/tests/Inputs/discovery/subsuite/test-one.txt b/utils/lit/tests/Inputs/discovery/subsuite/test-one.txt
new file mode 100644
index 0000000..b80b60b
--- /dev/null
+++ b/utils/lit/tests/Inputs/discovery/subsuite/test-one.txt
@@ -0,0 +1 @@
+# RUN: true
diff --git a/utils/lit/tests/Inputs/discovery/subsuite/test-two.txt b/utils/lit/tests/Inputs/discovery/subsuite/test-two.txt
new file mode 100644
index 0000000..b80b60b
--- /dev/null
+++ b/utils/lit/tests/Inputs/discovery/subsuite/test-two.txt
@@ -0,0 +1 @@
+# RUN: true
diff --git a/utils/lit/tests/Inputs/discovery/test-one.txt b/utils/lit/tests/Inputs/discovery/test-one.txt
new file mode 100644
index 0000000..b80b60b
--- /dev/null
+++ b/utils/lit/tests/Inputs/discovery/test-one.txt
@@ -0,0 +1 @@
+# RUN: true
diff --git a/utils/lit/tests/Inputs/discovery/test-two.txt b/utils/lit/tests/Inputs/discovery/test-two.txt
new file mode 100644
index 0000000..b80b60b
--- /dev/null
+++ b/utils/lit/tests/Inputs/discovery/test-two.txt
@@ -0,0 +1 @@
+# RUN: true
diff --git a/utils/lit/tests/Inputs/shtest-format/external_shell/fail.txt b/utils/lit/tests/Inputs/shtest-format/external_shell/fail.txt
new file mode 100644
index 0000000..1e74be5
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/external_shell/fail.txt
@@ -0,0 +1,3 @@
+# Run a command that fails with error on stdout.
+#
+# RUN: cat "does-not-exist"
diff --git a/utils/lit/tests/Inputs/shtest-format/external_shell/lit.local.cfg b/utils/lit/tests/Inputs/shtest-format/external_shell/lit.local.cfg
new file mode 100644
index 0000000..d14d147
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/external_shell/lit.local.cfg
@@ -0,0 +1 @@
+config.test_format = lit.formats.ShTest(execute_external=True)
diff --git a/utils/lit/tests/Inputs/shtest-format/external_shell/pass.txt b/utils/lit/tests/Inputs/shtest-format/external_shell/pass.txt
new file mode 100644
index 0000000..b80b60b
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/external_shell/pass.txt
@@ -0,0 +1 @@
+# RUN: true
diff --git a/utils/lit/tests/Inputs/shtest-format/fail.txt b/utils/lit/tests/Inputs/shtest-format/fail.txt
new file mode 100644
index 0000000..49932c3
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/fail.txt
@@ -0,0 +1 @@
+# RUN: false
diff --git a/utils/lit/tests/Inputs/shtest-format/lit.cfg b/utils/lit/tests/Inputs/shtest-format/lit.cfg
new file mode 100644
index 0000000..78dd1bf
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/lit.cfg
@@ -0,0 +1,7 @@
+config.name = 'shtest-format'
+config.suffixes = ['.txt']
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
+config.target_triple = 'x86_64-unknown-unknown'
+config.available_features.add('a-present-feature')
diff --git a/utils/lit/tests/Inputs/shtest-format/no-test-line.txt b/utils/lit/tests/Inputs/shtest-format/no-test-line.txt
new file mode 100644
index 0000000..f2316bd
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/no-test-line.txt
@@ -0,0 +1 @@
+# Empty!
diff --git a/utils/lit/tests/Inputs/shtest-format/pass.txt b/utils/lit/tests/Inputs/shtest-format/pass.txt
new file mode 100644
index 0000000..b80b60b
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/pass.txt
@@ -0,0 +1 @@
+# RUN: true
diff --git a/utils/lit/tests/Inputs/shtest-format/requires-missing.txt b/utils/lit/tests/Inputs/shtest-format/requires-missing.txt
new file mode 100644
index 0000000..9e6648d
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/requires-missing.txt
@@ -0,0 +1,2 @@
+RUN: true
+REQUIRES: a-missing-feature
diff --git a/utils/lit/tests/Inputs/shtest-format/requires-present.txt b/utils/lit/tests/Inputs/shtest-format/requires-present.txt
new file mode 100644
index 0000000..064f707
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/requires-present.txt
@@ -0,0 +1,2 @@
+RUN: true
+REQUIRES: a-present-feature
diff --git a/utils/lit/tests/Inputs/shtest-format/unsupported_dir/lit.local.cfg b/utils/lit/tests/Inputs/shtest-format/unsupported_dir/lit.local.cfg
new file mode 100644
index 0000000..462e3dc
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/unsupported_dir/lit.local.cfg
@@ -0,0 +1 @@
+config.unsupported = True
diff --git a/utils/lit/tests/Inputs/shtest-format/unsupported_dir/some-test.txt b/utils/lit/tests/Inputs/shtest-format/unsupported_dir/some-test.txt
new file mode 100644
index 0000000..b80b60b
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/unsupported_dir/some-test.txt
@@ -0,0 +1 @@
+# RUN: true
diff --git a/utils/lit/tests/Inputs/shtest-format/xfail-feature.txt b/utils/lit/tests/Inputs/shtest-format/xfail-feature.txt
new file mode 100644
index 0000000..bd6241f
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/xfail-feature.txt
@@ -0,0 +1,2 @@
+# RUN: false
+# XFAIL: a-present-feature
diff --git a/utils/lit/tests/Inputs/shtest-format/xfail-target.txt b/utils/lit/tests/Inputs/shtest-format/xfail-target.txt
new file mode 100644
index 0000000..36760be
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/xfail-target.txt
@@ -0,0 +1,2 @@
+RUN: false
+XFAIL: x86_64
diff --git a/utils/lit/tests/Inputs/shtest-format/xfail.txt b/utils/lit/tests/Inputs/shtest-format/xfail.txt
new file mode 100644
index 0000000..6814cda
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/xfail.txt
@@ -0,0 +1,2 @@
+RUN: false
+XFAIL: *
diff --git a/utils/lit/tests/Inputs/shtest-format/xpass.txt b/utils/lit/tests/Inputs/shtest-format/xpass.txt
new file mode 100644
index 0000000..764d217
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/xpass.txt
@@ -0,0 +1,2 @@
+RUN: true
+XFAIL: x86_64
diff --git a/utils/lit/tests/Inputs/shtest-shell/error-0.txt b/utils/lit/tests/Inputs/shtest-shell/error-0.txt
new file mode 100644
index 0000000..631c8df
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-shell/error-0.txt
@@ -0,0 +1,3 @@
+# Check error on an internal shell error (unable to find command).
+#
+# RUN: not-a-real-command
diff --git a/utils/lit/tests/Inputs/shtest-shell/error-1.txt b/utils/lit/tests/Inputs/shtest-shell/error-1.txt
new file mode 100644
index 0000000..e5c8be6
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-shell/error-1.txt
@@ -0,0 +1,3 @@
+# Check error on a shell parsing failure.
+#
+# RUN: echo "missing quote
diff --git a/utils/lit/tests/Inputs/shtest-shell/error-2.txt b/utils/lit/tests/Inputs/shtest-shell/error-2.txt
new file mode 100644
index 0000000..a976286
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-shell/error-2.txt
@@ -0,0 +1,3 @@
+# Check error on a unsupported redirect.
+#
+# RUN: echo "hello" 3>&1
diff --git a/utils/lit/tests/Inputs/shtest-shell/lit.cfg b/utils/lit/tests/Inputs/shtest-shell/lit.cfg
new file mode 100644
index 0000000..4878b65
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-shell/lit.cfg
@@ -0,0 +1,5 @@
+config.name = 'shtest-shell'
+config.suffixes = ['.txt']
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
diff --git a/utils/lit/tests/Inputs/shtest-shell/redirects.txt b/utils/lit/tests/Inputs/shtest-shell/redirects.txt
new file mode 100644
index 0000000..6be88b6
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-shell/redirects.txt
@@ -0,0 +1,41 @@
+# Check stdout redirect (> and >>).
+#
+# RUN: echo "not-present" > %t.stdout-write
+# RUN: echo "is-present" > %t.stdout-write
+# RUN: FileCheck --check-prefix=STDOUT-WRITE < %t.stdout-write %s
+#
+# STDOUT-WRITE-NOT: not-present
+# STDOUT-WRITE: is-present
+#
+# RUN: echo "appended-line" >> %t.stdout-write
+# RUN: FileCheck --check-prefix=STDOUT-APPEND < %t.stdout-write %s
+#
+# STDOUT-APPEND: is-present
+# STDOUT-APPEND: appended-line
+
+
+# Check stderr redirect (2> and 2>>).
+#
+# RUN: echo "not-present" > %t.stderr-write
+# RUN: %S/write-to-stderr.sh 2> %t.stderr-write
+# RUN: FileCheck --check-prefix=STDERR-WRITE < %t.stderr-write %s
+#
+# STDERR-WRITE-NOT: not-present
+# STDERR-WRITE: a line on stderr
+#
+# RUN: %S/write-to-stderr.sh 2>> %t.stderr-write
+# RUN: FileCheck --check-prefix=STDERR-APPEND < %t.stderr-write %s
+#
+# STDERR-APPEND: a line on stderr
+# STDERR-APPEND: a line on stderr
+
+
+# Check combined redirect (&>).
+#
+# RUN: echo "not-present" > %t.combined
+# RUN: %S/write-to-stdout-and-stderr.sh &> %t.combined
+# RUN: FileCheck --check-prefix=COMBINED-WRITE < %t.combined %s
+#
+# COMBINED-WRITE-NOT: not-present
+# COMBINED-WRITE: a line on stdout
+# COMBINED-WRITE: a line on stderr
diff --git a/utils/lit/tests/Inputs/shtest-shell/sequencing-0.txt b/utils/lit/tests/Inputs/shtest-shell/sequencing-0.txt
new file mode 100644
index 0000000..6578db2
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-shell/sequencing-0.txt
@@ -0,0 +1,28 @@
+# Check sequencing operations.
+#
+# RUN: echo "first-line" > %t.out && echo "second-line" >> %t.out
+# RUN: FileCheck --check-prefix CHECK-AND < %t.out %s
+#
+# CHECK-AND: first-line
+# CHECK-AND: second-line
+#
+# The false case of && is tested in sequencing-2.txt
+
+
+# RUN: echo "first-line" > %t.out || echo "second-line" >> %t.out
+# RUN: FileCheck --check-prefix CHECK-OR-1 < %t.out %s
+#
+# CHECK-OR-1: first-line
+# CHECK-OR-1-NOT: second-line
+
+# RUN: false || echo "second-line" > %t.out
+# RUN: FileCheck --check-prefix CHECK-OR-2 < %t.out %s
+#
+# CHECK-OR-2: second-line
+
+
+# RUN: echo "first-line" > %t.out; echo "second-line" >> %t.out
+# RUN: FileCheck --check-prefix CHECK-SEQ < %t.out %s
+#
+# CHECK-SEQ: first-line
+# CHECK-SEQ: second-line
diff --git a/utils/lit/tests/Inputs/shtest-shell/sequencing-1.txt b/utils/lit/tests/Inputs/shtest-shell/sequencing-1.txt
new file mode 100644
index 0000000..5a1794c
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-shell/sequencing-1.txt
@@ -0,0 +1,2 @@
+# RUN: false && true
+# XFAIL: *
diff --git a/utils/lit/tests/Inputs/shtest-shell/write-to-stderr.sh b/utils/lit/tests/Inputs/shtest-shell/write-to-stderr.sh
new file mode 100755
index 0000000..ead3fd3
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-shell/write-to-stderr.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+echo "a line on stderr" 1>&2
diff --git a/utils/lit/tests/Inputs/shtest-shell/write-to-stdout-and-stderr.sh b/utils/lit/tests/Inputs/shtest-shell/write-to-stdout-and-stderr.sh
new file mode 100755
index 0000000..f20de5d
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-shell/write-to-stdout-and-stderr.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+echo "a line on stdout"
+echo "a line on stderr" 1>&2
diff --git a/utils/lit/tests/Inputs/unittest-adaptor/lit.cfg b/utils/lit/tests/Inputs/unittest-adaptor/lit.cfg
new file mode 100644
index 0000000..52de709
--- /dev/null
+++ b/utils/lit/tests/Inputs/unittest-adaptor/lit.cfg
@@ -0,0 +1,5 @@
+config.name = 'unittest-adaptor'
+config.suffixes = ['.txt']
+config.test_format = lit.formats.ShTest()
+config.test_source_root = None
+config.test_exec_root = None
diff --git a/utils/lit/tests/Inputs/unittest-adaptor/test-one.txt b/utils/lit/tests/Inputs/unittest-adaptor/test-one.txt
new file mode 100644
index 0000000..b80b60b
--- /dev/null
+++ b/utils/lit/tests/Inputs/unittest-adaptor/test-one.txt
@@ -0,0 +1 @@
+# RUN: true
diff --git a/utils/lit/tests/Inputs/unittest-adaptor/test-two.txt b/utils/lit/tests/Inputs/unittest-adaptor/test-two.txt
new file mode 100644
index 0000000..49932c3
--- /dev/null
+++ b/utils/lit/tests/Inputs/unittest-adaptor/test-two.txt
@@ -0,0 +1 @@
+# RUN: false
diff --git a/utils/lit/tests/discovery.py b/utils/lit/tests/discovery.py
new file mode 100644
index 0000000..54b99d3
--- /dev/null
+++ b/utils/lit/tests/discovery.py
@@ -0,0 +1,25 @@
+# Check the basic discovery process, including a sub-suite.
+#
+# RUN: %{lit} %{inputs}/discovery \
+# RUN:   -j 1 --debug --no-execute --show-suites -v > %t.out 2> %t.err
+# RUN: FileCheck --check-prefix=CHECK-BASIC-OUT < %t.out %s
+# RUN: FileCheck --check-prefix=CHECK-BASIC-ERR < %t.err %s
+#
+# CHECK-BASIC-ERR: loading suite config '{{.*}}/tests/Inputs/discovery/lit.cfg'
+# CHECK-BASIC-ERR: loading local config '{{.*}}/tests/Inputs/discovery/subdir/lit.local.cfg'
+# CHECK-BASIC-ERR: loading suite config '{{.*}}/tests/Inputs/discovery/subsuite/lit.cfg'
+#
+# CHECK-BASIC-OUT: -- Test Suites --
+# CHECK-BASIC-OUT:   sub-suite - 2 tests
+# CHECK-BASIC-OUT:     Source Root:
+# CHECK-BASIC-OUT:     Exec Root  :
+# CHECK-BASIC-OUT:   top-level-suite - 3 tests
+# CHECK-BASIC-OUT:     Source Root:
+# CHECK-BASIC-OUT:     Exec Root  :
+#
+# CHECK-BASIC-OUT: -- Testing: 5 tests, 1 threads --
+# CHECK-BASIC-OUT: PASS: sub-suite :: test-one
+# CHECK-BASIC-OUT: PASS: sub-suite :: test-two
+# CHECK-BASIC-OUT: PASS: top-level-suite :: subdir/test-three
+# CHECK-BASIC-OUT: PASS: top-level-suite :: test-one
+# CHECK-BASIC-OUT: PASS: top-level-suite :: test-two
diff --git a/utils/lit/tests/lit.cfg b/utils/lit/tests/lit.cfg
new file mode 100644
index 0000000..32760ce
--- /dev/null
+++ b/utils/lit/tests/lit.cfg
@@ -0,0 +1,36 @@
+# -*- Python -*-
+
+import os
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'lit'
+
+# testFormat: The test format to use to interpret tests.
+config.test_format = lit.formats.ShTest(execute_external=False)
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.py']
+
+# excludes: A list of individual files to exclude.
+config.excludes = ['Inputs']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+config.test_exec_root = config.test_source_root
+
+config.target_triple = None
+
+src_root = os.path.join(config.test_source_root, '..')
+config.environment['PYTHONPATH'] = src_root
+config.substitutions.append(('%{src_root}', src_root))
+config.substitutions.append(('%{inputs}', os.path.join(
+            src_root, 'tests', 'Inputs')))
+config.substitutions.append(('%{lit}', os.path.join(src_root, 'lit.py')))
+
+# Enable coverage.py reporting, assuming the coverage module has been installed
+# and sitecustomize.py in the virtualenv has been modified appropriately.
+if lit.params.get('check-coverage', None):
+    config.environment['COVERAGE_PROCESS_START'] = os.path.join(
+        os.path.dirname(__file__), ".coveragerc")
diff --git a/utils/lit/tests/shell-parsing.py b/utils/lit/tests/shell-parsing.py
new file mode 100644
index 0000000..f644132
--- /dev/null
+++ b/utils/lit/tests/shell-parsing.py
@@ -0,0 +1,3 @@
+# Just run the ShUtil unit tests.
+#
+# RUN: python -m lit.ShUtil
diff --git a/utils/lit/tests/shtest-format.py b/utils/lit/tests/shtest-format.py
new file mode 100644
index 0000000..4b36873
--- /dev/null
+++ b/utils/lit/tests/shtest-format.py
@@ -0,0 +1,43 @@
+# Check the various features of the ShTest format.
+#
+# RUN: not %{lit} -j 1 -v %{inputs}/shtest-format > %t.out
+# RUN: FileCheck < %t.out %s
+#
+# END.
+
+# CHECK: -- Testing:
+
+# CHECK: FAIL: shtest-format :: external_shell/fail.txt
+# CHECK: *** TEST 'shtest-format :: external_shell/fail.txt' FAILED ***
+# CHECK: Command Output (stderr):
+# CHECK: cat: does-not-exist: No such file or directory
+# CHECK: --
+
+# CHECK: PASS: shtest-format :: external_shell/pass.txt
+
+# CHECK: FAIL: shtest-format :: fail.txt
+
+# CHECK: UNRESOLVED: shtest-format :: no-test-line.txt
+# CHECK: PASS: shtest-format :: pass.txt
+# CHECK: UNSUPPORTED: shtest-format :: requires-missing.txt
+# CHECK: PASS: shtest-format :: requires-present.txt
+# CHECK: UNSUPPORTED: shtest-format :: unsupported_dir/some-test.txt
+# CHECK: XFAIL: shtest-format :: xfail-feature.txt
+# CHECK: XFAIL: shtest-format :: xfail-target.txt
+# CHECK: XFAIL: shtest-format :: xfail.txt
+# CHECK: XPASS: shtest-format :: xpass.txt
+# CHECK: Testing Time
+
+# CHECK: Unexpected Passing Tests (1)
+# CHECK: shtest-format :: xpass.txt
+
+# CHECK: Failing Tests (2)
+# CHECK: shtest-format :: external_shell/fail.txt
+# CHECK: shtest-format :: fail.txt
+
+# CHECK: Expected Passes    : 3
+# CHECK: Expected Failures  : 3
+# CHECK: Unsupported Tests  : 2
+# CHECK: Unresolved Tests   : 1
+# CHECK: Unexpected Passes  : 1
+# CHECK: Unexpected Failures: 2
diff --git a/utils/lit/tests/shtest-shell.py b/utils/lit/tests/shtest-shell.py
new file mode 100644
index 0000000..32479e1
--- /dev/null
+++ b/utils/lit/tests/shtest-shell.py
@@ -0,0 +1,33 @@
+# Check the internal shell handling component of the ShTest format.
+#
+# RUN: not %{lit} -j 1 -v %{inputs}/shtest-shell > %t.out
+# RUN: FileCheck < %t.out %s
+#
+# END.
+
+# CHECK: -- Testing:
+
+# CHECK: FAIL: shtest-shell :: error-0.txt
+# CHECK: *** TEST 'shtest-shell :: error-0.txt' FAILED ***
+# CHECK: Command 0: "not-a-real-command"
+# CHECK: Command 0 Result: 127
+# CHECK: Command 0 Stderr:
+# CHECK: 'not-a-real-command': command not found
+# CHECK: ***
+
+# FIXME: The output here sucks.
+#
+# CHECK: FAIL: shtest-shell :: error-1.txt
+# CHECK: *** TEST 'shtest-shell :: error-1.txt' FAILED ***
+# CHECK: shell parser error on: 'echo "missing quote'
+# CHECK: ***
+
+# CHECK: FAIL: shtest-shell :: error-2.txt
+# CHECK: *** TEST 'shtest-shell :: error-2.txt' FAILED ***
+# CHECK: Unsupported redirect:
+# CHECK: ***
+
+# CHECK: PASS: shtest-shell :: redirects.txt
+# CHECK: PASS: shtest-shell :: sequencing-0.txt
+# CHECK: XFAIL: shtest-shell :: sequencing-1.txt
+# CHECK: Failing Tests (3)
diff --git a/utils/lit/tests/unittest-adaptor.py b/utils/lit/tests/unittest-adaptor.py
new file mode 100644
index 0000000..243dd41
--- /dev/null
+++ b/utils/lit/tests/unittest-adaptor.py
@@ -0,0 +1,18 @@
+# Check the lit adaption to run under unittest.
+#
+# RUN: python %s %{inputs}/unittest-adaptor 2> %t.err
+# RUN: FileCheck < %t.err %s
+#
+# CHECK: unittest-adaptor :: test-one.txt ... ok
+# CHECK: unittest-adaptor :: test-two.txt ... FAIL
+
+import unittest
+import sys
+
+import lit
+import lit.discovery
+
+input_path = sys.argv[1]
+unittest_suite = lit.discovery.load_test_suite([input_path])
+runner = unittest.TextTestRunner(verbosity=2)
+runner.run(unittest_suite)
diff --git a/utils/lit/tests/usage.py b/utils/lit/tests/usage.py
new file mode 100644
index 0000000..e10d613
--- /dev/null
+++ b/utils/lit/tests/usage.py
@@ -0,0 +1,6 @@
+# Basic sanity check that usage works.
+#
+# RUN: %{lit} --help > %t.out
+# RUN: FileCheck < %t.out %s
+#
+# CHECK: Usage: lit.py [options] {file-or-path}
diff --git a/utils/lit/utils/README.txt b/utils/lit/utils/README.txt
new file mode 100644
index 0000000..81862ba
--- /dev/null
+++ b/utils/lit/utils/README.txt
@@ -0,0 +1,2 @@
+Utilities for the project that aren't intended to be part of a source
+distribution.
diff --git a/utils/lit/utils/check-coverage b/utils/lit/utils/check-coverage
new file mode 100755
index 0000000..bb3d17e
--- /dev/null
+++ b/utils/lit/utils/check-coverage
@@ -0,0 +1,50 @@
+#!/bin/sh
+
+prog=$(basename $0)
+
+# Expect to be run from the parent lit directory.
+if [ ! -f setup.py ] || [ ! -d lit ]; then
+    printf 1>&2 "%s: expected to be run from base lit directory\n" "$prog"
+    exit 1
+fi
+
+# Parse command line arguments.
+if [ "$1" == "--generate-html" ]; then
+    GENERATE_HTML=1
+    shift
+fi
+
+# If invoked with no arguments, run all the tests.
+if [ $# == "0" ]; then
+    set -- "tests"
+fi
+
+# Check that the active python has been modified to enable coverage in its
+# sitecustomize.
+if ! python -c \
+      'import sitecustomize, sys; sys.exit("coverage" not in dir(sitecustomize))' \
+      &> /dev/null; then
+    printf 1>&2 "error: active python does not appear to enable coverage in its 'sitecustomize.py'\n"
+    exit 1
+fi
+
+# First, remove any existing coverage data files.
+rm -f tests/.coverage
+find tests -name .coverage.\* -exec rm {} \;
+
+# Next, run the tests.
+lit -sv --param check-coverage=1 "$@"
+
+# Next, move all the data files from subdirectories up.
+find tests/* -name .coverage.\* -exec mv {} tests \;
+
+# Combine all the data files.
+(cd tests && python -m coverage combine)
+
+# Finally, generate the report.
+(cd tests && python -m coverage report)
+
+# Generate the HTML report, if requested.
+if [ ! -z "$GENERATE_HTML" ]; then
+    (cd tests && python -m coverage html)
+fi
diff --git a/utils/lit/utils/check-sdist b/utils/lit/utils/check-sdist
new file mode 100755
index 0000000..6186446
--- /dev/null
+++ b/utils/lit/utils/check-sdist
@@ -0,0 +1,44 @@
+#!/bin/sh
+
+if [ $# == 1 ]; then
+    cd $1
+fi
+
+# Create a list of all the files in the source tree, excluding various things we
+# know don't belong.
+echo "Creating current directory contents list."
+find . | \
+    grep -v '^\./.gitignore' | \
+    grep -v '^\./dist' | \
+    grep -v '^\./utils' | \
+    grep -v '^\./venv' | \
+    grep -v '^\./lit.egg-info' | \
+    grep -v '^\./lit/ExampleTests' | \
+    grep -v '/Output' | \
+    grep -v '__pycache__' | \
+    grep -v '.pyc$' | grep -v '~$' | \
+    sort > /tmp/lit_source_files.txt
+
+# Create the source distribution.
+echo "Creating source distribution."
+rm -rf lit.egg-info dist
+python setup.py sdist > /tmp/lit_sdist_log.txt
+
+# Creating list of files in source distribution.
+echo "Creating source distribution file list."
+tar zft dist/lit*.tar.gz | \
+    sed -e 's#lit-[0-9.dev]*/#./#' | \
+    sed -e 's#/$##' | \
+    grep -v '^\./PKG-INFO' | \
+    grep -v '^\./setup.cfg' | \
+    grep -v '^\./lit.egg-info' | \
+    sort > /tmp/lit_sdist_files.txt
+
+# Diff the files.
+echo "Running diff..."
+if (diff /tmp/lit_source_files.txt /tmp/lit_sdist_files.txt); then
+    echo "Diff is clean!"
+else
+    echo "error: there were differences in the source lists!"
+    exit 1
+fi
diff --git a/utils/llvm-compilers-check b/utils/llvm-compilers-check
index 623ebc6..3173027 100755
--- a/utils/llvm-compilers-check
+++ b/utils/llvm-compilers-check
@@ -1,11 +1,11 @@
 #!/usr/bin/python3
 ##===- utils/llvmbuild - Build the LLVM project ----------------*-python-*-===##
-# 
+#
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
-# 
+#
 ##===----------------------------------------------------------------------===##
 #
 # This script builds many different flavors of the LLVM ecosystem.  It
@@ -147,6 +147,8 @@ def add_options(parser):
                       help=("Do not build dragonegg"))
     parser.add_option("--no-install", default=False, action="store_true",
                       help=("Do not do installs"))
+    parser.add_option("--keep-going", default=False, action="store_true",
+                      help=("Keep going after failures"))
     return
 
 def check_options(parser, options, valid_builds):
@@ -282,7 +284,7 @@ class Builder(threading.Thread):
 
         for key, value in env.items():
             execenv[key] = value
- 
+
         self.logger.debug("[" + prefix + "] " + "env " + str(env) + " "
                           + " ".join(command));
 
@@ -299,6 +301,11 @@ class Builder(threading.Thread):
                                  + str(line, "utf-8").rstrip())
                 line = proc.stdout.readline()
 
+            (stdoutdata, stderrdata) = proc.communicate()
+            retcode = proc.wait()
+
+            return retcode
+
         except:
             traceback.print_exc()
 
@@ -327,6 +334,7 @@ class Builder(threading.Thread):
                     self.logger.debug("Start Gather")
                     gather = True
                 line = proc.stdout.readline()
+
         except:
             traceback.print_exc()
         self.logger.debug(includes)
@@ -353,16 +361,16 @@ class Builder(threading.Thread):
 
         configure_flags = dict(
             llvm=dict(debug=["--prefix=" + self.install_prefix,
-                             "--with-extra-options=-Werror",
+                             "--enable-werror",
                              "--enable-assertions",
                              "--disable-optimized",
                              "--with-gcc-toolchain=" + cxxroot],
                       release=["--prefix=" + self.install_prefix,
-                               "--with-extra-options=-Werror",
+                               "--enable-werror",
                                "--enable-optimized",
                                "--with-gcc-toolchain=" + cxxroot],
                       paranoid=["--prefix=" + self.install_prefix,
-                                "--with-extra-options=-Werror",
+                                "--enable-werror",
                                 "--enable-assertions",
                                 "--enable-expensive-checks",
                                 "--disable-optimized",
@@ -438,7 +446,7 @@ class Builder(threading.Thread):
 
         for component in components:
             comp = component[:]
-            
+
             if (self.options.no_dragonegg):
                 if (comp == 'dragonegg'):
                     self.logger.info("Skipping " + component + " in "
@@ -458,43 +466,74 @@ class Builder(threading.Thread):
                                        "").split())
 
             self.logger.info("Configuring " + component + " in " + builddir)
-            self.configure(component, srcdir, builddir,
-                           config_args,
-                           configure_env[comp_key][build])
-
-            self.logger.info("Building " + component + " in " + builddir)
-            self.logger.info("Build: make " + str(make_flags[comp_key][build]))
-            self.make(component, srcdir, builddir,
-                      make_flags[comp_key][build],
-                      make_env[comp_key][build])
-
-            if (not self.options.no_install):
-                self.logger.info("Installing " + component + " in " + installdir)
-                self.make(component, srcdir, builddir,
-                          make_install_flags[comp_key][build],
-                          make_install_env[comp_key][build])
-
-            self.logger.info("Testing " + component + " in " + builddir)
-            self.logger.info("Test: make "
-                             + str(make_check_flags[comp_key][build]))
-            self.make(component, srcdir, builddir,
-                      make_check_flags[comp_key][build],
-                      make_check_env[comp_key][build])
-
+            configrc = self.configure(component, srcdir, builddir,
+                                      config_args,
+                                      configure_env[comp_key][build])
+
+            if (configrc == None) :
+                self.logger.info("[None] Failed to configure " + component + " in " + installdir)
+
+            if (configrc == 0 or self.options.keep_going) :
+                self.logger.info("Building " + component + " in " + builddir)
+                self.logger.info("Build: make " + str(make_flags[comp_key][build]))
+                buildrc = self.make(component, srcdir, builddir,
+                                    make_flags[comp_key][build],
+                                    make_env[comp_key][build])
+
+                if (buildrc == None) :
+                    self.logger.info("[None] Failed to build " + component + " in " + installdir)
+
+                if (buildrc == 0 or self.options.keep_going) :
+                    self.logger.info("Testing " + component + " in " + builddir)
+                    self.logger.info("Test: make "
+                                     + str(make_check_flags[comp_key][build]))
+                    testrc = self.make(component, srcdir, builddir,
+                                       make_check_flags[comp_key][build],
+                                       make_check_env[comp_key][build])
+
+                    if (testrc == None) :
+                        self.logger.info("[None] Failed to test " + component + " in " + installdir)
+
+                    if ((testrc == 0  or self.options.keep_going)
+                        and not self.options.no_install):
+                        self.logger.info("Installing " + component + " in " + installdir)
+                        self.make(component, srcdir, builddir,
+                                  make_install_flags[comp_key][build],
+                                  make_install_env[comp_key][build])
+                    else :
+                        self.logger.info("Failed testing " + component + " in " + installdir)
+
+                else :
+                    self.logger.info("Failed to build " + component + " in " + installdir)
+
+            else :
+                self.logger.info("Failed to configure " + component + " in " + installdir)
 
     def configure(self, component, srcdir, builddir, flags, env):
+        prefix = self.component_abbrev[component.replace("-", "_")]
+
         self.logger.debug("Configure " + str(flags) + " " + str(srcdir) + " -> "
                           + str(builddir))
 
         configure_files = dict(
             llvm=[(srcdir + "/configure", builddir + "/Makefile")],
-            dragonegg=[("","")])
+            dragonegg=[(None,None)])
 
 
         doconfig = False
         for conf, mf in configure_files[component.replace("-", "_")]:
+            if conf is None:
+                # No configure necessary
+                return 0
+
             if not os.path.exists(conf):
-                return
+                self.logger.info("[" + prefix + "] Configure failed, no configure script " + conf)
+                return -1
+
+            if not os.path.exists(mf):
+                self.logger.info("[" + prefix + "] Configure failed, no makefile " + mf)
+                return -1
+
             if os.path.exists(conf) and os.path.exists(mf):
                 confstat = os.stat(conf)
                 makestat = os.stat(mf)
@@ -506,16 +545,17 @@ class Builder(threading.Thread):
                 break
 
         if not doconfig and not self.options.force_configure:
-            return
+            return 0
 
         program = srcdir + "/configure"
         if not is_executable(program):
-            return
+            self.logger.info("[" + prefix + "] Configure failed, cannot execute " + program)
+            return -1
 
         args = [program]
         args += ["--verbose"]
         args += flags
-        self.execute(args, builddir, env, component)
+        return self.execute(args, builddir, env, component)
 
     def make(self, component, srcdir, builddir, flags, env):
         program = find_executable("make")
@@ -527,7 +567,7 @@ class Builder(threading.Thread):
 
         args = [program]
         args += flags
-        self.execute(args, builddir, env, component)
+        return self.execute(args, builddir, env, component)
 
 # Global constants
 build_abbrev = dict(debug="dbg", release="opt", paranoid="par")
diff --git a/utils/llvm-lit/llvm-lit.in b/utils/llvm-lit/llvm-lit.in
index 768dc51..87878d5 100755
--- a/utils/llvm-lit/llvm-lit.in
+++ b/utils/llvm-lit/llvm-lit.in
@@ -13,8 +13,7 @@ sys.path.insert(0, os.path.join(llvm_source_root, 'utils', 'lit'))
 # Set up some builtin parameters, so that by default the LLVM test suite
 # configuration file knows how to find the object tree.
 builtin_parameters = {
-    'build_config' : "@CMAKE_CFG_INTDIR@",
-    'build_mode' : "@RUNTIME_BUILD_MODE@",
+    'build_mode' : "@CMAKE_CFG_INTDIR@",
     'llvm_site_config' : os.path.join(llvm_obj_root, 'test', 'lit.site.cfg')
     }
 
diff --git a/utils/llvm.grm b/utils/llvm.grm
index 322036b..d65f075 100644
--- a/utils/llvm.grm
+++ b/utils/llvm.grm
@@ -174,7 +174,9 @@ FuncAttr      ::= noreturn
  | sspreq
  | returns_twice
  | nonlazybind
- | address_safety
+ | sanitize_address
+ | sanitize_thread
+ | sanitize_memory
  ;
 
 OptFuncAttrs  ::= + _ | OptFuncAttrs FuncAttr ;
diff --git a/utils/llvm.natvis b/utils/llvm.natvis
new file mode 100644
index 0000000..6b4ef83
--- /dev/null
+++ b/utils/llvm.natvis
@@ -0,0 +1,181 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+Visual Studio 2012 Native Debugging Visualizers for LLVM
+
+Put this file into "%USERPROFILE%\Documents\Visual Studio 2012\Visualizers"
+or create a symbolic link so it updates automatically.
+-->
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+  <Type Name="llvm::SmallVector&lt;*,*&gt;">
+    <DisplayString Condition="(($T1*)EndX - ($T1*)BeginX) == 0">empty</DisplayString>
+    <DisplayString Condition="(($T1*)EndX - ($T1*)BeginX) != 0">{{ size={($T1*)EndX - ($T1*)BeginX} }}</DisplayString>
+    <Expand>
+      <Item Name="[size]">($T1*)EndX - ($T1*)BeginX</Item>
+      <Item Name="[capacity]">($T1*)CapacityX - ($T1*)BeginX</Item>
+      <ArrayItems>
+        <Size>($T1*)EndX - ($T1*)BeginX</Size>
+        <ValuePointer>($T1*)BeginX</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::SmallVectorImpl&lt;*&gt;">
+    <DisplayString Condition="(($T1*)EndX - ($T1*)BeginX) == 0">empty</DisplayString>
+    <DisplayString Condition="(($T1*)EndX - ($T1*)BeginX) != 0">[{($T1*)EndX - ($T1*)BeginX}]</DisplayString>
+    <Expand>
+      <Item Name="[size]">($T1*)EndX - ($T1*)BeginX</Item>
+      <Item Name="[capacity]">($T1*)CapacityX - ($T1*)BeginX</Item>
+      <ArrayItems>
+        <Size>($T1*)EndX - ($T1*)BeginX</Size>
+        <ValuePointer>($T1*)BeginX</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::SmallString&lt;*&gt;">
+    <DisplayString>{BeginX,s}</DisplayString>
+    <StringView>BeginX,s</StringView>
+    <Expand>
+      <Item Name="[size]">(char*)EndX - (char*)BeginX</Item>
+      <Item Name="[capacity]">(char*)CapacityX - (char*)BeginX</Item>
+      <ArrayItems>
+        <Size>(char*)EndX - (char*)BeginX</Size>
+        <ValuePointer>(char*)BeginX</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::StringRef">
+    <DisplayString>[{Length}] {Data,s}</DisplayString>
+    <StringView>Data,s</StringView>
+    <Expand>
+      <Item Name="[length]">Length</Item>
+      <ArrayItems>
+        <Size>Length</Size>
+        <ValuePointer>Data</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::PointerIntPair&lt;*,*,*,*&gt;">
+    <DisplayString>{Value &amp; PointerBitMask} [{(Value &gt;&gt; IntShift) &amp; IntMask}]</DisplayString>
+    <Expand>
+      <Item Name="[ptr]">Value &amp; PointerBitMask</Item>
+      <Item Name="[int]">(Value &gt;&gt; IntShift) &amp; IntMask</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::PointerUnion&lt;*,*&gt;">
+    <DisplayString Condition="((Val.Value &gt;&gt; Val.IntShift) &amp; Val.IntMask) == 0">[P1] {($T1)(Val.Value &amp; Val.PointerBitMask)}</DisplayString>
+    <DisplayString Condition="((Val.Value &gt;&gt; Val.IntShift) &amp; Val.IntMask) != 0">[P2] {($T2)(Val.Value &amp; Val.PointerBitMask)}</DisplayString>
+    <Expand>
+      <Item Name="[ptr]" Condition="((Val.Value &gt;&gt; Val.IntShift) &amp; Val.IntMask) == 0">($T1)(Val.Value &amp; Val.PointerBitMask)</Item>
+      <Item Name="[ptr]" Condition="((Val.Value &gt;&gt; Val.IntShift) &amp; Val.IntMask) != 0">($T2)(Val.Value &amp; Val.PointerBitMask)</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::PointerUnion3&lt;*,*,*&gt;">
+    <DisplayString Condition="(Val.Val.Value &amp; 2) != 2 &amp;&amp; (Val.Val.Value &amp; 1) != 1">[P1] {($T1)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)}</DisplayString>
+    <DisplayString Condition="(Val.Val.Value &amp; 2) == 2">[P2] {($T2)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)}</DisplayString>
+    <DisplayString Condition="(Val.Val.Value &amp; 1) == 1">[P3] {($T3)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)}</DisplayString>
+    <Expand>
+      <Item Name="[ptr]" Condition="(Val.Val.Value &amp; 2) != 2 &amp;&amp; (Val.Val.Value &amp; 1) != 1">($T1)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)</Item>
+      <Item Name="[ptr]" Condition="(Val.Val.Value &amp; 2) == 2">($T2)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)</Item>
+      <Item Name="[ptr]" Condition="(Val.Val.Value &amp; 1) == 1">($T3)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::PointerUnion4&lt;*,*,*,*&gt;">
+    <DisplayString Condition="(Val.Val.Value &amp; 3) != 3 &amp;&amp; (Val.Val.Value &amp; 2) != 2 &amp;&amp; (Val.Val.Value &amp; 1) != 1">[P1] {($T1)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)}</DisplayString>
+    <DisplayString Condition="(Val.Val.Value &amp; 3) != 3 &amp;&amp; (Val.Val.Value &amp; 2) == 2">[P2] {($T2)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)}</DisplayString>
+    <DisplayString Condition="(Val.Val.Value &amp; 3) != 3 &amp;&amp; (Val.Val.Value &amp; 1) == 1">[P3] {($T3)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)}</DisplayString>
+    <DisplayString Condition="(Val.Val.Value &amp; 3) == 3">[P4] {($T4)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)}</DisplayString>
+    <Expand>
+      <Item Name="[ptr]" Condition="(Val.Val.Value &amp; 3) != 3 &amp;&amp; (Val.Val.Value &amp; 2) != 2 &amp;&amp; (Val.Val.Value &amp; 1) != 1">($T1)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)</Item>
+      <Item Name="[ptr]" Condition="(Val.Val.Value &amp; 3) != 3 &amp;&amp; (Val.Val.Value &amp; 2) == 2">($T2)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)</Item>
+      <Item Name="[ptr]" Condition="(Val.Val.Value &amp; 3) != 3 &amp;&amp; (Val.Val.Value &amp; 1) == 1">($T3)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)</Item>
+      <Item Name="[ptr]" Condition="(Val.Val.Value &amp; 3) == 3">($T4)((Val.Val.Value &gt;&gt; 2) &lt;&lt; 2)</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::iplist&lt;*,*&gt;">
+    <DisplayString Condition="Head == 0">{{ empty }}</DisplayString>
+    <DisplayString Condition="Head != 0">{{ head={Head} }}</DisplayString>
+    <Expand>
+      <LinkedListItems>
+        <HeadPointer>Head</HeadPointer>
+        <NextPointer>Next</NextPointer>
+        <ValueNode>this</ValueNode>
+      </LinkedListItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::IntrusiveRefCntPtr&lt;*&gt;">
+    <DisplayString Condition="Obj == 0">empty</DisplayString>
+    <DisplayString Condition="(Obj != 0) &amp;&amp; (Obj-&gt;ref_cnt == 1)">RefPtr [1 ref] {*Obj}</DisplayString>
+    <DisplayString Condition="(Obj != 0) &amp;&amp; (Obj-&gt;ref_cnt != 1)">RefPtr [{Obj-&gt;ref_cnt} refs] {*Obj}</DisplayString>
+    <Expand>
+      <Item Condition="Obj != 0" Name="[refs]">Obj-&gt;ref_cnt</Item>
+      <Item Condition="Obj != 0" Name="[ptr]">Obj</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::OwningPtr&lt;*&gt;">
+    <DisplayString Condition="Ptr == 0">empty</DisplayString>
+    <DisplayString Condition="Ptr != 0">OwningPtr {*Ptr}</DisplayString>
+    <Expand>
+      <Item Condition="Ptr != 0" Name="[ptr]">Ptr</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::SmallPtrSet&lt;*,*&gt;">
+    <DisplayString Condition="CurArray == SmallArray">{{ [Small Mode] elements={NumElements}, arraySize={CurArraySize} }}</DisplayString>
+    <DisplayString Condition="CurArray != SmallArray">{{ [Big Mode] elements={NumElements}, arraySize={CurArraySize} }}</DisplayString>
+    <Expand>
+      <Item Name="[NumElements]">NumElements</Item>
+      <Item Name="[CurArraySize]">CurArraySize</Item>
+      <IndexListItems>
+        <Size>CurArraySize + 1</Size>
+        <ValueNode>($T1*)&amp;CurArray[$i]</ValueNode>
+      </IndexListItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::DenseMap&lt;*,*,*&gt;">
+    <DisplayString Condition="NumEntries == 0">empty</DisplayString>
+    <DisplayString Condition="NumEntries != 0">{{ entries={NumEntries}, buckets={NumBuckets} }}</DisplayString>
+    <Expand>
+      <Item Name="[NumEntries]">NumEntries</Item>
+      <Item Name="[NumBuckets]">NumBuckets</Item>
+      <ArrayItems>
+        <Size>NumBuckets</Size>
+        <ValuePointer>Buckets</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::StringMap&lt;*,*&gt;">
+    <DisplayString>{{ NumBuckets={NumBuckets}, ItemSize={ItemSize} }}</DisplayString>
+    <Expand>
+      <Item Name="[NumBuckets]">NumBuckets</Item>
+      <Item Name="[ItemSize]">ItemSize</Item>
+      <IndexListItems>
+        <Size>NumBuckets</Size>
+        <ValueNode>(llvm::StringMapEntry&lt;$T1&gt;*)TheTable[$i]</ValueNode>
+      </IndexListItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::StringMapEntry&lt;*&gt;">
+    <DisplayString Condition="StrLen == 0">empty</DisplayString>
+    <DisplayString Condition="StrLen != 0">({((llvm::StringMapEntry&lt;$T1&gt;*)this)+1,s}, {second})</DisplayString>
+    <Expand>
+      <Item Name="[key]">((llvm::StringMapEntry&lt;$T1&gt;*)this)+1,s</Item>
+      <Item Name="[value]" Condition="StrLen != 0">second</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::Triple">
+    <DisplayString>{Data}</DisplayString>
+  </Type>
+</AutoVisualizer>
diff --git a/utils/testgen/mc-bundling-x86-gen.py b/utils/testgen/mc-bundling-x86-gen.py
index 832e841..5c1c6c4 100644
--- a/utils/testgen/mc-bundling-x86-gen.py
+++ b/utils/testgen/mc-bundling-x86-gen.py
@@ -1,3 +1,4 @@
+
 #!/usr/bin/python
 
 # Auto-generates an exhaustive and repetitive test for correct bundle-locked
@@ -40,7 +41,7 @@ def print_bundle_locked_sequence(len, align_to_end=False):
 
 def generate(align_to_end=False):
   print(PREAMBLE)
-  
+
   ntest = 0
   for instlen in range(1, BUNDLE_SIZE + 1):
     for offset in range(0, BUNDLE_SIZE):
@@ -56,13 +57,15 @@ def generate(align_to_end=False):
       base_offset = ntest * 2 * BUNDLE_SIZE
       inst_orig_offset = base_offset + offset  # had it not been padded...
 
-      def print_check(adjusted_offset=None):
+      def print_check(adjusted_offset=None, nop_split_offset=None):
         if adjusted_offset is not None:
           print('# CHECK: {0:x}: nop'.format(inst_orig_offset))
+          if nop_split_offset is not None:
+            print('# CHECK: {0:x}: nop'.format(nop_split_offset))
           print('# CHECK: {0:x}: incl'.format(adjusted_offset))
         else:
           print('# CHECK: {0:x}: incl'.format(inst_orig_offset))
-      
+
       if align_to_end:
         if offset + instlen == BUNDLE_SIZE:
           # No padding needed
@@ -72,9 +75,13 @@ def generate(align_to_end=False):
           offset_to_end = base_offset + (BUNDLE_SIZE - instlen)
           print_check(offset_to_end)
         else: # offset + instlen > BUNDLE_SIZE
-          # Pad to end at next bundle boundary
+          # Pad to end at next bundle boundary, splitting the nop sequence
+          # at the nearest bundle boundary
+          offset_to_nearest_bundle = base_offset + BUNDLE_SIZE
           offset_to_end = base_offset + (BUNDLE_SIZE * 2 - instlen)
-          print_check(offset_to_end)
+          if offset_to_nearest_bundle == offset_to_end:
+            offset_to_nearest_bundle = None
+          print_check(offset_to_end, offset_to_nearest_bundle)
       else:
         if offset + instlen > BUNDLE_SIZE:
           # Padding needed
@@ -94,5 +101,3 @@ if __name__ == '__main__':
                          help='generate .bundle_lock with align_to_end option')
   args = argparser.parse_args()
   generate(align_to_end=args.align_to_end)
-
-
diff --git a/utils/unittest/UnitTestMain/TestMain.cpp b/utils/unittest/UnitTestMain/TestMain.cpp
index b35bae5..ce32b73 100644
--- a/utils/unittest/UnitTestMain/TestMain.cpp
+++ b/utils/unittest/UnitTestMain/TestMain.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Config/config.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Signals.h"
 #include "gtest/gtest.h"
 
@@ -22,6 +23,7 @@
 int main(int argc, char **argv) {
   llvm::sys::PrintStackTraceOnErrorSignal();
   testing::InitGoogleTest(&argc, argv);
+  llvm::cl::ParseCommandLineOptions(argc, argv);
 
 # if defined(LLVM_ON_WIN32)
   // Disable all of the possible ways Windows conspires to make automated
diff --git a/utils/unittest/googletest/gtest-printers.cc b/utils/unittest/googletest/gtest-printers.cc
index ed63c7b..205a394 100644
--- a/utils/unittest/googletest/gtest-printers.cc
+++ b/utils/unittest/googletest/gtest-printers.cc
@@ -127,7 +127,7 @@ namespace internal {
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
 enum CharFormat {
   kAsIs,
@@ -230,7 +230,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
     return;
   *os << " (" << String::Format("%d", c).c_str();
 
-  // For more convenience, we print c's code again in hexidecimal,
+  // For more convenience, we print c's code again in hexadecimal,
   // unless c was already printed in the form '\x##' or the code is in
   // [1, 9].
   if (format == kHexEscape || (1 <= c && c <= 9)) {
diff --git a/utils/valgrind/x86_64-pc-linux-gnu.supp b/utils/valgrind/x86_64-pc-linux-gnu.supp
index fc863b8..c8e5cd0 100644
--- a/utils/valgrind/x86_64-pc-linux-gnu.supp
+++ b/utils/valgrind/x86_64-pc-linux-gnu.supp
@@ -33,6 +33,12 @@
 }
 
 {
+   We don't care of cmp
+   Memcheck:Cond
+   obj:/usr/bin/cmp
+}
+
+{
    We don't care if grep leaks
    Memcheck:Leak
    obj:/bin/grep
diff --git a/utils/vim/llvm.vim b/utils/vim/llvm.vim
index 753549f..830476f 100644
--- a/utils/vim/llvm.vim
+++ b/utils/vim/llvm.vim
@@ -35,7 +35,7 @@ syn keyword llvmStatement uitofp ule ult umax umin une uno unreachable unwind
 syn keyword llvmStatement urem va_arg xchg xor zext
 
 " Keywords.
-syn keyword llvmKeyword acq_rel acquire address_safety addrspace alias align
+syn keyword llvmKeyword acq_rel acquire sanitize_address addrspace alias align
 syn keyword llvmKeyword alignstack alwaysinline appending arm_aapcs_vfpcc
 syn keyword llvmKeyword arm_aapcscc arm_apcscc asm atomic available_externally
 syn keyword llvmKeyword blockaddress byval c catch cc ccc cleanup coldcc common
@@ -51,10 +51,11 @@ syn keyword llvmKeyword noimplicitfloat noinline nonlazybind noredzone noreturn
 syn keyword llvmKeyword nounwind optsize personality private protected
 syn keyword llvmKeyword ptx_device ptx_kernel readnone readonly release
 syn keyword llvmKeyword returns_twice section seq_cst sideeffect signext
-syn keyword llvmKeyword singlethread spir_func spir_kernel sret ssp sspreq tail
-syn keyword llvmKeyword target thread_local to triple unnamed_addr unordered
-syn keyword llvmKeyword uwtable volatile weak weak_odr x86_fastcallcc
-syn keyword llvmKeyword x86_stdcallcc x86_thiscallcc zeroext
+syn keyword llvmKeyword singlethread spir_func spir_kernel sret ssp sspreq
+syn keyword llvmKeyword sspstrong tail target thread_local to triple
+syn keyword llvmKeyword unnamed_addr unordered uwtable volatile weak weak_odr
+syn keyword llvmKeyword x86_fastcallcc x86_stdcallcc x86_thiscallcc zeroext
+syn keyword llvmKeyword sanitize_thread sanitize_memory
 
 " Obsolete keywords.
 syn keyword llvmError  getresult begin end